<a href="https://colab.research.google.com/github/sabire113/Master/blob/main/Neural_network.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Neural network

In [5]:
from google.colab import files
uploaded = files.upload()


Saving OSEBX_Market_Macro_Data_2015_2024.csv to OSEBX_Market_Macro_Data_2015_2024.csv


In [6]:
import pandas as pd

# Load the dataset
file_path = "OSEBX_Market_Macro_Data_2015_2024.csv"
df = pd.read_csv(file_path)

# Display basic information and first few rows
df.info(), df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7080 entries, 0 to 7079
Data columns (total 34 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Date                     7080 non-null   object 
 1   Instrument               7080 non-null   object 
 2   First Trade Date         7080 non-null   object 
 3   ClosePrice               7080 non-null   float64
 4   OpenPrice                7080 non-null   float64
 5   Volume                   7080 non-null   float64
 6   BidPrice                 7080 non-null   float64
 7   AskPrice                 7080 non-null   float64
 8   DividendYield            7079 non-null   float64
 9   BookValuePerShare        7080 non-null   float64
 10  Beta                     7069 non-null   float64
 11  MarketCap                7080 non-null   float64
 12  CommonSharesOutstanding  7080 non-null   float64
 13  MonthlyReturn            7080 non-null   float64
 14  EconomicSector          

(None,
          Date Instrument First Trade Date  ClosePrice   OpenPrice   Volume  \
 0  2015-01-31    AFGA.OL       1997-09-08   79.420230   79.890172      0.0   
 1  2015-02-28    AFGA.OL       1997-09-08   85.059536   85.529479      0.0   
 2  2015-03-31    AFGA.OL       1997-09-08   93.048554   93.988438  29730.0   
 3  2015-04-30    AFGA.OL       1997-09-08   97.747976   93.988438  31574.0   
 4  2015-05-31    AFGA.OL       1997-09-08  105.267051  100.567629      0.0   
 
      BidPrice    AskPrice  DividendYield  BookValuePerShare  ...  \
 0   79.420230   80.595086            NaN          15.058302  ...   
 1   84.354623   85.059536       5.524862          15.723256  ...   
 2   92.578611   93.518496       5.050505          15.723256  ...   
 3   96.808091   97.747976       4.807692          15.723256  ...   
 4  104.327166  105.267051       4.464286          15.723256  ...   
 
    TurnoverRatio   BrentOil    USDNOK    EURNOK US10Y    USCPI USGDPGrowth  \
 0       0.000000  52.

# New Section

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Drop rows with missing target variable (OSEBXReturns)
df_clean = df.dropna(subset=["OSEBXReturns"]).copy()

# Convert Date column to datetime format
df_clean["Date"] = pd.to_datetime(df_clean["Date"])

# Selecting Features (X) and Target (Y)
features = [
    "Momentum_3M", "Momentum_6M", "Momentum_12M",
    "Volatility_3M", "Volatility_6M", "Volatility_12M",
    "Volume", "TurnoverRatio", "BidAskSpread",
    "MarketCap", "DividendYield", "BookValuePerShare",
    "EarningsPerShare", "Beta", "USDNOK", "EURNOK",
    "US10Y", "USCPI", "USGDPGrowth", "NorgesBank10Y", "NorwegianCPI",
    "BrentOil"
]

target = "OSEBXReturns"

# Drop remaining rows with missing features
df_clean = df_clean.dropna(subset=features)

# Sorting dataset by Date
df_clean = df_clean.sort_values(by="Date")

# Splitting data into training (2015-2019), validation (2020-2022), and test (2023-2024)
train = df_clean[(df_clean["Date"].dt.year >= 2015) & (df_clean["Date"].dt.year <= 2019)]
valid = df_clean[(df_clean["Date"].dt.year >= 2020) & (df_clean["Date"].dt.year <= 2022)]
test = df_clean[(df_clean["Date"].dt.year >= 2023)]

# Extract features and target
X_train, y_train = train[features], train[target]
X_valid, y_valid = valid[features], valid[target]
X_test, y_test = test[features], test[target]

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrame
X_train_scaled = pd.DataFrame(X_train_scaled, columns=features, index=X_train.index)
X_valid_scaled = pd.DataFrame(X_valid_scaled, columns=features, index=X_valid.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=features, index=X_test.index)

# Display the cleaned and split dataset info
display(df_clean)

# Summary
{
    "Train Set": X_train_scaled.shape,
    "Validation Set": X_valid_scaled.shape,
    "Test Set": X_test_scaled.shape,
    "Target Variable": target,
    "Feature Count": len(features)
}


Unnamed: 0,Date,Instrument,First Trade Date,ClosePrice,OpenPrice,Volume,BidPrice,AskPrice,DividendYield,BookValuePerShare,...,TurnoverRatio,BrentOil,USDNOK,EURNOK,US10Y,USCPI,USGDPGrowth,OSEBXReturns,NorgesBank10Y,NorwegianCPI
480,2015-01-31,AKSOA.OL,2014-09-29,24.114697,24.650013,0.0,24.114697,24.197544,11.898241,22.904350,...,0.0,52.990002,7.72540,8.72510,1.68,234.747,3.6,-0.020052,1.512,98.5
2760,2015-01-31,FLNG.OL,2007-04-19,86.190732,86.190732,0.0,86.190732,88.653324,10.452511,16.617404,...,0.0,52.990002,7.72540,8.72510,1.68,234.747,3.6,-0.020052,1.512,98.5
6600,2015-01-31,VEI.OL,1986-06-23,67.629914,67.841920,0.0,67.417908,67.629914,13.295551,17.980553,...,0.0,52.990002,7.72540,8.72510,1.68,234.747,3.6,-0.020052,1.512,98.5
4800,2015-01-31,NYKD.OL,2020-01-27,100.450000,102.000000,0.0,19.200000,19.200000,0.000000,3.134430,...,0.0,52.990002,7.72540,8.72510,1.68,234.747,3.6,-0.020052,1.512,98.5
600,2015-01-31,ATEA.OL,1985-03-28,75.787717,77.113450,0.0,75.566762,75.787717,73.954984,34.743722,...,0.0,52.990002,7.72540,8.72510,1.68,234.747,3.6,-0.020052,1.512,98.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5759,2024-12-31,SCHB.OL,2015-06-01,334.200000,332.600000,0.0,333.200000,334.200000,21.170455,185.422331,...,0.0,72.940002,11.32762,11.78811,4.58,317.603,2.3,-0.020052,3.599,137.6
1559,2024-12-31,BWLPG.OL,2013-11-21,125.300000,127.000000,0.0,125.300000,125.400000,25.788244,11.047649,...,0.0,72.940002,11.32762,11.78811,4.58,317.603,2.3,-0.020052,3.599,137.6
5879,2024-12-31,SNI.OL,1996-02-02,289.000000,292.000000,0.0,289.000000,290.000000,10.533452,35.611202,...,0.0,72.940002,11.32762,11.78811,4.58,317.603,2.3,-0.020052,3.599,137.6
4919,2024-12-31,NYKD.OL,2020-01-27,3.046000,3.000000,0.0,3.020000,3.046000,0.000000,0.569875,...,0.0,72.940002,11.32762,11.78811,4.58,317.603,2.3,-0.020052,3.599,137.6


{'Train Set': (3528, 22),
 'Validation Set': (2124, 22),
 'Test Set': (1416, 22),
 'Target Variable': 'OSEBXReturns',
 'Feature Count': 22}

In [8]:
pip install tensorflow scikit-learn pandas numpy matplotlib




In [9]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, regularizers
from sklearn.metrics import r2_score
import numpy as np

# Load your cleaned dataset (X_train_scaled, X_valid_scaled, etc.)
# Ensure you have run the data preprocessing steps before this!

# Set random seed for reproducibility
tf.random.set_seed(42)

# Define the Neural Network (NN3) Model
model = keras.Sequential([
    layers.Dense(32, activation="relu", kernel_regularizer=regularizers.l2(0.01), input_shape=(X_train_scaled.shape[1],)),
    layers.BatchNormalization(),
    layers.Dense(16, activation="relu", kernel_regularizer=regularizers.l2(0.01)),
    layers.BatchNormalization(),
    layers.Dense(8, activation="relu", kernel_regularizer=regularizers.l2(0.01)),
    layers.BatchNormalization(),
    layers.Dense(1, activation="linear")  # Output layer (predicting excess return)
])

# Compile the model
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), loss="mse", metrics=["mae"])

# Train the model with early stopping
early_stopping = keras.callbacks.EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True)

history = model.fit(
    X_train_scaled, y_train,
    validation_data=(X_valid_scaled, y_valid),
    epochs=100,
    batch_size=32,
    callbacks=[early_stopping],
    verbose=1
)

# Evaluate the model
y_train_pred = model.predict(X_train_scaled).flatten()
y_valid_pred = model.predict(X_valid_scaled).flatten()
y_test_pred = model.predict(X_test_scaled).flatten()

# Compute R-squared
r2_train = r2_score(y_train, y_train_pred)
r2_valid = r2_score(y_valid, y_valid_pred)
r2_test = r2_score(y_test, y_test_pred)

# Print Results
print(f"Train R²: {r2_train:.4f}")
print(f"Validation R²: {r2_valid:.4f}")
print(f"Test R² (Out-of-Sample): {r2_test:.4f}")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 10ms/step - loss: 1.8483 - mae: 0.8675 - val_loss: 8.7090 - val_mae: 1.9806
Epoch 2/100
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.8057 - mae: 0.3858 - val_loss: 9.3910 - val_mae: 2.0790
Epoch 3/100
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.6514 - mae: 0.2673 - val_loss: 7.8976 - val_mae: 1.9135
Epoch 4/100
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.5746 - mae: 0.2086 - val_loss: 6.2019 - val_mae: 1.6743
Epoch 5/100
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.5187 - mae: 0.1732 - val_loss: 4.8642 - val_mae: 1.4690
Epoch 6/100
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.4713 - mae: 0.1483 - val_loss: 3.8897 - val_mae: 1.3060
Epoch 7/100
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms

In [10]:
!pip install tensorflow scikit-learn pandas numpy matplotlib

import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, regularizers
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

# Load the dataset
file_path = "/content/OSEBX_Market_Macro_Data_2015_2024.csv"
df = pd.read_csv(file_path)

# Preprocessing
df["Date"] = pd.to_datetime(df["Date"])
df = df.dropna(subset=["OSEBXReturns"])  # Drop missing target values

features = [
    "Momentum_3M", "Momentum_6M", "Momentum_12M",
    "Volatility_3M", "Volatility_6M", "Volatility_12M",
    "Volume", "TurnoverRatio", "BidAskSpread",
    "MarketCap", "DividendYield", "BookValuePerShare",
    "EarningsPerShare", "Beta", "USDNOK", "EURNOK",
    "US10Y", "USCPI", "USGDPGrowth", "NorgesBank10Y", "NorwegianCPI",
    "BrentOil"
]

target = "OSEBXReturns"
df = df.dropna(subset=features)  # Drop missing feature values

# Splitting data by year
df = df.sort_values(by="Date")
train = df[(df["Date"].dt.year >= 2015) & (df["Date"].dt.year <= 2019)]
valid = df[(df["Date"].dt.year >= 2020) & (df["Date"].dt.year <= 2022)]
test = df[(df["Date"].dt.year >= 2023)]

X_train, y_train = train[features], train[target]
X_valid, y_valid = valid[features], valid[target]
X_test, y_test = test[features], test[target]

# Standardizing features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)
X_test_scaled = scaler.transform(X_test)

# Feature Selection using Elastic Net and Random Forest
elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5)
elastic_net.fit(X_train_scaled, y_train)
elastic_net_importance = abs(elastic_net.coef_)

rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train_scaled, y_train)
rf_importance = rf.feature_importances_

# Select Top Features
feature_importance_df = pd.DataFrame({
    "Feature": features,
    "ElasticNet Importance": elastic_net_importance,
    "RandomForest Importance": rf_importance
})

feature_importance_df["Avg Importance"] = (feature_importance_df["ElasticNet Importance"] + feature_importance_df["RandomForest Importance"]) / 2
feature_importance_df = feature_importance_df.sort_values(by="Avg Importance", ascending=False)

top_features = feature_importance_df["Feature"].head(10).tolist()

X_train_selected = pd.DataFrame(X_train_scaled, columns=features)[top_features]
X_valid_selected = pd.DataFrame(X_valid_scaled, columns=features)[top_features]
X_test_selected = pd.DataFrame(X_test_scaled, columns=features)[top_features]

# Define Updated Neural Network with Regularization
model = keras.Sequential([
    layers.Dense(32, activation="relu", kernel_regularizer=regularizers.l2(0.1), input_shape=(X_train_selected.shape[1],)),
    layers.BatchNormalization(),
    layers.Dropout(0.3),

    layers.Dense(16, activation="relu", kernel_regularizer=regularizers.l2(0.1)),
    layers.BatchNormalization(),
    layers.Dropout(0.3),

    layers.Dense(8, activation="relu", kernel_regularizer=regularizers.l2(0.1)),
    layers.BatchNormalization(),
    layers.Dropout(0.3),

    layers.Dense(1, activation="linear")  # Output layer
])

# Compile Model
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), loss="mse", metrics=["mae"])

# Train the model with Early Stopping
early_stopping = keras.callbacks.EarlyStopping(monitor="val_loss", patience=20, restore_best_weights=True)

history = model.fit(
    X_train_selected, y_train,
    validation_data=(X_valid_selected, y_valid),
    epochs=100,
    batch_size=32,
    callbacks=[early_stopping],
    verbose=1
)

# Evaluate Model
y_train_pred = model.predict(X_train_selected).flatten()
y_valid_pred = model.predict(X_valid_selected).flatten()
y_test_pred = model.predict(X_test_selected).flatten()

# Compute R² Scores
r2_train = r2_score(y_train, y_train_pred)
r2_valid = r2_score(y_valid, y_valid_pred)
r2_test = r2_score(y_test, y_test_pred)

print(f"Train R²: {r2_train:.4f}")
print(f"Validation R²: {r2_valid:.4f}")
print(f"Test R² (Out-of-Sample): {r2_test:.4f}")




  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - loss: 5.3907 - mae: 0.7567 - val_loss: 3.9183 - val_mae: 0.4973
Epoch 2/100
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 3.6161 - mae: 0.4827 - val_loss: 2.6468 - val_mae: 0.3355
Epoch 3/100
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 2.4730 - mae: 0.3648 - val_loss: 1.7387 - val_mae: 0.2260
Epoch 4/100
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 1.6306 - mae: 0.2888 - val_loss: 1.1417 - val_mae: 0.1839
Epoch 5/100
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 1.0625 - mae: 0.2286 - val_loss: 0.7671 - val_mae: 0.2021
Epoch 6/100
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.6859 - mae: 0.1919 - val_loss: 0.4790 - val_mae: 0.1556
Epoch 7/100
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/

In [11]:
!pip install tensorflow scikit-learn pandas numpy matplotlib

import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, regularizers
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

# Load the dataset
file_path = "/content/OSEBX_Market_Macro_Data_2015_2024.csv"
df = pd.read_csv(file_path)

# Preprocessing
df["Date"] = pd.to_datetime(df["Date"])
df = df.dropna(subset=["OSEBXReturns"])  # Drop missing target values

features = [
    "Momentum_3M", "Momentum_6M", "Momentum_12M",
    "Volatility_3M", "Volatility_6M", "Volatility_12M",
    "Volume", "TurnoverRatio", "BidAskSpread",
    "MarketCap", "DividendYield", "BookValuePerShare",
    "EarningsPerShare", "Beta", "USDNOK", "EURNOK",
    "US10Y", "USCPI", "USGDPGrowth", "NorgesBank10Y", "NorwegianCPI",
    "BrentOil"
]

target = "OSEBXReturns"
df = df.dropna(subset=features)  # Drop missing feature values

# Splitting data by year
df = df.sort_values(by="Date")
train = df[(df["Date"].dt.year >= 2015) & (df["Date"].dt.year <= 2019)]
valid = df[(df["Date"].dt.year >= 2020) & (df["Date"].dt.year <= 2022)]
test = df[(df["Date"].dt.year >= 2023)]

X_train, y_train = train[features], train[target]
X_valid, y_valid = valid[features], valid[target]
X_test, y_test = test[features], test[target]

# Standardizing features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)
X_test_scaled = scaler.transform(X_test)

# Feature Selection using Elastic Net and Random Forest
elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5)
elastic_net.fit(X_train_scaled, y_train)
elastic_net_importance = abs(elastic_net.coef_)

rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train_scaled, y_train)
rf_importance = rf.feature_importances_

# Select Top Features
feature_importance_df = pd.DataFrame({
    "Feature": features,
    "ElasticNet Importance": elastic_net_importance,
    "RandomForest Importance": rf_importance
})

feature_importance_df["Avg Importance"] = (feature_importance_df["ElasticNet Importance"] + feature_importance_df["RandomForest Importance"]) / 2
feature_importance_df = feature_importance_df.sort_values(by="Avg Importance", ascending=False)

top_features = feature_importance_df["Feature"].head(10).tolist()

X_train_selected = pd.DataFrame(X_train_scaled, columns=features)[top_features]
X_valid_selected = pd.DataFrame(X_valid_scaled, columns=features)[top_features]
X_test_selected = pd.DataFrame(X_test_scaled, columns=features)[top_features]

# Set random seed for reproducibility
tf.random.set_seed(42)

# Define NN1 (1 Hidden Layer)
model_nn1 = keras.Sequential([
    layers.Dense(32, activation="relu", kernel_regularizer=regularizers.l2(0.1), input_shape=(X_train_selected.shape[1],)),
    layers.BatchNormalization(),
    layers.Dropout(0.5),
    layers.Dense(1, activation="linear")  # Output layer
])

# Define NN2 (2 Hidden Layers)
model_nn2 = keras.Sequential([
    layers.Dense(32, activation="relu", kernel_regularizer=regularizers.l2(0.1), input_shape=(X_train_selected.shape[1],)),
    layers.BatchNormalization(),
    layers.Dropout(0.5),

    layers.Dense(16, activation="relu", kernel_regularizer=regularizers.l2(0.1)),
    layers.BatchNormalization(),
    layers.Dropout(0.5),

    layers.Dense(1, activation="linear")  # Output layer
])

# Compile Models
for model in [model_nn1, model_nn2]:
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), loss="mse", metrics=["mae"])

# Early stopping for both models
early_stopping = keras.callbacks.EarlyStopping(monitor="val_loss", patience=20, restore_best_weights=True)

# Train NN1
history_nn1 = model_nn1.fit(
    X_train_selected, y_train,
    validation_data=(X_valid_selected, y_valid),
    epochs=100,
    batch_size=32,
    callbacks=[early_stopping],
    verbose=1
)

# Train NN2
history_nn2 = model_nn2.fit(
    X_train_selected, y_train,
    validation_data=(X_valid_selected, y_valid),
    epochs=100,
    batch_size=32,
    callbacks=[early_stopping],
    verbose=1
)

# Evaluate NN1
y_train_pred_nn1 = model_nn1.predict(X_train_selected).flatten()
y_valid_pred_nn1 = model_nn1.predict(X_valid_selected).flatten()
y_test_pred_nn1 = model_nn1.predict(X_test_selected).flatten()

r2_train_nn1 = r2_score(y_train, y_train_pred_nn1)
r2_valid_nn1 = r2_score(y_valid, y_valid_pred_nn1)
r2_test_nn1 = r2_score(y_test, y_test_pred_nn1)

# Evaluate NN2
y_train_pred_nn2 = model_nn2.predict(X_train_selected).flatten()
y_valid_pred_nn2 = model_nn2.predict(X_valid_selected).flatten()
y_test_pred_nn2 = model_nn2.predict(X_test_selected).flatten()

r2_train_nn2 = r2_score(y_train, y_train_pred_nn2)
r2_valid_nn2 = r2_score(y_valid, y_valid_pred_nn2)
r2_test_nn2 = r2_score(y_test, y_test_pred_nn2)

# Print Results
print(f"NN1 Train R²: {r2_train_nn1:.4f}")
print(f"NN1 Validation R²: {r2_valid_nn1:.4f}")
print(f"NN1 Test R² (Out-of-Sample): {r2_test_nn1:.4f}")

print(f"NN2 Train R²: {r2_train_nn2:.4f}")
print(f"NN2 Validation R²: {r2_valid_nn2:.4f}")
print(f"NN2 Test R² (Out-of-Sample): {r2_test_nn2:.4f}")


Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - loss: 4.4920 - mae: 1.2846 - val_loss: 4.1732 - val_mae: 0.7511
Epoch 2/100
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 2.3924 - mae: 0.8251 - val_loss: 3.1543 - val_mae: 0.6351
Epoch 3/100
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 1.6948 - mae: 0.6452 - val_loss: 2.2153 - val_mae: 0.6119
Epoch 4/100
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1.1675 - mae: 0.4943 - val_loss: 1.5014 - val_mae: 0.5201
Epoch 5/100
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.8718 - mae: 0.4088 - val_loss: 1.1100 - val_mae: 0.4934
Epoch 6/100
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - loss: 0.6179 - mae: 0.3171 - val_loss: 0.7856 - val_mae: 0.3631
Epoch 7/100
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 20ms/step - los

In [13]:
# Re-load the dataset
file_path = "OSEBX_Market_Macro_Data_2015_2024.csv"
df = pd.read_csv(file_path)

# Convert Date column to datetime format
df["Date"] = pd.to_datetime(df["Date"])

# Drop rows with missing target variable (OSEBXReturns)
df_clean = df.dropna(subset=["OSEBXReturns"]).copy()

# Selecting Features (X) and Target (Y)
features = [
    "Momentum_3M", "Momentum_6M", "Momentum_12M",
    "Volatility_3M", "Volatility_6M", "Volatility_12M",
    "Volume", "TurnoverRatio", "BidAskSpread",
    "MarketCap", "DividendYield", "BookValuePerShare",
    "EarningsPerShare", "Beta", "USDNOK", "EURNOK",
    "US10Y", "USCPI", "USGDPGrowth", "NorgesBank10Y", "NorwegianCPI",
    "BrentOil"
]

target = "OSEBXReturns"

# Drop remaining rows with missing features
df_clean = df_clean.dropna(subset=features)

# Sorting dataset by Date
df_clean = df_clean.sort_values(by="Date")

# Splitting data into training (2015-2019), validation (2020-2022), and test (2023-2024)
train = df_clean[(df_clean["Date"].dt.year >= 2015) & (df_clean["Date"].dt.year <= 2019)]
valid = df_clean[(df_clean["Date"].dt.year >= 2020) & (df_clean["Date"].dt.year <= 2022)]
test = df_clean[(df_clean["Date"].dt.year >= 2023)]

# Extract features and target
X_train, y_train = train[features], train[target]
X_valid, y_valid = valid[features], valid[target]
X_test, y_test = test[features], test[target]

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrame
X_train_scaled = pd.DataFrame(X_train_scaled, columns=features, index=X_train.index)
X_valid_scaled = pd.DataFrame(X_valid_scaled, columns=features, index=X_valid.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=features, index=X_test.index)

# Feature Selection using Elastic Net and Random Forest
elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5)
elastic_net.fit(X_train_scaled, y_train)
elastic_net_importance = abs(elastic_net.coef_)

rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train_scaled, y_train)
rf_importance = rf.feature_importances_

# Select Top Features
feature_importance_df = pd.DataFrame({
    "Feature": features,
    "ElasticNet Importance": elastic_net_importance,
    "RandomForest Importance": rf_importance
})

feature_importance_df["Avg Importance"] = (feature_importance_df["ElasticNet Importance"] + feature_importance_df["RandomForest Importance"]) / 2
feature_importance_df = feature_importance_df.sort_values(by="Avg Importance", ascending=False)

top_features = feature_importance_df["Feature"].head(10).tolist()

X_train_selected = X_train_scaled[top_features]
X_valid_selected = X_valid_scaled[top_features]
X_test_selected = X_test_scaled[top_features]

# Import required libraries for XGBoost
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

# Define the XGBoost model with hyperparameter tuning
xgb_model = xgb.XGBRegressor(objective="reg:squarederror", random_state=42)

# Define hyperparameter grid
param_grid = {
    "n_estimators": [100, 200, 500],
    "max_depth": [3, 5, 7],
    "learning_rate": [0.01, 0.1, 0.2],
    "subsample": [0.7, 1.0],
    "colsample_bytree": [0.7, 1.0]
}

# Perform Grid Search with Cross-Validation
grid_search = GridSearchCV(xgb_model, param_grid, cv=3, scoring="r2", n_jobs=-1, verbose=1)
grid_search.fit(X_train_selected, y_train)

# Get the best model from grid search
best_xgb_model = grid_search.best_estimator_

# Evaluate on training, validation, and test sets
y_train_pred_xgb = best_xgb_model.predict(X_train_selected)
y_valid_pred_xgb = best_xgb_model.predict(X_valid_selected)
y_test_pred_xgb = best_xgb_model.predict(X_test_selected)

# Compute R-squared scores
r2_train_xgb = r2_score(y_train, y_train_pred_xgb)
r2_valid_xgb = r2_score(y_valid, y_valid_pred_xgb)
r2_test_xgb = r2_score(y_test, y_test_pred_xgb)

# Results Summary
{
    "XGBoost Train R²": r2_train_xgb,
    "XGBoost Validation R²": r2_valid_xgb,
    "XGBoost Test R² (Out-of-Sample)": r2_test_xgb
}


Fitting 3 folds for each of 108 candidates, totalling 324 fits


{'XGBoost Train R²': 0.4499276726254987,
 'XGBoost Validation R²': -0.0032004416905613287,
 'XGBoost Test R² (Out-of-Sample)': -0.023906247402601055}