In [285]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim
from sklearn.metrics import r2_score
from tqdm import tqdm
import pandas as pd
import os


In [303]:
os.chdir("C:/Users/luisb/OneDrive/Documents/UT_ORIE/fall23/Applied_ML/project")
test_data_top = pd.read_csv('data/test_dataset_top.csv')
test_data_top.drop(test_data_top.columns[0], axis=1, inplace=True)
test_data_top.dropna(inplace=True)
#test_data_top = test_data_top.iloc[:, :97]

train_data_top = pd.read_csv('data/train_dataset_top.csv', index_col=None)
train_data_top.drop(train_data_top.columns[0], axis=1, inplace=True)
train_data_top.dropna(inplace=True)
#train_data_top = train_data_top.iloc[:, :97]

val_data_top = pd.read_csv('data/val_dataset_top.csv')
val_data_top.drop(val_data_top.columns[0], axis=1, inplace=True)
val_data_top.dropna(inplace=True)
#val_data_top = val_data_top.iloc[:, :97]

In [304]:
train_data_top.head()
#test_data_top[test_data_top.isnull().any(axis=1)] #check for rows with null values  

Unnamed: 0,permno,DATE,mvel1,beta,betasq,chmom,dolvol,idiovol,indmom,mom1m,...,invest*dfy,invest*svar,absacc*dp_sp,absacc*ep_sp,absacc*bm_sp,absacc*ntis,absacc*tbl,absacc*tms,absacc*dfy,absacc*svar
0,14593,201506,1.0,-0.392488,-0.843512,0.319507,0.968018,-0.718208,-0.60459,-0.119343,...,-0.365988,-0.374302,-0.754788,-0.816891,-0.810511,-0.28503,-0.96976,-0.831976,-0.819573,-0.974298
1,14593,201503,0.993421,-0.373529,-0.83235,0.346878,0.97109,-0.711564,-0.659332,-0.020089,...,-0.379144,-0.372207,-0.774483,-0.819861,-0.822897,-0.271672,-0.957228,-0.872915,-0.837108,-0.960373
2,14593,201504,0.930708,-0.359465,-0.823821,0.354844,0.965641,-0.717428,-0.627439,-0.256641,...,-0.364658,-0.377033,-0.761046,-0.813335,-0.812858,-0.275138,-0.96976,-0.85335,-0.815734,-0.98597
3,14593,201505,0.921025,-0.351181,-0.818698,0.343828,0.974312,-0.720239,-0.596675,-0.189967,...,-0.367983,-0.375246,-0.761733,-0.818006,-0.814626,-0.281153,-0.96976,-0.845632,-0.825331,-0.978333
4,14593,201507,0.906199,-0.377502,-0.83472,0.362267,0.956196,-0.719959,-0.62092,-0.267217,...,-0.358673,-0.373687,-0.75806,-0.823116,-0.811265,-0.284888,-0.95464,-0.845632,-0.798459,-0.971669


In [305]:
#Prepare data
X_train_tensor = torch.tensor(train_data_top.drop(columns=['DATE','permno','RET']).values, dtype=torch.float32)
y_train_tensor = torch.tensor(train_data_top['RET'].values, dtype=torch.float32)

X_test_tensor = torch.tensor(test_data_top.drop(columns=['DATE','permno','RET']).values, dtype=torch.float32)
y_test_tensor = torch.tensor(test_data_top['RET'].values, dtype=torch.float32)

X_val_tensor = torch.tensor(val_data_top.drop(columns=['DATE','permno','RET']).values, dtype=torch.float32)
y_val_tensor = torch.tensor(val_data_top['RET'].values, dtype=torch.float32)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)


batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)


In [306]:
number_of_features = train_dataset[0][0].shape[0]
print(number_of_features)

911


In [315]:
#MLP Model
class MLP(nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2, hidden_size3, output_size):
        super(MLP, self).__init__()
        #Create fully connected layers
        self.fc1 = nn.Linear(input_size, hidden_size1)
        #self.bn1 = nn.BatchNorm1d(hidden_size1)
        self.fc2 = nn.Linear(hidden_size1, hidden_size2)
        #self.bn2 = nn.BatchNorm1d(hidden_size2)
        self.fc3 = nn.Linear(hidden_size2, hidden_size3)
        #self.bn3 = nn.BatchNorm1d(hidden_size3)
        self.fc4 = nn.Linear(hidden_size3, output_size)
        

        #Weight Initialization
        #nn.init.xavier_uniform_(self.fc1.weight)
        #nn.init.xavier_uniform_(self.fc2.weight)
        #nn.init.xavier_uniform_(self.fc3.weight)
        #nn.init.xavier_uniform_(self.fc4.weight)

    def forward(self, x):
        x = x.view(-1, 911)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.fc4(x)
        return x

In [318]:
# Adjust input_size based on the number of features in your input data
input_size = train_dataset[0][0].shape[0]
print(input_size)
hidden_size1, hidden_size2, hidden_size3 = 32, 16, 8
output_size = 1  # For regression
learning_rate = 0.000001

model = MLP(input_size, hidden_size1, hidden_size2, hidden_size3, output_size)
criterion = nn.MSELoss()  # Mean Squared Error loss for regression
optimizer = optim.Adam(model.parameters(), lr=learning_rate)



# R-squared function
def calculate_r_squared(y_true, y_pred):
    return r2_score(y_true, y_pred)

911


In [319]:
num_epochs = 10
#Training
for epoch in range(num_epochs):
    model.train()
    for batch_x, batch_y in tqdm(train_loader):
        outputs = model(batch_x)

        loss = criterion(outputs, batch_y.unsqueeze(1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


    #Validation
    model.eval()
    val_predictions = []
    val_targets = []
    val_loss = 0.0
    with torch.no_grad():
        for val_batch_x, val_batch_y in tqdm(val_loader):
            val_outputs = model(val_batch_x)
            val_loss += criterion(val_outputs, val_batch_y.unsqueeze(1)).item()

            #Validation Set R-squared
            val_predictions.extend(val_outputs.numpy())
            val_targets.extend(val_batch_y.unsqueeze(1).numpy())

    val_loss /= len(val_loader)
    val_r_squared = calculate_r_squared(val_targets, val_predictions)

    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}, Val Loss: {val_loss:.4f}, Validation R-squared: {val_r_squared:.4f}')

print('Training finished.')


100%|██████████| 1123/1123 [00:03<00:00, 347.22it/s]
100%|██████████| 374/374 [00:00<00:00, 1229.16it/s]


Epoch 1/10, Loss: 0.0072, Val Loss: 0.0081, Validation R-squared: -0.4378


100%|██████████| 1123/1123 [00:03<00:00, 339.30it/s]
100%|██████████| 374/374 [00:00<00:00, 1064.87it/s]


Epoch 2/10, Loss: 0.0078, Val Loss: 0.0063, Validation R-squared: -0.1241


100%|██████████| 1123/1123 [00:03<00:00, 289.45it/s]
100%|██████████| 374/374 [00:00<00:00, 813.18it/s]


Epoch 3/10, Loss: 0.0053, Val Loss: 0.0059, Validation R-squared: -0.0496


100%|██████████| 1123/1123 [00:04<00:00, 226.63it/s]
100%|██████████| 374/374 [00:00<00:00, 683.42it/s]


Epoch 4/10, Loss: 0.0049, Val Loss: 0.0058, Validation R-squared: -0.0310


100%|██████████| 1123/1123 [00:05<00:00, 196.06it/s]
100%|██████████| 374/374 [00:00<00:00, 662.20it/s]


Epoch 5/10, Loss: 0.0062, Val Loss: 0.0057, Validation R-squared: -0.0236


100%|██████████| 1123/1123 [00:06<00:00, 182.96it/s]
100%|██████████| 374/374 [00:00<00:00, 502.55it/s]


Epoch 6/10, Loss: 0.0062, Val Loss: 0.0057, Validation R-squared: -0.0183


100%|██████████| 1123/1123 [00:09<00:00, 115.38it/s]
100%|██████████| 374/374 [00:00<00:00, 375.03it/s]


Epoch 7/10, Loss: 0.0052, Val Loss: 0.0057, Validation R-squared: -0.0163


100%|██████████| 1123/1123 [00:08<00:00, 135.44it/s]
100%|██████████| 374/374 [00:01<00:00, 373.62it/s]


Epoch 8/10, Loss: 0.0100, Val Loss: 0.0057, Validation R-squared: -0.0146


100%|██████████| 1123/1123 [00:10<00:00, 103.29it/s]
100%|██████████| 374/374 [00:01<00:00, 299.05it/s]


Epoch 9/10, Loss: 0.0090, Val Loss: 0.0057, Validation R-squared: -0.0127


100%|██████████| 1123/1123 [00:06<00:00, 161.57it/s]
100%|██████████| 374/374 [00:00<00:00, 632.43it/s]

Epoch 10/10, Loss: 0.0042, Val Loss: 0.0057, Validation R-squared: -0.0123
Training finished.





In [320]:
# Testing
model.eval()
test_loss = 0.0
all_predictions = []
all_targets = []

with torch.no_grad():
    for test_batch_x, test_batch_y in test_loader:
        test_outputs = model(test_batch_x)
        test_batch_y = test_batch_y.view(-1, 1)  # Ensure the target tensor has shape [batch_size, 1]
        test_loss += criterion(test_outputs, test_batch_y).item()

        all_predictions.extend(test_outputs.numpy())
        all_targets.extend(test_batch_y.numpy())

test_loss /= len(test_loader)

# Calculate R-squared
r_squared = calculate_r_squared(all_targets, all_predictions)

print(f'Test Loss: {test_loss:.4f}')
print(f'R-squared: {r_squared:.4f}')

print('Training and Testing finished.')

Test Loss: 0.0109
R-squared: -0.0015
Training and Testing finished.


In [199]:
r_squared

-0.010011854942911569

In [197]:
all_targets

[0.074229,
 -0.068037,
 -0.001236,
 0.017333,
 0.10494,
 0.042438,
 0.058657,
 -0.066119,
 0.064982,
 -0.005502,
 -0.079532,
 -0.102526,
 -0.050434,
 0.061528,
 0.176291,
 0.099109,
 0.076218,
 0.051717,
 0.00734,
 0.114574,
 -0.060012,
 -0.007693,
 0.084989,
 0.09549,
 0.216309,
 -0.049252,
 0.069602,
 -0.053518,
 0.014588,
 0.004225,
 -0.07047,
 -0.032722,
 -0.087579,
 0.039924,
 -0.067397,
 0.043034,
 0.042892,
 0.026602,
 -0.015576,
 0.067355,
 0.039006,
 -0.035328,
 -0.03737,
 0.028058,
 0.090461,
 -0.035754,
 0.165132,
 0.120663,
 0.000372,
 0.10258,
 0.007371,
 0.060058,
 0.04344,
 0.110559,
 0.147114,
 0.147386,
 0.025389,
 -0.11431,
 -0.045292,
 0.05401,
 0.084956,
 -0.012785,
 -0.026541,
 0.129567,
 0.079455,
 0.136326,
 -0.069761,
 0.098784,
 0.041749,
 -0.076855,
 0.027612,
 0.077424,
 0.155374,
 -0.008303,
 0.059427,
 -0.030478,
 0.031216,
 0.008487,
 0.015044,
 -0.180709,
 0.017244,
 -0.049464,
 -0.062214,
 0.110684,
 -0.004824,
 0.2689,
 -0.202192,
 -0.01657,
 -0.078613,

In [198]:
all_predictions

[array([-0.00597654], dtype=float32),
 array([-0.00500271], dtype=float32),
 array([-0.00280816], dtype=float32),
 array([-0.00138816], dtype=float32),
 array([-0.00493495], dtype=float32),
 array([-0.00457401], dtype=float32),
 array([-0.00518202], dtype=float32),
 array([-0.00111628], dtype=float32),
 array([-0.00406744], dtype=float32),
 array([-0.00561002], dtype=float32),
 array([-0.00678351], dtype=float32),
 array([-0.00254017], dtype=float32),
 array([-0.00630204], dtype=float32),
 array([-0.00080157], dtype=float32),
 array([-0.00127775], dtype=float32),
 array([-0.00506701], dtype=float32),
 array([-0.00829867], dtype=float32),
 array([-0.00069372], dtype=float32),
 array([-0.00792769], dtype=float32),
 array([-0.00477006], dtype=float32),
 array([-0.00405661], dtype=float32),
 array([-0.00129267], dtype=float32),
 array([-0.00085808], dtype=float32),
 array([-0.00474503], dtype=float32),
 array([-0.00171005], dtype=float32),
 array([-0.00281575], dtype=float32),
 array([-0.0