In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import talib
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from torch.utils.data import Dataset, DataLoader, Subset
from sklearn.model_selection import train_test_split
import torch.optim as optim
import os
from sklearn.model_selection import TimeSeriesSplit

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, log_loss, mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
import matplotlib.pyplot as plt
import scipy.cluster.hierarchy as sch
import seaborn as sns

import optuna
from optuna.samplers import TPESampler
from optuna.trial import TrialState
from torch.optim.lr_scheduler import StepLR, ReduceLROnPlateau 
import shap
import plotly.graph_objs as go
import plotly.offline as pyo
from tqdm.auto import tqdm
from sklearn.utils.class_weight import compute_class_weight
import torch.nn.functional as F
from tqdm.notebook import tqdm
from optuna.pruners import HyperbandPruner

from sklearn.model_selection import KFold
import copy
from torch.cuda.amp import GradScaler, autocast
import os


In [2]:
if torch.cuda.is_available():
    device = torch.device('cuda')
    print("gpu")
else:
    device = torch.device('cpu')
print(torch.__version__)
print('CUDA available:', torch.cuda.is_available())
print('CUDA version:', torch.version.cuda)
print('cuDNN version:', torch.backends.cudnn.version())


gpu
2.1.2+cu121
CUDA available: True
CUDA version: 12.1
cuDNN version: 8902


In [3]:
pivoted_data = pd.read_csv("../Data/longmerged_deneme_51.csv")
pivoted_data.drop(columns=["Unnamed: 0"], inplace=True)
#pivoted_data.set_index("PENDS", inplace=True)
len(pivoted_data["OFTIC"].unique())
# pivoted_data[pivoted_data["OFTIC"] == "AAPL"]
pivoted_data = pivoted_data[pivoted_data['PENDS'] > '2011-03-31']
pivoted_data = pivoted_data[pivoted_data['PENDS'] < '2020-01-01']
# pivoted_data = pivoted_data[pivoted_data['PENDS'] > '2017-01-01']

pivoted_data

Unnamed: 0,Date,OFTIC,PENDS,MEAN,STDEV,BPS,CPS,CPX,CSH,DPS,...,commodity_trade_Close_quarterly_return,C_Discretionary_Close_quarterly_return,C_Staples_Close_quarterly_return,Energy_Close_quarterly_return,Financials_Close_quarterly_return,Health_care_Close_quarterly_return,industrials_Close_quarterly_return,information_Close_quarterly_return,materials_Close_quarterly_return,utilities_Close_quarterly_return
0,2011-06-30,AAPL,2011-06-30,17.869831,1.698451,2.6718,0.422900,777.000000,0.209015,0.00,...,0.062957,2.970550,4.378340,-5.517243,-6.345335,7.308964,-1.141483,-1.381423,-1.624192,5.051769
1,2011-09-30,AAPL,2011-09-30,17.834875,3.131120,2.9872,0.397600,1645.000000,0.258659,0.00,...,0.120922,-13.305144,-5.027217,-22.349038,-23.061887,-10.695185,-21.535988,-8.171207,-25.425447,0.418158
2,2011-12-31,AAPL,2011-12-31,21.771714,3.087308,3.4500,0.665700,1321.000000,0.249095,0.00,...,-0.087046,11.933447,9.541476,18.150742,10.076208,9.328708,15.503083,7.838985,14.100815,7.019633
3,2012-03-31,AAPL,2012-03-31,26.426070,2.380069,3.9154,0.528200,1457.000000,0.153558,0.00,...,-0.191850,15.556124,4.893814,3.789965,21.538450,8.417417,10.874069,18.506872,10.358213,-2.612559
4,2012-06-30,AAPL,2012-06-30,27.652509,3.109565,4.2582,0.384300,2056.000000,0.204900,0.00,...,0.484614,-2.905304,2.024644,-7.498254,-7.341768,1.063541,-4.676644,-4.741380,-4.544226,5.565071
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14055,2018-12-31,ZION,2018-12-31,53.000000,4.600140,37.3900,1.760000,45.779944,0.504142,0.30,...,0.355782,-15.534891,-5.840908,-24.280433,-13.633068,-9.080399,-17.844385,-17.722026,-12.791300,0.512814
14056,2019-03-31,ZION,2019-03-31,52.800000,3.001754,41.5751,-0.210000,41.655270,0.300673,0.30,...,-0.178828,14.988381,10.496262,15.292074,7.934506,6.057101,16.488114,19.393354,9.857481,9.920635
14057,2019-06-30,ZION,2019-06-30,49.574468,4.422004,42.9480,1.380000,44.643278,0.516266,0.30,...,0.081598,4.699164,3.493137,-3.644893,7.351231,0.970027,3.185392,5.459461,5.405405,2.509890
14058,2019-09-30,ZION,2019-09-30,51.193548,3.780823,40.7500,1.583422,44.768947,0.615809,0.34,...,-0.036101,1.258389,5.768897,-7.078949,1.449274,-2.709415,0.271247,3.190669,-0.512819,8.569506


In [4]:
mean_abs_eps = pivoted_data['EPS'].abs().mean()

mean_abs_eps

1.17573528154396

In [5]:
unique_counts = pivoted_data.groupby('Sector')['OFTIC'].nunique()
unique_counts

Sector
Communication Services     5
Consumer Discretionary    32
Consumer Staples          20
Energy                    14
Financials                38
Health Care               38
Industrials               36
Information Technology    34
Materials                 19
Real Estate               20
Utilities                 20
Name: OFTIC, dtype: int64

In [6]:
pivoted_data.columns

Index(['Date', 'OFTIC', 'PENDS', 'MEAN', 'STDEV', 'BPS', 'CPS', 'CPX', 'CSH',
       'DPS', 'EBG', 'EBI', 'EBS', 'EBT', 'ENT', 'EPS', 'FFO', 'GPS', 'GRM',
       'NAV', 'NDT', 'NET', 'OPR', 'PRE', 'ROA', 'ROE', 'SAL',
       'Real_Estate_Index_Price', 'VIX_Close', 'Gold_Close',
       'Three_Month_Yield', 'Brent_Close', 'Hrc_close',
       'commodity_trade_Close', 'C_Discretionary_Close', 'C_Staples_Close',
       'Energy_Close', 'Financials_Close', 'Health_care_Close',
       'industrials_Close', 'information_Close', 'materials_Close',
       'utilities_Close', 'Sector', 'numeric_sector',
       'Real_Estate_Index_Price_quarterly_return',
       'VIX_Close_quarterly_return', 'Gold_Close_quarterly_return',
       'Three_Month_Yield_quarterly_return', 'Brent_Close_quarterly_return',
       'Hrc_close_quarterly_return', 'commodity_trade_Close_quarterly_return',
       'C_Discretionary_Close_quarterly_return',
       'C_Staples_Close_quarterly_return', 'Energy_Close_quarterly_return',
   

In [7]:
# drop_ticker = {'BK',
#                 'CAH',
#                 'CPB',
#                 'CRM',
#                 'EIX',
#                 'EMR',
#                 'ETN',
#                 'ETR',
#                 'EXPD',
#                 'FAST',
#                 'FCX',
#                 'GIS',
#                 'GLW',
#                 'HIG',
#                 'HON',
#                 'IBM',
#                 'ILMN',
#                 'ITW',
#                 'KMB',
#                 'KR',
#                 'LLY',
#                 'MAS',
#                 'MCO',
#                 'MO',
#                 'MOH',
#                 'NRG',
#                 'NTRS',
#                 'ORLY',
#                 'OXY',
#                 'PM',
#                 'PNC',
#                 'PPL',
#                 'PSA',
#                 'RMD',
#                 'STT',
#                 'TXN',
#                 'VFC',
#                 'VMC',
#                 'XEL',
#                 'DFS',
#                 'ICE'}

# pivoted_data = pivoted_data[~pivoted_data['OFTIC'].isin(drop_ticker)]

In [8]:
# pivoted_data = pivoted_data[pivoted_data["Sector"] == "Financials"]
# pivoted_data

In [9]:
pivoted_data[pivoted_data["OFTIC"] == "ACN"]

Unnamed: 0,Date,OFTIC,PENDS,MEAN,STDEV,BPS,CPS,CPX,CSH,DPS,...,commodity_trade_Close_quarterly_return,C_Discretionary_Close_quarterly_return,C_Staples_Close_quarterly_return,Energy_Close_quarterly_return,Financials_Close_quarterly_return,Health_care_Close_quarterly_return,industrials_Close_quarterly_return,information_Close_quarterly_return,materials_Close_quarterly_return,utilities_Close_quarterly_return


In [10]:
pivoted_data = pivoted_data.copy()

pivoted_data['Hrc_close'] = pd.to_numeric(pivoted_data['Hrc_close'].astype(str).str.split(",").str[0], errors='coerce')
pivoted_data['Gold_Close'] = pd.to_numeric(pivoted_data['Gold_Close'].astype(str).str.replace(",", ""), errors='coerce').round(2)

In [11]:
company_list = pivoted_data["OFTIC"].unique()
company_list

array(['AAPL', 'AFL', 'ALB', 'AMT', 'AMZN', 'ANSS', 'AON', 'AOS', 'APA',
       'APD', 'APH', 'ARE', 'AVB', 'AVGO', 'AVY', 'AWK', 'AXP', 'AZO',
       'BA', 'BDX', 'BEN', 'BG', 'BIIB', 'BK', 'BLK', 'BMY', 'BR', 'BSX',
       'BWA', 'BX', 'CAH', 'CBOE', 'CCI', 'CCL', 'CDNS', 'CE', 'CF',
       'CHD', 'CHRW', 'CHTR', 'CI', 'CMA', 'CMCSA', 'CME', 'CMI', 'CMS',
       'CNC', 'CNP', 'COF', 'COO', 'COST', 'CPB', 'CPRT', 'CPT', 'CRM',
       'CSX', 'CTSH', 'CVS', 'CVX', 'D', 'DAL', 'DE', 'DFS', 'DG', 'DGX',
       'DHI', 'DHR', 'DLR', 'DLTR', 'DOV', 'DPZ', 'DTE', 'DXCM', 'EBAY',
       'ECL', 'ED', 'EFX', 'EIX', 'EMR', 'EOG', 'EQIX', 'EQR', 'ESS',
       'ETN', 'ETR', 'EW', 'EXC', 'EXPD', 'EXPE', 'EXR', 'F', 'FAST',
       'FCX', 'FE', 'FFIV', 'FMC', 'FRT', 'FSLR', 'FTNT', 'GD', 'GILD',
       'GIS', 'GLW', 'GM', 'GNRC', 'GRMN', 'GS', 'GWW', 'HAL', 'HCA',
       'HD', 'HES', 'HIG', 'HOLX', 'HON', 'HPQ', 'HSY', 'HUM', 'IBM',
       'ICE', 'IDXX', 'IEX', 'ILMN', 'INCY', 'IP', 'ISRG', 'IT', 'ITW

In [12]:
pivoted_data.shape

(9660, 61)

In [13]:
company_list = pivoted_data["OFTIC"].unique()
company_list

array(['AAPL', 'AFL', 'ALB', 'AMT', 'AMZN', 'ANSS', 'AON', 'AOS', 'APA',
       'APD', 'APH', 'ARE', 'AVB', 'AVGO', 'AVY', 'AWK', 'AXP', 'AZO',
       'BA', 'BDX', 'BEN', 'BG', 'BIIB', 'BK', 'BLK', 'BMY', 'BR', 'BSX',
       'BWA', 'BX', 'CAH', 'CBOE', 'CCI', 'CCL', 'CDNS', 'CE', 'CF',
       'CHD', 'CHRW', 'CHTR', 'CI', 'CMA', 'CMCSA', 'CME', 'CMI', 'CMS',
       'CNC', 'CNP', 'COF', 'COO', 'COST', 'CPB', 'CPRT', 'CPT', 'CRM',
       'CSX', 'CTSH', 'CVS', 'CVX', 'D', 'DAL', 'DE', 'DFS', 'DG', 'DGX',
       'DHI', 'DHR', 'DLR', 'DLTR', 'DOV', 'DPZ', 'DTE', 'DXCM', 'EBAY',
       'ECL', 'ED', 'EFX', 'EIX', 'EMR', 'EOG', 'EQIX', 'EQR', 'ESS',
       'ETN', 'ETR', 'EW', 'EXC', 'EXPD', 'EXPE', 'EXR', 'F', 'FAST',
       'FCX', 'FE', 'FFIV', 'FMC', 'FRT', 'FSLR', 'FTNT', 'GD', 'GILD',
       'GIS', 'GLW', 'GM', 'GNRC', 'GRMN', 'GS', 'GWW', 'HAL', 'HCA',
       'HD', 'HES', 'HIG', 'HOLX', 'HON', 'HPQ', 'HSY', 'HUM', 'IBM',
       'ICE', 'IDXX', 'IEX', 'ILMN', 'INCY', 'IP', 'ISRG', 'IT', 'ITW

In [14]:
pivoted_data[pivoted_data["OFTIC"] == "AAPL"].sort_values(by="PENDS")

Unnamed: 0,Date,OFTIC,PENDS,MEAN,STDEV,BPS,CPS,CPX,CSH,DPS,...,commodity_trade_Close_quarterly_return,C_Discretionary_Close_quarterly_return,C_Staples_Close_quarterly_return,Energy_Close_quarterly_return,Financials_Close_quarterly_return,Health_care_Close_quarterly_return,industrials_Close_quarterly_return,information_Close_quarterly_return,materials_Close_quarterly_return,utilities_Close_quarterly_return
0,2011-06-30,AAPL,2011-06-30,17.869831,1.698451,2.6718,0.4229,777.0,0.209015,0.0,...,0.062957,2.97055,4.37834,-5.517243,-6.345335,7.308964,-1.141483,-1.381423,-1.624192,5.051769
1,2011-09-30,AAPL,2011-09-30,17.834875,3.13112,2.9872,0.3976,1645.0,0.258659,0.0,...,0.120922,-13.305144,-5.027217,-22.349038,-23.061887,-10.695185,-21.535988,-8.171207,-25.425447,0.418158
2,2011-12-31,AAPL,2011-12-31,21.771714,3.087308,3.45,0.6657,1321.0,0.249095,0.0,...,-0.087046,11.933447,9.541476,18.150742,10.076208,9.328708,15.503083,7.838985,14.100815,7.019633
3,2012-03-31,AAPL,2012-03-31,26.42607,2.380069,3.9154,0.5282,1457.0,0.153558,0.0,...,-0.19185,15.556124,4.893814,3.789965,21.53845,8.417417,10.874069,18.506872,10.358213,-2.612559
4,2012-06-30,AAPL,2012-06-30,27.652509,3.109565,4.2582,0.3843,2056.0,0.2049,0.0,...,0.484614,-2.905304,2.024644,-7.498254,-7.341768,1.063541,-4.676644,-4.74138,-4.544226,5.565071
5,2012-09-30,AAPL,2012-09-30,26.301,2.756554,4.495,0.3443,3461.0,0.278036,0.0946,...,-0.256538,6.875291,3.048609,10.637332,6.489072,5.551173,2.410991,7.309434,4.278828,-1.622066
6,2012-12-31,AAPL,2012-12-31,22.322095,3.309063,4.8425,0.8832,2317.0,0.272748,0.0946,...,0.072421,1.389181,-2.595591,-2.737304,5.131497,-0.5982,3.75035,-6.422314,2.010874,-4.039575
7,2013-03-31,AAPL,2013-03-31,18.713368,2.228731,5.1473,0.4721,2008.0,0.161689,0.0946,...,-0.059795,11.720071,13.954151,11.047325,11.104331,15.220661,10.184688,4.922011,4.368672,11.970219
8,2013-06-30,AAPL,2013-06-30,19.420477,2.793346,4.8496,0.3025,1885.0,0.222812,0.1089,...,0.031464,6.415097,-0.251452,-1.273477,6.809444,3.612622,2.107282,1.057151,-2.118432,-3.759584
9,2013-09-30,AAPL,2013-09-30,21.519667,1.59732,4.9071,0.3893,1955.0,0.362124,0.1089,...,-0.220967,7.499999,0.327706,5.887612,2.365034,6.259188,8.841465,4.740114,9.543677,-0.690944


In [15]:
cols_to_move = ['OFTIC', 'PENDS', 'Date', 'Sector', 'numeric_sector']
pivoted_data = pivoted_data[[col for col in pivoted_data if col not in cols_to_move] + cols_to_move]
pivoted_data

Unnamed: 0,MEAN,STDEV,BPS,CPS,CPX,CSH,DPS,EBG,EBI,EBS,...,Health_care_Close_quarterly_return,industrials_Close_quarterly_return,information_Close_quarterly_return,materials_Close_quarterly_return,utilities_Close_quarterly_return,OFTIC,PENDS,Date,Sector,numeric_sector
0,17.869831,1.698451,2.6718,0.422900,777.000000,0.209015,0.00,0.230530,9379.0,0.505000,...,7.308964,-1.141483,-1.381423,-1.624192,5.051769,AAPL,2011-06-30,2011-06-30,Information Technology,3
1,17.834875,3.131120,2.9872,0.397600,1645.000000,0.258659,0.00,0.303239,8710.0,0.551793,...,-10.695185,-21.535988,-8.171207,-25.425447,0.418158,AAPL,2011-09-30,2011-09-30,Information Technology,3
2,21.771714,3.087308,3.4500,0.665700,1321.000000,0.249095,0.00,0.274352,17340.0,0.885858,...,9.328708,15.503083,7.838985,14.100815,7.019633,AAPL,2011-12-31,2011-12-31,Information Technology,3
3,26.426070,2.380069,3.9154,0.528200,1457.000000,0.153558,0.00,0.192177,15384.0,0.626165,...,8.417417,10.874069,18.506872,10.358213,-2.612559,AAPL,2012-03-31,2012-03-31,Information Technology,3
4,27.652509,3.109565,4.2582,0.384300,2056.000000,0.204900,0.00,0.246704,11573.0,0.666070,...,1.063541,-4.676644,-4.741380,-4.544226,5.565071,AAPL,2012-06-30,2012-06-30,Information Technology,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14055,53.000000,4.600140,37.3900,1.760000,45.779944,0.504142,0.30,0.519314,297.0,0.587800,...,-9.080399,-17.844385,-17.722026,-12.791300,0.512814,ZION,2018-12-31,2018-12-31,Financials,5
14056,52.800000,3.001754,41.5751,-0.210000,41.655270,0.300673,0.30,0.245076,278.0,0.327376,...,6.057101,16.488114,19.393354,9.857481,9.920635,ZION,2019-03-31,2019-03-31,Financials,5
14057,49.574468,4.422004,42.9480,1.380000,44.643278,0.516266,0.30,0.489922,277.0,0.540476,...,0.970027,3.185392,5.459461,5.405405,2.509890,ZION,2019-06-30,2019-06-30,Financials,5
14058,51.193548,3.780823,40.7500,1.583422,44.768947,0.615809,0.34,0.638738,298.0,0.654236,...,-2.709415,0.271247,3.190669,-0.512819,8.569506,ZION,2019-09-30,2019-09-30,Financials,5


In [16]:
stock_symbol = "AFL"
stock = pivoted_data[pivoted_data["OFTIC"] == stock_symbol].sort_values(by="PENDS")

stock_features_dict = {}
for column in stock.columns:
    stock_features_dict[column] = stock[column]

trace = go.Scatter(x=stock_features_dict["PENDS"], y=stock_features_dict["EPS"], mode="lines+markers", name=f"{stock_symbol}: EPS - PENDS")

layout = go.Layout(
    title = f"{stock_symbol}: EPS - PENDS",
    xaxis=dict(title='Date'),
    yaxis=dict(title='EPS', side='left', rangemode='tozero'),
    height=600,
)

fig = go.Figure(data=trace, layout=layout)
pyo.iplot(fig)

In [17]:
X_ = pivoted_data.drop(columns=["EPS"])
y_ = pivoted_data['EPS']
y_

0        0.2782
1        0.2518
2        0.4954
3        0.4393
4        0.3329
          ...  
14055    1.0800
14056    1.0400
14057    0.9900
14058    1.1700
14059    0.9700
Name: EPS, Length: 9660, dtype: float64

In [18]:
x_values = pivoted_data.drop(columns=["OFTIC","PENDS","Date","Sector","numeric_sector"])
x_values

Unnamed: 0,MEAN,STDEV,BPS,CPS,CPX,CSH,DPS,EBG,EBI,EBS,...,commodity_trade_Close_quarterly_return,C_Discretionary_Close_quarterly_return,C_Staples_Close_quarterly_return,Energy_Close_quarterly_return,Financials_Close_quarterly_return,Health_care_Close_quarterly_return,industrials_Close_quarterly_return,information_Close_quarterly_return,materials_Close_quarterly_return,utilities_Close_quarterly_return
0,17.869831,1.698451,2.6718,0.422900,777.000000,0.209015,0.00,0.230530,9379.0,0.505000,...,0.062957,2.970550,4.378340,-5.517243,-6.345335,7.308964,-1.141483,-1.381423,-1.624192,5.051769
1,17.834875,3.131120,2.9872,0.397600,1645.000000,0.258659,0.00,0.303239,8710.0,0.551793,...,0.120922,-13.305144,-5.027217,-22.349038,-23.061887,-10.695185,-21.535988,-8.171207,-25.425447,0.418158
2,21.771714,3.087308,3.4500,0.665700,1321.000000,0.249095,0.00,0.274352,17340.0,0.885858,...,-0.087046,11.933447,9.541476,18.150742,10.076208,9.328708,15.503083,7.838985,14.100815,7.019633
3,26.426070,2.380069,3.9154,0.528200,1457.000000,0.153558,0.00,0.192177,15384.0,0.626165,...,-0.191850,15.556124,4.893814,3.789965,21.538450,8.417417,10.874069,18.506872,10.358213,-2.612559
4,27.652509,3.109565,4.2582,0.384300,2056.000000,0.204900,0.00,0.246704,11573.0,0.666070,...,0.484614,-2.905304,2.024644,-7.498254,-7.341768,1.063541,-4.676644,-4.741380,-4.544226,5.565071
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14055,53.000000,4.600140,37.3900,1.760000,45.779944,0.504142,0.30,0.519314,297.0,0.587800,...,0.355782,-15.534891,-5.840908,-24.280433,-13.633068,-9.080399,-17.844385,-17.722026,-12.791300,0.512814
14056,52.800000,3.001754,41.5751,-0.210000,41.655270,0.300673,0.30,0.245076,278.0,0.327376,...,-0.178828,14.988381,10.496262,15.292074,7.934506,6.057101,16.488114,19.393354,9.857481,9.920635
14057,49.574468,4.422004,42.9480,1.380000,44.643278,0.516266,0.30,0.489922,277.0,0.540476,...,0.081598,4.699164,3.493137,-3.644893,7.351231,0.970027,3.185392,5.459461,5.405405,2.509890
14058,51.193548,3.780823,40.7500,1.583422,44.768947,0.615809,0.34,0.638738,298.0,0.654236,...,-0.036101,1.258389,5.768897,-7.078949,1.449274,-2.709415,0.271247,3.190669,-0.512819,8.569506


In [19]:
corr = x_values.corr()

fig = go.Figure(data=go.Heatmap(
                   z=corr.values,
                   x=corr.columns,
                   y=corr.columns,
                   text=corr.round(2).values,
                   texttemplate="%{text}",
                   colorscale='Viridis',
                   zmin=-1, zmax=1))

# Update the layout
fig.update_layout(
    title='Correlation Matrix',
    xaxis_title="Variables",
    yaxis_title="Variables",
    xaxis=dict(side='bottom'),
    yaxis=dict(autorange='reversed'),
    width=900,  # or any width you desire
    height=800,  # or any height you desire
)

fig.show()

In [20]:
linked = sch.linkage(sch.distance.pdist(corr), method='ward')
cluster_order = sch.dendrogram(linked, no_plot=True)['leaves']

# Reorder the correlation matrix
correlation_matrix_ordered = corr.iloc[cluster_order, cluster_order]

fig = go.Figure(data=go.Heatmap(
                   z=correlation_matrix_ordered.values,
                   x=correlation_matrix_ordered.columns,
                   y=correlation_matrix_ordered.columns,
                   text=correlation_matrix_ordered.round(2).values,
                   texttemplate="%{text}",
                   colorscale='Viridis',
                   zmin=-1, zmax=1))

# Update the layout
fig.update_layout(
    title='Correlation Matrix',
    xaxis_title="Variables",
    yaxis_title="Variables",
    xaxis=dict(side='bottom'),
    yaxis=dict(autorange='reversed'),
    width=900,  # or any width you desire
    height=800,  # or any height you desire
)

fig.show()

In [21]:
cluster_order += [max(cluster_order)+1, max(cluster_order)+2, max(cluster_order)+3,max(cluster_order)+4,max(cluster_order)+5]
cluster_order

[27,
 32,
 35,
 37,
 39,
 38,
 31,
 34,
 36,
 7,
 30,
 44,
 49,
 43,
 50,
 52,
 54,
 48,
 53,
 47,
 51,
 18,
 8,
 20,
 10,
 19,
 11,
 4,
 16,
 23,
 33,
 28,
 24,
 26,
 46,
 25,
 41,
 3,
 12,
 14,
 0,
 1,
 40,
 55,
 29,
 45,
 42,
 2,
 15,
 5,
 21,
 22,
 6,
 13,
 9,
 17,
 56,
 57,
 58,
 59,
 60]

In [22]:
pivoted_data = pivoted_data.iloc[:, cluster_order]
pivoted_data

Unnamed: 0,Three_Month_Yield,C_Staples_Close,Health_care_Close,information_Close,utilities_Close,materials_Close,C_Discretionary_Close,Financials_Close,industrials_Close,EBG,...,ROE,DPS,FFO,EBS,NDT,OFTIC,PENDS,Date,Sector,numeric_sector
0,0.03,31.230000,35.529999,25.700001,33.480000,39.369999,40.209999,12.469537,37.240002,0.230530,...,44.69,0.00,0.221201,0.505000,-76156.000000,AAPL,2011-06-30,2011-06-30,Information Technology,3
1,0.02,29.660000,31.730000,23.600000,33.619999,29.360001,34.860001,9.593826,29.219999,0.303239,...,36.30,0.00,0.323584,0.551793,-25952.000000,AAPL,2011-09-30,2011-09-30,Information Technology,3
2,0.02,32.490002,34.689999,25.450001,35.980000,33.500000,39.020000,10.560520,33.750000,0.274352,...,62.71,0.00,0.369230,0.885858,-30156.000000,AAPL,2011-12-31,2011-12-31,Information Technology,3
3,0.07,34.080002,37.610001,30.160000,35.040001,36.970001,45.090000,12.835093,37.419998,0.192177,...,48.29,0.00,0.177561,0.626165,-28538.000000,AAPL,2012-03-31,2012-03-31,Information Technology,3
4,0.09,34.770000,38.009998,28.730000,36.990002,35.290001,43.779999,11.892770,35.669998,0.246704,...,32.95,0.00,0.277505,0.666070,-27654.000000,AAPL,2012-06-30,2012-06-30,Information Technology,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14055,2.45,50.779999,86.510002,61.980000,52.919998,50.520000,99.010002,23.820000,64.410004,0.519314,...,12.40,0.30,0.566079,0.587800,0.525924,ZION,2018-12-31,2018-12-31,Financials,5
14056,2.40,56.110001,91.750000,74.000000,58.169998,55.500000,113.849998,25.709999,75.029999,0.245076,...,11.90,0.30,0.294405,0.327376,0.296725,ZION,2019-03-31,2019-03-31,Financials,5
14057,2.12,58.070000,92.639999,78.040001,59.630001,58.500000,119.199997,27.600000,77.419998,0.489922,...,10.80,0.30,0.538019,0.540476,0.526172,ZION,2019-06-30,2019-06-30,Financials,5
14058,1.88,61.419998,90.129997,80.529999,64.739998,58.200001,120.699997,28.000000,77.629997,0.638738,...,12.10,0.34,0.665659,0.654236,0.641649,ZION,2019-09-30,2019-09-30,Financials,5


In [23]:
company_dict = {}

for company in tqdm(pivoted_data["OFTIC"].unique()):
    comp_data = pivoted_data[pivoted_data["OFTIC"] == company].sort_values(by='PENDS')
    X = comp_data.drop(columns=["EPS", "Sector","PENDS","OFTIC","Date"])
    y = comp_data['EPS']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

    X_train_df = X_train
    X_test_df = X_test
    y_train_df = y_train
    y_test_df = y_test

    for column in X_train.columns:
        if "embedding" in column:
            continue

        if "numeric_sector" in column:
            continue

        scaler = MinMaxScaler()

        X_train_scaled = scaler.fit_transform(X_train[[column]].values)
        X_train_df[column] = X_train_scaled
            
        X_test_scaled = scaler.transform(X_test[[column]].values)
        X_test_df[column] = X_test_scaled

    scaler_y = MinMaxScaler()

    y_train_scaled = pd.DataFrame(scaler_y.fit_transform(y_train.values.reshape(-1, 1)), columns=['EPS'], index=y_train.index)
    y_train_df = y_train_scaled
            
    y_test_scaled = pd.DataFrame(scaler_y.transform(y_test.values.reshape(-1, 1)), columns=['EPS'], index=y_test.index)
    y_test_df = y_test_scaled

    X_train_df["lagged_EPS"] = y_train_df
    X_test_df["lagged_EPS"] = y_test_df

    company_dict[company] = {"X_train": X_train_df, "X_test": X_test_df, "y_train": y_train_df, "y_test": y_test_df, "scaler_y": scaler_y}


  0%|          | 0/276 [00:00<?, ?it/s]

In [24]:
company_dict["AFL"]["X_test"]

Unnamed: 0,Three_Month_Yield,C_Staples_Close,Health_care_Close,information_Close,utilities_Close,materials_Close,C_Discretionary_Close,Financials_Close,industrials_Close,EBG,...,GRM,CSH,ROA,ROE,DPS,FFO,EBS,NDT,numeric_sector,lagged_EPS
79,1.115607,0.803158,1.015309,1.096844,0.944302,0.921078,1.120578,0.927933,0.913025,0.826128,...,0.89575,0.805589,0.377757,0.196025,1.0,0.86474,0.717296,0.835628,5,1.106667
80,1.265896,0.891296,1.24475,1.236968,0.979561,0.916587,1.239801,0.981983,1.058773,1.207215,...,1.244318,1.171794,0.530168,0.176148,1.0,1.264188,1.184915,1.184623,5,1.026667
81,1.416185,0.775615,1.075172,0.917743,0.993357,0.678858,0.965678,0.7767,0.757589,1.002111,...,0.950453,1.11587,-0.176453,0.167923,1.0,1.101017,1.076053,1.060511,5,1.0
82,1.387283,0.971355,1.178018,1.205165,1.261625,0.838627,1.189071,0.879888,0.986222,0.444183,...,0.484448,0.347483,0.129171,0.193283,1.090909,0.248053,0.253698,0.476706,5,1.266667
83,1.225434,1.043335,1.195486,1.30177,1.336229,0.934873,1.269607,0.983075,1.037675,0.88743,...,1.049653,0.941124,0.424311,0.107608,1.090909,0.975364,0.930325,0.976911,5,1.293333
84,1.086705,1.166361,1.146222,1.36131,1.597343,0.925249,1.292187,1.004914,1.042196,1.218511,...,1.33594,1.274189,0.563968,0.072653,1.090909,1.296393,1.356457,1.365598,5,1.373333
85,0.895954,1.22365,1.376447,1.62769,1.591211,1.028553,1.363239,1.156692,1.124866,0.8508,...,0.901552,0.985987,0.454371,-0.048663,1.090909,1.028198,0.984323,0.957014,5,1.026667


In [25]:
company_dict["AFL"]["X_train"].shape

(28, 57)

In [26]:
company_dict["AFL"]["y_test"]

Unnamed: 0,EPS
79,1.106667
80,1.026667
81,1.0
82,1.266667
83,1.293333
84,1.373333
85,1.026667


In [27]:
# Custom Dataset
class RollingWindowDataset(Dataset):
    def __init__(self, X, y, device=device):
        self.X = X.clone().detach().to(torch.float)
        self.y = y.clone().detach().to(torch.float)

    def __len__(self):
        return len(self.X) 

    def __getitem__(self, idx):
        # Ensure idx is within the valid range
        if idx > len(self.X):
            raise IndexError("Index out of bounds")

        X_window_normal = self.X[idx][0]
        X_window_years = self.X[idx][1]
        y_target = self.y[idx]  

        return X_window_normal.clone().detach().to(torch.float).to(device), X_window_years.clone().detach().to(torch.float).to(device), y_target.clone().detach().to(torch.float).to(device)

In [28]:
X_train = []
y_train = []

X_test = []
y_test = []

num_freq= 4
time_steps = 4
print(len(company_dict[company]["X_train"]))
print((len(company_dict[company]["X_test"])))

for company in company_list:
    company_dict[company]["y_train"]["Type"] = "Train"
    company_dict[company]["y_test"]["Type"] = "Test"

    comp_df_X = pd.concat([company_dict[company]['X_train'], company_dict[company]['X_test']], axis=0)
    comp_df_y = pd.concat([company_dict[company]['y_train'], company_dict[company]['y_test']], axis=0)

    for i in range((len(comp_df_X)) - time_steps):
        if i < 12:
            continue

        if comp_df_y.iloc[i + time_steps]["Type"] == "Train":

            years_df = pd.DataFrame()

            for freq in range(num_freq-1, -1, -1):
                start_idx = i + time_steps - (4 * (freq + 1))

                data_for_freq = comp_df_X.iloc[start_idx:start_idx+1, :]  
                years_df = pd.concat([years_df, data_for_freq], ignore_index=True, axis=0)

            # display(years_df)
            # display(comp_df_X.iloc[i : (i + time_steps)])

            normal_data = comp_df_X.iloc[i: (i + time_steps)].values
            years_data = years_df.values

            combined_sample = np.stack([normal_data, years_data], axis=0)
            
            X_train.append(combined_sample)
            y_train.append(comp_df_y.iloc[i + time_steps]["EPS"])

        elif comp_df_y.iloc[i + time_steps]["Type"] == "Test":

            years_df = pd.DataFrame()

            for freq in range(num_freq-1, -1, -1):
                start_idx = i + time_steps - (4 * (freq + 1))

                data_for_freq = comp_df_X.iloc[start_idx:start_idx+1, :]
                years_df = pd.concat([years_df, data_for_freq], ignore_index=True, axis=0)

            normal_data = comp_df_X.iloc[i: (i + time_steps)].values
            years_data = years_df.values

            combined_sample = np.stack([normal_data, years_data], axis=0)


            X_test.append(combined_sample)
            y_test.append(comp_df_y.iloc[i + time_steps]["EPS"])


X_train, y_train = np.array(X_train, dtype=np.float32), np.array(y_train, dtype=np.float32)
X_test, y_test = np.array(X_test, dtype=np.float32), np.array(y_test, dtype=np.float32)

28
7


In [29]:
y_test

array([0.46232828, 0.6600527 , 1.1005967 , ..., 0.8924731 , 1.0860215 ,
       0.87096775], dtype=float32)

In [30]:
X_test.shape

(1932, 2, 4, 57)

In [31]:
X_train.shape

(3312, 2, 4, 57)

In [32]:
X_train

array([[[[0.02312139, 0.549394  , 0.5711482 , ..., 0.82054096,
          3.        , 0.09463022],
         [0.01156069, 0.56738895, 0.6315996 , ..., 0.92179054,
          3.        , 0.14319412],
         [0.02312139, 0.6915168 , 0.71933264, ..., 0.88832325,
          3.        , 0.7120855 ],
         [0.01734104, 0.70069784, 0.8001963 , ..., 0.06945615,
          3.        , 0.45885944]],

        [[0.01734104, 0.05765698, 0.07458291, ..., 0.4613404 ,
          3.        , 0.03663105],
         [0.05202312, 0.1876607 , 0.12325807, ..., 0.7198991 ,
          3.        , 0.11252949],
         [0.02312139, 0.3676092 , 0.31167814, ..., 0.6401917 ,
          3.        , 0.0208131 ],
         [0.02312139, 0.549394  , 0.5711482 , ..., 0.82054096,
          3.        , 0.09463022]]],


       [[[0.01156069, 0.56738895, 0.6315996 , ..., 0.92179054,
          3.        , 0.14319412],
         [0.02312139, 0.6915168 , 0.71933264, ..., 0.88832325,
          3.        , 0.7120855 ],
         [0.01

In [33]:
X_train_tensor = torch.tensor(X_train, dtype=torch.float, device=device)
y_train_tensor = torch.tensor(y_train, dtype=torch.float, device=device)

X_test_tensor = torch.tensor(X_test, dtype=torch.float, device=device)
y_test_tensor = torch.tensor(y_test, dtype=torch.float, device=device)

train_data = RollingWindowDataset(X_train_tensor, y_train_tensor, device=device)
test_data = RollingWindowDataset(X_test_tensor, y_test_tensor, device=device)

print(test_data.__getitem__(0)[1])
print(test_data.__getitem__(1)[0])


tensor([[2.3121e-02, 5.4939e-01, 5.7115e-01, 3.5270e-01, 5.5084e-01, 6.5063e-01,
         4.7990e-01, 4.8476e-01, 5.3477e-01, 3.5875e-01, 1.9533e-02, 6.7355e-01,
         8.5791e-01, 2.2857e-01, 5.5722e-01, 6.7070e-01, 7.6954e-01, 5.1818e-01,
         5.1253e-01, 5.6915e-01, 5.7159e-01, 8.3693e-02, 8.9501e-02, 9.2823e-02,
         1.5051e-01, 1.7960e-01, 4.3189e-01, 4.6840e-01, 7.2820e-01, 1.5264e-01,
         1.0000e+00, 8.7710e-01, 1.0000e+00, 3.7497e-01, 1.6527e-01, 6.1584e-02,
         1.4052e-01, 1.0619e-01, 9.4630e-02, 3.0118e-01, 3.9878e-01, 7.0497e-01,
         6.2991e-01, 5.5769e-01, 4.5900e-01, 6.7100e-01, 5.6242e-01, 2.3496e-01,
         3.1670e-01, 1.3845e-01, 1.6157e-01, 7.4603e-01, 3.8222e-01, 0.0000e+00,
         8.2054e-01, 3.0000e+00, 9.4630e-02],
        [5.7803e-03, 6.5883e-01, 8.3729e-01, 4.2563e-01, 4.0777e-01, 6.1052e-01,
         6.2652e-01, 5.5750e-01, 5.3477e-01, 2.1812e-01, 5.8262e-01, 8.4487e-01,
         4.7481e-01, 9.5238e-02, 5.4219e-01, 4.9866e-01, 6.2303

In [34]:
# class TwoDimCNNLSTMModel(nn.Module):
#     def __init__(self, input_dim, hidden_size, layer_size, output_dim, dropout_prob, conv_channels, kernel_size, pool_size, stride):
#         super(TwoDimCNNLSTMModel, self).__init__()

#         self.hidden_size = hidden_size
#         self.layer_size = layer_size
#         self.hn, self.cn = None, None

#         conv_output_width = (input_dim - kernel_size)  + 1
#         # Pooling output width (no padding, considering the stride for pooling)
#         pool_output_width = (conv_output_width - pool_size) // stride + 1

#         self.lstm_input_size = conv_channels * pool_output_width  # LSTM input dimensions

#         self.conv = nn.Conv2d(in_channels=1, out_channels=conv_channels, kernel_size=(1, kernel_size))
#         self.relu1 = nn.ReLU()
#         self.maxpool = nn.MaxPool2d(kernel_size=(1,pool_size), stride=(1, stride))

#         self.lstm = nn.LSTM(input_size = self.lstm_input_size, hidden_size = self.hidden_size, num_layers=self.layer_size,
#                             dropout=(dropout_prob if self.layer_size > 1 else 0), batch_first=True)
                            
#         self.dropout = nn.Dropout(dropout_prob)
        
#         self.fc = nn.Linear(self.hidden_size, output_dim)

#     def init_hidden(self, batch_size):
#         # Initialize hidden and cell states with zeros
#         h0 = torch.zeros(self.layer_size, batch_size, self.hidden_size).to(device)
#         c0 = torch.zeros(self.layer_size, batch_size, self.hidden_size).to(device)
#         return (h0, c0)
    
#     def reset_hidden(self):
#         self.hn = None
#         self.cn = None

#     def forward(self, x):

#         batch_size, seq_len, num_of_feature = x.shape
#         x = x.view(batch_size, 1, seq_len, num_of_feature)

#         x = self.conv(x)
#         x = self.relu1(x)
#         x = self.maxpool(x)

#         batch_size, channels, height, width = x.shape
#         x = x.permute(0, 2, 3, 1) # [batch, height, width, channels]
#         x = x.reshape(batch_size, height*width, channels)
#         # print(x.shape)

#         assert x.size(-1) == self.lstm.input_size, f"Mismatch in LSTM input size. Expected: {self.lstm.input_size}, Got: {x.size(-1)}"

#         if self.hn == None or self.cn == None:
#             self.hn, self.cn = self.init_hidden(x.size(0))
        
#         else:
#             self.hn, self.cn = self.hn.detach(), self.cn.detach()
            
#         if self.hn.size(1) > x.size(0):
#             self.hn = self.hn[:, -x.size(0):, :].contiguous()
#             self.cn = self.cn[:, -x.size(0):, :].contiguous()

#         # Forward propagate LSTM
#         out, (self.hn, self.cn) = self.lstm(x, (self.hn, self.cn))

#         out = self.dropout(out[:, -1, :])  # Add dropout

#         out = self.fc(out)

#         return out

In [35]:

class TwoDimCNNLSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_size, layer_size, output_dim, dropout_prob, conv_channels, kernel_size, pool_size, stride):
        super(TwoDimCNNLSTMModel, self).__init__()

        self.hidden_size = hidden_size
        self.layer_size = layer_size
        self.hn_quarterly, self.hn_yearly, self.cn_quarterly, self.cn_yearly = None, None, None, None
        self.padding = (1,1)
        kernel_size = (3,kernel_size)
        conv_stride=(1,1)

        conv_output_width = (input_dim - kernel_size[1] + 2*self.padding[1]) // conv_stride[1]  + 1
        pool_output_width = (conv_output_width - pool_size) // stride + 1

        self.lstm_input_size = conv_channels * pool_output_width  # LSTM input dimensions

        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout_prob)
        self.maxpool = nn.MaxPool2d(kernel_size=(1, pool_size), stride=(1, stride))

        self.conv_quarterly = nn.Conv2d(in_channels=1, out_channels=conv_channels, kernel_size=kernel_size, stride=conv_stride, padding=self.padding)
        self.conv_yearly = nn.Conv2d(in_channels=1, out_channels=conv_channels, kernel_size=kernel_size, stride=conv_stride, padding=self.padding)

        self.lstm_quarterly = nn.LSTM(input_size = self.lstm_input_size, hidden_size = self.hidden_size, num_layers=self.layer_size,
                            dropout=(dropout_prob if self.layer_size > 1 else 0), batch_first=True)
        self.lstm_yearly = nn.LSTM(input_size = self.lstm_input_size, hidden_size = self.hidden_size, num_layers=self.layer_size,
                            dropout=(dropout_prob if self.layer_size > 1 else 0), batch_first=True)
                                    
        self.fc = nn.Linear(self.hidden_size*2, output_dim)

    def init_hidden(self, batch_size):
        # Initialize hidden and cell states with zeros
        h0 = torch.zeros(self.layer_size, batch_size, self.hidden_size).to(device)
        c0 = torch.zeros(self.layer_size, batch_size, self.hidden_size).to(device)
        return (h0, c0)
    
    def reset_hidden(self):
        self.hn_quarterly, self.hn_yearly, self.cn_quarterly, self.cn_yearly = None, None, None, None


    def forward(self, data_quarterly, data_yearly):

        batch_size, seq_len, num_of_feature = data_quarterly.shape
        data_quarterly = data_quarterly.view(batch_size, 1, seq_len, num_of_feature)

        batch_size, seq_len, num_of_feature = data_yearly.shape
        data_yearly = data_yearly.view(batch_size, 1, seq_len, num_of_feature)

        # Quarterly 2D CNN
        out_quarterly = self.conv_quarterly(data_quarterly)
        out_quarterly = self.relu(out_quarterly)
        out_quarterly = self.maxpool(out_quarterly)

        out_quarterly = self.dropout(out_quarterly)

        # Yearly 2D CNN
        out_yearly = self.conv_yearly(data_yearly)
        out_yearly = self.relu(out_yearly)
        out_yearly = self.maxpool(out_yearly)

        out_yearly = self.dropout(out_yearly)

        batch_size, channels, height, width = out_quarterly.shape
        out_quarterly = out_quarterly.permute(0, 2, 3, 1) # [batch, height, width, channels]
        out_quarterly = out_quarterly.reshape(batch_size, height, channels*width)

        batch_size, channels, height, width = out_yearly.shape
        out_yearly = out_yearly.permute(0, 2, 3, 1) # [batch, height, width, channels]
        out_yearly = out_yearly.reshape(batch_size, height, channels*width)

        assert out_quarterly.size(-1) == self.lstm_quarterly.input_size, f"Mismatch in LSTM Quarterly input size. Expected: {self.lstm_quarterly.input_size}, Got: {x.size(-1)}"
        assert out_yearly.size(-1) == self.lstm_yearly.input_size, f"Mismatch in LSTM Yearly input size. Expected: {self.lstm_yearly.input_size}, Got: {x.size(-1)}"

        if self.hn_quarterly == None or self.hn_yearly == None:
            self.hn_quarterly, self.cn_quarterly = self.init_hidden(data_quarterly.size(0))
            self.hn_yearly, self.cn_yearly = self.init_hidden(data_yearly.size(0))

        else:
            self.hn_quarterly, self.cn_quarterly = self.hn_quarterly.detach(), self.cn_quarterly.detach()
            self.hn_yearly, self.cn_yearly = self.hn_yearly.detach(), self.cn_yearly.detach()

            last_hn_quarterly, last_cn_quarterly = self.hn_quarterly[:,-1:,:], self.cn_quarterly[:,-1:,:]
            last_hn_yearly, last_cn_yearly = self.hn_yearly[:,-1:,:], self.cn_yearly[:,-1:,:]

            self.hn_quarterly, self.cn_quarterly = last_hn_quarterly.repeat(1, data_quarterly.size(0), 1), last_cn_quarterly.repeat(1, data_quarterly.size(0), 1)
            self.hn_yearly, self.cn_yearly = last_hn_yearly.repeat(1, data_yearly.size(0), 1), last_cn_yearly.repeat(1, data_yearly.size(0), 1)

        # Forward propagate LSTM
        out_quarterly, (self.hn_quarterly, self.cn_quarterly) = self.lstm_quarterly(out_quarterly, (self.hn_quarterly, self.cn_quarterly))
        out_yearly, (self.hn_yearly, self.cn_yearly) = self.lstm_yearly(out_yearly, (self.hn_yearly, self.cn_yearly))

        out = torch.cat((out_quarterly, out_yearly), dim=2)

        out = self.dropout(out[:, -1, :])  # Add dropout

        out = self.fc(out)

        return out

In [36]:
def find_supported_splits(comp_size, num_companies):
    supported_splits = []
    for n_splits in range(1, comp_size + 1):
        if comp_size % n_splits != 0:
            continue
        
        supported_splits.append(n_splits)
    return supported_splits


def custom_time_series_folds(data, n_splits):

    total_size = len(data)
    comp_size = total_size // len(company_list)
    comp_fold_size = comp_size//n_splits

    if comp_size % n_splits != 0:
        supported_splits = find_supported_splits(comp_size, n_splits)
        print(supported_splits)
        print(f"fold_size: {comp_fold_size} comp_size: {comp_size}")
        raise ValueError("Fold size must be divisible by the number of companies.")

    accumulated_train_idx = []     

    for i in range(n_splits-1):
        current_fold_val_idx = []
        current_fold_train_idx = []

        for j in range(len(company_list)):

            start_idx = j * comp_size
            val_start_idx = start_idx + (i+1) * comp_fold_size 
        
            end_idx = val_start_idx + comp_fold_size
        
            current_comp_train_idx = list(range(start_idx, val_start_idx))
            current_fold_train_idx.extend(current_comp_train_idx)  
        
            val_idx = list(range(val_start_idx, end_idx))
            current_fold_val_idx.extend(val_idx)  

        
        yield current_fold_train_idx, current_fold_val_idx


In [37]:
class ModelActioner:

    def __init__(self, train_data, test_data, device):
        self.train_data = train_data
        self.test_data = test_data
        self.device = device
        self.model = None
        self.optimizer = None
        self.criterion = nn.MSELoss()

    def custom_time_series_cv(self, config, trial):
        batch_size = config["batch_size"]
        epochs = config["epochs"]
        hidden_size = config["hidden_size"]
        num_layers = config["num_layers"]
        learning_rate = config["learning_rate"]
        dropout_prob = config["dropout_prob"]
        weight_decay = config["weight_decay"]
        lr_step_size = epochs//config["lr_step_size"]
        gamma = config["gamma"]
        kernel_size = config["kernel_size"]
        conv_channels = config["conv_channels"]
        pool_size = config["pool_size"]
        stride = config["stride"]
 

        suffle = False

        fold_results = []
        num_of_fold = 5

        for fold, (train_idx, val_idx) in enumerate(custom_time_series_folds(self.train_data, num_of_fold)):
            print(f"Fold: {fold+1}/{num_of_fold}")

            train_subset = Subset(self.train_data, train_idx)
            val_subset = Subset(self.train_data, val_idx)

            train_loader = DataLoader(train_subset, batch_size=batch_size, shuffle=suffle)
            val_loader = DataLoader(val_subset, batch_size=batch_size, shuffle=suffle)

            self.model = TwoDimCNNLSTMModel(input_dim=self.train_data.__getitem__(0)[0].shape[1], hidden_size=hidden_size, layer_size=num_layers, dropout_prob=dropout_prob, output_dim=1, conv_channels=conv_channels, kernel_size=kernel_size, pool_size=pool_size, stride=stride).to(self.device)

            self.optimizer = optim.Adam(self.model.parameters(), lr=learning_rate, weight_decay=weight_decay)
            scheduler = ReduceLROnPlateau(self.optimizer, patience=lr_step_size, factor=gamma, mode="min") 

            for epoch in range(epochs):
                print('epochs {}/{}'.format(epoch+1,epochs))

                running_loss = 0.0
                total_sample_train = 0

                self.model.reset_hidden()
                self.model.train()

                for batch_idx, (data_normal, data_year, target) in enumerate(train_loader):
                    target = target.view(-1,1) 

                    self.optimizer.zero_grad()

                    preds = self.model(data_normal, data_year)
                    loss = self.criterion(preds, target)
                    loss.backward()
                    self.optimizer.step() # Update model params

                    running_loss += loss.item() * data_normal.size(0)
                    total_sample_train += data_normal.size(0)

                train_loss = running_loss/total_sample_train

                self.model.eval()

                val_running_loss = 0.0
                total_sample_val = 0

                with torch.no_grad():

                    for batch_idx, (data_normal, data_year, target) in enumerate(val_loader):
                        target = target.view(-1,1) 

                        preds = self.model(data_normal, data_year)

                        loss = self.criterion(preds, target)

                        val_running_loss += loss.item() * data_normal.size(0)
                        total_sample_val += data_normal.size(0)
                
                val_loss = val_running_loss/total_sample_val
                fold_results.append(val_loss)
                scheduler.step(train_loss)
                
                unique_step = fold * epochs + epoch
                trial.report(val_loss, unique_step)

                if trial.should_prune():
                    raise optuna.TrialPruned()

                current_lr = self.optimizer.param_groups[0]['lr']

                print(f'Current Learning Rate: {current_lr}')
                print(f"train_loss: {train_loss}, val_loss: {val_loss}")
                
        mean_val_loss = np.mean(fold_results)
        print(f"Mean validation loss: {mean_val_loss}")
        return mean_val_loss


                    
    def train(self, config):
        batch_size = config["batch_size"]
        epochs = config["epochs"]
        hidden_size = config["hidden_size"]
        num_layers = config["num_layers"]
        learning_rate = config["learning_rate"]
        dropout_prob = config["dropout_prob"]
        weight_decay = config["weight_decay"]
        lr_step_size = epochs//config["lr_step_size"]
        gamma = config["gamma"]
        kernel_size = config["kernel_size"]
        conv_channels = config["conv_channels"]
        pool_size = config["pool_size"]
        stride = config["stride"]
 
        self.model = TwoDimCNNLSTMModel(input_dim=self.train_data.__getitem__(0)[0].shape[1], hidden_size=hidden_size, layer_size=num_layers, dropout_prob=dropout_prob, output_dim=1, conv_channels=conv_channels, pool_size=pool_size, kernel_size=kernel_size, stride=stride).to(self.device)

        # Update optimizer with updated lr
        self.optimizer = optim.Adam(self.model.parameters(), lr = learning_rate, weight_decay=weight_decay)

        # Creating data loader
        train_loader = DataLoader(dataset=self.train_data, batch_size=batch_size, shuffle=False)

        scheduler = ReduceLROnPlateau(self.optimizer, patience=lr_step_size, factor=gamma, mode="min")  

        # Training Loop
        for epoch in range(epochs):
            print('epochs {}/{}'.format(epoch+1,epochs))
            
            running_loss = 0.0
            total_sample_train = 0

            self.model.reset_hidden()
            self.model.train()

            for batch_idx, (data_normal, data_year, target) in enumerate(train_loader):
                target = target.view(-1,1) 

                self.optimizer.zero_grad()

                preds = self.model(data_normal, data_year)

                loss = self.criterion(preds, target)
                loss.backward()
                self.optimizer.step() # Update model params

                running_loss += loss.item() * data_normal.size(0)
                total_sample_train += data_normal.size(0)

            train_loss = running_loss/total_sample_train
            scheduler.step(train_loss)
            current_lr = self.optimizer.param_groups[0]['lr']

            print(f'Current Learning Rate: {current_lr}')
            print(f"train_loss: {train_loss}")
        
        return self.model
            
    
    def test(self, config):
        batch_size = config["batch_size"]
        all_preds = []

        test_loader = DataLoader(dataset=self.test_data, batch_size=batch_size, shuffle=False)

        running_loss = .0
        total_sample = 0

        self.model.eval()

        with torch.no_grad():
            
            for batch_idx, (data_normal, data_year, target) in enumerate(test_loader):
                target = target.view(-1,1) 

                preds = self.model(data_normal, data_year)

                loss = self.criterion(preds, target)

                running_loss += loss.item() * data_normal.size(0)
                total_sample += data_normal.size(0)

                all_preds.extend(preds.cpu().numpy())

            test_loss = running_loss/total_sample
            print(f"test_loss: {test_loss}")

        return all_preds
    


In [38]:
def objective(trial):
    config = {
        "batch_size": trial.suggest_int("batch_size", 22, 100),
        "epochs": trial.suggest_int("epochs", 100, 400),
        "hidden_size": trial.suggest_int("hidden_size", 50, 1000),
        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-1),
        "dropout_prob": trial.suggest_float("dropout_prob", 0.1, 0.3),
        "weight_decay": trial.suggest_float("weight_decay", 1e-6, 1e-1),
        "lr_step_size": trial.suggest_int("lr_step_size", 3, 10), 
        "gamma": trial.suggest_float("gamma", 1e-4, 1e-1),
        "conv_channels": trial.suggest_int("conv_channels", 16, 128, step=16),
        "kernel_size": trial.suggest_int("kernel_size", 3, 11, step=2),
        "num_layers": trial.suggest_int("num_layers", 1, 6),
        "pool_size": trial.suggest_int("pool_size", 3, 17, step=2),
        "stride": trial.suggest_int("stride", 1, 4)
    }

    trainer = ModelActioner(train_data, test_data, device)

    val_loss = trainer.custom_time_series_cv(config, trial)

    return val_loss

In [39]:
# study_name = "CNN-LSTM-Tunner"
# storage_url = "sqlite:///db.sqlite3"

# storage = optuna.storages.RDBStorage(url=storage_url)

# # Check if the study exists
# study_names = [study.study_name for study in optuna.study.get_all_study_summaries(storage=storage)]
# if study_name in study_names:
#     # Delete the study if it exists
#     print(f"Deleting study '{study_name}'")
#     optuna.delete_study(study_name=study_name, storage=storage_url)
# else:
#     print(f"Study '{study_name}' does not exist in the storage.")
    
# study = optuna.create_study(direction='minimize', 
#                             storage=storage_url, 
#                             sampler=TPESampler(),
#                             pruner=HyperbandPruner(
#                             min_resource=3,  # Minimum resource allocated to a trial
#                             max_resource='auto',  # Maximum resource allocated to a trial, 'auto' calculates it based on the first trial
#                             reduction_factor=2  # Reduction factor for pruning
#                             ),
#                             study_name=study_name,
#                             load_if_exists=False)

# pbar = tqdm(total=20, desc='Optimizing', unit='trial')

# def callback(study, trial):
#     # Update the progress bar
#     pbar.update(1)
#     pbar.set_postfix_str(f"Best Value: {study.best_value:.4f}")

# study.optimize(objective, n_trials=20, callbacks=[callback])
# pbar.close()

# # Best hyperparameters
# print('Number of finished trials:', len(study.trials))
# print('Best trial:')
# trial = study.best_trial

# print('Value:', trial.value)
# print('Params:')
# for key, value in trial.params.items():
#     print(f'{key}: {value}')

In [40]:
config = {
    "batch_size": 99,
    "epochs": 201,
    "hidden_size": 393,
    "learning_rate": 0.0007614988524301935,
    "dropout_prob": 0.14575155207300822,
    "weight_decay": 0.0007841290102129483,
    "lr_step_size": 10, 
    "gamma": 0.09641306473538694,
    "conv_channels": 48,
    "kernel_size": 5,
    "num_layers": 2,
    "pool_size": 17,
    "stride": 1
}

In [41]:
# config = {
#     "batch_size": 43,
#     "epochs": 338,
#     "hidden_size": 178,
#     "learning_rate": 0.003583015916083878,
#     "dropout_prob": 0.2531041717674941,
#     "weight_decay":  0.038916614038269626,
#     "lr_step_size": 8, 
#     "gamma":  0.002654162304612564,
#     "conv_channels": 48,
#     "kernel_size": 5,
#     "num_layers": 1,
#     "pool_size": 7,
#     "stride": 1
# }

model = ModelActioner(train_data=train_data,test_data=test_data,device=device)
model.train(config)

epochs 1/201
Current Learning Rate: 0.0007614988524301935
train_loss: 0.10587439333777064
epochs 2/201
Current Learning Rate: 0.0007614988524301935
train_loss: 0.06751127926754238
epochs 3/201
Current Learning Rate: 0.0007614988524301935
train_loss: 0.07086904529158188
epochs 4/201
Current Learning Rate: 0.0007614988524301935
train_loss: 0.0614387351391918
epochs 5/201
Current Learning Rate: 0.0007614988524301935
train_loss: 0.061844915870334145
epochs 6/201
Current Learning Rate: 0.0007614988524301935
train_loss: 0.06073295057285577
epochs 7/201
Current Learning Rate: 0.0007614988524301935
train_loss: 0.05797642583796835
epochs 8/201
Current Learning Rate: 0.0007614988524301935
train_loss: 0.05684410342845418
epochs 9/201
Current Learning Rate: 0.0007614988524301935
train_loss: 0.05574033086431091
epochs 10/201
Current Learning Rate: 0.0007614988524301935
train_loss: 0.05499178215699351
epochs 11/201
Current Learning Rate: 0.0007614988524301935
train_loss: 0.05435772642047833
epochs 1

TwoDimCNNLSTMModel(
  (relu): ReLU()
  (dropout): Dropout(p=0.14575155207300822, inplace=False)
  (maxpool): MaxPool2d(kernel_size=(1, 17), stride=(1, 1), padding=0, dilation=1, ceil_mode=False)
  (conv_quarterly): Conv2d(1, 48, kernel_size=(3, 5), stride=(1, 1), padding=(1, 1))
  (conv_yearly): Conv2d(1, 48, kernel_size=(3, 5), stride=(1, 1), padding=(1, 1))
  (lstm_quarterly): LSTM(1872, 393, num_layers=2, batch_first=True, dropout=0.14575155207300822)
  (lstm_yearly): LSTM(1872, 393, num_layers=2, batch_first=True, dropout=0.14575155207300822)
  (fc): Linear(in_features=786, out_features=1, bias=True)
)

In [42]:
y_test = y_test.reshape(-1,1)
print(len(y_test))
y_test[:4]

1932


array([[0.46232828],
       [0.6600527 ],
       [1.1005967 ],
       [0.50395447]], dtype=float32)

In [43]:
preds = model.test(config)

test_loss: 0.18060287689755422


In [44]:
preds = np.array(preds)
len(preds)

1932

In [45]:
preds_inverse = []
y_true_inverse = []

idx = 0
data_size = len(y_test)//len(company_list)
print(data_size)

preds_comp_data = []
y_true_comp_data = pd.DataFrame()

for company in company_list:
    scaler = company_dict[company]["scaler_y"]
    preds_inverse.append(scaler.inverse_transform(preds[idx:idx+data_size]))
    y_true_inverse.append(scaler.inverse_transform(y_test[idx:idx+data_size].reshape(-1, 1) ))

    for pred in preds_inverse:
        preds_comp_data.append({"Prediction": pred, "Company": company})

    idx += data_size



# for company in company_list:
#     preds_inverse.append(np.array(preds).flatten())
#     y_true_inverse.append(np.array(y_test).flatten())
comp_preds_inverse = np.array(preds_inverse)
comp_true_inverse = np.array(y_true_inverse)

preds_inverse = np.array(preds_inverse).flatten()
y_true_inverse = np.array(y_true_inverse).flatten()
print(y_true_inverse.shape)
print(preds_inverse.shape)



mse = mean_squared_error(y_true_inverse, preds_inverse)
mape = mean_absolute_percentage_error(y_true_inverse, preds_inverse)*100
mae = mean_absolute_error(y_true_inverse, preds_inverse)
r2 = r2_score(y_true_inverse,preds_inverse)

print(f"MAPE Score: %{mape:.2f}")
print(f"MSE Score: {mse:.2f}")
print(f"MAE Score: {mae:.2f}")
print(f"R_2 Score: {r2:.2f}")

7
(1932,)
(1932,)
MAPE Score: %39903412224000.00
MSE Score: 2.16
MAE Score: 0.51
R_2 Score: 0.82


In [46]:
preds_inverse

array([0.53763795, 0.60360396, 0.9559994 , ..., 0.78240067, 0.7939631 ,
       0.76260823], dtype=float32)

In [47]:
len(y_true_inverse)

1932

In [48]:
comp_preds_inverse

array([[[0.53763795],
        [0.60360396],
        [0.9559994 ],
        ...,
        [0.49962127],
        [0.4873571 ],
        [0.659146  ]],

       [[0.86927736],
        [0.93015146],
        [0.85797304],
        ...,
        [0.8986802 ],
        [0.9699023 ],
        [0.8639244 ]],

       [[1.1256521 ],
        [1.1355134 ],
        [1.2155911 ],
        ...,
        [1.157536  ],
        [1.0934092 ],
        [1.1320057 ]],

       ...,

       [[0.9923983 ],
        [1.082393  ],
        [1.1534083 ],
        ...,
        [0.6768262 ],
        [1.0467108 ],
        [0.8009682 ]],

       [[2.169678  ],
        [2.3004222 ],
        [2.4550488 ],
        ...,
        [1.6822634 ],
        [1.6945587 ],
        [1.8440305 ]],

       [[0.87012047],
        [0.83966094],
        [0.9537636 ],
        ...,
        [0.78240067],
        [0.7939631 ],
        [0.76260823]]], dtype=float32)

In [49]:
y_true_inverse

array([0.585     , 0.7275    , 1.045     , ..., 0.98999995, 1.17      ,
       0.96999997], dtype=float32)

In [50]:
preds_inverse = np.array(preds_inverse).flatten()
preds_inverse

array([0.53763795, 0.60360396, 0.9559994 , ..., 0.78240067, 0.7939631 ,
       0.76260823], dtype=float32)

In [51]:
comp_true_inverse

array([[[0.585     ],
        [0.7275    ],
        [1.045     ],
        ...,
        [0.545     ],
        [0.75750005],
        [1.2475    ]],

       [[1.06      ],
        [1.03      ],
        [1.02      ],
        ...,
        [1.13      ],
        [1.16      ],
        [1.03      ]],

       [[1.36      ],
        [1.3100001 ],
        [1.5300001 ],
        ...,
        [1.55      ],
        [1.5300001 ],
        [1.7299999 ]],

       ...,

       [[0.92      ],
        [1.46      ],
        [1.41      ],
        ...,
        [0.73      ],
        [0.75      ],
        [0.41      ]],

       [[2.48      ],
        [2.88      ],
        [3.1000001 ],
        ...,
        [3.0200002 ],
        [3.43      ],
        [3.56      ]],

       [[0.89000005],
        [1.04      ],
        [1.0799999 ],
        ...,
        [0.98999995],
        [1.17      ],
        [0.96999997]]], dtype=float32)

In [52]:
len(y_test)

1932

In [53]:
y_test.flatten()
x_indices = np.arange(len(y_true_inverse))
preds = np.array(preds)
preds

array([[0.39661157],
       [0.488142  ],
       [0.9771047 ],
       ...,
       [0.66924804],
       [0.6816807 ],
       [0.64796585]], dtype=float32)

In [54]:
x_indices

array([   0,    1,    2, ..., 1929, 1930, 1931])

In [55]:
pivoted_data.shape[1]

61

In [56]:
len(y_test)//len(company_list)

7

In [57]:
len(y_test)

1932

In [58]:
x_indices = [["2018-Q2", "2018-Q3", "2018-Q4" ,"2019-Q1", '2019-Q2', '2019-Q3', '2019-Q4']] * len(company_list)

In [59]:
x_indices

[['2018-Q2', '2018-Q3', '2018-Q4', '2019-Q1', '2019-Q2', '2019-Q3', '2019-Q4'],
 ['2018-Q2', '2018-Q3', '2018-Q4', '2019-Q1', '2019-Q2', '2019-Q3', '2019-Q4'],
 ['2018-Q2', '2018-Q3', '2018-Q4', '2019-Q1', '2019-Q2', '2019-Q3', '2019-Q4'],
 ['2018-Q2', '2018-Q3', '2018-Q4', '2019-Q1', '2019-Q2', '2019-Q3', '2019-Q4'],
 ['2018-Q2', '2018-Q3', '2018-Q4', '2019-Q1', '2019-Q2', '2019-Q3', '2019-Q4'],
 ['2018-Q2', '2018-Q3', '2018-Q4', '2019-Q1', '2019-Q2', '2019-Q3', '2019-Q4'],
 ['2018-Q2', '2018-Q3', '2018-Q4', '2019-Q1', '2019-Q2', '2019-Q3', '2019-Q4'],
 ['2018-Q2', '2018-Q3', '2018-Q4', '2019-Q1', '2019-Q2', '2019-Q3', '2019-Q4'],
 ['2018-Q2', '2018-Q3', '2018-Q4', '2019-Q1', '2019-Q2', '2019-Q3', '2019-Q4'],
 ['2018-Q2', '2018-Q3', '2018-Q4', '2019-Q1', '2019-Q2', '2019-Q3', '2019-Q4'],
 ['2018-Q2', '2018-Q3', '2018-Q4', '2019-Q1', '2019-Q2', '2019-Q3', '2019-Q4'],
 ['2018-Q2', '2018-Q3', '2018-Q4', '2019-Q1', '2019-Q2', '2019-Q3', '2019-Q4'],
 ['2018-Q2', '2018-Q3', '2018-Q4', '2019

In [60]:
# trace_true = go.Scatter(x=x_indices, y=y_true_inverse, mode="lines+markers", name=f"All Stocks True: EPS - PENDS")
# trace_preds = go.Scatter(x=x_indices, y=preds_inverse, mode="lines+markers", name=f"All Stocks Preds: EPS - PENDS")


stock_symbol = "Multiple Companies"

# Initialize the figure with layout
layout = go.Layout(
    title=f"{stock_symbol}: EPS - PENDS",
    xaxis=dict(title='Date'),
    yaxis=dict(title='EPS', side='left', rangemode='tozero'),
    height=700,
    width=1400
    )
fig = go.Figure(layout=layout)

# Variables to keep track of the extended x-axis and y-axis data
extended_x_indices = []
all_true_y = []
all_preds_y = []

# Adjust x_indices and concatenate y data for each company
for i, company in enumerate(company_list):
    # Extend x_indices with company identifier to differentiate time points between companies
    extended_x_indices += [f"{x} {company}" for x in x_indices[i]]
    all_true_y += list(comp_true_inverse[i].flatten())
    all_preds_y += list(comp_preds_inverse[i].flatten())

# Add traces for the concatenated true and predicted EPS values
trace_true = go.Scatter(x=extended_x_indices, y=all_true_y, mode="lines+markers", name="All Companies True: EPS - PENDS")
trace_preds = go.Scatter(x=extended_x_indices, y=all_preds_y, mode="lines+markers", name="All Companies Preds: EPS - PENDS")

# Add traces to the figure
fig.add_trace(trace_true)
fig.add_trace(trace_preds)

# Show the figure
fig.show()

In [61]:
all_true_y

[0.585,
 0.7275,
 1.045,
 0.61499995,
 0.545,
 0.75750005,
 1.2475,
 1.06,
 1.03,
 1.02,
 1.12,
 1.13,
 1.16,
 1.03,
 1.36,
 1.3100001,
 1.5300001,
 1.23,
 1.55,
 1.5300001,
 1.7299999,
 0.69,
 0.83,
 0.62,
 0.8899999,
 0.9599999,
 1.1199999,
 1.26,
 0.25349998,
 0.2875,
 0.302,
 0.3545,
 0.261,
 0.21149999,
 0.3235,
 1.35,
 1.3100001,
 2.13,
 1.2900001,
 1.6099999,
 1.42,
 2.24,
 1.71,
 1.3,
 2.1599998,
 3.3100002,
 1.8699999,
 1.45,
 2.5300002,
 0.65999997,
 0.60999995,
 0.74,
 0.53,
 0.60999995,
 0.53,
 0.55999994,
 0.5,
 0.63000005,
 0.31000003,
 0.099999994,
 0.110000014,
 -0.29,
 0.080000006,
 1.9499999,
 2.0,
 1.86,
 1.92,
 2.17,
 2.27,
 2.14,
 0.45,
 0.495,
 0.525,
 0.445,
 0.45999998,
 0.475,
 0.49,
 0.56999993,
 1.99,
 -0.29999998,
 1.1100001,
 0.68000007,
 -0.44,
 1.7399999,
 1.8399999,
 1.39,
 2.79,
 1.23,
 1.2099999,
 1.9999999,
 1.1999999,
 4.992362,
 5.004212,
 6.2013674,
 6.032232,
 6.6098003,
 6.4043884,
 6.7741413,
 1.66,
 1.45,
 1.52,
 1.48,
 1.72,
 1.66,
 1.7300001,

In [62]:
pivoted_data[pivoted_data["OFTIC"] == "AFL"]["EPS"].tail(3)

83    1.13
84    1.16
85    1.03
Name: EPS, dtype: float64

In [63]:
trace_true = go.Scatter(x=x_indices, y=y_true_inverse, mode="lines+markers", name=f"All Stocks: EPS - PENDS")
trace_preds = go.Scatter(x=x_indices, y=preds_inverse, mode="lines+markers", name=f"All Stocks: EPS - PENDS")


layout = go.Layout(
    title = f"{stock_symbol}: EPS - PENDS",
    xaxis=dict(title='Date'),
    yaxis=dict(title='EPS', side='left', rangemode='tozero'),
    height=600,
)

fig = go.Figure(data=[trace_true, trace_preds], layout=layout)
fig.show()