# Full Two Step Pipeline

In [None]:
# imports
import pandas as pd
import numpy as np
import wandb

## Creating Surface

In [None]:
from data_handler import preprocessData, fitSurface

In [None]:
json_path = "../data/raw/yearwise/spy_options_data_23.json" #../data/raw/yearwise/spy_options_data_22.json
csv_save_path = "../data/processed/pca/predicted_iv23.csv" # ../data/processed/predicted_iv22.csv
preprocess = preprocessData(json_path)
df = preprocess.fit()
print(df.head())
fit = fitSurface(df)
predicted_iv = fit.fit()
print(predicted_iv)
predicted_iv.to_csv(csv_save_path, index=False)

## PCA Feature Extraction

In [None]:
from feature_extraction import featureExtractor

In [None]:
paths = [
    "../data/processed/pca/predicted_iv16.csv",
    "../data/processed/pca/predicted_iv17.csv",
    "../data/processed/pca/predicted_iv18.csv",
    "../data/processed/pca/predicted_iv19.csv",
    "../data/processed/pca/predicted_iv20.csv"
]
save_path = "../data/processed/features_pca_iv16-20.csv"

fe = featureExtractor(paths, save_path)
fe.transform()
fe.fit()

In [None]:
# sanity check to see if there are no repeated values or NaNs
df = pd.read_csv(save_path)
print(df.head())
print(df.isnull().sum())
print(df.duplicated().sum())
print(df.shape)
print(df.tail())

## VAE Feature Extraction

In [None]:
from models.vae_master import VAE, IVSDataset, IVSFeatureExtractor

In [None]:
# bring here

## LSTM

In [None]:
from models.lstm import CustomLSTMCell, CustomLSTMModel, ModelManager, DatasetManager

In [None]:
# pca_path = "../data/processed/features_pca_iv16-20.csv"
# dataset = DatasetManager(pca_path)
# features, targets = dataset.make_train_target_pairs()
# print('Features shape:', features.shape)
# print('Targets shape:', targets.shape)

In [None]:
# #Split the dataset into training and validation sets
# split = int(0.8 * len(features))
# train_features, val_features = features[:split], features[split:]
# train_targets, val_targets = targets[:split], targets[split:]

# no split
# train_features, train_targets = features, targets

In [None]:
# print('Initializing model...')
# model_path = './lstm1620_256.pth'
# model = ModelManager(input_dim=9, hidden_dim=256, output_dim=3, model_path=model_path, learning_rate=0.01)
# model.train(train_features, train_targets, epochs=160000)

# #val_loader = DataLoader(TensorDataset(val_features, val_targets), batch_size=1, shuffle=False)
# #model.validate(val_loader)

# model.save_model()

## DNN

In [None]:
import pandas as pd
import numpy as np
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
from models.lstm import CustomLSTMCell, CustomLSTMModel, ModelManager, DatasetManager

In [None]:
lstm_model_path = "./ckpts/lstm1620_256.pth"
#lstm_model_path = './ckpts/test_bilstm256.pth'
lstm_model = CustomLSTMModel(input_dim=9, hidden_dim=256, output_dim=3)
lstm_model.load_model(model_path=lstm_model_path)

In [None]:
features = pd.read_csv("../data/processed/features_pca_iv16-20.csv")
features.head()

In [None]:
for i in range(22,len(features)):
    ma1 = torch.tensor(features.iloc[i-1][['feature1', 'feature2', 'feature3']].astype(float).values, dtype=torch.float32)
    ma5 = torch.tensor(features.iloc[i-5:i][['feature1', 'feature2', 'feature3']].mean(axis=0).values, dtype=torch.float32)
    ma22 = torch.tensor(features.iloc[i-22:i][['feature1', 'feature2', 'feature3']].mean(axis=0).values, dtype=torch.float32)
    feature = torch.cat((ma1, ma5, ma22), dim=0).to(device)
    out = lstm_model.predict(feature)
    for obj in out:
        features.at[i, "F1"] = obj[0].item()
        features.at[i, "F2"] = obj[1].item()
        features.at[i, "F3"] = obj[2].item()

In [None]:
features = features.dropna().reset_index(drop=True)
features.head()

In [None]:
df_iv_path_list = [
    "../data/processed/pca/predicted_iv16.csv",
    "../data/processed/pca/predicted_iv17.csv",
    "../data/processed/pca/predicted_iv18.csv",
    "../data/processed/pca/predicted_iv19.csv",
    "../data/processed/pca/predicted_iv20.csv"
]

In [None]:
merged_df = pd.DataFrame()
for path in df_iv_path_list:
    df = pd.read_csv(path)
    merged_df = pd.concat([merged_df, df], axis=0)

merged_df = merged_df.reset_index(drop=True)
print(len(merged_df))
merged_df.head()

In [None]:
# join the two dataframes using the date column so that we have the corresponding F1, F2, F3 values for each date
df = pd.merge(merged_df, features, on='date')
df.head()

In [None]:
feature_cols = ['F1', 'F2', 'F3']
from models.dnn import IVDataset, IVSDNN, train_model, large_moneyness_penalty, butterfly_arbitrage_penalty, calendar_spread_penalty, safe_divide

In [None]:
dataset = IVDataset(df, feature_cols)

In [None]:
print(dataset.get_input_size())

In [None]:
from torch.utils.data import DataLoader
train_loader = DataLoader(dataset, batch_size=256, shuffle=True)
dnn = IVSDNN(input_size=dataset.get_input_size(), hidden_size=128)

In [None]:
lambda_penalty=1
num_epochs=500

In [None]:
import wandb
wandb.init(project="ivs-dnn")
train_model(dnn, train_loader, 10, 0.001, 1, wandb)