In [None]:
import pandas as pd 
import numpy as np 
import torch 
import torch.nn as nn 
from torch.utils.data import DataLoader,Dataset
from sklearn.model_selection import StratifiedKFold
from transformers import AdamW
from colorama import Fore , Style
r__ = Fore.RED
g__ = Fore.GREEN
st__ = Style.RESET_ALL

In [None]:
def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

In [None]:
config = {"batch_size" : 16 ,
         "epoch" : 10 ,
          "lr" : 5e-5 ,
          "wb" : 2e-5
         }

In [None]:
train_df  = pd.read_csv("../input/lgbm-with-efficient-features-engineering/train_df")

In [None]:
retained_features = ['stock_id','realized_t', 'realized1_t', 'squared_wap_vol', 'squared_wap1_vol',
       'realized_supply_demand', 'diff_bid_price_mean', 'diff_ask_price_mean',
       'realized_ask_price', 'realized_bid_price','log_wap3_realized_volatility', 'wap_balance_mean', 'price_spread_mean',
       'total_volume_mean', 'wap_mean', 'diff_t', 'diff_t^2', 'realized_t^2',
       'diff1_t', 'diff1_t^2', 'realized1_t^2', 'vol_price', 'vol_price^2',"size_order_mean","size_mean",
       'diff_price',"order_count_sum",'seconds_in_bucket_count_unique',"balance_wap_price"]

In [None]:
torch_data = train_df[retained_features + ["target"]]

In [None]:
encode_stock_id = train_df.groupby(["stock_id"])["target"].agg(np.mean)

In [None]:
torch_data.loc[:,"stock_id"] = train_df.loc[:,"stock_id"].map(encode_stock_id)

In [None]:
len_columns = len(torch_data.columns)-1
class NN_model(nn.Module) :
    def __init__(self) :
        super(NN_model,self).__init__()
        self.batch_norm = nn.BatchNorm1d(len_columns)
        self.dropout = nn.Dropout(0.2)
        dropout_rate = 0.2
        hidden_size = 128
        
        self.dense1 = nn.Linear(len_columns,hidden_size)
        self.batch_norm1 = nn.BatchNorm1d(hidden_size)
        self.dropout1 = nn.Dropout(dropout_rate)
        
        self.dense2 = nn.Linear(hidden_size+len_columns,hidden_size)
        self.batch_norm2 = nn.BatchNorm1d(hidden_size)
        self.dropout2 = nn.Dropout(dropout_rate)
        
        self.dense3 = nn.Linear(hidden_size,1)
    def forward(self,x):
        x0 = self.batch_norm(x)
        x0 = self.dropout(x0)
        x1 = self.dense1(x0)
        x1 = self.batch_norm1(x1)
        x1 = self.dropout1(x1)
        x1 = torch.cat([x,x1],1)
        x = self.dense2(x1)
        x = self.batch_norm2(x)
        x = self.dropout2(x)
        x = self.dense3(x)
        return x

In [None]:
class DataGenerator(Dataset) :
    
    def __init__(self,df,target=None):
        self.df = df 
        
        self.target = target
    def __len__(self) :
        return len(self.df)
    def __getitem__(self,ind):
        if self.target is not None:
            return torch.tensor(self.df[ind,],dtype=torch.double),torch.tensor(self.target[ind],dtype=torch.double)
        else :
            return torch.tensor(self.df[ind,],dtype=torch.double)

In [None]:
def create_dataloader(df,target = None) :
    data_generated = DataGenerator(df,target=target)
    loader = DataLoader(data_generated,batch_size=config["batch_size"],shuffle=False)
    return loader 

In [None]:
if torch.cuda.is_available() :
    device = torch.device("cuda")
else :
    device = torch.device("cpu")

In [None]:
def loss_func(y_pred,y_true) :
    return  (torch.sqrt(torch.mean(torch.square((y_true.view(-1) - y_pred.view(-1)) / y_true.view(-1)))))

In [None]:
skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=42)

In [None]:
def train_and_validate_model(tr_loader,val_loader,model,device,loss_func,optimizer,\
                             epoch,verbose=True):
    model.train()
    total_train_loss = 0
    for i,(data,target) in enumerate(tr_loader):
        optimizer.zero_grad()
        dt = data.to(device)
        target = target.to(device)
        
        # Compute output 
        output = model(dt.float())
        loss = loss_func(output,target)
        # Backpropagation 
        loss.backward()
        optimizer.step()
        
        total_train_loss += loss.item()
    model.eval()
    total_val_loss = 0
    for j,(val_data,val_target) in enumerate(val_loader) :
        with torch.no_grad() :
            X = val_data.to(device)
            Y = val_target.to(device)
            out = model(X.float())
            loss = loss_func(out,Y)
            total_val_loss += loss.item()
    
    train_loss  = total_train_loss / len(tr_loader) 
    val_loss = total_val_loss / len(val_loader)
    
    if verbose :
        print(f"Epoch : {epoch} | Train Loss:{train_loss}")
        print(f"Epoch : {epoch} | Val Loss : {val_loss}")

In [None]:
NN_models = []
for i,(tr_ind,ts_ind) in enumerate(skf.split(torch_data.values,train_df["stock_id"].values)) :
    
    X_tr = torch_data.loc[tr_ind,retained_features].values
    X_ts = torch_data.loc[ts_ind,retained_features].values
    
    Y_tr = torch_data.loc[tr_ind,"target"].values
    Y_ts = torch_data.loc[ts_ind,"target"].values 
    
    train_dataloader = create_dataloader(X_tr,target=Y_tr)
    val_dataloader = create_dataloader(X_ts,target=Y_ts)
    
    model = NN_model()
    model.to(device)
    
    optimizer = AdamW(model.parameters(),lr=config["lr"],weight_decay=config["wb"])
    
    print(f"{r__}training model {i+1} starting...{st__}")
    for ep in range(config["epoch"]) :
       print(f"{g__}="*100)
       print(" " * 46 ,f"Epoch : {ep}")
       print("="*100,f"{st__}")
       train_and_validate_model(train_dataloader,val_dataloader,model,device,loss_func,optimizer,\
                             ep,verbose=True)
    NN_models.append(model)

In [None]:
df_loader = create_dataloader(torch_data[retained_features].values)
for i,model in enumerate(NN_models) :
  vars() [f"prediction_{i}"] = []
  for dt in df_loader :
      dat = dt.to(device)
      out = model(dat.float()).detach().cpu().numpy().tolist()
      vars() [f"prediction_{i}"].extend(out)
  
  
  vars() [f"prediction_{i}"] =np.array((vars() [f"prediction_{i}"])).flatten()

In [None]:
cible = np.vstack([prediction_0,prediction_1,prediction_2,prediction_3,prediction_4])

In [None]:
cible = np.mean(cible,axis=0)

In [None]:
train_df["cible"] = cible

In [None]:
RMSPE = round(rmspe(y_true = train_df['target'], y_pred = train_df['cible']),3)
print(f'Performance of pytorch model  RMSPE: {RMSPE}')