# Welcome to Pytorch!
After almost everyone using Keras to get good scores in the competition, I took the challenge to use the beloved Pytorch! <br>
My Previous tries were good, but they took **4 hours to run**
Now after I made changes as the great francescopochetti, my excecution time has **come down to 10 minutes** (WOHOOOO!) <br>
Have a look at the original work of Frances http://francescopochetti.com/pytorch-for-tabular-data-predicting-nyc-taxi-fares/


#### upvote if you find it useful. Sharing is the best way to learn!

Ideas to imporve:
* Get a early stopping callback
* Get learning rate scheduler
* Make Network Deeper

# Simple imports

In [None]:
%matplotlib inline
import pathlib
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np
import seaborn as sns
pd.set_option('display.max_columns', 500)
from collections import defaultdict


from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error


# Pytroch imports

In [None]:
from torch.nn import init
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as F
from torch.utils import data
from torch.optim import lr_scheduler

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

from tqdm import tqdm # , # tqdm_notebook, # tnrange
from tqdm.notebook import trange as tnrange # will change this to trange later 
from tqdm.notebook import tqdm as tqdm_notebook # will change this to tqdm later
tqdm.pandas(desc='Progress')

In [None]:
import gc
gc.collect()

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Import dataset

In [None]:
df = pd.read_pickle('../input/ubiquant-market-prediction-half-precision-pickle/train.pkl')
df.head(2)

In [None]:
n_features = 300
features = [f'f_{i}' for i in range(n_features)]

# Setting investment id as categorical feature (just trying out!)

In [None]:
# setting as category feature
df['investment_id'] = df['investment_id'].astype('category')

# Defining some helper functions to make life easy later

In [None]:
def split_features(df):
    catf = ['investment_id']
    numf = [col for col in df.columns if col not in catf]
    
    for c in catf: 
        df[c] = df[c].astype('category').cat.as_ordered()
        df[c] = df[c].cat.codes + 1
    
    return catf, numf

In [None]:
def emb_init(x):
    x = x.weight.data
    sc = 2/(x.size(1)+1)
    x.uniform_(-sc,sc)

In [None]:
df = df.loc[df['time_id']>400] # filter out old data

## make use of helper functions!

In [None]:
y = df['target']
df = df.drop(columns = ['target'], axis = 1)

In [None]:
catf, numf = split_features(df)

print(len(catf))
print(catf)

print(len(numf))
# numf

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.25, random_state=1)
print(X_train.shape, X_test.shape)

In [None]:
cat_sz = [(c, df[c].max()+1) for c in catf]
print(cat_sz)

emb_szs = [(c, min(50, (c+1)//2)) for _,c in cat_sz]
print(emb_szs)

# Define the Dataset by rewriting the data.Dataset module

In [None]:
class RegressionColumnarDataset(data.Dataset):
    def __init__(self, df, cats, y):
        self.dfcats = df[cats]
        self.dfconts = df.drop(cats, axis=1)
        
        self.cats = np.stack([c.values for n, c in self.dfcats.items()], axis=1).astype(np.int64)
        self.conts = np.stack([c.values for n, c in self.dfconts.items()], axis=1).astype(np.float32)
        self.y = y.values.astype(np.float32)
        
    def __len__(self): return len(self.y)

    def __getitem__(self, idx):
        return [self.cats[idx], self.conts[idx], self.y[idx]]

In [None]:
trainds = RegressionColumnarDataset(X_train, catf, y_train)
valds = RegressionColumnarDataset(X_test, catf, y_test)

In [None]:
del X_train, X_test, y_train, y_test

In [None]:
traindl = data.DataLoader(trainds, batch_size = 1024, shuffle = True, num_workers = 2, pin_memory = True)
valdl = data.DataLoader(valds, batch_size = 2048, shuffle = True, num_workers = 2, pin_memory = True)

In [None]:
n_cont = len(df.columns)-len(catf)
n_cont

In [None]:
del df,trainds, valds

# Training!

## The Neural Network :)

This may look complex (it does to me!),  but actually this is quite simple. Have a nice read and check it out. <br>
The model mainly uses Embedding layers for the categorical variable (investmentid) and simple dense layers otherwise

In [None]:
class MixedInputModel(nn.Module):
    def __init__(self, emb_szs, n_cont, emb_drop, out_sz, szs, drops, use_bn=True):
        super().__init__()
        
        for i,(c,s) in enumerate(emb_szs): 
            assert c > 1, f"cardinality must be >=2, got emb_szs[{i}]: ({c},{s})"
        
        self.embs = nn.ModuleList([nn.Embedding(c, s) for c,s in emb_szs])
        
        for emb in self.embs: emb_init(emb)
        n_emb = sum(e.embedding_dim for e in self.embs)
        self.n_emb, self.n_cont = n_emb, n_cont
        
        # embeddings are done, now concatatenate 
        szs = [n_emb + n_cont] + szs
        self.lins = nn.ModuleList([nn.Linear(szs[i], szs[i+1]) for i in range(len(szs)-1)])
        self.bns = nn.ModuleList([nn.BatchNorm1d(sz) for sz in szs[1:]])
        
        # simple lines to make sure the weights are initialised in a kaiming distribution
        for o in self.lins: nn.init.kaiming_normal_(o.weight.data)
            
        self.outp = nn.Linear(szs[-1], out_sz) # define output layer
        nn.init.kaiming_normal_(self.outp.weight.data)

        # define dropout layers
        self.emb_drop = nn.Dropout(emb_drop)
        self.drops = nn.ModuleList([nn.Dropout(drop) for drop in drops])
        
        # define batch normalisation layers
        self.bn = nn.BatchNorm1d(n_cont)
        self.use_bn = use_bn

    def forward(self, x_cat, x_cont):
        # print('initial shape HOW TO GET')
        if self.n_emb != 0:
            x = [e(x_cat[:,i]) for i,e in enumerate(self.embs)]
            # print('embs len', len(x), 'elements like', x[:5])
            x = torch.cat(x, 1)
            # print('cat', x.shape)
            x = self.emb_drop(x)
            # print('emb drop', x.shape)
            
        # print('\n')
        if self.n_cont != 0:
            x2 = self.bn(x_cont)
            # print('bn get x2', x2.shape)
            x = torch.cat([x, x2], 1) if self.n_emb != 0 else x2
            # print('cat again', x.shape)
            
        # print('\n')
        for l,d,b in zip(self.lins, self.drops, self.bns):
            # changing order to fc - bn - relu - dropouts
            x = l(x)
            # print('linear', x.shape)
            if self.use_bn: x = b(x)
            # print('bn', x.shape)
            x = F.silu(x) # silu activation istead of the usual ReLU
            # print('silu', x.shape)
            x = d(x)
            # print('drops', x.shape)
            # print('\n')
            
        # print('\n')
        x = self.outp(x)
        # print('output', x.shape)
        
            
        return x.squeeze()

In [None]:
m = MixedInputModel(emb_szs=emb_szs, 
                    n_cont=n_cont, 
                    emb_drop=0.04, 
                    out_sz=1, 
                    szs=[400, 650, 950, 650, 400, 128, 8], 
                    drops=[0.1, 0.1, 0.1, 0.3, 0.1, 0.01, 0.0001]).to(device)

### check if the model looks good

In [None]:
m

In [None]:
# get single elements, and train them on the CPU iteself
for cat, cont, y in traindl:
    print(cat.device, cont.device, y.device)
    
    break

# better way to see the network
uncomment the print statements in the network, this will the output <br>
It helps to nicely see what is going on. **However, I will try my best to convert this to a nn.Sequential type for east understanding**

In [None]:
# m(cat, cont)

# initial shape HOW TO GET?? ( will have to see better)
# embs len 1 elements like [tensor([[-6.8909e-03,  3.2817e-02, -2.0786e-02,  ...,  3.0483e-02,
#          -4.5606e-04, -5.0471e-04],
#         [-5.0236e-03, -2.1645e-02, -4.6295e-05,  ...,  8.9745e-03,
#           2.3533e-02,  3.0192e-02],
#         [ 2.5954e-02,  2.4555e-03, -2.6891e-02,  ..., -1.2833e-02,
#           1.6570e-02, -3.5575e-03],
#         ...,
#         [ 1.9291e-02, -2.6346e-02,  4.3786e-03,  ...,  2.5476e-02,
#          -6.8894e-03,  3.7377e-02],
#         [-1.0805e-02, -1.9892e-02, -3.4380e-02,  ..., -3.1371e-02,
#           5.2091e-03,  6.5443e-03],
#         [ 3.8637e-02,  4.4848e-03, -2.4427e-02,  ..., -1.6600e-02,
#           9.7741e-03, -2.1790e-05]], grad_fn=<EmbeddingBackward>)]
# cat torch.Size([1024, 50])
# emb drop torch.Size([1024, 50])


# bn get x2 torch.Size([1024, 301])
# cat again torch.Size([1024, 351])


# linear torch.Size([1024, 400])
# bn torch.Size([1024, 400])
# silu torch.Size([1024, 400])
# drops torch.Size([1024, 400])


# linear torch.Size([1024, 500])
# bn torch.Size([1024, 500])
# silu torch.Size([1024, 500])
# drops torch.Size([1024, 500])


# linear torch.Size([1024, 750])
# bn torch.Size([1024, 750])
# silu torch.Size([1024, 750])
# drops torch.Size([1024, 750])


# linear torch.Size([1024, 500])
# bn torch.Size([1024, 500])
# silu torch.Size([1024, 500])
# drops torch.Size([1024, 500])


# linear torch.Size([1024, 400])
# bn torch.Size([1024, 400])
# silu torch.Size([1024, 400])
# drops torch.Size([1024, 400])


# linear torch.Size([1024, 128])
# bn torch.Size([1024, 128])
# silu torch.Size([1024, 128])
# drops torch.Size([1024, 128])


# linear torch.Size([1024, 8])
# bn torch.Size([1024, 8])
# silu torch.Size([1024, 8])
# drops torch.Size([1024, 8])




# output torch.Size([1024, 1])
# tensor([ 0.1282,  0.3186,  0.1614,  ..., -0.1414, -0.0461,  0.1904],
#        grad_fn=<SqueezeBackward0>)

# overfit on one batch
This is an incredibly important step to make sure that the model 'works'. Highly recommended by a lot of other experts, and I am really happy

In [None]:
# compile the neural net
network = MixedInputModel(emb_szs=emb_szs, 
                    n_cont=n_cont, 
                    emb_drop=0.04, 
                    out_sz=1, 
                    szs=[64, 128, 256, 512, 256, 128, 8], 
                    drops=[0.1, 0.1, 0.1, 0.3, 0.1, 0.01, 0.0001])

optimizer = optim.Adam(network.parameters(), lr=1e-2)

total_loss = []

for i in range(101):
    # loss
    loss = F.mse_loss(network(cat, cont), y)
    total_loss.append(loss)
    if (i%10 == 0):
        print("Step", i," loss:", loss.item())

    optimizer.zero_grad()
    
    # backprop
    loss.backward()  # update gradients
    optimizer.step() # update weights using gradients to minimize loss

In [None]:
plt.plot(total_loss)

# Looks like working well!

# Fitting loop

In [None]:
def fit(model, train_dl, val_dl, loss_fn, opt, epochs = 3):
    num_batch = len(train_dl)
    for epoch in tnrange(epochs):   
        
        model.train()
        y_true_train = list()
        y_pred_train = list()
        total_loss_train = 0          
        
        t = tqdm_notebook(iter(train_dl), leave=False, total = num_batch)
        
        for cat, cont, y in t:
            cat = cat.cuda()
            cont = cont.cuda()
            y = y.cuda()
            
            t.set_description(f'Epoch {epoch}')
            
            opt.zero_grad()
            pred = model(cat, cont)
            loss = loss_fn(pred, y)
            loss.backward()
            lr[epoch].append(opt.param_groups[0]['lr'])
            tloss[epoch].append(loss.item())
            
            torch.nn.utils.clip_grad_norm_(model.parameters(), 4.0) # gradient clipping
            
            opt.step()
            
            
            t.set_postfix(loss=loss.item())
            
            y_true_train += list(y.cpu().data.numpy())
            y_pred_train += list(pred.cpu().data.numpy())
            total_loss_train += loss.item()
            
        train_loss = total_loss_train/len(train_dl)
        
        if val_dl:
            model.eval()
            y_true_val = list()
            y_pred_val = list()
            total_loss_val = 0
            
            for cat, cont, y in tqdm_notebook(val_dl, leave=False):
                cat = cat.cuda()
                cont = cont.cuda()
                y = y.cuda()
                
                pred = model(cat, cont)
                loss = loss_fn(pred, y)
                
                y_true_val += list(y.cpu().data.numpy())
                y_pred_val += list(pred.cpu().data.numpy())
                total_loss_val += loss.item()
            
                vloss[epoch].append(loss.item())
                
            valloss = total_loss_val/len(valdl)
    
            print(f'Epoch {epoch}: train_loss: {train_loss:.4f}  | val_loss: {valloss:.4f} ')
        else:
            print(f'Epoch {epoch}: train_loss: {train_loss:.4f} ')
    
    return lr, tloss, vloss

# Loop through training
- I would love to implement **Callbacks** in my later models. Feel free to add in the comments any tips you have!

In [None]:
opt = optim.Adam(m.parameters(), 1e-6)
num_epochs = 8

lr = defaultdict(list)
tloss = defaultdict(list)
vloss = defaultdict(list)

In [None]:
lr, tloss, vloss = fit(model=m, train_dl=traindl, val_dl=valdl, loss_fn=F.mse_loss, opt=opt, epochs=num_epochs)

## Plot the results

In [None]:
t = [np.mean(tloss[el]) for el in tloss]
v = [np.mean(vloss[el]) for el in vloss]
p = pd.DataFrame({'Train Loss': t, 'Validation Loss': v, 'Epochs': range(num_epochs)})

_ = p.plot(x='Epochs', y=['Train Loss', 'Validation Loss'], 
           title='Train and Validation Loss over Epochs')

In [None]:
# lr

# Train some more with lower lr

In [None]:
opt = optim.Adam(m.parameters(), 5e-12)
num_epochs = 8

lr2 = defaultdict(list)
tloss2 = defaultdict(list)
vloss2 = defaultdict(list)

In [None]:
lr2, tloss2, vloss2 = fit(model=m, train_dl=traindl, val_dl=valdl, loss_fn=F.mse_loss, opt=opt, epochs=num_epochs)

In [None]:
# t2 = [np.mean(tloss2[el]) for el in tloss2]
# v2 = [np.mean(vloss2[el]) for el in vloss2]
# p2 = pd.DataFrame({'Train Loss': t2, 'Validation Loss': v2, 'Epochs': range(num_epochs)})

# _ = p2.plot(x='Epochs', y=['Train Loss', 'Validation Loss'], 
#            title='Train and Validation Loss over Epochs')

In [None]:
# opt = optim.Adam(m.parameters(), 5e-12)
# num_epochs = 5

# lr3 = defaultdict(list)
# tloss3 = defaultdict(list)
# vloss3 = defaultdict(list)

In [None]:
# lr3, tloss3, vloss3 = fit(model=m, train_dl=traindl, val_dl=valdl, loss_fn=F.mse_loss, opt=opt, epochs=num_epochs)

In [None]:
# t3 = [np.mean(tloss3[el]) for el in tloss3]
# v3 = [np.mean(vloss3[el]) for el in vloss3]
# p3 = pd.DataFrame({'Train Loss': t3, 'Validation Loss': v3, 'Epochs': range(num_epochs)})

# _ = p.plot(x='Epochs', y=['Train Loss', 'Validation Loss'], 
#            title='Train and Validation Loss over Epochs')

# Save the model

In [None]:
torch.save(m.state_dict(), 'trained_model.pth')

# Make submissions

In [None]:
torch.cuda.empty_cache() # just to clear some GPU cache memory
gc.collect()

In [None]:
cols_order = ['investment_id' , 'time_id'] + features

In [None]:
def predict_for_test_data(test_data):
    catf, numf = split_features(test_df)
    testds = RegressionColumnarDataset(test_df, catf, pd.Series([1 for i in range(test_df.shape[0])])) # using 1 as y value just for putting something
    testdl = data.DataLoader(testds, batch_size = 1, shuffle = False, num_workers = 2, pin_memory = True)
    sub = []
    
    for cat_test, cont_test, _ in testdl:
        cat_test = cat_test.cuda()
        cont_test = cont_test.cuda()
        pred = m(cat_test, cont_test)
        print(pred)
        sub.append(pred)
    
        
    submission_values = [float(i.detach()) for i in sub]
    return submission_values

# Use the submission API to make predictions

In [None]:
import ubiquant
env = ubiquant.make_env()   # initialize the environment
iter_test = env.iter_test()    # an iterator which loops over the test set and sample submission

for (test_df, sample_prediction_df) in iter_test:
    
    print("test_df as loaded by the API")
    display(test_df.head(), test_df.shape)
    
    # here you need to modify test_df to match the training data
    test_df['time_id'] = test_df.row_id.str.split("_", expand=True)[0].astype("int16") #re-create time_id
    test_df = test_df[cols_order]  
    print("test_df after selecting/creating the features the model was trained with")
    display(test_df.head(), test_df.shape)
    
    # Call our function to make predictions
    predictions = predict_for_test_data(test_df)
    sample_prediction_df['target'] = predictions  # make your predictions here
    env.predict(sample_prediction_df)   # register your predictions
    print('submission made for this data')
    display(sample_prediction_df)

# Upvote if useful!