# Begin

This notebook is just a fork of https://www.kaggle.com/seraphwedd18/pytorch-regression-model-train-by-chunks by using custom activation function ALReLU ( https://arxiv.org/abs/2012.07564 ) instead of ReLU

In [None]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
tqdm.pandas()

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Setting Environmental Variables

In [None]:
import time
import torch
import random

s = time.time()

seed = int(np.random.randint(0, 1e9))

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Random seed:", seed)
print("Device:", device)
basic_cols = ['row_id', 'time_id', 'investment_id', 'target']
num_feat = 300 #total of 300 feats from f_0 to f_299
features = [f'f_{i}' for i in range(num_feat)]
cols = basic_cols + features

In [None]:
torch.__version__ 

# Model creation and Training

In [None]:
# import pytorch
from IPython.display import clear_output
import torch.optim as optim
import torch

from torch import nn

# import activation functions
#import Mish.Torch.functional as Func




@torch.jit.script
def ALReLU(input):
    """
    Applies the ALReLU function :
    alrelu(x) = torch.maximum(torch.abs(alpha*input), input)
    """
    alpha = 0.01
    return torch.maximum(torch.abs(alpha*input), input)


In [None]:
class RegressionModel(torch.nn.Module):
    def __init__(self, in_shape, out_shape, hidden, device='cpu'):
        super().__init__()
        self.in_shape = in_shape
        self.out_shape = out_shape
        self.hidden = hidden
        self.device = device
        self.initialize_weights()
        
    def initialize_weights(self):
        self.w1 = torch.nn.Parameter(torch.randn((self.hidden, self.in_shape), device=self.device, requires_grad=True))
        self.w2 = torch.nn.Parameter(torch.randn((self.out_shape, self.hidden), device=self.device, requires_grad=True))
        self.b1 = torch.nn.Parameter(torch.randn(1, device=self.device, requires_grad=True))
        self.b2 = torch.nn.Parameter(torch.randn(1, device=self.device, requires_grad=True))
    
    def forward(self, x):
        #basic linear computation
        y_hat = torch.add(torch.mm(self.w1, x.t()), self.b1)
        #Apply ALReLU
        y_hat = ALReLU(y_hat)
        #return regression out
        return torch.add(torch.mm(self.w2, y_hat), self.b2)

class PredModel(torch.nn.Module):
    def __init__(self, in_shape, out_shape, hidden, device='cpu'):
        super().__init__()
        self.in_shape = in_shape
        self.out_shape = out_shape
        self.hidden = hidden
        self.device = device
        # We will be considering a multi-tower construct with varying
        # sized of hidden nodes
        # Tower 1
        self.t1 = RegressionModel(self.in_shape, self.hidden//4, self.hidden, self.device)
        self.t2 = RegressionModel(self.in_shape, self.hidden//4, self.hidden//2, self.device)
        self.t3 = RegressionModel(self.in_shape, self.hidden//4, self.hidden//4, self.device)
        self.out = RegressionModel(self.hidden//4, self.out_shape, self.hidden//4, self.device)
    
    def forward(self, x):
        #get sum of each tower
        y_hat = torch.add(self.t1(x), torch.add(self.t2(x), self.t3(x)))
        #get average
        y_hat = torch.mul(y_hat, 1/3)
        y_hat = self.out(y_hat.t())
        return y_hat

In [None]:


def loss(y_predicted, y_target):
    #RMSE Loss
    return torch.sqrt(torch.mean((y_predicted - y_target)**2))

model = PredModel(num_feat, 1, 64, device)

verbose = 25
epochs = 1000
chunks = 900000
tol = 500

for q, data in enumerate(pd.read_csv(
    "../input/ubiquant-market-prediction/train.csv", usecols=cols, chunksize=chunks)):
    #Initialize weights and biases
    optimizer = optim.Adam(model.parameters(), lr=0.01/(2**q))
    
    clear_output(wait=True)
    print(f"Currently training on {q*chunks} to {(q+1)*chunks}:")
    min_loss = np.inf
    cnt = 0
    
    x_dataset = torch.tensor(data[features].values, dtype=torch.float).to(device)
    y_dataset = torch.tensor(data['target'].values, dtype=torch.float).to(device)
    
    # Main optimization loop
    for t in range(1, epochs+1):
        # Set the gradients to 0.
        optimizer.zero_grad()
        # Compute the current predicted y's from x_dataset
        y_predicted = model(x_dataset)
        # See how far off the prediction is
        current_loss = loss(y_predicted, y_dataset)
        # Compute the gradient of the loss
        current_loss.backward()
        # Update model W and b accordingly.
        optimizer.step()
        
        #Check for early stopping
        if current_loss >= min_loss:
            cnt += 1
            if cnt >= tol:
                print("Early stopping!")
                break
        else:
            min_loss = current_loss
            cnt = 0

        if t%verbose==0:
            print(f"epoch = {t:4}/{epochs}, loss = {current_loss:.8f}, min_loss = {min_loss:.8f}, count = {cnt}")
    
    print(f"Total time spent: {time.time()-s:.4f} seconds")

# Prediction

In [None]:
import ubiquant
env = ubiquant.make_env()
iter_test = env.iter_test()

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    test_x = torch.tensor(test_df[features].values, dtype=torch.float).to(device)
    pred = model(test_x)
    sample_prediction_df['target'] = pred.detach().cpu().numpy().T
    env.predict(sample_prediction_df) 
    display(sample_prediction_df)