### points to take care of now onwards
1. Add PCA components
1. Search how to make the NN deeper
1. Get it on the GPU
1. Plot training losses with the validation losses

In [None]:
import os
import pandas as pd
import numpy as np
import gc

import matplotlib.pyplot as plt
from scipy import stats# Imports
import torch

import torchvision
import torch.nn as nn

import torch.nn.functional as F

from torch.utils.data import DataLoader, TensorDataset, random_split

In [None]:
gc.collect()

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Import dataset

In [None]:
%%time
n_features = 300
features = [f'f_{i}' for i in range(n_features)]
train = pd.read_pickle('../input/ubiquant-market-prediction-half-precision-pickle/train.pkl')
train.head(2)

In [None]:
features_to_choose = train.columns.drop(['target'])
features_to_choose

### Convert from Pandas dataframe to numpy arrays

In [None]:
inputs = train.drop(['target'], axis=1).values
targets = train[['target']].values

inputs.shape, targets.shape

### 80 % split number

In [None]:
val_1 = int(0.8*inputs.shape[0])
val_2 = int(0.2*inputs.shape[0])
val_1, val_2

### Hyperparameters

In [None]:
batch_size = 64
learning_rate = 1e-7

TARGET_COLUMN = 'target'
input_size=302
output_size=1

### Convert to PyTorch dataset

In [None]:
dataset = TensorDataset(torch.tensor(inputs, dtype=torch.float32), torch.tensor(targets, dtype=torch.float32))
train_ds, val_ds = random_split(dataset, [val_1, val_2])

train_loader = DataLoader(train_ds, batch_size, shuffle=False) # future predict karna hai na
val_loader = DataLoader(val_ds, batch_size*2)

### This is the simplest Neural Network Ever!
**feel free to edit the layers anytime**
NOTE: The loss function was getting out of hand, so I have (for now) multiplied it by 1e-5 to keep it under control. Will get this fixed soon!

In [None]:
class My_Kaggle_Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = nn.Linear(input_size, output_size)
        
    def forward(self, xb):
        out = self.linear(xb)
        return out
    
    def training_step(self, batch):
        inputs, targets = batch 
        out = self(inputs)                 # Generate predictions
        loss = F.mse_loss(out, targets) * 1e-5   # Calculate loss
        return loss
    
    def validation_step(self, batch):
        inputs, targets = batch 
        out = self(inputs)                 # Generate predictions
        loss = F.mse_loss(out, targets) *1e-5  # Calculate loss
        return {'val_loss': loss.detach()}
        
    def validation_epoch_end(self, outputs):
        batch_losses = [x['val_loss'] for x in outputs]
        epoch_loss = torch.stack(batch_losses).mean()   # Combine losses
        return {'val_loss': epoch_loss.item()}
    
    def epoch_end(self, epoch, result):
        print("Epoch [{}], val_loss: {:.4f}".format(epoch, result['val_loss']))
    
model = My_Kaggle_Model()

### Note that this work is heavily inspired from `jovian.ai` 's notebooks
**I highly recommend checking out their website for starters**

### Simple functions for evaluating and fitting

In [None]:
def evaluate(model, val_loader):
    outputs = [model.validation_step(batch) for batch in val_loader]
    return model.validation_epoch_end(outputs)

def fit(epochs, lr, model, train_loader, val_loader, opt_func=torch.optim.SGD):
    history = []
    optimizer = opt_func(model.parameters(), lr)
    for epoch in range(epochs):
        # Training Phase 
        for batch in train_loader:
            loss = model.training_step(batch)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
        # Validation phase
        result = evaluate(model, val_loader)
        model.epoch_end(epoch, result)
        history.append(result)
    return history

### I am fitting for only 5 epochs with a certain Learning rate
Go ahead and change it!

In [None]:
%%time
history = fit(5, learning_rate, model, train_loader, val_loader)

In [None]:
history

In [None]:
[r['val_loss'] for r in history]

In [None]:
losses = [r['val_loss'] for r in history]
plt.plot(losses, '-x')
plt.xlabel('epoch')
plt.ylabel('val_loss')
plt.title('val_loss vs. epochs');

In [None]:
result = evaluate(model, val_loader)
result

### important to save the model!

In [None]:
torch.save(model.state_dict(), 'my_trained_model.pth')

In [None]:
test = pd.read_csv('/kaggle/input/ubiquant-market-prediction/example_test.csv')
test.head(2)

In [None]:
test.shape

In [None]:
sample_sub = pd.read_csv('/kaggle/input/ubiquant-market-prediction/example_sample_submission.csv')
sample_sub.head(2)

In [None]:
sample_sub.shape

## time to make predictions!

In [None]:
val_ds[1][0].shape, val_ds[1][1].shape

### Simple function to predict

In [None]:
def predict_single(x, model):
    xb = x.unsqueeze(0)
    return model(x).item()

In [None]:
x, target = val_ds[10]
pred = predict_single(x, model)
# print("Input: ", x)
print("Target: ", target.item())
print("Prediction:", pred)

In [None]:
# test_loader = DataLoader(test_ds, batch_size*2) 
# no need to make dataloaders while predicting
# only useful while training

In [None]:
submission_try = []

### use pca components (SKIP for now)

In [None]:
# X = iris.data
# y = iris.target
# #In general a good idea is to scale the data
# scaler = StandardScaler()
# scaler.fit(X)
# X=scaler.transform(X)    

# pca = PCA()
# x_new = pca.fit_transform(X)


In [None]:
# pca.explained_variance_ratio_

In [None]:
# abs( pca.components_ )

### submission

### simple function to predict on the test dataframe

In [None]:
def predict_for_test_data(test_data):
    test_ds = TensorDataset(torch.tensor(test_data.values, dtype=torch.float32))
    
    submission_try = []
    
    for x in test_ds:
        pred = model(x[0])
        submission_try.append(pred)
        print("Prediction:", pred)
        
    submission_values = [float(i.detach()) for i in submission_try]
    return submission_values

### submit off!
( credits to {} for informing about the submission API. Here is the link to their work )

In [None]:
import ubiquant
env = ubiquant.make_env()   # initialize the environment
iter_test = env.iter_test()    # an iterator which loops over the test set and sample submission
for (test_df, sample_prediction_df) in iter_test:
    
    print("test_df as loaded by the API")
    display(test_df.head(), test_df.shape)
    #display(sample_prediction_df.head(), sample_prediction_df.shape)
    
    # here you need to modify test_df to match the training data
    test_df['time_id'] = test_df.row_id.str.split("_", expand=True)[0].astype("int16") #re-create time_id
    test_df = test_df[features_to_choose]  
    print("test_df after selecting/creating the features the model was trained with")
    display(test_df.head(), test_df.shape)
    
#     predictions = model.predict(test_df)
    predictions = predict_for_test_data(test_df)
    sample_prediction_df['target'] = predictions  # make your predictions here
    env.predict(sample_prediction_df)   # register your predictions
    
#     print("Predictions for this time_id")
#     display(sample_prediction_df)
#     print("-----------time_id finished-----------\n\n")