# This is code for Tabular Playground Series - Jul 2021 
## Note: Beginer friendly


## I have added
### * Early stopping
### * gradient clipping
### * weights decay
### * learning rate scheduler

In [None]:
import torch
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

torch.cuda.is_available()

In [None]:
!nvidia-smi

In [None]:
train_ds=pd.read_csv('../input/tabular-playground-series-jul-2021/train.csv')
train_ds.describe()

In [None]:
y_train=train_ds[['target_carbon_monoxide','target_benzene','target_nitrogen_oxides']]
cols=['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']
for col in cols:
    y_train.loc[:,col]=np.log(y_train.loc[:,col])
x_train=train_ds.drop(['target_carbon_monoxide','target_benzene','target_nitrogen_oxides','date_time'],axis=1)
y_train

In [None]:
try:
    x_train=x_train.to_numpy()
except:
    print(x_train)
print()
print()
try:
    y_train=y_train.to_numpy()
except:
    print(y_train)

In [None]:
try:
    x_train = torch.from_numpy(x_train)
    y_train = torch.from_numpy(y_train)
    x_train=x_train.float()
    y_train=y_train.float()
    train_ds=TensorDataset(x_train,y_train)
except:
    print()

train_dl=DataLoader(train_ds,batch_size=128,shuffle=True)


# MODEL


In [None]:
class MODEL(nn.Module):
  def __init__(self,in_size,hidden_size1,hidden_size2,hidden_size3,out_size):
    super().__init__()
    self.linear1=nn.Linear(in_size,hidden_size1)
    self.linear2=nn.Linear(hidden_size1,hidden_size2)
    self.linear3=nn.Linear(hidden_size2,hidden_size3)
    self.linear4=nn.Linear(hidden_size3,out_size)
  
  def forward(self,x):
    out=self.linear1(x)
    out=F.relu(out)
    
    out=self.linear2(out)
    out=F.relu(out)
    
    out=self.linear3(out)
    out=F.relu(out)
    
    out=self.linear4(out)

    return(out)
model=MODEL(x_train.size(1),32,64,128,y_train.size(1))
if torch.cuda.is_available():
    model=model.cuda()
model

## Training the model

Before we train the model, we're going to make a bunch of small but important improvements to our `fit` function:

* **Learning rate scheduling**: Instead of using a fixed learning rate, we will use a learning rate scheduler, which will change the learning rate after every batch of training. There are many strategies for varying the learning rate during training, and the one we'll use is called the **"One Cycle Learning Rate Policy"**, which involves starting with a low learning rate, gradually increasing it batch-by-batch to a high learning rate for about 30% of epochs, then gradually decreasing it to a very low value for the remaining epochs. Learn more: https://sgugger.github.io/the-1cycle-policy.html



* **Weight decay**: We also use weight decay, which is yet another regularization technique which prevents the weights from becoming too large by adding an additional term to the loss function.Learn more: https://towardsdatascience.com/this-thing-called-weight-decay-a7cd4bcfccab



* **Gradient clipping**: Apart from the layer weights and outputs, it also helpful to limit the values of gradients to a small range to prevent undesirable changes in parameters due to large gradient values. This simple yet effective technique is called gradient clipping. Learn more: https://towardsdatascience.com/what-is-gradient-clipping-b8e815cdfb48


Let's define a `fit_one_cycle` function to incorporate these changes. We'll also record the learning rate used for each batch.

In [None]:
import time
def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']
    
    
def training(epochs,train_dl,loss_fn,max_lr,model,save_file_name,grad_clip=0,weight_decay=0,opt_func=torch.optim.Adam,max_epochs_stop=10):
    start=time.time()
    time_flag=0
    optimizer=opt_func(model.parameters(),max_lr,weight_decay=weight_decay)
    sched = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr, epochs=epochs, 
                                                steps_per_epoch=len(train_dl))
    train_loss_min=np.Inf
    # Actualy to be done with val_dataset
    k=0
    train_loss=np.zeros(epochs)
    lrs=[]
    for epoch in range(epochs):
        for batch in train_dl:
            x,y=batch
            if torch.cuda.is_available():
                x=x.cuda()
                y=y.cuda()
                model=model.cuda()
            
            #STEP-1: Forward
            out=model(x)
        
            #STEP-2:Loss
            loss=loss_fn(out,y)
        
            #Step-3:Cleaning the prev calculated gradients
            model.zero_grad()
        
            #Step-4:Accumalate partial derivatives of Loss wrt to params
            loss.backward()
            
            if grad_clip:
                nn.utils.clip_grad_value_(model.parameters(), grad_clip)
        
            #Step-5: Takes 5 steps to update
            optimizer.step()

            loss=loss.cpu()
            lrs.append(get_lr(optimizer))
            #STEP-6
            sched.step()
            
        train_loss[k]=loss
        
            
        k+=1
        if(epoch==0):
            print("{}/{} Epochs | Train Loss={:.4f}  |lr={:.5f}".format(epoch+1,epochs,loss,get_lr(optimizer)))
            
        if((epoch+1)%1==0):
            print("{}/{} Epochs    | Train Loss={:.4f}  |lr={:.5f}".format(epoch+1,epochs,loss,get_lr(optimizer)))
            
            
        if loss<train_loss_min:
            f = open("best.pth","w",encoding='utf-8')
            torch.save(model.state_dict(), save_file_name)
            epochs_no_improve = 0
            train_loss_min = loss
            best_epoch = epoch
            f.close()
        else:
            net_time=time.time()-start
            time_flag=1
            epochs_no_improve+=1
            if epochs_no_improve>max_epochs_stop:
                    print("Early Stopping! Total_epochs:",epochs,"Best epoch:",best_epoch,"with train loss:",train_loss_min,
                         "Time elapsed:",time)
            
            
                    # Load the best state dict
                    f = open("best.pth","r",encoding='utf-8')
                    model.load_state_dict(torch.load(save_file_name))
                    f.close()
                    # Attach the optimizer
                    model.optimizer = optimizer
                    return model,train_loss,lrs
                
    if time_flag==0:       
        net_time=time.time()-start
    print("TIME Taken:{:.2f} sec ",net_time,"avg time per epoch:{:.2f} sec",((net_time)*1.0)/epochs)
    return model,train_loss,lrs




def plot_loss(loss_list):
    sns.set_style('darkgrid')
    matplotlib.rcParams['font.size'] = 18
    matplotlib.rcParams['figure.figsize'] = (12,8)
    plt.plot(loss_list,'g-')
    plt.xlabel("Epochs")
    plt.ylabel("Training loss")
    plt.show()
def plot_lr(lr_list):
    sns.set_style('darkgrid')
    matplotlib.rcParams['font.size'] = 18
    matplotlib.rcParams['figure.figsize'] = (12,8)
    plt.plot(lr_list,'b--')
    plt.xlabel("Epochs")
    plt.ylabel("Learning Rates")
    plt.show()    

In [None]:
#List of "HYPERPARAMETERS"

loss_fn=F.mse_loss
epochs=150
max_lr=1e-2
grad_clip=1
weight_decay=0.01
max_epochs_stop=50

In [None]:
model,loss_co,lrs=training(epochs,train_dl,loss_fn,max_lr,model,
                           save_file_name="best.pth",
                           grad_clip=grad_clip,weight_decay=weight_decay,
                           opt_func=torch.optim.RMSprop,
                           max_epochs_stop=max_epochs_stop)

In [None]:
plot_loss(loss_co)

In [None]:
plot_lr(lrs)

In [None]:
model.cpu()
prediction=model(x_train)
prediction

### Since we have grad='true' we have to remove it.


### We use `.detach().numpy()`

In [None]:
preds=prediction.detach().numpy()
print("loss:",loss_fn(y_train,torch.from_numpy(preds)))
preds=np.exp(preds)
df = pd.DataFrame(preds, columns = ['Column_A','Column_B','Column_C'])


In [None]:
test_ds=pd.read_csv('../input/tabular-playground-series-jul-2021/test.csv')
test_ds.drop('date_time',axis=1,inplace=True)
try:
    test_ds=test_ds.values
except:
    print(test_ds.shape)
test_ds=torch.from_numpy(test_ds)


test_ds=test_ds.float()

In [None]:
model.cpu()
prediction=model(test_ds)
preds=prediction.detach().numpy()
preds=np.exp(preds)
df = pd.DataFrame(preds, columns = ['Column_A','Column_B','Column_C'])
df

In [None]:
df_predict=pd.read_csv('../input/tabular-playground-series-jul-2021/sample_submission.csv')
df_predict['target_carbon_monoxide']=df['Column_A']
df_predict['target_benzene']=df['Column_B']
df_predict['target_nitrogen_oxides']=df['Column_C']
df_predict.to_csv('Submission.csv',index=False)
df_predict

In [None]:
df_predict.to_csv('submissions.csv',index=False)