<div style="background-color:rgba(0, 167, 255, 0.6);border-radius:5px;display:fill">
    <h1><center>Tabular Playground Series - Nov 2021</center></h1>
</div>

<center><a><img src="https://i.ibb.co/PWvpT9F/header.png" alt="header" border="0" width=800 height=400></a></center>

<div align='center'>
    <h1>PyTorch Tutorial</h1>
    <img src='https://pytorch.org/assets/images/pytorch-logo.png' style="width:200px;height:200px;">
</div>

<div style="background-color:rgba(255, 69, 0, 0.5);border-radius:5px;display:fill">
    <h1><center>Importing Libraries and Data</center></h1>
</div>

# Import Libraries

In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.utils import shuffle
from torch.autograd import Variable

from sklearn.metrics import roc_auc_score

# Import Data

In [None]:
id_column = 'id'
train_data = pd.read_csv("/kaggle/input/tabular-playground-series-nov-2021/train.csv", index_col=id_column)
test_data = pd.read_csv("/kaggle/input/tabular-playground-series-nov-2021/test.csv", index_col=id_column)
submission = pd.read_csv("/kaggle/input/tabular-playground-series-nov-2021/sample_submission.csv", index_col=id_column)

<div style="background-color:rgba(255, 69, 0, 0.5);border-radius:5px;display:fill">
    <h1><center>Basic Data Check</center></h1>
</div>

# Reduce memory

In [None]:
label = 'target'
features = [col for col in train_data.columns if 'f' in col]

cont_features = []
disc_features = []

for col in features:
    if train_data[col].dtype=='float64':
        cont_features.append(col)
    else:
        disc_features.append(col)

train_data[cont_features] = train_data[cont_features].astype('float32')
train_data[disc_features] = train_data[disc_features].astype('uint8')
train_data[cont_features] = train_data[cont_features].astype('float32')
train_data[disc_features] = train_data[disc_features].astype('uint8')

Collect garbage to reduce memory usage

In [None]:
import gc

gc.collect()

In [None]:
train_data[:5]

In [None]:
train_data.info()

In [None]:
train_data['target'].value_counts()

In [None]:
X_train, y_train = train_data.drop(['target'], axis = 1), train_data['target']

<div style="background-color:rgba(255, 69, 0, 0.5);border-radius:5px;display:fill">
    <h1><center>Logistic Regression with PyTorch</center></h1>
</div>

In Logistic Regression we use:
* One hidden layer with 1500 neurons
* Activation function - leaky Relu
* Dropout with p = 0.3
* Sigmoid

In [None]:
class LogisticRegression(nn.Module):
    def __init__(self,input_size,output_size):
        super(LogisticRegression,self).__init__()
        self.f1 = nn.Linear(input_dim, 1500)
        self.f2 = nn.Linear(1500, output_dim)

    def forward(self,x):
        x = self.f1(x)
        x = F.leaky_relu(x)
        x = F.dropout(x, p = 0.3)
        x = self.f2(x)
        return  F.sigmoid(x)

In [None]:
batch_size = 500
batch_no = len(X_train) // batch_size

In [None]:
X_train.shape

In [None]:
def generate_batches(X, y, batch_size):
    assert len(X) == len(y)
    np.random.seed(42)
    X = np.array(X)
    y = np.array(y)
    perm = np.random.permutation(len(X))

    for i in range(len(X)//batch_size):
        if i + batch_size >= len(X):
            continue
        ind = perm[i*batch_size : (i+1)*batch_size]
        yield (X[ind], y[ind])

In training stage we use:
* Learning rate = 0.0001
* Optimizer - Adam
* Loss - CrossEntropyLoss
* Epochs = 200

In [None]:
input_dim = 100
output_dim = 2
learning_rate = 0.0001
model = LogisticRegression(input_dim,output_dim)
error = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

loss_list = []
roc_list = []
iteration_number = 200

for iteration in range(iteration_number):
    batch_loss = 0
    batch_roc = 0
    size = 0

    for (x, y) in generate_batches(X_train, y_train, batch_size):
        inputs = Variable(torch.from_numpy(x)).float()
        labels = Variable(torch.from_numpy(y))
            
        optimizer.zero_grad() 
        results = model(inputs)
        loss = error(results, labels)

        batch_loss += loss.data
        
        loss.backward()
        optimizer.step()
        
        batch_roc += roc_auc_score(labels.detach().numpy(), results[:, 1].detach().numpy())
        size += 1
    
    loss_list.append(batch_loss/batch_no)
    roc_list.append(batch_roc/size)
    
    if (iteration % 20 == 0):
        print('Epoch {}: loss {}, ROC {}'.format(iteration, batch_loss / batch_no, batch_roc / size))

plt.plot(range(iteration_number), loss_list)
plt.xlabel("Number of Iterations")
plt.ylabel("Loss")
plt.show()
plt.plot(range(iteration_number), roc_list)
plt.xlabel("Number of Iterations")
plt.ylabel("ROC")
plt.show()

<div style="background-color:rgba(255, 69, 0, 0.5);border-radius:5px;display:fill">
    <h1><center>Predictions</center></h1>
</div>

In [None]:
test_data =  np.array(test_data)
test_data = Variable(torch.FloatTensor(test_data), requires_grad=True) 
predictions = model(test_data)

In [None]:
submission['target'] = predictions[:, 1].detach().numpy()

In [None]:
submission[:5]

In [None]:
submission.to_csv("submit.csv")

<div style="background-color:rgba(255, 69, 0, 0.5);border-radius:5px;display:fill">
    <h1><center>Conclusion</center></h1>
</div>

<div>
    <p>
For best results you can change number of hidden layers in Logistic Regression and increase the number of epochs. </p>
</div>

*Please upvote if you liked it.*