In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler

from sklearn.preprocessing import MinMaxScaler    
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train=pd.read_csv("/kaggle/input/tabular-playground-series-jun-2021/train.csv")
train.head()

In [None]:
train.drop(columns=['id'],inplace=True)
train.head()

In [None]:
train['target']=train['target'].str.replace('Class_','').astype(int)-1
train.head()

In [None]:
X=train.iloc[:,0:-1]
y=train.iloc[:,-1]
X.head()

In [None]:
X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=60)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.1, stratify=y_trainval, random_state=20)

In [None]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)
X_train, y_train = np.array(X_train), np.array(y_train)
X_val, y_val = np.array(X_val), np.array(y_val)
X_test, y_test = np.array(X_test), np.array(y_test)

In [None]:
def class_dist(obj):
    count=np.unique(obj,return_counts=True)
    return count
print(class_dist(y_train))

In [None]:
class ClassifierDataset(Dataset):
    
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)

train_dataset = ClassifierDataset(torch.from_numpy(X_train).float(), torch.from_numpy(y_train).long())
val_dataset = ClassifierDataset(torch.from_numpy(X_val).float(), torch.from_numpy(y_val).long())
test_dataset = ClassifierDataset(torch.from_numpy(X_test).float(), torch.from_numpy(y_test).long())

In [None]:
target_list = []
for _, t in train_dataset:
    target_list.append(t)
    
target_list = torch.tensor(target_list)
target_list = target_list[torch.randperm(len(target_list))]
print(target_list)

In [None]:
class_count = class_dist(y_train)[1:]
class_weights = 1./torch.tensor(class_count, dtype=torch.float)
print(class_weights)

In [None]:
epochs=300
batch_size=200
hidden=[64,32,16]
learn_rate=0.01
m,n=X_train.shape
k=np.unique(y_train,return_counts=False).shape[0]
print(k)

In [None]:
train_loader = DataLoader(dataset=train_dataset,batch_size=batch_size,shuffle=True)
val_loader = DataLoader(dataset=val_dataset, batch_size=1)
test_loader = DataLoader(dataset=test_dataset, batch_size=1)

In [None]:
model=nn.Sequential(nn.Linear(n,hidden[0],bias=True),
                    nn.ReLU(),
                    nn.Linear(hidden[0],hidden[1],bias=True),
                    nn.ReLU(),
                    nn.Linear(hidden[1],hidden[2],bias=True),
                    nn.ReLU(),
                    nn.Linear(hidden[2],k,bias=True))

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
model.to(device)

criterion = nn.CrossEntropyLoss(weight=class_weights.to(device))
optimizer = optim.Adam(model.parameters(), lr=learn_rate)
print(model)

In [None]:
def multi_acc(y_pred, y_test):
    y_pred_softmax = torch.log_softmax(y_pred, dim = 1)
    _, y_pred_tags = torch.max(y_pred_softmax, dim = 1)    
    
    correct_pred = (y_pred_tags == y_test).float()
    acc = correct_pred.sum() / len(correct_pred)
    
    acc = torch.round(acc * 100)
    
    return acc

In [None]:
accuracy_stats = {
    'train': [],
    "val": []
}
loss_stats = {
    'train': [],
    "val": []
}


In [None]:
print("Begin training.")
for e in tqdm(range(1, epochs+1)):
    train_epoch_loss = 0
    train_epoch_acc = 0
    model.train()
    for X_train_batch, y_train_batch in train_loader:
        X_train_batch, y_train_batch = X_train_batch.to(device), y_train_batch.to(device)
        optimizer.zero_grad()
        
        y_train_pred = model(X_train_batch)
        
        train_loss = criterion(y_train_pred, y_train_batch)
        train_acc = multi_acc(y_train_pred, y_train_batch)
        
        train_loss.backward()
        optimizer.step()
        
        train_epoch_loss += train_loss.item()
        train_epoch_acc += train_acc.item()  
    with torch.no_grad():
        
        val_epoch_loss = 0
        val_epoch_acc = 0
        
        model.eval()
        for X_val_batch, y_val_batch in val_loader:
            X_val_batch, y_val_batch = X_val_batch.to(device), y_val_batch.to(device)
            
            y_val_pred = model(X_val_batch)
                        
            val_loss = criterion(y_val_pred, y_val_batch)
            val_acc = multi_acc(y_val_pred, y_val_batch)
            
            val_epoch_loss += val_loss.item()
            val_epoch_acc += val_acc.item()
    loss_stats['train'].append(train_epoch_loss/len(train_loader))
    loss_stats['val'].append(val_epoch_loss/len(val_loader))
    accuracy_stats['train'].append(train_epoch_acc/len(train_loader))
    accuracy_stats['val'].append(val_epoch_acc/len(val_loader))
                              
    
    print(f'Epoch {e+0:03}: | Train Loss: {train_epoch_loss/len(train_loader):.5f} | Val Loss: {val_epoch_loss/len(val_loader):.5f} | Train Acc: {train_epoch_acc/len(train_loader):.3f}| Val Acc: {val_epoch_acc/len(val_loader):.3f}')

In [None]:
import torch.nn.functional as F
test=pd.read_csv("/kaggle/input/tabular-playground-series-jun-2021/test.csv")
test_id = test['id']
test.drop(columns='id',inplace=True)
with torch.no_grad():
    test = torch.tensor(test.to_numpy()).float()
    if torch.cuda.is_available():
        test = test.cuda()
    test_pred = F.softmax(model(test),dim=1)
    print(test_pred.shape)


test_pred = np.array(test_pred.detach().cpu())
final = pd.DataFrame(test_pred,index=test_id,columns=["Class_"+ str(i+1) for i in range(9)])
final.to_csv("final.csv")

In [None]:
 !kaggle competitions submit -c tabular-playground-series-jun-2021 -f final.csv -m "Batch size = 10"