In [None]:
import numpy as np 
import pandas as pd 

from sklearn.preprocessing import LabelEncoder, StandardScaler

import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader

This is my first attempt to build model using Pytorch library, so for some of you this might be too basic, but we are here to learn so there you go. Many thanks to Akshaj Verma for a  which helped me understand parts of Pytorch model architecture. https://towardsdatascience.com/pytorch-tabular-binary-classification-a0368da5bb89


In [None]:
train_df = pd.read_csv("/kaggle/input/tabular-playground-series-apr-2021/train.csv")
test_df = pd.read_csv("/kaggle/input/tabular-playground-series-apr-2021/test.csv")
sample_df = pd.read_csv("/kaggle/input/tabular-playground-series-apr-2021/sample_submission.csv")

In [None]:
train_df.drop("PassengerId", axis=1, inplace=True)
test_df.drop("PassengerId", axis=1, inplace=True)

In [None]:
def cabin_feat(df):
    df["Cabin"] = df["Cabin"].fillna("None")
    df["has_Cabin"] = df["Cabin"].apply(lambda x: 1 if x != "None" else 0)
    df["Deck"] = df["Cabin"].apply(lambda x: x[0])
    df.drop("Cabin", axis=1, inplace=True)
    
cabin_feat(train_df)
cabin_feat(test_df)

In [None]:
def fill_nan(df, group_col, col):
    """
    This function fill nan values in given column 
    based on groupby column.
    """
    mask_dict = df.groupby(group_col).mean()[col].to_dict()
    missing_mask = df[col].isna()
    df.loc[missing_mask, col] = df.loc[missing_mask, group_col].map(mask_dict)
    
fill_nan(train_df, "Pclass", "Age")
fill_nan(test_df, "Pclass", "Age")
fill_nan(train_df, "Deck", "Fare")
fill_nan(test_df, "Deck", "Fare")

In [None]:
def age_feat(x):
    if x <= 5:
        return "baby"
    elif 5 < x <= 16:
        return "teen"
    elif 16 < x <= 30:
        return "yound_adult"
    elif 30 < x <= 50:
        return "adult"
    else:
        return "elder"

In [None]:
train_df["Fare"] = train_df["Fare"].apply(lambda x: np.log(x) if x != 0 else 0)
test_df["Fare"] = test_df["Fare"].apply(lambda x: np.log(x) if x != 0 else 0)

train_df["Embarked"] = train_df["Embarked"].fillna(train_df["Embarked"].mode()[0])
test_df["Embarked"] = test_df["Embarked"].fillna(test_df["Embarked"].mode()[0])

train_df["Ticket"] = train_df["Ticket"].fillna("NAN")
test_df["Ticket"] = test_df["Ticket"].fillna("NAN")
train_df["Ticket"] = train_df["Ticket"].apply(lambda x: str(x)[:2])
test_df["Ticket"] = test_df["Ticket"].apply(lambda x: str(x)[:2])

train_df["FamilySize"] = train_df["SibSp"] + train_df["Parch"]
test_df["FamilySize"] = test_df["SibSp"] + test_df["Parch"]

train_df["Name_length"] = train_df["Name"].apply(lambda x: len(x.split(",")[0] + x.split(",")[1].strip()))
test_df["Name_length"] = test_df["Name"].apply(lambda x: len(x.split(",")[0] + x.split(",")[1].strip()))
train_df["Last_name"] = train_df["Name"].apply(lambda x: x.split(",")[0])
train_df["First_name"] = train_df["Name"].apply(lambda x: x.split(",")[1].strip())
test_df["Last_name"] = test_df["Name"].apply(lambda x: x.split(",")[0])
test_df["First_name"] = test_df["Name"].apply(lambda x: x.split(",")[1].strip())
train_df.drop("Name", axis=1, inplace=True)
test_df.drop("Name", axis=1, inplace=True)

train_df["age_range"] = train_df["Age"].apply(age_feat)
test_df["age_range"] = test_df["Age"].apply(age_feat)

In [None]:
enc_cols = [col for col in train_df.select_dtypes("object").columns]

def label_encoder():
    for col in enc_cols:
        le = LabelEncoder()
        le.fit(train_df[col].values.tolist() + test_df[col].values.tolist())
        train_df.loc[:, col] = le.transform(train_df[col].values)
        test_df.loc[:, col] = le.transform(test_df[col].values)

label_encoder()

In [None]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5)
train_df["kfold"] = -1

train_df = train_df.sample(frac=1).reset_index(drop=True)

for fold, (train_idx, valid_idx) in enumerate(skf.split(X=train_df, y=train_df["Survived"])):
    train_df.loc[valid_idx, "kfold"] = fold

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
train_df.shape[1]

In [None]:
class binaryClassification(nn.Module):
    def __init__(self):
        super(binaryClassification, self).__init__()
        
        
        # Number of input features is 15
        self.layer_1 = nn.Linear(15, 64)
        self.layer_2 = nn.Linear(64, 64)
        self.layer_out = nn.Linear(64, 1)
        
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.1)
        self.batchnorm1 = nn.BatchNorm1d(64)
        self.batchnorm2 = nn.BatchNorm1d(64)
        
    def forward(self, inputs):
        x = self.relu(self.layer_1(inputs))
        x = self.batchnorm1(x)
        x = self.relu(self.layer_2(x))
        x = self.batchnorm2(x)
        x = self.dropout(x)
        x = self.layer_out(x)
        
        return x



class trainData(Dataset):
    
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
    
    def __len__(self):
        return len(self.X_data)
    
class testData(Dataset):
    
    def __init__(self, X_data):
        self.X_data = X_data
        
    def __getitem__(self, index):
        return self.X_data[index]
    
    def __len__(self):
        return len(self.X_data)
    
    
    
def binary_acc(y_pred, yvalid):
    y_pred_tag = torch.round(torch.sigmoid(y_pred))
    
    correct_results_sum = (y_pred_tag == yvalid).sum().float()
    acc = correct_results_sum / yvalid.shape[0]
    acc = torch.round(acc * 100)
    return acc

In [None]:
EPOCHS = 25
BATCH_SIZE = 64
LEARNING_RATE = 0.001

In [None]:
train_df.shape[1]

In [None]:
for fold in range(5):
    train = train_df[train_df.kfold != fold].reset_index(drop=True)
    valid = train_df[train_df.kfold == fold].reset_index(drop=True)
    
    xtrain = train.drop(["Survived", "kfold"], axis=1).values
    xvalid = valid.drop(["Survived", "kfold"], axis=1).values
    ytrain = train.Survived.values
    yvalid = valid.Survived.values
    
    sc = StandardScaler()
    xtrain = sc.fit_transform(xtrain)
    xvalid = sc.transform(xvalid)
    test_scaled = sc.transform(test_df)
    
    train_data = trainData(torch.FloatTensor(xtrain),
                           torch.FloatTensor(ytrain))
    
    valid_data = testData(torch.FloatTensor(xvalid))
    
    train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE,
                              shuffle=True)
    valid_loader = DataLoader(dataset=valid_data, batch_size=1)
    
    model = binaryClassification()
    model.to(device)
    
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
    
    model.train()
    for e in range(EPOCHS +1):
        epoch_loss = 0
        epoch_acc = 0
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            
            y_pred = model(X_batch)
            
            loss = criterion(y_pred, y_batch.unsqueeze(1))
            acc = binary_acc(y_pred, y_batch.unsqueeze(1))
            
            loss.backward()
            optimizer.step()
            
            epoch_loss += loss.item()
            epoch_acc += acc.item()
            
        print(f'Epoch {e+0:03}: | Loss:{epoch_loss/len(train_loader):.5f} | Acc: {epoch_acc/len(train_loader):.3f}')

In [None]:
y_pred_list = []

model.eval()
with torch.no_grad():
    for X_batch in valid_loader:
        X_batch = X_batch.to(device)
        y_valid_pred = model(X_batch)
        y_valid_pred = torch.sigmoid(y_valid_pred)
        y_pred_tag = torch.round(y_valid_pred)
        y_pred_list.append(y_pred_tag.cpu().numpy())

y_pred_list = [a.squeeze().tolist() for a in y_pred_list]

In [None]:
from sklearn.metrics import classification_report

print(classification_report(yvalid, y_pred_list))

There is lots to improve here so if you have any suggestions or you see some mistakes, leave feedback, any constractive criticism is welcome.