<a href="https://colab.research.google.com/github/ramkumarr02/Titanic-Prediction-using-Pytorch/blob/master/Titanic_Pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Env Setup

## Packages



In [0]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

from sklearn.utils import shuffle
from torch.autograd import Variable

import warnings
warnings.filterwarnings("ignore")

## Mount Drive


In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Functions

## Split Columns

In [0]:
def column_split(df):

    try:
        df['SurName'], df['Name'] = df['Name'].str.split(',', 1).str
        df['Title'], df['Name'] = df['Name'].str.split('.', 1).str
        df['Cabin_Section'] = df[df['Cabin'].notna()]['Cabin'].astype(str).str[0]
        df['Cabin_Nums'] = df[df['Cabin'].notna()]['Cabin'].str.count(" ") + 1
        del df['Name']
        del df['Ticket']
        del df['Cabin']
        #del df['SurName']
        
    except:
        pass
    
    return(df)

## Impute Age by title

In [0]:
def impute_age_by_title(df):       
    train_with_age = df.copy()

    train_with_age_notnull = train_with_age[train_with_age['Age'].notna()]

    age_map = train_with_age_notnull.groupby(['Title'])['Age'].mean().astype('int').to_dict()
    age_map[' Ms'] = 28

    #temp = train_with_age['Age']

    for i, row in train_with_age.iterrows():           
        if pd.isnull(row['Age']):
            df.Age[i] = age_map[train_with_age.Title[i]]  

    return(df)

## Change all data type into int

In [0]:
def change_dtype(df):
    not_int_cols = list(df.select_dtypes(exclude=['int']).columns)
    df[not_int_cols] = df[not_int_cols].astype('int')
    return(df)

## Scale Data

In [0]:
def scale_data(df):

    scaled_features = StandardScaler().fit_transform(df.values)
    df = pd.DataFrame(scaled_features, index=df.index, columns=df.columns)
 
    return(df)

## PreProcess Data

In [0]:
def pre_process(df, prep_flag = None, train_data = None):        

    # Feature Engineering : Split Columns
    df = column_split(df)

    # Feature Engineering : Impute Age by Title
    df = impute_age_by_title(df)

    # One Hot Encoding
    df = pd.get_dummies(df)

    if prep_flag != None:
        train_data, df = train_data.align(df, join='left', axis=1)

    # Remove NaN
    df = df.fillna(0)

    # Change all Data types to Int
    df = change_dtype(df)

    # Scale Data
    scaled_df = scale_data(df)

    return(scaled_df)

# Code Engine

## Read Data

In [0]:
train = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Deep Learning/Titanic/PyTorch/train.csv')
test = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Deep Learning/Titanic/PyTorch/test.csv')

del train['PassengerId']
del test['PassengerId']

train_copy = train.copy()
test_copy = test.copy()

train.head(2)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


## Split Data for OOB testing

In [0]:
train = pd.concat([train]*20, ignore_index=True)

In [0]:
train_x, valid_x, train_y, valid_y = train_test_split(train.loc[:, train.columns != 'Survived'], train['Survived'],train_size = 0.8,random_state = 1)

## Prep data

In [0]:
scaled_train_x = pre_process(train_x)
scaled_valid_x = pre_process(valid_x, train_data = scaled_train_x, prep_flag = 'valid')

In [0]:
train_x_tens = torch.FloatTensor(scaled_train_x.values.astype('float'))
train_y_tens = torch.LongTensor(train_y.values.astype('long'))

valid_x_tens = torch.FloatTensor(scaled_valid_x.values.astype('float'))
valid_y_tens = torch.LongTensor(valid_y.values.astype('long'))

# Modelling

In [0]:
class Net(nn.Module):
    def __init__(self, input_size, output_size):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_size, 50)
        self.fc2 = nn.Linear(50, 30)        
        self.fc3 = nn.Linear(30, 2)

    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)                
        x = F.relu(x)
        x = self.fc3(x)                
        return(x)

In [0]:
batch_size = 500
train_batch_no = len(scaled_train_x) // batch_size
valid_batch_no = len(scaled_valid_x) // batch_size

#input_size = 589
input_size = 703
output_size = 2
num_epochs = 10
learning_rate = 0.01


loss_list = []
accuracy_list = []  
n_test = len(scaled_valid_x)

In [0]:
def tens(x):
    x = np.array(x)
    x = torch.tensor(x)
    return(x)

In [0]:
train_x_tens = torch.FloatTensor(scaled_train_x.values.astype('float'))
train_y_tens = torch.LongTensor(train_y.values.astype('long'))

valid_x_tens = torch.FloatTensor(scaled_valid_x.values.astype('float'))
valid_y_tens = torch.LongTensor(valid_y.values.astype('long'))

In [0]:
train_tensor = TensorDataset(tens(train_x_tens), tens(train_y_tens))
valid_tensor = TensorDataset(tens(valid_x_tens), tens(valid_y_tens))

train_loader = DataLoader(dataset = train_tensor, batch_size=batch_size, shuffle = True)
valid_loader = DataLoader(dataset = valid_tensor, batch_size=batch_size, shuffle = True)

In [0]:
model = Net(input_size = input_size, output_size = output_size)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [0]:
for epoch in range(num_epochs):   
    for x,y in train_loader:        
        # Forward + Backward + Optimize
        optimizer.zero_grad()
        yhat = model(x)
        loss =criterion(yhat, y)
        loss.backward()
        optimizer.step()

    correct = 0
    for x_test, y_test in valid_loader:        
        z = model(x_test)
        _, yhat = torch.max(z.data, 1)        
        correct += (yhat == y_test).sum().item()
    
    accuracy = correct/n_test        
    accuracy_list.append(accuracy)
    loss_list.append(loss.data)

    if epoch % 2 == 0:
        print(f'Epoch : {epoch}, Loss : {loss.data} , Accuracy : {accuracy}')        

Epoch : 0, Loss : 0.033243563026189804 , Accuracy : 0.9826038159371493
Epoch : 2, Loss : 0.012733000330626965 , Accuracy : 0.994668911335578
Epoch : 4, Loss : 0.017182018607854843 , Accuracy : 0.9960718294051627
Epoch : 6, Loss : 3.4248419979121536e-05 , Accuracy : 0.9960718294051627
Epoch : 8, Loss : 0.0005603625322692096 , Accuracy : 0.9960718294051627


# Testing

## Prep Test data

In [0]:
scaled_test_x = pre_process(test, train_data = scaled_train_x, prep_flag = 'Test')
test_x_tens = torch.FloatTensor(scaled_test_x.values.astype('float'))

z_test = model(test_x_tens)
_, yhat = torch.max(z_test.data, 1)
print(len(yhat))
yhat

418


tensor([0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
        1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0,
        1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0,
        1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0,
        1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1,
        1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0,
        1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1,
        0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1,
        1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0,
        1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
        0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,
        0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1,