<a href="https://colab.research.google.com/github/ramkumarr02/Titanic-Prediction-using-Pytorch/blob/master/Titanic_Pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Env Setup

## Packages



In [0]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier


import warnings
warnings.filterwarnings("ignore")

## Load Data


In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
train = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Deep Learning/Titanic/PyTorch/train.csv')
test = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Deep Learning/Titanic/PyTorch/test.csv')

del train['PassengerId']
del test['PassengerId']

train_copy = train.copy()
test_copy = test.copy()

train.head(2)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


# Functions

## Split Columns

In [0]:
def column_split(df):

    try:
        df['SurName'], df['Name'] = df['Name'].str.split(',', 1).str
        df['Title'], df['Name'] = df['Name'].str.split('.', 1).str
        df['Cabin_Section'] = df[df['Cabin'].notna()]['Cabin'].astype(str).str[0]
        df['Cabin_Nums'] = df[df['Cabin'].notna()]['Cabin'].str.count(" ") + 1
        del df['Name']
        del df['Ticket']
        del df['Cabin']
        #del df['SurName']
        
    except:
        pass
    
    return(df)

## Impute Age by title

In [0]:
def impute_age_by_title(df):       
    train_with_age = df.copy()

    train_with_age_notnull = train_with_age[train_with_age['Age'].notna()]

    age_map = train_with_age_notnull.groupby(['Title'])['Age'].mean().astype('int').to_dict()
    age_map[' Ms'] = 28

    #temp = train_with_age['Age']

    for i, row in train_with_age.iterrows():           
        if pd.isnull(row['Age']):
            df.Age[i] = age_map[train_with_age.Title[i]]  

    return(df)

## Change all data type into int

In [0]:
def change_dtype(df):
    not_int_cols = list(df.select_dtypes(exclude=['int']).columns)
    df[not_int_cols] = df[not_int_cols].astype('int')
    return(df)

## Scale Data

In [0]:
def scale_data(df):

    scaled_features = StandardScaler().fit_transform(df.values)
    df = pd.DataFrame(scaled_features, index=df.index, columns=df.columns)
 
    return(df)

## PreProcess Data

In [0]:
def pre_process(df, prep_flag = None, train_data = None):        

    # Feature Engineering : Split Columns
    df = column_split(df)

    # Feature Engineering : Impute Age by Title
    df = impute_age_by_title(df)

    # One Hot Encoding
    df = pd.get_dummies(df)

    if prep_flag != None:
        train_data, df = train_data.align(df, join='left', axis=1)

    # Remove NaN
    df = df.fillna(0)

    # Change all Data types to Int
    df = change_dtype(df)

    # Scale Data
    scaled_df = scale_data(df)

    return(scaled_df)

# Code Engine

## Split Data for OOB testing

In [0]:
train_x, valid_x, train_y, valid_y = train_test_split(train.loc[:, train.columns != 'Survived'], train['Survived'],train_size = 0.8,random_state = 1)

## Prep data

In [0]:
print('train_x', len(train_x.columns))

scaled_train_x = pre_process(train_x)
print('scaled_train_x', len(scaled_train_x.columns))

scaled_valid_x = pre_process(valid_x, train_data = scaled_train_x, prep_flag = 'valid')
print('scaled_valid_x', len(scaled_valid_x.columns))

train_x 10
scaled_train_x 589
scaled_valid_x 589


# Modelling

In [0]:
regr = RandomForestClassifier(random_state = 1)
regr.fit(scaled_train_x, train_y) 
valid_predictions = regr.predict(scaled_valid_x)
metrics.accuracy_score(valid_predictions, valid_y)

0.7932960893854749

# Testing

## Prep Test data

In [0]:
scaled_test_x = pre_process(test, train_data = scaled_train_x, prep_flag = 'Test')
print('scaled_test_x', len(scaled_test_x.columns))
test_predictions = regr.predict(scaled_test_x)
test_predictions

scaled_test_x 589


array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0,