# Titanic Survival Prediction

### Using machine learning models, to predict the survival of passengers

In [1]:
#Importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

#Ignore Warnings
import warnings
warnings.filterwarnings("ignore")

# Data Preprocessing

In [2]:
# Importing Data
raw_data = pd.read_csv('./train.csv')
raw_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
raw_data['Pclass'].value_counts()

3    491
1    216
2    184
Name: Pclass, dtype: int64

# Creating training and test data set

#### The data would be split in stratified format according to 'Sex columns'

In [4]:
X = raw_data.drop(columns = ['Survived'])
y = raw_data['Survived']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = raw_data['Sex'])

In [5]:
print('Training dataset shape:', X_train.shape)
print('Testing dataset shape :', X_test.shape)

Training dataset shape: (712, 11)
Testing dataset shape : (179, 11)


In [6]:
X_train.Pclass.value_counts()

3    387
1    171
2    154
Name: Pclass, dtype: int64

In [7]:
X_test.Pclass.value_counts()

3    104
1     45
2     30
Name: Pclass, dtype: int64

#### Percentage of Each class in training data set!

In [8]:
print('For Pclass = 3:',395/712*100)
print('For Pclass = 2:',147/712*100)
print('For Pclass = 1:',170/712*100)

For Pclass = 3: 55.47752808988764
For Pclass = 2: 20.646067415730336
For Pclass = 1: 23.876404494382022


#### Percentage of Each class in training data set!

In [9]:
print('For Pclass = 3:',96/179*100)
print('For Pclass = 2:',37/179*100)
print('For Pclass = 1:',46/179*100)

For Pclass = 3: 53.63128491620112
For Pclass = 2: 20.670391061452513
For Pclass = 1: 25.69832402234637


In [10]:
data = X_train.copy()
data.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,712.0,712.0,573.0,712.0,712.0,712.0
mean,447.779494,2.303371,29.82548,0.523876,0.386236,32.13823
std,259.122072,0.832254,14.478635,1.087969,0.817173,46.489112
min,1.0,1.0,0.42,0.0,0.0,0.0
25%,223.75,2.0,21.0,0.0,0.0,7.925
50%,452.0,3.0,29.0,0.0,0.0,14.5
75%,674.25,3.0,38.0,1.0,0.0,31.275
max,890.0,3.0,80.0,8.0,6.0,512.3292


In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 778 to 714
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  712 non-null    int64  
 1   Pclass       712 non-null    int64  
 2   Name         712 non-null    object 
 3   Sex          712 non-null    object 
 4   Age          573 non-null    float64
 5   SibSp        712 non-null    int64  
 6   Parch        712 non-null    int64  
 7   Ticket       712 non-null    object 
 8   Fare         712 non-null    float64
 9   Cabin        163 non-null    object 
 10  Embarked     710 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 66.8+ KB


# Preparing data for ML model

In [12]:
#Dropping unwanted columns
unwanted = ['PassengerId', 'Name', 'Ticket', 'Fare', 'Cabin']
data.drop(columns = unwanted, inplace = True)

In [13]:
data

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Embarked
778,3,male,,0,0,Q
42,3,male,,0,0,C
62,1,male,45.0,1,0,S
796,1,female,49.0,0,0,S
494,3,male,21.0,0,0,S
...,...,...,...,...,...,...
649,3,female,23.0,0,0,S
710,1,female,24.0,0,0,C
872,1,male,33.0,0,0,S
27,1,male,19.0,3,2,S


In [14]:
#Filling in missing values in Embarked and Age
#For Age
median = data['Age'].median()
data['Age'].fillna(median, inplace = True)

#For Embarked
mode = data['Embarked'].mode()
data['Embarked'].fillna(mode, inplace = True)

data

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Embarked
778,3,male,29.0,0,0,Q
42,3,male,29.0,0,0,C
62,1,male,45.0,1,0,S
796,1,female,49.0,0,0,S
494,3,male,21.0,0,0,S
...,...,...,...,...,...,...
649,3,female,23.0,0,0,S
710,1,female,24.0,0,0,C
872,1,male,33.0,0,0,S
27,1,male,19.0,3,2,S


In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 778 to 714
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    712 non-null    int64  
 1   Sex       712 non-null    object 
 2   Age       712 non-null    float64
 3   SibSp     712 non-null    int64  
 4   Parch     712 non-null    int64  
 5   Embarked  710 non-null    object 
dtypes: float64(1), int64(3), object(2)
memory usage: 38.9+ KB


### Using label encoding to encode data['Sex'] columns

In [16]:
from sklearn.preprocessing import LabelEncoder
l_encode = LabelEncoder()
data['Sex'] = l_encode.fit_transform(data['Sex'])
data

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Embarked
778,3,1,29.0,0,0,Q
42,3,1,29.0,0,0,C
62,1,1,45.0,1,0,S
796,1,0,49.0,0,0,S
494,3,1,21.0,0,0,S
...,...,...,...,...,...,...
649,3,0,23.0,0,0,S
710,1,0,24.0,0,0,C
872,1,1,33.0,0,0,S
27,1,1,19.0,3,2,S


In [17]:
data = pd.get_dummies(data, columns = ['Pclass'], drop_first = True)
data = pd.get_dummies(data, columns = ['Embarked'], drop_first = True)
data

Unnamed: 0,Sex,Age,SibSp,Parch,Pclass_2,Pclass_3,Embarked_Q,Embarked_S
778,1,29.0,0,0,0,1,1,0
42,1,29.0,0,0,0,1,0,0
62,1,45.0,1,0,0,0,0,1
796,0,49.0,0,0,0,0,0,1
494,1,21.0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...
649,0,23.0,0,0,0,1,0,1
710,0,24.0,0,0,0,0,0,0
872,1,33.0,0,0,0,0,0,1
27,1,19.0,3,2,0,0,0,1


In [18]:
data

Unnamed: 0,Sex,Age,SibSp,Parch,Pclass_2,Pclass_3,Embarked_Q,Embarked_S
778,1,29.0,0,0,0,1,1,0
42,1,29.0,0,0,0,1,0,0
62,1,45.0,1,0,0,0,0,1
796,0,49.0,0,0,0,0,0,1
494,1,21.0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...
649,0,23.0,0,0,0,1,0,1
710,0,24.0,0,0,0,0,0,0
872,1,33.0,0,0,0,0,0,1
27,1,19.0,3,2,0,0,0,1


## Creating custom transformers for pipeline transformation

In [19]:
from sklearn.base import BaseEstimator, TransformerMixin
class missing(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        age_median = X['Age'].median()
        embarked_mode = X['Embarked'].mode()
        X['Age'].fillna(median, inplace = True)
        X['Embarked'].fillna(mode, inplace = True)
        return X

In [20]:
class dropping(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y = None):
        return self
    def transform(self, X, y=None):
        if pd.Series(["PassengerId", "Name", "Ticket", "Fare", "Cabin"]).isin(X.columns).all():
            unwanted = ["PassengerId", "Name", "Ticket", "Fare", "Cabin"]
            X.drop(columns = unwanted, inplace = True)
            return X
        else:
            return X

In [21]:
class encoding(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y = None):
        return self
    def transform(self, X, y = None):
        l_encode = LabelEncoder()
        X['Sex'] = l_encode.fit_transform(X['Sex'])
        X = pd.get_dummies(X, columns = ['Pclass'], drop_first = True)
        X = pd.get_dummies(X, columns = ['Embarked'], drop_first = True)
        return X

## Combining all the transformers in a pipeline

In [22]:
#Creating Pipeline
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
    ('dropping', dropping()),
    ('missing', missing()),
    ('encoding', encoding())
])

In [23]:
trial1 = X_train.copy()

In [24]:
pipeline.fit_transform(trial1)

Unnamed: 0,Sex,Age,SibSp,Parch,Pclass_2,Pclass_3,Embarked_Q,Embarked_S
778,1,29.0,0,0,0,1,1,0
42,1,29.0,0,0,0,1,0,0
62,1,45.0,1,0,0,0,0,1
796,0,49.0,0,0,0,0,0,1
494,1,21.0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...
649,0,23.0,0,0,0,1,0,1
710,0,24.0,0,0,0,0,0,0
872,1,33.0,0,0,0,0,0,1
27,1,19.0,3,2,0,0,0,1


In [25]:
X_train = pipeline.fit_transform(X_train)

In [26]:
X_train

Unnamed: 0,Sex,Age,SibSp,Parch,Pclass_2,Pclass_3,Embarked_Q,Embarked_S
778,1,29.0,0,0,0,1,1,0
42,1,29.0,0,0,0,1,0,0
62,1,45.0,1,0,0,0,0,1
796,0,49.0,0,0,0,0,0,1
494,1,21.0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...
649,0,23.0,0,0,0,1,0,1
710,0,24.0,0,0,0,0,0,0
872,1,33.0,0,0,0,0,0,1
27,1,19.0,3,2,0,0,0,1


In [27]:
X_test = pipeline.fit_transform(X_test)
X_test

Unnamed: 0,Sex,Age,SibSp,Parch,Pclass_2,Pclass_3,Embarked_Q,Embarked_S
304,1,29.0,0,0,0,1,0,1
667,1,29.0,0,0,0,1,0,1
462,1,47.0,0,0,0,0,0,1
180,0,29.0,8,2,0,1,0,1
274,0,29.0,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...
125,1,12.0,1,0,0,1,0,0
397,1,46.0,0,0,1,0,0,1
751,1,6.0,0,1,0,1,0,1
628,1,26.0,0,0,0,1,0,1


# Linear Regression

In [28]:
from sklearn.linear_model import LinearRegression
linear = LinearRegression()
linear.fit(X_train, y_train)
linear_predict = linear.predict(X_test)

In [29]:
from sklearn.metrics import accuracy_score
def roundoff(prediction):
    for i in range(0, len(prediction)):
        prediction[i] = round(int(prediction[i]))

def accuracy(true, predicted):
    accuracy = accuracy_score(true, predicted)
    return accuracy

In [30]:
roundoff(linear_predict)
accuracy(y_test, linear_predict)

0.659217877094972

# Logistic Regression

In [31]:
from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression()
logistic.fit(X_train, y_train)
logistic_predict = logistic.predict(X_test)
roundoff(logistic_predict)
accuracy(y_test, logistic_predict)

0.8044692737430168

# Logistic Regression

In [32]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)
tree_predict = tree.predict(X_test)
roundoff(tree_predict)
accuracy(y_test, tree_predict)

0.8044692737430168

# KNeighbors Classifier

In [33]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn_predict = knn.predict(X_test)
roundoff(knn_predict)
accuracy(y_test, knn_predict)

0.7821229050279329

# Random Forest Classifier

In [34]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier()
forest.fit(X_train, y_train)
forest_predict = forest.predict(X_test)
roundoff(forest_predict)
accuracy(y_test, forest_predict)

0.8100558659217877

# Result

In [35]:
print(f'''
Linear Regression        : {accuracy(y_test, linear_predict)}\n
Logistic Regression      : {accuracy(y_test, logistic_predict)}\n
Decision Tree Regressor  : {accuracy(y_test, tree_predict)}\n
KNeighbors Classifier    : {accuracy(y_test, knn_predict)}\n
Random Forest Classifier : {accuracy(y_test, forest_predict)}
''')


Linear Regression        : 0.659217877094972

Logistic Regression      : 0.8044692737430168

Decision Tree Regressor  : 0.8044692737430168

KNeighbors Classifier    : 0.7821229050279329

Random Forest Classifier : 0.8100558659217877



# Using joblib to dump the prepared model

In [36]:
import joblib
joblib.dump(forest, 'forest_titanic.pkl')

['forest_titanic.pkl']