# TITANIC SURVIVOR PREDICTION MODEL

In [1]:
#importing the neccessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [4]:
#loading our training dataset
training_data = 'train.csv'
train = pd.read_csv(training_data)
testing_data = 'test.csv'
test = pd.read_csv(testing_data)

1. Exploratory Data Analysis is the first step to be performed in this process of developing a Machine Learning model

In [5]:
#describing the training data on different metrics
train.describe() 

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [6]:
train.head() #first five rows of training data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


2. Data Preparation or Data cleaning is the second step in this process where we prepare our data as per our considerations so that we can feed our data into our Machine Learning algorithm.

In [7]:
#checking whether these three columns have null values
train[['Age','Cabin','Embarked']].isnull().any()

Age         True
Cabin       True
Embarked    True
dtype: bool

In [8]:
#Calculating the total null values in different columns or attributes
pd.isnull(train).sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

So we know that 'Cabin' columns is having 687 null values out of total 891 values. So, we drop this column as such it is not relevant to include this column for building our model, as we cannot fill any random value into these rows.

In [9]:
# Filling the 'Age' and 'Embarked' columns with the higher frequency value 
train['Age'] = train['Age'].fillna(train['Age'].value_counts().index[0])
train['Embarked'] = train['Embarked'].fillna(train['Embarked'].value_counts().index[0])

In [11]:
train = train.drop(['Cabin'], axis=1)

In [13]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


Now after checking the training dataset, we now go through our test dataset.

In [14]:
# Checking the null values in testing data
test.isnull().any()

PassengerId    False
Pclass         False
Name           False
Sex            False
Age             True
SibSp          False
Parch          False
Ticket         False
Fare            True
Cabin           True
Embarked       False
dtype: bool

As we can see in above cell that in the columns 'Age','Fare' and 'Cabin' are having some null(NaN) values

In [15]:
#check for any other unusable values
pd.isnull(test).sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

As you can see that most of the values in column 'Cabin' are null so we will drop it from the dataframe. Since, there is no point of filling such large amount of values by assumption, as it will have no relevance towards the survival prediction.

In [16]:
# Filling the null(NaN) values in the 'Age' column by the most frequent value in the column
test['Age'] = test['Age'].fillna(test['Age'].value_counts().index[0])

In [17]:
test = test.drop(['Cabin'], axis=1)

In [18]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,S


In [19]:
# dropping the ticket column from training and testing data
train = train.drop(['Ticket'], axis=1)
test = test.drop(['Ticket'], axis=1)

In [20]:
# dropping the Fare column from training and testing data
train = train.drop(['Fare'], axis=1)
test = test.drop(['Fare'], axis=1)

In [22]:
# mapping the gender of passengers with male as 0 and female as 1 in training and testing dataset
gender_mapping = {"male":0,"female":1}
train['Sex'] = train['Sex'].map(gender_mapping)
test['Sex'] = test['Sex'].map(gender_mapping)

In [23]:
train.head(n=20)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,C
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,S
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,S
5,6,0,3,"Moran, Mr. James",0,24.0,0,0,Q
6,7,0,1,"McCarthy, Mr. Timothy J",0,54.0,0,0,S
7,8,0,3,"Palsson, Master. Gosta Leonard",0,2.0,3,1,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",1,27.0,0,2,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",1,14.0,1,0,C


In [24]:
#mapping the values of 'Embarked' column
embarked_mapping = {"S":1,"C":2,"Q":3}
train['Embarked'] = train['Embarked'].map(embarked_mapping)

In [25]:
test['Embarked'] = test['Embarked'].map(embarked_mapping)

In [26]:
#creating a combined dataset
combine = [train, test]
#extract a title for each Name in the train and test datasets
for dataset in combine:
    dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

pd.crosstab(train['Title'], train['Sex'])

Sex,0,1
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Capt,1,0
Col,2,0
Countess,0,1
Don,1,0
Dr,6,1
Jonkheer,1,0
Lady,0,1
Major,2,0
Master,40,0
Miss,0,182


In [27]:
#replace various titles with more common names
for dataset in combine:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Capt', 'Col',
    'Don', 'Dr', 'Major', 'Rev', 'Jonkheer', 'Dona'], 'Rare')
    dataset['Title'] = dataset['Title'].replace(['Countess', 'Lady', 'Sir'], 'Royal')
    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
    
train[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()

Unnamed: 0,Title,Survived
0,Master,0.575
1,Miss,0.702703
2,Mr,0.156673
3,Mrs,0.793651
4,Rare,0.285714
5,Royal,1.0


In [28]:
#Mapping the title column values into numerical values
title_mapping = {"Master":1, "Miss": 2, "Mr":3, "Mrs":4, "Rare":5, "Royal":6}
train['Title'] = train['Title'].map(title_mapping)

In [29]:
#Dropping the Name column from the training and testing dataset
train = train.drop(['Name'], axis=1)
test = test.drop(['Name'], axis=1)

In [30]:
#Dividing the Age of Passengers into different bands
#sort the ages into logical categories
train["Age"] = train["Age"].fillna(-0.5)
test["Age"] = test["Age"].fillna(-0.5)
bins = [-1, 0, 5, 12, 18, 24, 35, 60, np.inf]
labels = ['Unknown', 'Baby', 'Child', 'Teenager', 'Student', 'Young Adult', 'Adult', 'Senior']
train['AgeGroup'] = pd.cut(train["Age"], bins, labels = labels)
test['AgeGroup'] = pd.cut(test["Age"], bins, labels = labels)

In [31]:
# fill missing age with mode age group for each title
mr_age = train[train["Title"] == 1]["AgeGroup"].mode() #Young Adult
miss_age = train[train["Title"] == 2]["AgeGroup"].mode() #Student
mrs_age = train[train["Title"] == 3]["AgeGroup"].mode() #Adult
master_age = train[train["Title"] == 4]["AgeGroup"].mode() #Baby
royal_age = train[train["Title"] == 5]["AgeGroup"].mode() #Adult
rare_age = train[train["Title"] == 6]["AgeGroup"].mode() #Adult

age_title_mapping = {1: "Young Adult", 2: "Student", 3: "Adult", 4: "Baby", 5: "Adult", 6: "Adult"}

for x in range(len(train["AgeGroup"])):
    if train["AgeGroup"][x] == "Unknown":
        train["AgeGroup"][x] = age_title_mapping[train["Title"][x]]
        
for x in range(len(test["AgeGroup"])):
    if test["AgeGroup"][x] == "Unknown":
        test["AgeGroup"][x] = age_title_mapping[test["Title"][x]]

In [32]:
#Mapping the Age column values into numerical values
age_mapping = {"Baby":1, "Child":2, "Teenager":3, "Student":4, "Young Adult":5, "Adult":6, "Senior":7}
train['AgeGroup'] = train['AgeGroup'].map(age_mapping)
test['AgeGroup'] = test['AgeGroup'].map(age_mapping)

In [33]:
title_mapping = {"Master":1, "Miss": 2, "Mr":3, "Mrs":4, "Rare":5, "Royal":6}
test['Title'] = test['Title'].map(title_mapping)

In [34]:
train.head(n=20)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked,Title,AgeGroup
0,1,0,3,0,22.0,1,0,1,3,4
1,2,1,1,1,38.0,1,0,2,4,6
2,3,1,3,1,26.0,0,0,1,2,5
3,4,1,1,1,35.0,1,0,1,4,5
4,5,0,3,0,35.0,0,0,1,3,5
5,6,0,3,0,24.0,0,0,3,3,4
6,7,0,1,0,54.0,0,0,1,3,6
7,8,0,3,0,2.0,3,1,1,1,1
8,9,1,3,1,27.0,0,2,1,4,5
9,10,1,2,1,14.0,1,0,2,4,3


In [35]:
#Finally dropping the Age column as we have formed a new AgeGroup column which is more classified 
train = train.drop(['Age'], axis=1)
test = test.drop(['Age'], axis=1)

In [36]:
train.head(n=20) #training dataset first 20 rows

Unnamed: 0,PassengerId,Survived,Pclass,Sex,SibSp,Parch,Embarked,Title,AgeGroup
0,1,0,3,0,1,0,1,3,4
1,2,1,1,1,1,0,2,4,6
2,3,1,3,1,0,0,1,2,5
3,4,1,1,1,1,0,1,4,5
4,5,0,3,0,0,0,1,3,5
5,6,0,3,0,0,0,3,3,4
6,7,0,1,0,0,0,1,3,6
7,8,0,3,0,3,1,1,1,1
8,9,1,3,1,0,2,1,4,5
9,10,1,2,1,1,0,2,4,3


In [37]:
test.head(n=20) #testing datset first 20 rows

Unnamed: 0,PassengerId,Pclass,Sex,SibSp,Parch,Embarked,Title,AgeGroup
0,892,3,0,0,0,3,3,5
1,893,3,1,1,0,1,4,6
2,894,2,0,0,0,3,3,7
3,895,3,0,0,0,1,3,5
4,896,3,1,1,1,1,4,4
5,897,3,0,0,0,1,3,3
6,898,3,1,0,0,3,2,5
7,899,2,0,1,1,1,3,5
8,900,3,1,0,0,2,4,3
9,901,3,0,2,0,1,3,4
