## Titanic Survial Prediction

![](https://i.imgur.com/aQANpYt.gif)

* I made it into a full-featured web app. You can visit it here [Titanic Survival Prediction](https://dibkb-titanic.herokuapp.com/)
* The code for the Web app in available on [Github](https://github.com/dibkb/Titanic-Survival)

In [None]:
#import necessary libraries
import pandas as pd
import numpy as np
import plotly.express as px
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
#import the data
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
train = pd.read_csv('/kaggle/input/titanic/train.csv')
test = pd.read_csv('/kaggle/input/titanic/test.csv')

### Data Preprocessing

#### Check for missing values

In [None]:
import missingno as msno
msno.matrix(train)

In [None]:
msno.bar(train)

#### Cabin Missing Column

In [None]:
#analysis of the cabin 
train[(train['Cabin'].isnull()) & train['Survived'] == 1]
print(f"Persons with Missing Cabin that survived {len(train[(train['Cabin'].isnull()) & train['Survived'] == 1])}")


train[(train['Cabin'].isnull()) & train['Survived'] == 0]
print(f"Persons with Missing Cabin that didn\'t survived {len(train[(train['Cabin'].isnull()) & (train['Survived'] == 0)])}")

Since more people with missing Cabin didn't survive, It is not missing at random.Let's explore this further

In [None]:
#indicator variable 
train['Cabin_missing'] = np.where(train['Cabin'] == '#',1,0)
test['Cabin_missing'] = np.where(test['Cabin'] == '#',1,0)

In [None]:
#Delete 'Cabin' column as it has a a lot of missing values
del train['Cabin']
del test['Cabin']

In [None]:
#analysis of the missing persons
train[(train['Age'].isnull()) & train['Survived'] == 1]
print(f"Persons with Missing Age that survived {len(train[(train['Age'].isnull()) & train['Survived'] == 1])}")

train[(train['Age'].isnull()) & train['Survived'] == 0]
print(f"Persons with Missing Age that didn\'t survived {len(train[(train['Age'].isnull()) & (train['Survived'] == 0)])}")

Missing Age could be an important feature.

In [None]:
train['Age_missing'] = np.where(train['Age'].isnull(),1,0)
test['Age_missing'] = np.where(test['Age'].isnull(),1,0)

In [None]:
train['Embarked'].value_counts()

In [None]:
#repalce the missing values 'Embarked' column with the highest occuring frequency.
train['Embarked'] = train['Embarked'].fillna('S') 
test['Embarked'] = test['Embarked'].fillna('S') 

Create a new feature 'Ticket Length'

In [None]:
def extract_ticket_length(x):
    try:
        return len(x.split(' ')[1])        
    except IndexError:
        return len(x.split(' ')[0])

In [None]:
train['Ticket_length'] = train['Ticket'].apply(lambda x: extract_ticket_length(x))
test['Ticket_length'] = test['Ticket'].apply(lambda x: extract_ticket_length(x))

### Explanatory Data Analysis

In [None]:
 train.describe().T

#### Column : Survived


In [None]:
sns.set_theme(style="darkgrid")
fig,ax = plt.subplots(figsize = (9,9))
sns.countplot(x="Survived", data=train)
#annotatinos
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2., height + 1,height ,ha="center")

In [None]:
fig = px.pie(train,values="Survived",names="Sex",template="seaborn")
fig.update_traces(rotation=90, pull=0.05, textinfo="percent+label")
fig.update(layout_title_text='Sex composition of Survive Passengers',
           layout_showlegend=False)

#### <li>Only about 38.38 % (342) of the total passengers (891) survived.

#### Column : Pclass
<br>

In [None]:
sns.set_theme(style="darkgrid")
fig,ax = plt.subplots(1, 2,figsize = (15,6))
sns.countplot(x="Pclass", data=train,ax = ax[0])
sns.countplot(x="Pclass",hue = 'Survived', data=train,ax = ax[1])
#annotatinos
for i in np.arange(2):
    for p in ax[i].patches:
        height = p.get_height()
        ax[i].text(p.get_x()+p.get_width()/2., height + .3,height ,ha="center")

#### <li>About 62.968 % (136) of the 1st classs passengers (216) survived.
#### <li>Only 24.23% (119) of the 3rd classs passengers (491) survived.

In [None]:
sns.set_theme(style="darkgrid")
fig,ax = plt.subplots(figsize = (15,6))
sns.countplot(x="Pclass",hue = 'Sex',data=train)
fig.suptitle('Composition of passenger classs', fontsize =15)
#annotations
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2., height + .3,height ,ha="center")

In [None]:
ax = sns.catplot(x ='Pclass', y ='Survived',hue = 'Sex',kind = 'point' ,data = train,height = 6)
ax.fig.suptitle('Survival Rate vs Ticket class ')

In [None]:
sns.set_theme(style="darkgrid")
fig,ax = plt.subplots(1, 2,figsize = (15,6))
sns.countplot(x="Pclass",hue = 'Survived',data=train[train['Sex'] == 'male'],ax = ax[0])
sns.countplot(x="Pclass",hue = 'Survived', data=train[train['Sex'] == 'female'],ax = ax[1])
ax[0].set_title('Male')
ax[1].set_title('Female')
#annotatinos
for i in np.arange(2):
    for p in ax[i].patches:
        height = p.get_height()
        ax[i].text(p.get_x()+p.get_width()/2., height + .3,height ,ha="center")
fig.suptitle('Sex Composition of passenger classs', fontsize =15)   

#### <li>About 96.68 % (91) of the female 1st classs passengers (94) survived.
#### <li>Only 13.54% (47) of the male 3rd classs passengers (347) survive

In [None]:
fig,ax = plt.subplots(figsize = (15,6))
ax = sns.boxplot(y="Pclass", x="Age",orient="h", data=train)
fig.suptitle('Age distribution of passenger classs', fontsize=15)

In [None]:
fig = px.pie(train,
             values="Fare",
             names="Pclass",
             template="seaborn")
fig.update_traces(rotation=90, pull=0.05, textinfo="percent+label")
fig.update(layout_title_text='Percentage of Fare collected through Pclass',
           layout_showlegend=False)
fig.show()

In [None]:
print('Total Passengers by Pclass')
print(train['Pclass'].value_counts())

In [None]:
print('Total Survived Passengers by Pclass')
print(train[train['Survived'] == 1]['Pclass'].value_counts())

In [None]:
print('Percentage of  Survived Passengers by Pclass')
train[train['Survived'] == 1]['Pclass'].value_counts() / train['Pclass'].value_counts()

#### <li>About 55.1 % (491) of the total passengers (891) booked 3rd class ticket.
#### <li>About 62.96 % (136) of the total 1st classs passengers (184) survived.
#### <li>Only 24.23% (119) of the total 3rd classs passengers (491) survived.  

#### Column : Sex
<br>

In [None]:
sns.set_theme(style="darkgrid")
fig,ax = plt.subplots(1, 2,figsize = (15,6))
sns.countplot(x="Sex", data=train,ax = ax[0])
sns.countplot(x="Sex",hue = 'Survived', data=train,ax = ax[1])
#annotatinos
for i in np.arange(2):
    for p in ax[i].patches:
        height = p.get_height()
        ax[i].text(p.get_x()+p.get_width()/2., height + 3,height ,ha="center")

In [None]:
fig,ax = plt.subplots(figsize = (15,6))
sns.histplot(x="Age",kde = True,hue = 'Sex',data=train)
fig.suptitle('Distribution of passenger\'s age', fontsize=15)

In [None]:
print('Passengers composition by Sex')
train['Sex'].value_counts()

In [None]:
print('Survived Passengers composition by Sex')
train[train['Survived'] == 1]['Sex'].value_counts()

#### <li>About 74.2 % (233) of the total Female passengers (314) survived.
#### <li>About 18.89 % (109) of the total Male passengers (577) survived.

#### Column : Age
<br>

In [None]:
fig,ax = plt.subplots(figsize = (9,3))
ax = sns.boxplot(x=train['Age'],color = '#6edb00')

In [None]:
sns.set_theme(style="darkgrid")
fig,ax = plt.subplots(1, 2,figsize = (15,6))
sns.histplot(x="Age",kde = True ,data=train,ax = ax[0])
sns.histplot(x="Age",kde = True, hue = 'Survived',data=train,ax = ax[1])

#### Analysis of minor-passengers

In [None]:
fig,ax = plt.subplots(figsize = (9,3))
ax = sns.boxplot(x = train[train['Age'] <=18.0]['Age'],color = '#d9003d')
fig.suptitle('Age distribution of minors', fontsize=15)

In [None]:
fig,ax = plt.subplots(1,2,figsize = (15,6))
sns.histplot(x="Age",kde = True, hue = 'Survived',data=train[train['Age'] <=18.0],ax = ax[0])
sns.histplot(x="Age",kde = True, hue = 'Sex',data=train[train['Age'] <=18.0],ax = ax[1])
fig.suptitle('Age distribution of minors', fontsize=15)

In [None]:
sns.set_theme(style="darkgrid")
fig,ax = plt.subplots(1,2,figsize = (15,6))
sns.countplot(x="Survived", data=train[train['Age'] <=18.0],ax = ax[0])
sns.countplot(x="Survived", data=train[train['Age'] <=18.0],hue = 'Sex',ax = ax[1])
fig.suptitle('Survival percenatge of minors', fontsize=15)
#annotatinos
for i in np.arange(2):
    for p in ax[i].patches:
        height = p.get_height()
        ax[i].text(p.get_x()+p.get_width()/2., height + .3,height ,ha="center")

In [None]:
print('Total minors Sex-wise')
train[train['Age'] <=18.0]['Sex'].value_counts()

#### <li>About 15.60% (139) of the total passengers (891) were minors. 
#### <li>About 50.35% (70) of the minor passengers (139) survived. 
#### <li>About 67.64% (46) of the female-minor passengers (68) survived.

#### Column : Name
<br>

In [None]:
#extract the initial title of the name
train['Name_prefix'] = train['Name'].apply(lambda x : x.split(',')[1].split('.')[0])
test['Name_prefix'] = test['Name'].apply(lambda x : x.split(',')[1].split('.')[0])

In [None]:
fig,ax = plt.subplots(figsize = (15,6))
ax = sns.countplot(x = 'Name_prefix',data = train)
#annotations
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2., height + 3,height ,ha="center")
fig.suptitle('Count of Prefix name of Passengers', fontsize=15)

Since some of the initial names contains rare features let's convert them into an umbrella category 'Rare'

In [None]:
frequent_initial_names = train['Name_prefix'].value_counts().head(4).index

In [None]:
def rare_names(x):
    if x in frequent_initial_names:
        return x
    else:
        return 'Rare'

In [None]:
#replace all the uncommon names with 'Rare'
train['Name_prefix'] = train['Name_prefix'].apply(lambda x: rare_names(x))
test['Name_prefix'] = test['Name_prefix'].apply(lambda x: rare_names(x))

In [None]:
train['Name_prefix'] = train['Name_prefix'].str.replace('Ms','Miss')
#Mlle means 'Miss' in french
train['Name_prefix'] = train['Name_prefix'].str.replace('Mlle','Miss')
#Mme means 'Mrs' in french
train['Name_prefix'] = train['Name_prefix'].str.replace('Mme','Mrs')

test['Name_prefix'] = test['Name_prefix'].str.replace('Ms','Miss')
#Mlle means 'Miss' in french
test['Name_prefix'] = test['Name_prefix'].str.replace('Mlle','Miss')
#Mme means 'Mrs' in french
test['Name_prefix'] = test['Name_prefix'].str.replace('Mme','Mrs')

In [None]:
fig,ax = plt.subplots(figsize = (9,9))
ax = plt.pie(x=train['Name_prefix'].value_counts(), autopct="%.1f%%", labels = train['Name_prefix'].value_counts().index,pctdistance=0.5)
fig.suptitle('Compositon of prefix name', fontsize=15)

#### Column : SibSp (Number of Siblings / Spouses Aboard)
<br>

In [None]:
sns.set_theme(style="darkgrid")
fig,ax = plt.subplots(1, 2,figsize = (15,6))
sns.countplot(x="SibSp", data=train,ax = ax[0])
sns.countplot(x="SibSp",hue = 'Survived', data=train,ax = ax[1])
#annotatinos
for p in ax[0].patches:
    height = p.get_height()
    ax[0].text(p.get_x()+p.get_width()/2., height + 1,height ,ha="center")

In [None]:
ax = sns.catplot(x ='SibSp', y ='Survived',hue = 'Sex',kind = 'point' ,data = train,height = 6)
ax.fig.suptitle('Survival Rate vs sibling / spouce abord ')

##### Passengers travelling with one or more siblings / spouce.

In [None]:
fig,ax = plt.subplots(1,2,figsize = (15,6))
sns.histplot(x="Age",kde = True, hue = 'Survived',data=train[train['SibSp'] >= 1],ax = ax[0])
sns.histplot(x="Age",kde = True, hue = 'Sex',data=train[train['SibSp'] >= 1],ax = ax[1])
fig.suptitle('Age distribution of Passengers who travelled with one or more siblings / spouce', fontsize=15)

In [None]:
sns.set_theme(style="darkgrid")
fig,ax = plt.subplots(1,2,figsize = (15,6))
sns.countplot(x="Survived", data=train[train['SibSp'] >= 1],ax = ax[0])
sns.countplot(x="Survived", data=train[train['SibSp'] >= 1],hue = 'Sex',ax = ax[1])
fig.suptitle('Passengers who travelled with at least one siblings / spouce', fontsize=15)
#annotatinos
for i in np.arange(2):
    for p in ax[i].patches:
        height = p.get_height()
        ax[i].text(p.get_x()+p.get_width()/2., height + .3,height ,ha="center")

In [None]:
print('Gender distribution of Passengers who travelled with one or more siblings/spouce')
train[(train['SibSp'] >= 1)]['Sex'].value_counts()

#### <li>About 46.64% (132) of the total passengers (283) survived who travelled with one or more siblings / spouce. 
#### <li>About 44.58% (140) of the total female-passengers (314) travelled with one or more siblings / spouce. 
#### <li>About 68.57% (96) of the total female-passengers (140) survived who travelled with one or more siblings / spouce.
#### <li>Only 25.17% (36) of the total male-passengers (143) survived who travelled with one or more siblings / spouce. 

##### Passengers travelling with no siblings/spouce

In [None]:
fig,ax = plt.subplots(1,2,figsize = (15,6))
sns.histplot(x="Age",kde = True,hue = 'Survived',data=train[(train['SibSp'] == 0)],ax = ax[0])
sns.histplot(x="Age",kde = True,hue = 'Sex',data=train[(train['SibSp'] == 0)],ax = ax[1])
fig.suptitle('Age distribution of Passengers who travelled with no siblings / spouce', fontsize=15)

In [None]:
sns.set_theme(style="darkgrid")
fig,ax = plt.subplots(1,2,figsize = (15,6))
sns.countplot(x="Survived", data=train[train['SibSp'] == 0],ax = ax[0])
sns.countplot(x="Survived", data=train[train['SibSp'] == 0],hue = 'Sex',ax = ax[1])
fig.suptitle('Passengers who travelled with at least no siblings / spouce', fontsize=15)
#annotatinos
for i in np.arange(2):
    for p in ax[i].patches:
        height = p.get_height()
        ax[i].text(p.get_x()+p.get_width()/2., height + .3,height ,ha="center")

In [None]:
print('Gender distribution of Passengers who travelled with no siblings')
train[(train['SibSp'] == 0)]['Sex'].value_counts()

#### <li>Only 34.53% (210) of passengers (610) survived who travelled with no siblings.
#### <li>About 55.51% (174) of the total female-passengers (314) survived travelled with no siblings.     
#### <li>About 78.73% (137) of female-passengers (174) survived who travelled with no siblings.
#### <li>Only 16.62% (73) of male-passengers (434) survived who travelled with no siblings.    

#### Column : Parch (Number of Parents/Children Aboard)
<br>

In [None]:
sns.set_theme(style="darkgrid")
fig,ax = plt.subplots(1, 2,figsize = (15,6))
ax[0] = sns.countplot(x="Parch", data=train,ax = ax[0])
ax[1] = sns.countplot(x="Parch",hue = 'Survived', data=train,ax = ax[1])
#annotatinos
for i in np.arange(1):
    for p in ax[i].patches:
        height = p.get_height()
        ax[i].text(p.get_x()+p.get_width()/2., height + .3,height ,ha="center")

In [None]:
ax = sns.catplot(x ='Parch', y ='Survived',hue = 'Sex',kind = 'point' ,data = train,height = 6)
ax.fig.suptitle('Survival Rate vs Parent / child abord ')

##### Passengers travelling with one or more parents / children.

In [None]:
fig,ax = plt.subplots(1,2,figsize = (15,6))
sns.histplot(x="Age",kde = True, hue = 'Survived',data=train[train['Parch'] >= 1],ax = ax[0])
sns.histplot(x="Age",kde = True, hue = 'Sex',data=train[train['Parch'] >= 1],ax = ax[1])
fig.suptitle('Age distribution of Passengers who travelled with one or more siblings / spouce', fontsize=15)

In [None]:
print('Age description of passengers who travelled with one or more parents / children')
train[train['Parch'] >= 1]['Age'].describe()

In [None]:
sns.set_theme(style="darkgrid")
fig,ax = plt.subplots(1,2,figsize = (15,6))
sns.countplot(x="Survived", data=train[train['Parch'] >= 1],ax = ax[0])
sns.countplot(x="Survived", data=train[train['Parch'] >= 1],hue = 'Sex',ax = ax[1])
fig.suptitle('Passengers who travelled with at least one parents / children', fontsize=15)
#annotatinos
for i in np.arange(2):
    for p in ax[i].patches:
        height = p.get_height()
        ax[i].text(p.get_x()+p.get_width()/2., height + .3,height ,ha="center")

#### <li>About 51.17% (109) of the total passengers (213) survived who travelled with one or more parents / children. 
#### <li>About 38.21% (120) of the total female-passengers (314) survived travelled with one or more parents / children. 
#### <li>About 66.66% (80) of the total female-passengers (120) survived who travelled with one or more parents / children.
#### <li>About 38.18% (29) of the total male-passengers (93) survived who travelled with one or more parents / children. 

##### Passengers travelling with no parent  / child.

In [None]:
fig,ax = plt.subplots(1,2,figsize = (15,6))
sns.histplot(x="Age",kde = True,hue = 'Survived',data=train[(train['Parch'] == 0)],ax = ax[0])
sns.histplot(x="Age",kde = True,hue = 'Sex',data=train[(train['Parch'] == 0)],ax = ax[1])
fig.suptitle('Age distribution of Passengers who travelled with no parent / child', fontsize=15)

In [None]:
print('Age description of passengers who travelled with no parent / child')
train[train['Parch'] == 0]['Age'].describe()

In [None]:
sns.set_theme(style="darkgrid")
fig,ax = plt.subplots(1,2,figsize = (15,6))
sns.countplot(x="Survived", data=train[train['Parch'] == 0],ax = ax[0])
sns.countplot(x="Survived", data=train[train['Parch'] == 0],hue = 'Sex',ax = ax[1])
fig.suptitle('Passengers who travelled with at least no parent / child', fontsize=15)
#annotatinos
for i in np.arange(2):
    for p in ax[i].patches:
        height = p.get_height()
        ax[i].text(p.get_x()+p.get_width()/2., height + .3,height ,ha="center")

#### <li>About 34.36% (109) of the total passengers (678) survived who travelled with one or more parent / child. 
#### <li>About 48.72% (153) of the total female-passengers (314) survived travelled with one or more parent / child. 
 

##### Passengers travelling without family (without parnent / child or sibling / spouce)

In [None]:
fig,ax = plt.subplots(1,2,figsize = (15,6))
sns.histplot(x="Age",kde = True,hue = 'Survived',data=train[(train['Parch'] == 0) & (train['SibSp'] == 0)],ax = ax[0])
sns.histplot(x="Age",kde = True,hue = 'Sex',data=train[(train['Parch'] == 0) & (train['SibSp'] == 0)],ax = ax[1])
fig.suptitle('Age distribution of Passengers who travelled without family', fontsize=15)

In [None]:
print('Age description of passengers who travelled without family')
train[(train['Parch'] == 0) & (train['SibSp'] == 0)]['Age'].describe()

In [None]:
sns.set_theme(style="darkgrid")
fig,ax = plt.subplots(1,2,figsize = (15,6))
sns.countplot(x="Survived", data=train[(train['Parch'] == 0) & (train['SibSp'] == 0)],ax = ax[0])
sns.countplot(x="Survived", data=train[(train['Parch'] == 0) & (train['SibSp'] == 0)],hue = 'Sex',ax = ax[1])
fig.suptitle('Passengers who travelled without family', fontsize=15)
#annotatinos
for i in np.arange(2):
    for p in ax[i].patches:
        height = p.get_height()
        ax[i].text(p.get_x()+p.get_width()/2., height + .3,height ,ha="center")

#### <li>About 60.26% (537) of the passengers (891) travelled without family.
#### <li>About 84.42% (347) of the male passengers (411) who travelled without family couldn\'t survive

In [None]:
# train['Without_family'] = np.where((train['Parch'] == 0) & (train['SibSp'] == 0),1,0)
# test['Without_family'] = np.where((test['Parch'] == 0) & (test['SibSp'] == 0),1,0)

#### Column: Fare
<br>

In [None]:
fig,ax = plt.subplots(figsize = (9,3))
ax = sns.boxplot(x=train['Fare'],color = '#ff7a70')

In [None]:
sns.set_theme(style="darkgrid")
fig,ax = plt.subplots(1, 2,figsize = (15,6))
sns.histplot(x="Fare",bins=30,kde = True, data=train,ax = ax[0])
sns.histplot(x="Fare",bins=30,kde = True,hue = 'Survived', data=train,ax = ax[1])

##### Fare Vs Passengers class

In [None]:
for i in np.arange(1,4):
    fig,ax = plt.subplots(figsize = (15,6))
    sns.histplot(x="Fare",hue = 'Survived',kde = True,data = train[train['Pclass'] == i])
    fig.suptitle(f'Distribution of {i} class Fare', fontsize=15)


##### Analysis of passengers whose fare is more than £ 100

In [None]:
sns.set_theme(style="darkgrid")
fig,ax = plt.subplots(figsize = (15,6))
sns.histplot(x="Fare",bins=30,kde = True,hue = 'Survived', data = train[train['Fare'] >= 100 ])
fig.suptitle('Passengers whose Fare is more than £ 100', fontsize=15)

In [None]:
sns.set_theme(style="darkgrid")
fig,ax = plt.subplots(1,2,figsize = (15,6))
ax[0] = sns.countplot(x="Survived", data = train[train['Fare'] >= 100 ],ax = ax[0])
ax[1] = sns.countplot(x="Survived", data = train[train['Fare'] >= 100 ],hue = 'Sex',ax = ax[1])
#annotatinos
for i in np.arange(2):
    for p in ax[i].patches:
        height = p.get_height()
        ax[i].text(p.get_x()+p.get_width()/2., height + .3,height ,ha="center")
        
fig.suptitle('Survival of Passengers whose Fare is more than £ 100', fontsize=15) 

In [None]:
print('Survived passengers whose ticket costs more than £ 100')
train[train['Fare'] >= 100 ]['Survived'].value_counts()

In [None]:
print('Sex composition survived passengers whose ticket costs more than £ 100')
train[(train['Fare'] >= 100) & train['Survived'] == 1]['Sex'].value_counts()

#### <li>About 5.94% (53) of the total passenger's (891) fare was more than £ 100. 
#### <li>About 73.58% (39) of those passengers (53) survived. 
#### <li>About 82.05% (32) of those survived (39) were females.

##### Analysis of passengers whose fare is less than £ 50

In [None]:
sns.set_theme(style="darkgrid")
fig,ax = plt.subplots(figsize = (15,6))
sns.histplot(x="Fare",bins=30,kde = True,hue = 'Survived', data = train[train['Fare'] <= 50 ])
fig.suptitle('Passengers whose Fare is less than £ 50', fontsize=15)

In [None]:
fig,ax = plt.subplots(figsize = (9,6))
ax = sns.countplot(x="Survived", data = train[train['Fare'] <= 50 ])
#annotatinos
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2., height + 1,height ,ha="center")
fig.suptitle('Survival of Passengers whose Fare is less than £ 50', fontsize=15)    

#### <li>About 82.04% (731) of the total passenger's (891) fare was less than £ 50. 
#### <li>Only 31.87% (233) of those survived (731) survived.

#### Column : Embarked (C = Cherbourg; Q = Queenstown; S = Southampton)
<br>

In [None]:
sns.set_theme(style="darkgrid")
fig,ax = plt.subplots(1, 2,figsize = (15,6))
sns.countplot(x="Embarked",data=train,ax = ax[0])
sns.countplot(x="Embarked",hue = 'Survived', data=train,ax = ax[1])
#annnotations
for i in np.arange(2):
    for p in ax[i].patches:
        height = p.get_height()
        ax[i].text(p.get_x()+p.get_width()/2., height + 1,height ,ha="center")
#add title
fig.suptitle('Passengers count', fontsize=15)

In [None]:
fig = px.pie(train,
             values="Fare",
             names="Embarked",
             template="seaborn")
fig.update_traces(rotation=90, pull=0.05, textinfo="percent+label")
fig.update(layout_title_text='Percentage of Fare collected through Embarked',
           layout_showlegend=False)
fig.show()


In [None]:
sns.set_theme(style="darkgrid")
fig,ax = plt.subplots(1, 2,figsize = (15,6))
sns.countplot(x="Embarked",hue = 'Pclass',data=train,ax = ax[0])
sns.countplot(x="Embarked",hue = 'Sex', data=train,ax = ax[1])
#annnotations
for i in np.arange(2):
    for p in ax[i].patches:
        height = p.get_height()
        ax[i].text(p.get_x()+p.get_width()/2., height + 1,height ,ha="center")
#add title
fig.suptitle('Ticket Class and Sex composition', fontsize=15)

#### Skewness

In [None]:
train.skew()

#### Kurtosis

In [None]:
train.kurt()

#### IQR (Inter Quartile Range)

In [None]:
for i in train.columns[1:]:
    if train[i].dtype != 'object':
        print(i)
        print('IQR: ',train[i].quantile(.75) - train[i].quantile(.25))
        print('')


#### Let's See the Correlation among these attributes

In [None]:
train.corr()

In [None]:
plt.figure(figsize=(9,9))
sns.heatmap(train.drop(['PassengerId','Cabin_missing', 'Age_missing',
       'Ticket_length', 'Name_prefix',],axis = 1).corr(), vmax=1, square=True,annot=True,cmap='RdBu')
plt.title('Correlation between different attributes')
plt.show()

In [None]:
sns.pairplot(train.drop(['PassengerId','Cabin_missing', 'Age_missing',
       'Ticket_length', 'Name_prefix', ],axis = 1), hue="Survived")

#### One-hot encoding

In [None]:
test.columns

In [None]:
train.columns

In [None]:
#grab the ids of the passenger's id of the test data
ids = test['PassengerId']

In [None]:
cols_to_drop = ['PassengerId','Name','Ticket']

In [None]:
train = train.drop(['PassengerId','Name','Ticket'],axis = 1)
test = test.drop(['PassengerId','Name','Ticket'],axis = 1)

#### Impute the missing values

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# separate intro train and test set

X_train, X_test, y_train, y_test = train_test_split(
    train.drop('Survived', axis=1),  # just the features
    train['Survived'],  # the target
    test_size=0.3,  # the percentage of obs in the test set
    random_state=0)  # for reproducibility

X_train.shape, X_test.shape

In [None]:
#impute the misssing values with median
X_test['Age'] = X_test['Age'].fillna(X_train['Age'].median())
X_train['Age'] = X_train['Age'].fillna(X_train['Age'].median())

In [None]:
#impute the misssing values with median in the test
test['Age'] = test['Age'].fillna(X_train['Age'].median())
test['Fare'] = test['Fare'].fillna(X_train['Fare'].median())

In [None]:
print('Check the missing values of test')
test.isnull().sum()

#### One Hot Encoding

In [None]:
pip install feature_engine

In [None]:
from feature_engine.encoding import OneHotEncoder as fe_OneHotEncoder

In [None]:
ohe_enc = fe_OneHotEncoder(
    top_categories=None,
    drop_last=True) 

In [None]:
ohe_enc.fit(X_train)

In [None]:
X_train = ohe_enc.transform(X_train)
X_test = ohe_enc.transform(X_test)
test = ohe_enc.transform(test)

In [None]:
#scale the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train_std = scaler.transform(X_train)
X_test_std = scaler.transform(X_test)
test_std = scaler.transform(test)

### Model Preparation

#### Logistic Regression(Lasso)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn import metrics

In [None]:
logisticRegr = LogisticRegression(penalty='l1', solver='liblinear')
#fit the model
logisticRegr.fit(X_train_std, y_train)

In [None]:
#predictions and score
predictions = logisticRegr.predict(X_test_std)
print(f"The score on the Test-dataset is {logisticRegr.score(X_test_std, y_test)}")
print(f"The score on the Train-dataset is {logisticRegr.score(X_train_std, y_train)}")

In [None]:
def plot_condution_metrics(y_test,predictions):
    #condusion metrics
    cm = metrics.confusion_matrix(y_test, predictions)
    score = np.mean([y_test == predictions])
    #plot
    sns.heatmap(cm, annot=True, fmt=".0f", linewidths=1, square = True,cbar = False);
    plt.ylabel('Actual label');
    plt.xlabel('Predicted label');
    all_sample_title = 'Accuracy Score: {0}'.format(score)
    plt.title(all_sample_title, size = 15);

In [None]:
#confusion metrics
plot_condution_metrics(y_test,predictions)

In [None]:
#cross-val score
score = cross_val_score(logisticRegr, X_train_std, y_train, cv=9,scoring='accuracy')
print(f'The Cross-Valiation Score is {score.mean()}')

In [None]:
#store the cv-score
model_performance = {}
model_performance['Logistic Regression(Lasso)'] = score.mean()

#### Logistic Regression(Ridge)

In [None]:
logisticRegr = LogisticRegression(penalty='l2', solver='liblinear')
#fit the model
logisticRegr.fit(X_train_std, y_train)

In [None]:
#predictions and score
predictions = logisticRegr.predict(X_test_std)
print(f"The score on the Test-dataset is {logisticRegr.score(X_test_std, y_test)}")
print(f"The score on the Train-dataset is {logisticRegr.score(X_train_std, y_train)}")

In [None]:
plot_condution_metrics(y_test,predictions)

In [None]:
#cross-val score
score = cross_val_score(logisticRegr, X_train_std, y_train, cv=9,scoring='accuracy')
print(f'The Cross-Valiation Score is {score.mean()}')

In [None]:
model_performance['Logistic Regression(Ridge)'] = score.mean()

#### Polynomial Logistic Regression

In [None]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2)

#trasform into polynomial features
X_train_p = poly.fit_transform(X_train)
X_test_p = poly.fit_transform(X_test)

#standard scale
scaler = StandardScaler()
scaler.fit(X_train_p)
X_train_std_poly = scaler.transform(X_train_p)
X_test_std_poly = scaler.transform(X_test_p)

In [None]:
#fit the model
logisticPolyRegr = LogisticRegression(solver='liblinear')
logisticPolyRegr.fit(X_train_std_poly, y_train)

#predictions and score
predictions = logisticPolyRegr.predict(X_test_std_poly)
print(f"The score on the Test-dataset is {logisticPolyRegr.score(X_test_std_poly, y_test)}")
print(f"The score on the Train-dataset is {logisticPolyRegr.score(X_train_std_poly, y_train)}")

In [None]:
plot_condution_metrics(y_test,predictions)

In [None]:
#cross-val score
score = cross_val_score(logisticPolyRegr, X_train_std_poly, y_train, cv=9,scoring='accuracy')
print(f'The Cross-Valiation Score is {score.mean()}')

In [None]:
model_performance['Polynomial Logistic Regression'] = score.mean()

#### Cross-validation and Hypeer-parameter tuning

In [None]:
poly_degrees = [2,3,4]
degree_loop_values = []
for degree in poly_degrees:
    poly = PolynomialFeatures(degree = degree)

    #trasform into polynomial features
    X_train_p = poly.fit_transform(X_train)
    X_test_p = poly.fit_transform(X_test)

    #standard scale
    scaler = StandardScaler()
    scaler.fit(X_train_p)
    X_train_std_poly = scaler.transform(X_train_p)
    X_test_std_poly = scaler.transform(X_test_p)
    
    logisticPolyRegr = LogisticRegression(solver='liblinear')
    logisticPolyRegr.fit(X_train_std_poly, y_train)

    score = cross_val_score(logisticPolyRegr, X_train_std_poly, y_train, cv=10,scoring='accuracy')
    degree_loop_values.append([degree,score.mean(),np.std(score)])

In [None]:
pd.DataFrame(degree_loop_values,columns = ['Degree','Mean_cv','Std_cv'],)

The best-fit polynomial degree is 2

#### Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
gnb = GaussianNB()
predictions = gnb.fit(X_train_std, y_train).predict(X_test_std)

In [None]:
print(f"The score on the Test-dataset is {gnb.score(X_test_std, y_test)}")
print(f"The score on the Train-dataset is {gnb.score(X_train_std, y_train)}")

In [None]:
plot_condution_metrics(y_test,predictions)

In [None]:
#cross-val score
score = cross_val_score(gnb, X_train, y_train, cv=9,scoring='accuracy')
print(f'The Cross-Valiation Score is {score.mean()}')

In [None]:
model_performance['Naive Bayes (gausian)'] = score.mean()

#### Support Vector Machine

In [None]:
from sklearn.svm import SVC

#### Grid-Search CV to find the optimum parameters

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = [
    {
        "C" : [0.01,1,5],
        "gamma" : ['scale',0.09,0.1,0.5],
        "kernel" : ['rbf']
        
    },
]
optimat_parameters = GridSearchCV(
    SVC(),
    param_grid,
    cv = 9,
    scoring = 'accuracy',
    verbose = 0
)
optimat_parameters.fit(X_train_std,y_train)
print(optimat_parameters.best_params_)

#### Builing the optimam SVM model

In [None]:
from sklearn import svm
clf_svm = svm.SVC(C =  1, gamma = 0.1,kernel = 'rbf',random_state = 42)
predictions = clf_svm.fit(X_train_std, y_train).predict(X_test_std)

In [None]:
print(f"The score on the Test-dataset is {clf_svm.score(X_test_std, y_test)}")
print(f"The score on the Train-dataset is {clf_svm.score(X_train_std, y_train)}")

In [None]:
plot_condution_metrics(y_test,predictions)

In [None]:
#cross-val score
score = cross_val_score(clf_svm, X_train_std, y_train, cv=9,scoring='accuracy')
print(f'The Cross-Valiation Score is {score.mean()}')

In [None]:
model_performance['SVM (rbf)'] = score.mean()

#### Decision Trees

In [None]:
from sklearn.tree import DecisionTreeClassifier,plot_tree

clf_dt = DecisionTreeClassifier(random_state=42)

In [None]:
#build a preliminary tree
predictions = clf_dt.fit(X_train, y_train).predict(X_test)

In [None]:
fig,ax = plt.subplots(figsize = (25,12))
ax = plot_tree(
    clf_dt,
    filled = True,
    rounded = True,
    class_names = ['Not Survived',"Survived"],
    feature_names = X_train.columns
    
)

In [None]:
print(f"The score on the Test-dataset is {clf_dt.score(X_test, y_test)}")
print(f"The score on the Train-dataset is {clf_dt.score(X_train, y_train)}")

In [None]:
plot_condution_metrics(y_test,predictions)

In [None]:
#cross-val score
score = cross_val_score(clf_dt, X_train, y_train, cv=9,scoring='accuracy')
print(f'The Cross-Valiation Score is {score.mean()}')

#### Cost-complxity prunnig the decison tree

In [None]:
path = clf_dt.cost_complexity_pruning_path(X_train,y_train)
ccp_alphas = path.ccp_alphas
ccp_alphas = ccp_alphas[:-1]

cct_dts = []

for ccp_alpha in ccp_alphas:
    clf_dt = DecisionTreeClassifier(random_state=42,ccp_alpha = ccp_alpha)
    clf_dt.fit(X_train,y_train)
    cct_dts.append(clf_dt)
    


In [None]:
train_scores = [clf.score(X_train, y_train) for clf in cct_dts]
test_scores = [clf.score(X_test, y_test) for clf in cct_dts]

fig, ax = plt.subplots(figsize = (12,9))
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas, train_scores, marker='o', label="train",
        drawstyle="steps-post")
ax.plot(ccp_alphas, test_scores, marker='o', label="test",
        drawstyle="steps-post")
ax.legend()
plt.show()

#### Cross validation to find the optimal value of alpha

Looking at the figure and 'eye-balling' we see the alpha of 0.00375 could be a bette value

In [None]:
#looking at the figure and 'eye-balling' we see the alpha of 0.00375 could be a bette value
# using K-fold CV
clf_dt = DecisionTreeClassifier(random_state=42,ccp_alpha = 0.00375)
scores = cross_val_score(clf_dt, X_train, y_train, cv=9,scoring='accuracy')
#plot 
df_cv = pd.DataFrame(data = {'tree' : range(9),'accuracy':scores})
df_cv.plot(x = 'tree',y = 'accuracy',marker = 'o',linestyle = '--')

 As we see on one split i returns a very low accuracy of about 73%. So this value of alpha may not be the best
 So we use K-fold CV on all the values of alpha to find the optimum parameter

In [None]:
alpha_loop_values = []

for ccp_alpha in ccp_alphas:
    clf_dt = DecisionTreeClassifier(random_state=42,ccp_alpha = ccp_alpha)
    scores = cross_val_score(clf_dt, X_train, y_train, cv=9,scoring='accuracy')
    alpha_loop_values.append([ccp_alpha,np.mean(scores),np.std(scores)])
    
#storing in a pandas datframe
alpha_df = pd.DataFrame(alpha_loop_values,columns = ['alpha','mean_Score','std_score'])

#plot df
alpha_df.plot(x = 'alpha',y = 'mean_Score',marker = 'o',linestyle = '--')

In [None]:
print('alpha values with cv score > .8')
alpha_df[alpha_df['mean_Score'] > .8 ].sort_values(by = 'mean_Score',ascending = False)

In [None]:
ideal_alpha = 0.008865

#### Building the best tree

In [None]:
clf_dt_prune = DecisionTreeClassifier(random_state=42,ccp_alpha = ideal_alpha)
predictions = clf_dt_prune.fit(X_train, y_train).predict(X_test)

In [None]:
fig,ax = plt.subplots(figsize = (25,9))
ax = plot_tree(
    clf_dt_prune,
    filled = True,
    rounded = True,
    class_names = ['Not Survived',"Survived"],
    feature_names = X_train.columns
    
)

In [None]:
print(f"The score on the Test-dataset is {clf_dt_prune.score(X_test, y_test)}")
print(f"The score on the Train-dataset is {clf_dt_prune.score(X_train, y_train)}")

In [None]:
plot_condution_metrics(y_test,predictions)

In [None]:
#cross-val score
score = cross_val_score(clf_dt_prune, X_train_std, y_train, cv=9,scoring='accuracy')
print(f'The Cross-Valiation Score is {score.mean()}')

In [None]:
model_performance['Decison Tree'] = score.mean()

#### Visualizing the model performance

In [None]:
model_performance

In [None]:
model_df = pd.DataFrame.from_dict(model_performance,orient = 'index',columns = ['Mean CV Score'])
model_df = model_df.sort_values(by ='Mean CV Score',ascending = False)
model_df

In [None]:
gig,ax = plt.subplots(figsize = (12,6))
sns.barplot(x="Mean CV Score", y=model_df.index, data=model_df,color = '#fc8a26')

#### As we can see the Support Vector Machine with the 'rbf' kernel gives the best result

#### Making Submissions


In [None]:
#scale the test data
predictions = clf_svm.predict(test_std)

In [None]:
output = pd.DataFrame({ 'PassengerId' : ids, 'Survived': predictions })
output.to_csv('submission.csv', index=False)