# Titanic Survival Analysis


In [None]:
# importing required libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

In [None]:
warnings.filterwarnings('ignore')

In [None]:
sns.set_style('whitegrid')

In [None]:
# reading the titanic dataset into notebook

train = pd.read_csv('../input/titanicdataset-traincsv/train.csv')

In [None]:
# checking the top 5 rows of the dataset

train.head()

**Data Definitions**


* PassengerId - Unique Id of each passenger on the ship
* Survived - '0' for not survived & '1' for survived
* Pclass - Passenger class: '1' for 1st class, '2' for 2nd class & '3' for 3rd class
* Name - Passenger name
* Sex - Passenger gender: 'male' or 'female'
* Age - Passenger age
* SibSp - No. of siblings or spouses aborded Titanic together with the passenger
* Parch - No. of parents or children aborded Titanic together with the passenger
* Ticket - Passenger ticket number
* Fare - Passenger ticket fare
* Cabin - Passenger cabin number
* Embarked - Encoded name of city passenger embarked

In [None]:
# General information on the dataset

train.info()

In [None]:
# Statistical information on the dataset

train.describe()

## **EDA**

In [None]:
sns.countplot(x='Survived', hue='Sex', data=train)

In [None]:
# Probability of survival on both female and male

train[['Sex', 'Survived']].groupby(['Sex'], as_index=False).mean().sort_values(by='Survived', ascending=False)

In [None]:
sns.countplot(x='Pclass', hue='Survived', data=train)

In [None]:
# Probability of survival on the passenger class

train[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean().sort_values(by='Survived', ascending=False)

In [None]:
# Histogram depicting different ages with survival

g = sns.FacetGrid(data=train, col='Survived')
g.map(sns.distplot, 'Age', kde=False)

In [None]:
# Number of siblings or spouses

sns.countplot(x='SibSp', hue='Survived', data=train)

In [None]:
# Probability of survival

train[['SibSp', 'Survived']].groupby(['SibSp'], as_index=False).mean().sort_values(by='SibSp', ascending=True)

In [None]:
# Parents and children

sns.countplot(x='Parch', hue='Survived', data=train)

In [None]:
train[['Parch', 'Survived']].groupby(['Parch'], as_index=False).mean().sort_values(by='Parch', ascending=True)

In [None]:
train['Fare'].describe()

In [None]:
sns.distplot(train['Fare'], bins=50)

In [None]:
g = sns.FacetGrid(data=train, col='Survived')
g.map(sns.distplot, 'Fare', kde=False, bins=10)

In [None]:
sns.countplot(x='Embarked',hue='Survived', data=train)

In [None]:
train[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean().sort_values(by='Survived', ascending=False)

In [None]:
sns.pairplot(train, hue='Survived', diag_kws={'bw': 0.2})

In [None]:
plt.figure(figsize=(15,7))
sns.boxplot(x='Pclass', y='Age', data=train)

### **Observation**

Since the Age column has some null values, we can use the above boxplot to fill up those null values 

## **Handling Missing values**

In [None]:
sns.heatmap(train.isnull(), yticklabels=False, cbar=False, cmap='viridis')

The dataset has many null values from Age column to Cabin and Embarked, we will try to either fill up those null values or drop the column.

In [None]:
train[train['Age'].isnull()]

The Age column has 177 null or NaN values, since it is a small number and it has a good relationship whith Pclass column we will try to fill up those values with in relation to Pclass.

def impute_age(cols):
    age = cols[0]
    pclass = cols[1]
    
    if pd.isnull(age):
        if pclass == 1:
            return 37
        elif pclass == 2:
            return 29
        else:
            return 24
    else: 
        return age

train['Age'] = train[['Age','Pclass']].apply(impute_age, axis=1)

In [None]:
by_sex_class = train.groupby(['Sex', 'Pclass'])

In [None]:
def impute_median(series):
    return series.fillna(series.median())

In [None]:
train['Age'] = by_sex_class['Age'].transform(impute_median)

In [None]:
train[train['Embarked'].isnull()]

Now the Age column has been filled with some useful info with in elation with Pclass column, next is the Embarked column.

In [None]:
# filling up the null values with the top most common category

train['Embarked'] = train['Embarked'].fillna(train['Embarked'].value_counts().index[0])

In [None]:
train['Embarked'].isnull().any()

In [None]:
train[train['PassengerId'] == 830]

In [None]:
sns.heatmap(train.isnull(), yticklabels=False, cbar=False, cmap='viridis')

All the null values have been filled except Cabin column, which has a huge amount of null values we will drop the column itself.

In [None]:
train.drop('Cabin', axis=1, inplace=True)

In [None]:
train.head()

## **Feature selection and converting categorical feature into numerical data**

We will also drop the column which are not continuous or categorical.

In [None]:
train.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)

In [None]:
train.head()

Converting the categorical columns into dummy columns.
the pd.get_dummies() takes a column which has multiple categories,
and creates a column wise representation for those categories by assigning numbers.

In [None]:
Sex = pd.get_dummies(train['Sex'], drop_first=True)

In [None]:
Embarked = pd.get_dummies(train['Embarked'], drop_first=True)

In [None]:
Pclass = pd.get_dummies(train['Pclass'], drop_first=True)

In [None]:
# As we have our dummy variables, we will drop the existing columns and replace them with our dummy variables.

train.drop(['Sex', 'Embarked', 'Pclass'], axis=1, inplace=True)

In [None]:
# concatenating the dummy variables to the dataset

train = pd.concat([train, Sex, Embarked, Pclass], axis=1)

In [None]:
train.head()

## **Training and Testing**

In [None]:
X = train.drop(['Survived'], axis=1)
y = train['Survived']

Splitting the dataset into 70-30 for training and testing

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [None]:
# we will use a simple logistic regression model

from sklearn.linear_model import LogisticRegression

In [None]:
logmodel = LogisticRegression()

In [None]:
logmodel.fit(X_train, y_train)

In [None]:
predictions = logmodel.predict(X_test)

Let's see how our model did against the data

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
print(classification_report(y_test, predictions))

In [None]:
print(confusion_matrix(y_test, predictions))

Predicted No:Actual No: 135

Predicted No:Actual Yes: 38

Predicted Yes:Actual No: 19

Predicted Yes:Actual Yes: 76

In [None]:
print(accuracy_score(y_test, predictions) * 100)