In [None]:
import numpy as np
import pandas as pd
from pandas import Series,DataFrame

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
df = pd.read_csv('../input/passenger-list-for-the-estonia-ferry-disaster/estonia-passenger-list.csv',index_col='PassengerId')
df.head()

# Data Exploration

In [None]:
df.info()

No null objects means clean data!!!

### Country 

In [None]:
sns.countplot(y='Country',data=df)

Majority of the passengers are from Sweden and Estonia.

### Sex-ratio

In [None]:
sns.countplot('Sex',data=df)

Count of Male and Female passenger are almost same given a little edge to male passengers.

### Age Distribution

In [None]:
sns.kdeplot(df['Age'])
x_min = df['Age'].min()
x_max = df['Age'].max()
plt.xlim(x_min,x_max)

Most of the passengers are of age 20 to 70.

### Age vs Sex 

In [None]:
fig = sns.FacetGrid(df,hue='Sex',aspect=4)
fig.map(sns.kdeplot,'Age',shade=True)
x_min = df['Age'].min()
x_max = df['Age'].max()
fig.set(xlim=(x_min,x_max))

### Crew vs Passenger

In [None]:
sns.countplot('Category',data=df)

Obviously number of crews are less than passengers.

In [None]:
fig2 = sns.FacetGrid(df,hue='Category',aspect=4)
fig2.map(sns.kdeplot,'Age')
fig2.set(xlim=(0,x_max))

Crew Age varies between 10 yrs to 60 yrs.

### Survival 

In [None]:
sns.countplot('Survived',data=df)

More than 3/4th of the passengers didn't survive the accident!!!!

### Survived v/s Age

In [None]:
ax1 = sns.FacetGrid(df,hue='Survived',aspect=4)
ax1.map(sns.kdeplot,'Age')
ax1.set(xlim=(0,x_max))
ax1.add_legend()

Seems like people of age between 20 to 50 has more survival rate than others.

### Survived v/s Sex

In [None]:
sns.countplot('Sex',hue='Survived',data=df)

Male passengers has more survival rate than female. Plot imples very low number of females survived compared to males.

### Survived v/s Category

In [None]:
sns.countplot('Category',hue='Survived',data=df)

Given the count ratio of passengers v/s crew, survival rate of crew is greater than passengers.

In [None]:
s_rate = df.groupby(['Survived','Category'])['Category'].count()
print(s_rate)

cat = df.groupby('Category')['Category'].count()
print(cat)

x = (s_rate[1]['C'])/cat['C']
y = (s_rate[1]['P'])/cat['P']
print(f'Survival rate of crew is {x}')
print(f'Survival rate of passengers is {y}')

# Feature Engineering

Let's drop columns of Firstname and Lastname as those has vety little signigicance in the study.

In [None]:
df_copy = df.copy()

In [None]:
df_copy = df_copy.drop(['Firstname','Lastname'],axis=1)

### Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
def encoder (value):
    encode = LabelEncoder().fit(value)
    return encode.transform(value)

In [None]:
df_copy['Sex'] = encoder(df_copy['Sex'])
df_copy['Category'] = encoder(df_copy['Category'])
df_copy['Country'] = encoder(df_copy['Country'])
df_copy.head()

### Data Normalisation

In [None]:
"""Normalising to a range of 0=10"""
def normalize(values):
    mn = values.min()
    mx = values.max()
    return(10.0/(mx - mn) * (values - mx)+10)

In [None]:
df_copy = normalize(df_copy)

In [None]:
df_copy.describe()

In [None]:
sns.heatmap(df_copy.corr(),annot=True)

Interestingly, the correlation map implies, other than Sex all features are negatively correlated to Survival.

# Model

## Logistic Regression

In [None]:
Y = df_copy['Survived']
X = df_copy.drop(['Survived'],axis=1)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train,x_test,y_train,y_test = train_test_split(X,Y)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lreg = LogisticRegression()

In [None]:
lreg.fit(x_train,y_train)

In [None]:
y_pred = lreg.predict(x_test)

In [None]:
from sklearn.metrics import accuracy_score

### Accuracy Score

In [None]:
accuracy_score(y_test,y_pred)

## Logistic Regression with Cross validation

In [None]:
from sklearn.model_selection import cross_val_predict

In [None]:
lreg2 = LogisticRegression()
cross_y_pred = cross_val_predict(lreg2,X,Y,cv=5)

### Accuracy Score

In [None]:
accuracy_score(Y,cross_y_pred)

## SVM

In [None]:
x_train,x_test,y_train,y_test = train_test_split(X,Y)

In [None]:
from sklearn.svm import SVC

In [None]:
clf = SVC(random_state=1)

In [None]:
clf.fit(x_train,y_train)

In [None]:
y_pred1 = clf.predict(x_test)

### Accuracy Score

In [None]:
accuracy_score(y_test,y_pred1)

## SVM with Cross Validation

In [None]:
clf2 = SVC(random_state=1)
cross_y_pred1 = cross_val_predict(clf2,X,Y,cv=5)

In [None]:
accuracy_score(Y,cross_y_pred1)

Analysis : For the two models attempted, the accuracy increases with Cross-Validation. Hence, cross-validation can be utilised for beating the baseline.