# Titanic: Machine Learning from Disaster
This jupyter notebook is just a recitation of this kaggle notebook : https://www.kaggle.com/ldfreeman3/a-data-science-framework-to-achieve-99-accuracy. <br>
All credits goes to [ldfreeman3](https://www.kaggle.com/ldfreeman3)

## How a Data Scientist Beat the Odds
### A Data Science framework
1. Define the problem
2. Gather the data
3. Prepare Data for Consumption
4. Perform exploratory analysis
5. Model Data
6. Validate and implement Data Model
7. Optimize and strategize

## 3. Prepare Data for Consumption

In [1]:
TITANIC_TRAIN_PATH = '../datasets/titanic/train.csv'
TITANIC_TEST_PATH= '../datasets/titanic/test.csv'

### 3.1 Import Libraries

In [4]:
import sys
import pandas as pd
import matplotlib
import numpy as np
import scipy as sp
import IPython
from IPython import display
import sklearn
import random
import time

In [6]:
import warnings
warnings.filterwarnings('ignore') # ignore warnings in the jupyter notebook

### 3.11 Load Data Modelling Libraries

In [10]:
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
from xgboost import XGBClassifier

In [11]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics

In [12]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from pandas.tools.plotting import scatter_matrix

In [14]:
%matplotlib inline
mpl.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 12,8

### 3.2 Meet and Greet Data

In [15]:
data_raw = pd.read_csv(TITANIC_TRAIN_PATH)

In [16]:
data_val = pd.read_csv(TITANIC_TEST_PATH)

In [17]:
data1 = data_raw.copy(deep=True)

In [18]:
data_cleaner = [data1, data_val]

In [19]:
data1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [21]:
data1.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
692,693,1,3,"Lam, Mr. Ali",male,,0,0,1601,56.4958,,S
322,323,1,2,"Slayter, Miss. Hilda Mary",female,30.0,0,0,234818,12.35,,Q
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
219,220,0,2,"Harris, Mr. Walter",male,30.0,0,0,W/C 14208,10.5,,S
574,575,0,3,"Rush, Mr. Alfred George John",male,16.0,0,0,A/4. 20589,8.05,,S


### 3.21 The 4 C's of Data Cleaning: Correcting, Completing, Creating, and Converting

In [23]:
data1.isnull().sum() # good test

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [25]:
data_val.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [26]:
data1.describe(include='all')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891.0,891.0,204,889
unique,,,,891,2,,,,681.0,,147,3
top,,,,"Kent, Mr. Edward Austin",male,,,,1601.0,,B96 B98,S
freq,,,,1,577,,,,7.0,,4,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,20.125,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,38.0,1.0,0.0,,31.0,,


### 3.22 Clean Data

#### Completing

In [27]:
for dataset in data_cleaner:
    dataset['Age'].fillna(dataset['Age'].median(), inplace=True)
    dataset['Embarked'].fillna(dataset['Embarked'].mode()[0], inplace=True)
    dataset['Fare'].fillna(dataset['Fare'].median(), inplace=True)

In [28]:
drop_column = ['PassengerId', 'Cabin', 'Ticket']
data1.drop(drop_column, axis=1, inplace=True)

In [29]:
data1.isnull().sum()

Survived    0
Pclass      0
Name        0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [30]:
data_val.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
dtype: int64

#### Create

In [35]:
for dataset in data_cleaner:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
    dataset['IsAlone'] = 1
    dataset['IsAlone'].loc[dataset['FamilySize'] > 1] = 0 # good practice
    dataset['Title'] = dataset['Name'].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]
    dataset['FareBin'] = pd.qcut(dataset['Fare'], 4)
    dataset['AgeBin'] = pd.cut(dataset['Age'].astype(int), 5)

In [36]:
data1['Title'].value_counts()

Mr              517
Miss            182
Mrs             125
Master           40
Dr                7
Rev               6
Mlle              2
Col               2
Major             2
Capt              1
Ms                1
the Countess      1
Lady              1
Mme               1
Don               1
Sir               1
Jonkheer          1
Name: Title, dtype: int64

In [37]:
stat_min = 10

In [39]:
title_names = (data1['Title'].value_counts() < stat_min)

In [40]:
title_names

Mr              False
Miss            False
Mrs             False
Master          False
Dr               True
Rev              True
Mlle             True
Col              True
Major            True
Capt             True
Ms               True
the Countess     True
Lady             True
Mme              True
Don              True
Sir              True
Jonkheer         True
Name: Title, dtype: bool

In [41]:
data1['Title'] = data1['Title'].apply(lambda x: 'Misc' if title_names.loc[x] == True else x)

In [42]:
data1['Title'].value_counts()

Mr        517
Miss      182
Mrs       125
Master     40
Misc       27
Name: Title, dtype: int64

In [43]:
data1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
Survived      891 non-null int64
Pclass        891 non-null int64
Name          891 non-null object
Sex           891 non-null object
Age           891 non-null float64
SibSp         891 non-null int64
Parch         891 non-null int64
Fare          891 non-null float64
Embarked      891 non-null object
FamilySize    891 non-null int64
IsAlone       891 non-null int64
Title         891 non-null object
FareBin       891 non-null category
AgeBin        891 non-null category
dtypes: category(2), float64(2), int64(6), object(4)
memory usage: 85.5+ KB


In [44]:
data_val.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 16 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            418 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           418 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
FamilySize     418 non-null int64
IsAlone        418 non-null int64
Title          418 non-null object
FareBin        418 non-null category
AgeBin         418 non-null category
dtypes: category(2), float64(2), int64(6), object(6)
memory usage: 46.8+ KB


In [45]:
data1.sample(5)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,IsAlone,Title,FareBin,AgeBin
277,0,2,"Parkes, Mr. Francis ""Frank""",male,28.0,0,0,0.0,S,1,1,Mr,"(-0.001, 7.91]","(16.0, 32.0]"
295,0,1,"Lewy, Mr. Ervin G",male,28.0,0,0,27.7208,C,1,1,Mr,"(14.454, 31.0]","(16.0, 32.0]"
598,0,3,"Boulos, Mr. Hanna",male,28.0,0,0,7.225,C,1,1,Mr,"(-0.001, 7.91]","(16.0, 32.0]"
619,0,2,"Gavey, Mr. Lawrence",male,26.0,0,0,10.5,S,1,1,Mr,"(7.91, 14.454]","(16.0, 32.0]"
321,0,3,"Danoff, Mr. Yoto",male,27.0,0,0,7.8958,S,1,1,Mr,"(-0.001, 7.91]","(16.0, 32.0]"


### 3.23 Convert Formats

In [46]:
label = LabelEncoder()

In [47]:
# Can be simplify
for dataset in data_cleaner:
    dataset['Sex_Code'] = label.fit_transform(dataset['Sex'])
    dataset['Embarked_Code'] = label.fit_transform(dataset['Embarked'])
    dataset['Title_Code'] = label.fit_transform(dataset['Title'])
    dataset['AgeBin_Coder'] = label.fit_transform(dataset['AgeBin'])
    dataset['FareBin_Coder'] = label.fit_transform(dataset['FareBin'])

In [48]:
Target = ['Survived']

In [51]:
# pretty name/values for charts
data1_x = ['Sex', 'Pclass', 'Embarked', 'Title', 'SibSp', 'Parch', 'Age', 'Fare', 'FamilySize', 'IsAlone']

In [52]:
# for algorithm calculation
data1_x_calc = ['Sex_Code', 'Pclass', 'Embarked_Code', 'Title_Code', 'FamilySize', 'AgeBin_Code', 'FareBin_Code']

In [53]:
data1_xy = Target + data1_x

In [56]:
data1_x_bin = ['Sex_Code', 'PClass', 'Embarked_Code', 'Title_Code', 'FamilySize', 'AgeBin_Code', 'FareBin_Code']

In [57]:
data1_xy_bin = Target + data1_x_bin

In [60]:
data1_dummy = pd.get_dummies(data1[data1_x])
data1_x_dummy = data1_dummy.columns.tolist()
data1_xy_dummy = Target + data1_x_dummy

In [61]:
data1_dummy.head()

Unnamed: 0,Pclass,SibSp,Parch,Age,Fare,FamilySize,IsAlone,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Title_Master,Title_Misc,Title_Miss,Title_Mr,Title_Mrs
0,3,1,0,22.0,7.25,2,0,0,1,0,0,1,0,0,0,1,0
1,1,1,0,38.0,71.2833,2,0,1,0,1,0,0,0,0,0,0,1
2,3,0,0,26.0,7.925,1,1,1,0,0,0,1,0,0,1,0,0
3,1,1,0,35.0,53.1,2,0,1,0,0,0,1,0,0,0,0,1
4,3,0,0,35.0,8.05,1,1,0,1,0,0,1,0,0,0,1,0
