## Load packages

In [3]:
import pandas as pd
import numpy as np
import random as rnd

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

## Load input data

In [5]:
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')
combine = [train_df, test_df]

## Add new columns
1. from column ['Name'] extract title (e.g. Mr, Ms, ...) <br>
2. create new column ['Family'] by adding siblings/spouses and parents on board

In [7]:
for dataset in combine:
    dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
    dataset['Family'] = dataset['SibSp']+dataset['Parch']

## Drop unwanted columns from train and test dataframes

In [9]:
train_df = train_df.drop(['Name', 'PassengerId', 'SibSp','Parch','Ticket','Cabin'], axis=1)
test_df = test_df.drop(['Name', 'SibSp','Parch','Ticket','Cabin'], axis=1)
combine = [train_df, test_df]

In [10]:
train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,Family
0,0,3,male,22.0,7.25,S,Mr,1
1,1,1,female,38.0,71.2833,C,Mrs,1
2,1,3,female,26.0,7.925,S,Miss,0
3,1,1,female,35.0,53.1,S,Mrs,1
4,0,3,male,35.0,8.05,S,Mr,0


## Modify columns
### Modify column ['Title'] so that:
* least common titles are grouped in catgory 'Rare'
* duplicate titles (e.g. Mlle and Miss) are merged 

In [29]:
for dataset in combine:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

### Modify column ['Embarked'] so that:
* the embarkation ports are marked as C = Cherbourg, Q = Queenstown, S = Southampton
* NaN are filled with the most represented category (S = Southampton)

### Modify column ['Sex'] so that:
* the categories are marek as 0 = male, 1 = female

In [36]:
train_df.Embarked.unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [37]:
for dataset in combine:
    dataset['Sex'] = dataset['Sex'].map( {'female': 1, 'male': 0} ).astype(int)
    dataset['Embarked'] = dataset['Embarked'].fillna('S').map( {'S': 1, 'C': 2, 'Q': 3} ).astype(int)


### Modify column ['Title'] so that:
* a number is assigned to each category: 1 = Mr, 2 = Mrs, 3 = Miss, 4 = Master, 5 = Rare

In [38]:
for dataset in combine:
        dataset['Title'] = dataset['Title'].map( {'Mr':1,
                                             'Mrs':2,
                                             'Miss':3,
                                             'Master':4,
                                             'Rare':5} ).astype(int)

In [40]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare,Embarked,Title,Family
0,892,3,0,34.5,7.8292,3,1,0
1,893,3,1,47.0,7.0,1,2,1
2,894,2,0,62.0,9.6875,3,1,0
3,895,3,0,27.0,8.6625,1,1,0
4,896,3,1,22.0,12.2875,1,2,2


### Modify column ['Age'] so that:
* NaN are guessed depending on the values of columns ['Sex'] and ['Pclass']
* divide into ranges and assign an integer to each category

In [61]:
guess_ages = np.zeros((2,3))
for dataset in combine:
    for i in range(0, 2):
        for j in range(0, 3):
            guess_df = dataset[(dataset['Sex'] == i) & (dataset['Pclass'] == j+1)]['Age'].dropna()

            # age_mean = guess_df.mean()
            # age_std = guess_df.std()
            # age_guess = rnd.uniform(age_mean - age_std, age_mean + age_std)

            age_guess = guess_df.median()

            # Convert random age float to nearest .5 age
            guess_ages[i,j] = int( age_guess/0.5 + 0.5 ) * 0.5
            
    for i in range(0, 2):
        for j in range(0, 3):
            dataset.loc[ (dataset.Age.isnull()) & (dataset.Sex == i) & (dataset.Pclass == j+1),\
                    'Age'] = guess_ages[i,j]

    dataset['Age'] = dataset['Age'].astype(int)


In [66]:
for dataset in combine:    
    dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age']
train_df.head()
        

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,Family
0,0,3,0,1,7.25,1,1,1
1,1,1,1,2,71.2833,2,2,1
2,1,3,1,1,7.925,1,3,0
3,1,1,1,2,53.1,1,2,1
4,0,3,0,2,8.05,1,1,0


### Modify column ['Fare'] so that:
* divide into ranges and assign an integer to each category

In [67]:
for dataset in combine:
    dataset['Fare'].fillna(dataset['Fare'].dropna().median(), inplace=True)
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)

## Set train and test dataframes for ML model

In [70]:
titanic_train = train_df.drop("Survived", axis=1)
titanic_target = train_df["Survived"]
titanic_test  = test_df.drop("PassengerId", axis=1)

## Run some optimisation on the Random Forest classifier parameters

In [84]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

best_param = {}
best_score = []

for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(titanic_train, titanic_target)

    clf = RandomForestClassifier()
    grid_values = {'n_estimators': [25, 50, 100],
                   'min_impurity_decrease':[0.,0.5,0.9]
                  }

    grid_clf_auc = GridSearchCV(clf, param_grid = grid_values, scoring = 'accuracy')
    grid_clf_auc.fit(X_train, y_train)
    y_decision_fn_scores_auc = grid_clf_auc.predict(X_test) 

    best_param[i] = grid_clf_auc.best_params_
    best_score.append(grid_clf_auc.best_score_)

In [85]:
best_param

{0: {'min_impurity_decrease': 0.0, 'n_estimators': 50},
 1: {'min_impurity_decrease': 0.0, 'n_estimators': 100},
 2: {'min_impurity_decrease': 0.0, 'n_estimators': 50},
 3: {'min_impurity_decrease': 0.0, 'n_estimators': 25},
 4: {'min_impurity_decrease': 0.0, 'n_estimators': 100},
 5: {'min_impurity_decrease': 0.0, 'n_estimators': 50},
 6: {'min_impurity_decrease': 0.0, 'n_estimators': 100},
 7: {'min_impurity_decrease': 0.0, 'n_estimators': 25},
 8: {'min_impurity_decrease': 0.0, 'n_estimators': 100},
 9: {'min_impurity_decrease': 0.0, 'n_estimators': 25}}

In [86]:
best_score

[0.8068862275449101,
 0.8053892215568862,
 0.8083832335329342,
 0.811377245508982,
 0.8248502994011976,
 0.7889221556886228,
 0.8083832335329342,
 0.8203592814371258,
 0.8143712574850299,
 0.8308383233532934]

## Create the Random Forest classifier and predict the accuracy on the test dataframe

In [88]:
X_train, X_test, y_train, y_test = train_test_split(titanic_train, titanic_target)
clf_forest=RandomForestClassifier(n_estimators=50).fit(X_train, y_train)

y_predict=clf_forest.predict(X_test)
accuracy_score(y_test, y_predict)

0.8251121076233184

## Prepare output for submission

In [89]:
Y_pred = clf_forest.predict(titanic_test)

In [90]:
submission = pd.DataFrame({
        "PassengerId": test_df["PassengerId"],
        "Survived": Y_pred
    })
submission

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
5,897,0
6,898,1
7,899,0
8,900,1
9,901,0


In [91]:
submission.to_csv('submission.csv', index=False)