# Machine Learning
## Titanic Worked Example 
Author: Andrew Szwec

## Load the Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from string import ascii_letters
import seaborn as sns

%matplotlib inline

dd = pd.read_csv('titanic_preprocessed.csv')
dd.head()

## Get the Dummy Variables

In [None]:
dd_dum = pd.get_dummies(dd)
dd_dum

## Remove Missing Records

In [None]:
dd_dum.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True)

## Train Test Split

In [None]:
?RandomForestClassifier

In [None]:
from sklearn.model_selection import train_test_split
seed = 1234

x = dd_dum.drop('Survived', axis=1)
y = dd_dum['Survived']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.10, random_state=seed)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
# link http://scikit-learn.org/stable/auto_examples/ensemble/plot_feature_transformation.html#sphx-glr-auto-examples-ensemble-plot-feature-transformation-py
n_estimator = 20

rf = RandomForestClassifier(max_depth=10, n_estimators=n_estimator)
rf.fit(x_train, y_train)
print('The mean accuracy: {}'.format( np.round(rf.score(x_test, y_test), 3) ))

## Find the best model with a grid search

In [None]:
acc = []
log = []
i = 0
for est in range(6, 13):
    for depth in range(2,10):
        rf = RandomForestClassifier(max_depth=depth, n_estimators=est)
        rf.fit(x_train, y_train)
        scr = rf.score(x_test, y_test)
        details = 'Model {}, n_estimators={}, max-depth={} The mean accuracy={}'.format( i, est, depth, np.round(scr, 3) )
        print(details)
        acc.append(scr)
        log.append(details)
        # iterate
        i += 1

## Best Model

In [None]:
log[np.argmax(acc)]

In [None]:
n_estimator = 20
depth=10

rf = RandomForestClassifier(max_depth=depth, n_estimators=n_estimator)
rf.fit(x_train, y_train)
print('The mean accuracy: {}'.format( np.round(rf.score(x_test, y_test), 3) ))

In [None]:
preds = rf.predict(x)
preds[0:10]

In [None]:
dd_dum['preds'] = preds

In [None]:
dd_dum

In [None]:
preds = rf.predict_proba(x)
preds2 = [p[1] for p in preds]

In [None]:
dd_dum['probs'] = preds2
dd_dum

## Gradient Boosted Machine

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

n_estimator = 30

rf = GradientBoostingClassifier(n_estimators=n_estimator)
rf.fit(x_train, y_train)
print('The mean accuracy: {}'.format( np.round(rf.score(x_test, y_test), 3) ))

## Find the best model with a grid search

In [None]:
acc = []
log = []
i = 0
for est in range(10, 20):
    for depth in range(5,10):
        for lr in np.arange(0.01, 0.1, 0.02):
            rf = GradientBoostingClassifier(max_depth=depth, n_estimators=est, learning_rate=lr)
            rf.fit(x_train, y_train)
            scr = rf.score(x_test, y_test)
            details = 'Model {}, n_estimators={}, max-depth={}, learning_rate={}, The mean accuracy={}'.format( i, est, depth, lr, np.round(scr, 3) )
            print(details)
            acc.append(scr)
            log.append(details)
            # iterate
            i += 1

## Best Model

In [None]:
log[np.argmax(acc)]