### 1. Prepare for work
#### 1.1 Load Libraries

In [None]:
#load packages
import sys #access to system parameters https://docs.python.org/3/library/sys.html
print("Python version: {}". format(sys.version))

import pandas as pd #collection of functions for data processing and analysis modeled after R dataframes with SQL like features
print("pandas version: {}". format(pd.__version__))

import matplotlib #collection of functions for scientific and publication-ready visualization
print("matplotlib version: {}". format(matplotlib.__version__))

import numpy as np #foundational package for scientific computing
print("NumPy version: {}". format(np.__version__))

import scipy as sp #collection of functions for scientific computing and advance mathematics
print("SciPy version: {}". format(sp.__version__)) 

import IPython
from IPython import display #pretty printing of dataframes in Jupyter notebook
print("IPython version: {}". format(IPython.__version__)) 

import sklearn #collection of machine learning algorithms
print("scikit-learn version: {}". format(sklearn.__version__))

#misc libraries
import random
import time

#ignore warnings
import warnings
warnings.filterwarnings('ignore')
print('-'*25)

#Common Model Algorithms
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
####---- from xgboost import XGBClassifier

#Common Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics

#Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
# from pandas.tools.plotting import scatter_matrix

In [None]:
from sklearn import svm
from sklearn.metrics import classification_report, accuracy_score

#### 1.2 Configure Viz defaults

In [None]:
%matplotlib inline
mpl.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 12,8

#### 1.3 Check input data directory

In [None]:
from subprocess import check_output
print(check_output(["ls", "./data"]).decode("utf8"))

### 2 Wrangling Data

#### 2.1 Get from GCS bucket

In [None]:
!gsutil cp gs://bbs-2019-aiml4b-base-mldata/gender_submission.csv ./02.titanic/data
!gsutil cp gs://bbs-2019-aiml4b-base-mldata/train.csv ./02.titanic/data
!gsutil cp gs://bbs-2019-aiml4b-base-mldata/test.csv ./02.titanic/data

#### 2.2 Load Data

In [None]:
data_raw = pd.read_csv('./data/train.csv')

##### 2.2.1 Data Attributes (structure)

In [None]:
print (data_raw.info())

#### An explanation of attributes (features?)

##### PASSENGER ID
Some code for the passenger into Titanic's *information system*. It's like the Employee ID in your company!

##### TICKET
The ticket number / code

##### PCLASS
Ordinal variable
 - 1: upper class
 - 2: middle class
 - 3: lower class

##### NAME, SEX, AGE
It's the NAME, AGE and SEX of the passenger!

##### EMBARKED
The Port where the passenger was embarked

##### FARE
It's the fare paid by the passenger for the ticket

##### SIBSP
Represents number of related siblings/spouse aboard

##### PARCH
Represents number of related parents/children aboard

##### CABIN
Kind of approximate position on ship when the incident occurred

##### SURVIVED
It's the *label*

##### 2.2.2 Data Preview (10 records)

In [None]:
data_raw.sample(10)

##### 2.2.3 Statistical Properties Overview

In [None]:
data_raw.describe(include = 'all')

### 3 Set Training and Validation Sets

In [None]:
data_val  = pd.read_csv('./data/test.csv')
data_train = data_raw.copy(deep = True)

#### 3.1 Investigating on NULLs

In [None]:
print('Train columns with null values:\n', data_train.isnull().sum())

### 4 Data Cleaning

#### 4.1 Fill missing AGE with *median* value

In [None]:
data_train['Age'].fillna(data_train['Age'].median(), inplace = True)
data_val['Age'].fillna(data_val['Age'].median(), inplace = True)

#### 4.2 Fill missing EMBARKED with *mode* value

In [None]:
data_train['Embarked'].fillna(data_train['Embarked'].mode()[0], inplace = True)
data_val['Embarked'].fillna(data_val['Embarked'].mode()[0], inplace = True)

#### 4.3 Fill missing FARE with *median* value

In [None]:
data_train['Fare'].fillna(data_train['Fare'].median(), inplace = True)
data_val['Fare'].fillna(data_val['Fare'].median(), inplace = True)

#### 4.4 Discard useless features
 - Passenger ID
 - Cabin (Code)
 - Ticket (Number)

In [None]:
drop_column = ['PassengerId', 'Cabin', 'Ticket']
data_train.drop(drop_column, axis=1, inplace = True)

#### 4.4 Checking NULLs

In [None]:
print(data_train.isnull().sum())

### 5 Feature Engineering

#### 5.1 FamilySize
Calculated field that comes from summing up the passenger itself, its siblings/spouse and its parents/children aboard with it

In [None]:
data_train['FamilySize'] = data_train['SibSp'] + data_train['Parch'] + 1
data_val['FamilySize'] = data_val['SibSp'] + data_val['Parch'] + 1

#### 5.2 IsAlone
One more "calculation" to mark which passengers were travelling alone

In [None]:
data_train['IsAlone'] = 1
data_train['IsAlone'].loc[data_train['FamilySize'] > 1] = 0

data_val['IsAlone'] = 1
data_val['IsAlone'].loc[data_val['FamilySize'] > 1] = 0

#### 5.3 Title

NAME variable is formattes as \<Surname\>, \<Title\>. \<Name\>.
Let's isolate the "Title" token.

In [None]:
data_train['Title'] = data_train['Name'].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]
data_val['Title'] = data_val['Name'].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]

##### 5.6.1 Rare values

They call it the *Long Tail*

In [None]:
print(data_train['Title'].value_counts())

In [None]:
data_train['Title'].value_counts().hist()

10 is a magic number (http://nicholasjjackson.com/2012/03/08/sample-size-is-10-a-magic-number/)

In [None]:
title_names = (data_train['Title'].value_counts() < 10)
data_train['Title'] = data_train['Title'].apply(lambda x: 'Misc' if title_names.loc[x] == True else x)

Let's check our work

In [None]:
print(data_train['Title'].value_counts())

#### 5.4 Checkup!

In [None]:
data_train.sample(10)

#### 5.5 FARE

FARE variable is continuous but we consider it quantizable into 4 bins to reduce complexity

In [None]:
data_train['Fare'].hist()

In [None]:
data_train['FareBin'] = pd.qcut(data_train['Fare'], 4)
data_val['FareBin'] = pd.qcut(data_val['Fare'], 4)

#### 5.6 AGE

Doing the same on AGE into 5 bins (we need integer bins here!)

In [None]:
data_train['Age'].hist()

In [None]:
data_train['AgeBin'] = pd.cut(data_train['Age'].astype(int), 5)
data_val['AgeBin'] = pd.cut(data_val['Age'].astype(int), 5)

#### 5.7 Let's check it up again!

In [None]:
data_train.sample(10)

### 6 Feature Engineering: CATEGORICAL data

LabelEncoder is aimed to transform "list of alphanumeric values" into a numeric (integer) mapping

In [None]:
label = LabelEncoder()

Transform in numeric the following attributes
 - SEX
 - Embarked
 - Title
 - Age  (quantized)
 - Fare (quantized)

In [None]:
data_train['Sex_Code'] = label.fit_transform(data_train['Sex'])
data_train['Embarked_Code'] = label.fit_transform(data_train['Embarked'])
data_train['Title_Code'] = label.fit_transform(data_train['Title'])
data_train['AgeBin_Code'] = label.fit_transform(data_train['AgeBin'])
data_train['FareBin_Code'] = label.fit_transform(data_train['FareBin'])

In [None]:
data_val['Sex_Code'] = label.fit_transform(data_val['Sex'])
data_val['Embarked_Code'] = label.fit_transform(data_val['Embarked'])
data_val['Title_Code'] = label.fit_transform(data_val['Title'])
data_val['AgeBin_Code'] = label.fit_transform(data_val['AgeBin'])
data_val['FareBin_Code'] = label.fit_transform(data_val['FareBin'])

In [None]:
data_train.sample(10)

In [None]:
data_train['FareBin_Code'].hist()

In [None]:
data_train['AgeBin_Code'].hist()

### 8 Assessing X-Correlation


In [None]:
def correlation_heatmap(df):
    _ , ax = plt.subplots(figsize =(14, 12))
    colormap = sns.diverging_palette(220, 10, as_cmap = True)
    
    _ = sns.heatmap(
        df.corr(), 
        cmap = colormap,
        square=True, 
        cbar_kws={'shrink':.9 }, 
        ax=ax,
        annot=True, 
        linewidths=0.1,vmax=1.0, linecolor='white',
        annot_kws={'fontsize':12 }
    )
    
    plt.title('Correlation of Features (Pearson)', y=1.05, size=15)

correlation_heatmap(data_train)

### 7 Preparing for ML

Mark the LABEL

In [None]:
Target = ['Survived']

#### 7.1 Features selection

In [None]:
data_train_x_bin = ['Sex_Code','Pclass', 'Embarked_Code', 'Title_Code', 'FamilySize', 'AgeBin_Code', 'FareBin_Code']
data_train_xy_bin = Target + data_train_x_bin
print('Bin X Y: ', data_train_xy_bin, '\n')

#### 7.2 Split Train and Test Sets

In [None]:
train1_x_bin, test1_x_bin, train1_y_bin, test1_y_bin = model_selection.train_test_split(data_train[data_train_x_bin], data_train[Target] , random_state = 0)

Sample

In [None]:
train1_x_bin.head()

### 9 ML

Support Vector Machines

In [None]:
#from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV#, train_test_split

param_grid = [{'kernel': ['rbf'], 'C': [10, 100, 1000], 'gamma' : [1e-2, 1e-3, 1e-4]},
              {'kernel': ['linear'], 'C': [1, 10, 100]}]

#X_train_a, X_train_v, Y_train_a, Y_train_v = train_test_split(X_train, Y_train, random_state=42)
gcv = GridSearchCV(svm.SVC(), param_grid, cv=5, n_jobs=4)
gcv.fit(train1_x_bin, train1_y_bin)

In [None]:
mu = gcv.cv_results_['mean_test_score']
std = gcv.cv_results_['std_test_score']
pars = gcv.cv_results_['params']
for (m,s,p) in zip(mu,std,pars):
    print('%.2f (+-%.2f): %s' %(m,s,p))

In [None]:
pred = gcv.predict(test1_x_bin)

In [None]:
print('hyperparams: %s' % gcv.best_params_)

In [None]:
print(classification_report(pred,test1_y_bin))
print('accuracy %.2f' % accuracy_score(pred,test1_y_bin))

### CONFUSION MATRIX

In [None]:
#Plot Accuracy Summary
#Credit: http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
import itertools
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

# Compute confusion matrix
cnf_matrix = metrics.confusion_matrix(test1_y_bin, pred)
np.set_printoptions(precision=2)

class_names = ['Dead', 'Survived']
# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      title='Confusion matrix, without normalization')

In [None]:
# Plot normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True, 
                      title='Normalized confusion matrix')