# Hello World: Kaggle Edition ([aka, the Titanic Competition](https://www.kaggle.com/c/titanic))

*Robert A. Brown*


In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
from matplotlib  import cm
import matplotlib.pyplot as plt
from sklearn import preprocessing
import matplotlib.gridspec as gridspec
from mpl_toolkits.axes_grid1 import ImageGrid
from sklearn.model_selection import train_test_split

from sklearn import preprocessing
from sklearn.metrics import accuracy_score

from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn import svm
import xgboost as xgb

train_path = 'data/train.csv'
test_path = 'data/test.csv'

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
train = pd.read_csv(train_path)
validation = pd.read_csv(test_path)
train.Embarked[train.Embarked.isnull()]  = 'M'
validation.Embarked[validation.Embarked.isnull()]  = 'M'

catagorical = ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']
numeric = [c for c in list(train) if c not in catagorical]
train.head(15)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [None]:
f, ax = plt.subplots(figsize=(10, 8))
heat = sns.heatmap(train.corr(), cmap=sns.diverging_palette(220, 10, as_cmap=True),
                   square=False, annot = True, ax=ax)

In [None]:
not_nan = np.isfinite(train.Age)
classes = [train[(train.Pclass == c) & not_nan] for c in sorted(train.Pclass.unique())]

bins = np.arange(0, 80, 5)
title = ['First Class', 'Second Class', 'Third Class']

rows = cols = 3
fig, ax = plt.subplots(num=None, figsize=(15, 20), dpi=400, facecolor='w', edgecolor='k')
gs = gridspec.GridSpec(rows, cols)
ax = [plt.subplot(gs[i]) for i in range(rows*cols)]

for i, CLASS in enumerate(classes):
    survived = CLASS[CLASS.Survived == 1]
    died = CLASS[CLASS.Survived == 0]
    
    ax[3*i].hist(died.Age, alpha=0.5, bins=bins, label='died')
    ax[3*i].hist(survived.Age, alpha=0.5, bins=bins, label='survived')
    ax[3*i].set_title(title[i])
    ax[3*i].set_ylabel('Count')
    ax[3*i].set_xlabel('Age')
    ax[3*i].legend(loc='upper right')    
    
    for k, df in {'survived':survived, 'died':died}.items():
        male_df = df[df.Sex == 'male']
        female_df = df[df.Sex == 'female']        
        ax[3*i + 1].hist(male_df.Age, alpha=0.5, bins=bins, label=k)
        ax[3*i + 2].hist(female_df.Age, alpha=0.5, bins=bins, label=k)
        
        for j, s in enumerate(['Men', 'Women']):
            ax[3*i+j+1].set_title('{0} {1}'.format(title[i], s))
            ax[3*i+j+1].set_ylabel('Count')
            ax[3*i+j+1].set_xlabel('Age')
            ax[3*i+j+1].legend(loc='upper right')    

In [13]:
"""
normalizer = preprocessing.MinMaxScaler()

training_df = pd.get_dummies(train.loc[:,features].dropna())
trian_norm = normalizer.fit_transform(training_df)  
training_df = pd.DataFrame(trian_norm, columns=list(training_df))

validation_df = pd.get_dummies(validation.loc[:,features].dropna())
validation_norm = normalizer.fit_transform(validation_df)  
validation_df = pd.DataFrame(validation_norm, columns=list(validation_df))
features = list(set(training_df).intersection(validation_df))
"""

features = ['Age', 'Pclass', 'Fare', 'Sex', 'SibSp', 'Parch']
t_features = np.append(['Survived'], features)

train.Sex = train.Sex.apply(lambda x: 1 if x == 'female' else 0)
validation.Sex = validation.Sex.apply(lambda x: 1 if x == 'female' else 0)
validation.Age = validation.Age.fillna(train.Age.mean())
validation.Fare = validation.Fare.fillna(train.Fare.mean())

X = train[features].dropna()
y = train[t_features].dropna().Survived
X_validation = validation[features].dropna()

In [4]:
M = {}
M['Logistic Regression'] = LogisticRegression()
M['Perceptron'] = Perceptron(class_weight='balanced')
M['SVM'] = svm.SVC(class_weight='balanced', kernel='sigmoid')
M['KNN'] = KNeighborsClassifier(n_neighbors=10, weights='distance')

In [5]:
params = {
    'max_depth':5, 
    'class_weight':'balanced', 
    'min_weight_fraction_leaf':0.01
}
M['Decision Tree'] = tree.DecisionTreeClassifier(**params)

In [6]:
params = {  
            'n_estimators':1000,
            'learning_rate':0.1,
            'max_depth':3,
            'subsample':0.75,
            'random_state':0  
}
M['Gradient Boosting'] = GradientBoostingClassifier(**params)

In [7]:
params = {
    'objective': 'binary:logistic',
    'max_depth': 3,
    'learning_rate': 1.0,
    'silent': 1.0,
    'n_estimators': 5
}
M['XGBoost'] = xgb.XGBClassifier(**params)

In [8]:
params = {
        'n_estimators':1000,
        'max_depth':None, 
        'min_samples_split':10,
        #'class_weight':"balanced", 
        #'min_weight_fraction_leaf':0.02
}

M['Random Forest'] = RandomForestClassifier(**params)

In [9]:
classifiers = [(m, M[m]) for m in ['XGBoost', 'Gradient Boosting', 'Decision Tree', 'Random Forest']]
M['Ensemble'] = VotingClassifier(estimators=classifiers, voting='soft')

In [10]:
map(lambda m: m.fit(X, y), M.values())

models = pd.DataFrame({ 'Model':list(M),
                        'Training Score':map(lambda m: m.score(X, y), M.values()),
                        })

models.sort_values(by='Training Score', ascending=False)

Unnamed: 0,Model,Training Score
0,KNN,0.985994
4,Gradient Boosting,0.985994
5,Ensemble,0.928571
8,Random Forest,0.911765
3,XGBoost,0.855742
2,Decision Tree,0.841737
6,Logistic Regression,0.792717
1,SVM,0.593838
7,Perceptron,0.535014


In [18]:
for name, m in M.items():
    df = pd.DataFrame(columns=['PassengerId', 'Survived'])
    df['PassengerId'] = validation['PassengerId']
    df['Survived'] = m.predict(validation[features])
    df.to_csv('{0}.csv'.format(name), index=False)