In [1]:
import pandas as pd
import numpy as np
import sklearn

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [2]:
titanic_df = pd.read_csv('datasets/titanic_processed.csv')
titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,0,3,1,38.0,0,0,8.6625,0,0,1
1,1,3,1,32.0,0,0,8.05,0,0,1
2,0,1,1,31.0,0,0,50.4958,0,0,1
3,0,3,1,14.0,5,2,46.9,0,0,1
4,1,3,1,3.0,4,2,31.3875,0,0,1


In [3]:
X = titanic_df.drop('Survived', axis=1)
y = titanic_df['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [4]:
def summarize_classification(y_test, y_pred):
    
    acc = accuracy_score(y_test, y_pred, normalize=True)
    num_acc = accuracy_score(y_test, y_pred, normalize=False)
    
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    
    print 'Test data count:', len(y_test)
    print 'accuracy: ', acc
    print 'precision: ', prec
    print 'recall: ', rec
    print 'accuracy_count:', num_acc
    print ' '

In [5]:
from sklearn.model_selection import GridSearchCV

parameters = {'max_depth': [2,4,5,7,9,10]}

grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid=parameters, cv=3, return_train_score=True)


In [7]:
grid_search.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': [2, 4, 5, 7, 9, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [8]:
grid_search.best_params_

{'max_depth': 5}

In [9]:
for i in range(6):
    print 'Parameters: ', grid_search.cv_results_['params'][i]
    
    print 'Mean Test Score: ', grid_search.cv_results_['mean_test_score'][i]
    
    print 'Rank: ', grid_search.cv_results_['rank_test_score'][i]

Parameters:  {'max_depth': 2}
Mean Test Score:  0.804920913884007
Rank:  2
Parameters:  {'max_depth': 4}
Mean Test Score:  0.7978910369068541
Rank:  4
Parameters:  {'max_depth': 5}
Mean Test Score:  0.8137082601054482
Rank:  1
Parameters:  {'max_depth': 7}
Mean Test Score:  0.8031634446397188
Rank:  3
Parameters:  {'max_depth': 9}
Mean Test Score:  0.7926186291739895
Rank:  5
Parameters:  {'max_depth': 10}
Mean Test Score:  0.7855887521968365
Rank:  6


In [10]:
decision_tree_model = DecisionTreeClassifier(\
                                            max_depth = grid_search.best_params_['max_depth']).fit(X_train, y_train)

In [11]:
y_pred = decision_tree_model.predict(X_test)

In [12]:
summarize_classification(y_test, y_pred)

Test data count: 143
accuracy:  0.7482517482517482
precision:  0.7755102040816326
recall:  0.6031746031746031
accuracy_count: 107
 


In [13]:
#Hyperparameter tuning for logistic regression model
from sklearn.model_selection import GridSearchCV

parameters = {'penalty': ['l1', 'l2'],
             'C': [0.1, 0.4, 0.8, 1, 2, 5]}

grid_search = GridSearchCV(LogisticRegression(solver='liblinear'), param_grid=parameters, cv=3, return_train_score=True)

In [14]:
grid_search.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'penalty': ['l1', 'l2'], 'C': [0.1, 0.4, 0.8, 1, 2, 5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [15]:
grid_search.best_params_

{'C': 1, 'penalty': 'l1'}

In [16]:
for i in range(12):
    print 'Parameters: ', grid_search.cv_results_['params'][i]
    print 'Mean Test Score: ', grid_search.cv_results_['mean_test_score'][i]
    print 'Rank: ', grid_search.cv_results_['rank_test_score'][i]

Parameters:  {'penalty': 'l1', 'C': 0.1}
Mean Test Score:  0.7504393673110721
Rank:  12
Parameters:  {'penalty': 'l2', 'C': 0.1}
Mean Test Score:  0.7644991212653779
Rank:  11
Parameters:  {'penalty': 'l1', 'C': 0.4}
Mean Test Score:  0.7926186291739895
Rank:  7
Parameters:  {'penalty': 'l2', 'C': 0.4}
Mean Test Score:  0.7803163444639719
Rank:  10
Parameters:  {'penalty': 'l1', 'C': 0.8}
Mean Test Score:  0.7943760984182777
Rank:  5
Parameters:  {'penalty': 'l2', 'C': 0.8}
Mean Test Score:  0.789103690685413
Rank:  9
Parameters:  {'penalty': 'l1', 'C': 1}
Mean Test Score:  0.7996485061511424
Rank:  1
Parameters:  {'penalty': 'l2', 'C': 1}
Mean Test Score:  0.7926186291739895
Rank:  7
Parameters:  {'penalty': 'l1', 'C': 2}
Mean Test Score:  0.7978910369068541
Rank:  4
Parameters:  {'penalty': 'l2', 'C': 2}
Mean Test Score:  0.7943760984182777
Rank:  5
Parameters:  {'penalty': 'l1', 'C': 5}
Mean Test Score:  0.7996485061511424
Rank:  1
Parameters:  {'penalty': 'l2', 'C': 5}
Mean Test Sc

In [17]:
logistic_model = LogisticRegression(solver='liblinear', \
                                   penalty = grid_search.best_params_['penalty'], C=grid_search.best_params_['C']). \
                                   fit(X_train, y_train)

In [18]:
y_pred = logistic_model.predict(X_test)

In [19]:
summarize_classification(y_test, y_pred)

Test data count: 143
accuracy:  0.7622377622377622
precision:  0.7959183673469388
recall:  0.6190476190476191
accuracy_count: 109
 
