In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [3]:
titanic_df = pd.read_csv('datasets/titanic_processed.csv')
titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,0,3,0,30.0,0,0,8.6625,0,0,1
1,0,3,0,25.0,1,0,7.925,0,0,1
2,0,1,1,36.0,1,0,78.85,0,0,1
3,0,3,1,20.0,0,0,9.8458,0,0,1
4,1,2,1,31.0,0,0,13.0,0,0,1


In [6]:
X = titanic_df.drop('Survived', axis=1)

Y = titanic_df['Survived']

x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size=.2)

In [9]:
#helper function
def summarize_classification(y_test, y_pred):
    acc=accuracy_score(y_test, y_pred, normalize=True)#fraction %
    num_acc = accuracy_score(y_test, y_pred, normalize=False)#raw number
    
    prec=precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    res = {'test data count':len(y_test), 'accuracy count':num_acc, 'accuracy':acc, 'precision':prec, 'recall':recall}
    print(res,"\n")
    return res

In [10]:
from sklearn.model_selection import GridSearchCV

parameters = {'max_depth':[2,4,5,7,9,10]}# 6 parameters = 6 models

grid_search = GridSearchCV(DecisionTreeClassifier(), parameters, cv=3, return_train_score=True)
#cv=3 means use three-fold croass validation (split dataset into 3 parts) 2 of 3 used to train and the other used to test
#default train_score for classification = accuracy
grid_search.fit(x_train, y_train)

grid_search.best_params_

{'max_depth': 5}

In [11]:
for i in range(6):
    print('Parameters: ', grid_search.cv_results_['params'][i])
    print('Mean test score: ', grid_search.cv_results_['mean_test_score'][i])
    print('Rank: ', grid_search.cv_results_['rank_test_score'][i])

Parameters:  {'max_depth': 2}
Mean test score:  0.7697670101178873
Rank:  6
Parameters:  {'max_depth': 4}
Mean test score:  0.7821312540610786
Rank:  4
Parameters:  {'max_depth': 5}
Mean test score:  0.8137473312911908
Rank:  1
Parameters:  {'max_depth': 7}
Mean test score:  0.785602896129212
Rank:  2
Parameters:  {'max_depth': 9}
Mean test score:  0.7838113803026084
Rank:  3
Parameters:  {'max_depth': 10}
Mean test score:  0.7803118908382066
Rank:  5


In [12]:
decision_tree_model=DecisionTreeClassifier(max_depth=grid_search.best_params_['max_depth']).fit(x_train, y_train)

In [13]:
y_pred = decision_tree_model.predict(x_test)

In [14]:
summarize_classification(y_test, y_pred)

{'test data count': 143, 'accuracy': 0.7972027972027972, 'precision': 0.8863636363636364, 'recall': 0.6190476190476191, 'accuracy count': 114} 



{'test data count': 143,
 'accuracy': 0.7972027972027972,
 'precision': 0.8863636363636364,
 'recall': 0.6190476190476191,
 'accuracy count': 114}

In [15]:
parameters = { 'penalty': ['l1', 'l2'], 'C': [.1, .4, .8, 1,2,5]}# L1 or L2 and diff penalty values
grid_search = GridSearchCV(LogisticRegression(solver='liblinear'), parameters, cv=3, return_train_score=True)
#10 total models
grid_search.fit(x_train, y_train)

grid_search.best_params_
# in the case of a tie , grid search just gives 1 set of parameters

{'C': 1, 'penalty': 'l1'}

In [16]:
for i in range(12):
    print('Parameters: ', grid_search.cv_results_['params'][i])
    print('mean test score: ', grid_search.cv_results_['mean_test_score'][i])
    print('rank: ', grid_search.cv_results_['rank_test_score'][i])

Parameters:  {'C': 0.1, 'penalty': 'l1'}
mean test score:  0.7803768680961664
rank:  12
Parameters:  {'C': 0.1, 'penalty': 'l2'}
mean test score:  0.7821312540610786
rank:  11
Parameters:  {'C': 0.4, 'penalty': 'l1'}
mean test score:  0.7979578576069803
rank:  8
Parameters:  {'C': 0.4, 'penalty': 'l2'}
mean test score:  0.7944676506080014
rank:  10
Parameters:  {'C': 0.8, 'penalty': 'l1'}
mean test score:  0.8032302979671401
rank:  6
Parameters:  {'C': 0.8, 'penalty': 'l2'}
mean test score:  0.7962127541074909
rank:  9
Parameters:  {'C': 1, 'penalty': 'l1'}
mean test score:  0.8067576348278104
rank:  1
Parameters:  {'C': 1, 'penalty': 'l2'}
mean test score:  0.7979671400724033
rank:  7
Parameters:  {'C': 2, 'penalty': 'l1'}
mean test score:  0.805003248862898
rank:  2
Parameters:  {'C': 2, 'penalty': 'l2'}
mean test score:  0.8032395804325629
rank:  5
Parameters:  {'C': 5, 'penalty': 'l1'}
mean test score:  0.805003248862898
rank:  2
Parameters:  {'C': 5, 'penalty': 'l2'}
mean test sco

In [17]:
logistic_model = LogisticRegression(solver='liblinear', penalty=grid_search.best_params_['penalty'],
                                    C=grid_search.best_params_['C']).fit(x_train, y_train)

In [18]:
y_pred = logistic_model.predict(x_test)

In [19]:
summarize_classification(y_test, y_pred)

{'test data count': 143, 'accuracy': 0.7762237762237763, 'precision': 0.7818181818181819, 'recall': 0.6825396825396826, 'accuracy count': 111} 



{'test data count': 143,
 'accuracy': 0.7762237762237763,
 'precision': 0.7818181818181819,
 'recall': 0.6825396825396826,
 'accuracy count': 111}