In [1]:
import pandas as pd
import numpy as np
import sklearn

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [2]:
#Importing different ML Classification models
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

In [3]:
titanic_df = pd.read_csv('datasets/titanic_processed.csv')
titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,0,3,1,38.0,0,0,8.6625,0,0,1
1,1,3,1,32.0,0,0,8.05,0,0,1
2,0,1,1,31.0,0,0,50.4958,0,0,1
3,0,3,1,14.0,5,2,46.9,0,0,1
4,1,3,1,3.0,4,2,31.3875,0,0,1


In [4]:
FEATURES = list(titanic_df.columns[1:])

FEATURES

['Pclass',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'Embarked_C',
 'Embarked_Q',
 'Embarked_S']

In [6]:
result_dict = {} #defining a results dictionary to store results of our models

In [7]:
def summarize_classification(y_test, y_pred):
    
    acc = accuracy_score(y_test, y_pred, normalize=True)
    num_acc = accuracy_score(y_test, y_pred, normalize=False)
    
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    
    return {'accuracy': acc,
           'precision': prec,
           'recall': rec,
           'accuracy_count': num_acc}

In [12]:
def build_model(classifier_fn,
               name_of_y_col,
               name_of_x_cols,
               dataset,
               test_frac = 0.2):
    
    X = dataset[name_of_x_cols]
    y = dataset[name_of_y_col]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_frac)
    
    model = classifier_fn(X_train,y_train)
    
    y_pred = model.predict(X_test)
    
    y_pred_train = model.predict(X_train)
    
    train_summary = summarize_classification(y_train, y_pred_train)
    test_summary = summarize_classification(y_test, y_pred)
    
    pred_results = pd.DataFrame({'y_test': y_test, 'y_pred': y_pred})
    
    model_crosstab = pd.crosstab(pred_results.y_pred, pred_results.y_test)
    
    return {'training': train_summary,
           'test': test_summary,
           'confusion matrix': model_crosstab }

In [13]:
def compare_results():
    for key in result_dict:
        print 'Classification: ', key
        
        print ' '
        print 'Training data'
        for score in result_dict[key]['training']:
            print score, result_dict[key]['training'][score]
            
        print ' '
        print 'Test data'
        for score in result_dict[key]['test']:
            print score, result_dict[key]['test'][score]
        print ' '

In [14]:
def logistic_fn(X_train, y_train):
    
    model = LogisticRegression(solver = 'liblinear')
    model.fit(X_train, y_train)
    
    return model

In [15]:
result_dict['survived ~ logistic'] = build_model(logistic_fn,
                                                'Survived',
                                                FEATURES,
                                                titanic_df)

compare_results()

Classification:  survived ~ logistic
 
Training data
recall 0.6577777777777778
precision 0.7628865979381443
accuracy_count 446
accuracy 0.7838312829525483
 
Test data
recall 0.7619047619047619
precision 0.8421052631578947
accuracy_count 119
accuracy 0.8321678321678322
 


In [16]:
def linear_discriminant_fn(X_train, y_train, solver = 'svd'):
    
    model = LinearDiscriminantAnalysis(solver=solver)
    model.fit(X_train, y_train)
    
    return model

In [17]:
result_dict['survived ~ linear_discriminant_analysis'] = build_model(linear_discriminant_fn,
                                                'Survived',
                                                FEATURES,
                                                titanic_df)

compare_results()

Classification:  survived ~ linear_discriminant_analysis
 
Training data
recall 0.7117903930131004
precision 0.7688679245283019
accuracy_count 454
accuracy 0.7978910369068541
 
Test data
recall 0.6440677966101694
precision 0.7755102040816326
accuracy_count 111
accuracy 0.7762237762237763
 
Classification:  survived ~ logistic
 
Training data
recall 0.6577777777777778
precision 0.7628865979381443
accuracy_count 446
accuracy 0.7838312829525483
 
Test data
recall 0.7619047619047619
precision 0.8421052631578947
accuracy_count 119
accuracy 0.8321678321678322
 




In [19]:
#To resolve the issue of dummy trap, variable collinearity, I will remove one of the one-hot encoded feature 

result_dict['survived ~ linear_discriminant_analysis'] = build_model(linear_discriminant_fn,
                                                'Survived',
                                                FEATURES[0:-1],
                                                titanic_df)

compare_results()

Classification:  survived ~ linear_discriminant_analysis
 
Training data
recall 0.7272727272727273
precision 0.7813953488372093
accuracy_count 459
accuracy 0.8066783831282952
 
Test data
recall 0.631578947368421
precision 0.7058823529411765
accuracy_count 107
accuracy 0.7482517482517482
 
Classification:  survived ~ logistic
 
Training data
recall 0.6577777777777778
precision 0.7628865979381443
accuracy_count 446
accuracy 0.7838312829525483
 
Test data
recall 0.7619047619047619
precision 0.8421052631578947
accuracy_count 119
accuracy 0.8321678321678322
 


In [20]:
#Quadratic Discriminant Analysis
def quadratic_discriminant_fn(X_train, y_train):
    
    model = QuadraticDiscriminantAnalysis()
    model.fit(X_train, y_train)
    
    return model

In [21]:
result_dict['survived ~ quadratic_discriminant_analysis'] = build_model(quadratic_discriminant_fn,
                                                'Survived',
                                                FEATURES[0:-1],
                                                titanic_df)

compare_results()

Classification:  survived ~ linear_discriminant_analysis
 
Training data
recall 0.7272727272727273
precision 0.7813953488372093
accuracy_count 459
accuracy 0.8066783831282952
 
Test data
recall 0.631578947368421
precision 0.7058823529411765
accuracy_count 107
accuracy 0.7482517482517482
 
Classification:  survived ~ logistic
 
Training data
recall 0.6577777777777778
precision 0.7628865979381443
accuracy_count 446
accuracy 0.7838312829525483
 
Test data
recall 0.7619047619047619
precision 0.8421052631578947
accuracy_count 119
accuracy 0.8321678321678322
 
Classification:  survived ~ quadratic_discriminant_analysis
 
Training data
recall 0.7381974248927039
precision 0.7713004484304933
accuracy_count 457
accuracy 0.8031634446397188
 
Test data
recall 0.7090909090909091
precision 0.7090909090909091
accuracy_count 111
accuracy 0.7762237762237763
 


In [28]:
#SGD (Stochastic Gradient Descent) Classifier
def sgd_fn(X_train, y_train, max_iter = 10000, tol = 1e-3):
    
    model = SGDClassifier(max_iter=max_iter, tol=tol)
    model.fit(X_train, y_train)
    
    return model

In [29]:
result_dict['survived ~ Stochastic_Gradient_Descent_SGD'] = build_model(sgd_fn,
                                                'Survived',
                                                FEATURES,
                                                titanic_df)

compare_results()

Classification:  survived ~ Stochastic_Gradient_Descent_SGD
 
Training data
recall 0.3829787234042553
precision 0.8256880733944955
accuracy_count 405
accuracy 0.7117750439367311
 
Test data
recall 0.33962264150943394
precision 0.6923076923076923
accuracy_count 100
accuracy 0.6993006993006993
 
Classification:  survived ~ linear_discriminant_analysis
 
Training data
recall 0.7272727272727273
precision 0.7813953488372093
accuracy_count 459
accuracy 0.8066783831282952
 
Test data
recall 0.631578947368421
precision 0.7058823529411765
accuracy_count 107
accuracy 0.7482517482517482
 
Classification:  survived ~ logistic
 
Training data
recall 0.6577777777777778
precision 0.7628865979381443
accuracy_count 446
accuracy 0.7838312829525483
 
Test data
recall 0.7619047619047619
precision 0.8421052631578947
accuracy_count 119
accuracy 0.8321678321678322
 
Classification:  survived ~ quadratic_discriminant_analysis
 
Training data
recall 0.7381974248927039
precision 0.7713004484304933
accuracy_coun

In [42]:
#Support Vector Machines(SVM)
def linear_svc_fn(X_train, y_train, C= 1.0, max_iter = 1000, tol = 1e-3):
    
    model = LinearSVC(C=C, max_iter=max_iter, tol=tol, dual=False)
    model.fit(X_train, y_train)
    
    return model

In [43]:
result_dict['survived ~ Linear_SVC'] = build_model(linear_svc_fn,
                                                'Survived',
                                                FEATURES,
                                                titanic_df)

compare_results()

Classification:  survived ~ Stochastic_Gradient_Descent_SGD
 
Training data
recall 0.3829787234042553
precision 0.8256880733944955
accuracy_count 405
accuracy 0.7117750439367311
 
Test data
recall 0.33962264150943394
precision 0.6923076923076923
accuracy_count 100
accuracy 0.6993006993006993
 
Classification:  survived ~ linear_discriminant_analysis
 
Training data
recall 0.7272727272727273
precision 0.7813953488372093
accuracy_count 459
accuracy 0.8066783831282952
 
Test data
recall 0.631578947368421
precision 0.7058823529411765
accuracy_count 107
accuracy 0.7482517482517482
 
Classification:  survived ~ Linear_SVC
 
Training data
recall 0.6949152542372882
precision 0.7592592592592593
accuracy_count 445
accuracy 0.7820738137082601
 
Test data
recall 0.7692307692307693
precision 0.7843137254901961
accuracy_count 120
accuracy 0.8391608391608392
 
Classification:  survived ~ logistic
 
Training data
recall 0.6577777777777778
precision 0.7628865979381443
accuracy_count 446
accuracy 0.7838

In [46]:
#nearest neighbors classification model
def radius_neighbors_fn(X_train, y_train, radius = 60):
    
    model = RadiusNeighborsClassifier(radius = radius)
    model.fit(X_train, y_train)
    
    return model

In [47]:
result_dict['survived ~ radius_neighbors_classifier'] = build_model(radius_neighbors_fn,
                                                'Survived',
                                                FEATURES,
                                                titanic_df)

compare_results()

Classification:  survived ~ linear_discriminant_analysis
 
Training data
recall 0.7272727272727273
precision 0.7813953488372093
accuracy_count 459
accuracy 0.8066783831282952
 
Test data
recall 0.631578947368421
precision 0.7058823529411765
accuracy_count 107
accuracy 0.7482517482517482
 
Classification:  survived ~ quadratic_discriminant_analysis
 
Training data
recall 0.7381974248927039
precision 0.7713004484304933
accuracy_count 457
accuracy 0.8031634446397188
 
Test data
recall 0.7090909090909091
precision 0.7090909090909091
accuracy_count 111
accuracy 0.7762237762237763
 
Classification:  survived ~ radius_neighbors_classifier
 
Training data
recall 0.24336283185840707
precision 0.7236842105263158
accuracy_count 377
accuracy 0.6625659050966608
 
Test data
recall 0.1774193548387097
precision 0.6875
accuracy_count 87
accuracy 0.6083916083916084
 
Classification:  survived ~ Linear_SVC
 
Training data
recall 0.6949152542372882
precision 0.7592592592592593
accuracy_count 445
accuracy 

In [48]:
#decision trees
def decision_tree_fn(X_train, y_train, max_depth=None, max_features=None):
    
    model = DecisionTreeClassifier(max_depth=max_depth, max_features=max_features)
    model.fit(X_train, y_train)
    
    return model

In [49]:
result_dict['survived ~ decision_tree'] = build_model(decision_tree_fn,
                                                'Survived',
                                                FEATURES,
                                                titanic_df)

compare_results()

Classification:  survived ~ linear_discriminant_analysis
 
Training data
recall 0.7272727272727273
precision 0.7813953488372093
accuracy_count 459
accuracy 0.8066783831282952
 
Test data
recall 0.631578947368421
precision 0.7058823529411765
accuracy_count 107
accuracy 0.7482517482517482
 
Classification:  survived ~ quadratic_discriminant_analysis
 
Training data
recall 0.7381974248927039
precision 0.7713004484304933
accuracy_count 457
accuracy 0.8031634446397188
 
Test data
recall 0.7090909090909091
precision 0.7090909090909091
accuracy_count 111
accuracy 0.7762237762237763
 
Classification:  survived ~ radius_neighbors_classifier
 
Training data
recall 0.24336283185840707
precision 0.7236842105263158
accuracy_count 377
accuracy 0.6625659050966608
 
Test data
recall 0.1774193548387097
precision 0.6875
accuracy_count 87
accuracy 0.6083916083916084
 
Classification:  survived ~ decision_tree
 
Training data
recall 0.9823008849557522
precision 1.0
accuracy_count 565
accuracy 0.9929701230

In [50]:
#Naive Bayes Classifier
def naive_bayes_fn(X_train, y_train, priors=None):
    
    model = GaussianNB(priors=priors)
    model.fit(X_train, y_train)
    
    return model

In [51]:
result_dict['survived ~ naive_bayes'] = build_model(naive_bayes_fn,
                                                'Survived',
                                                FEATURES,
                                                titanic_df)

compare_results()

Classification:  survived ~ linear_discriminant_analysis
 
Training data
recall 0.7272727272727273
precision 0.7813953488372093
accuracy_count 459
accuracy 0.8066783831282952
 
Test data
recall 0.631578947368421
precision 0.7058823529411765
accuracy_count 107
accuracy 0.7482517482517482
 
Classification:  survived ~ quadratic_discriminant_analysis
 
Training data
recall 0.7381974248927039
precision 0.7713004484304933
accuracy_count 457
accuracy 0.8031634446397188
 
Test data
recall 0.7090909090909091
precision 0.7090909090909091
accuracy_count 111
accuracy 0.7762237762237763
 
Classification:  survived ~ radius_neighbors_classifier
 
Training data
recall 0.24336283185840707
precision 0.7236842105263158
accuracy_count 377
accuracy 0.6625659050966608
 
Test data
recall 0.1774193548387097
precision 0.6875
accuracy_count 87
accuracy 0.6083916083916084
 
Classification:  survived ~ naive_bayes
 
Training data
recall 0.6781115879828327
precision 0.7596153846153846
accuracy_count 444
accuracy