In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

In [2]:
titanic_df = pd.read_csv('rachita_titanic_processed.csv')
titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,0,3,0,28.0,1,1,14.4,0,0,1
1,0,2,1,30.0,0,0,13.0,0,0,1
2,1,1,1,0.92,1,2,151.55,0,0,1
3,0,2,1,36.0,0,0,12.875,1,0,0
4,0,3,1,47.0,0,0,7.25,0,0,1


In [3]:
features = list(titanic_df.columns[1:])
features

['Pclass',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'Embarked_C',
 'Embarked_Q',
 'Embarked_S']

In [4]:
result_dict = {}

In [5]:
# Creating a helper function to calculate accuracy, precision, recall

def summarize_classification(y_test, y_pred):
    acc = accuracy_score(y_test, y_pred)
    count_acc = accuracy_score(y_test, y_pred, normalize = False)
    
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    return {'Accuracy' : acc, 'Precision' : precision, 'Recall' : recall, 'Number of accurate results' : count_acc}


In [6]:
# Creating another helper function call build model to help us build & train classification model

def build_model(classification_function, y_col, x_cols, dataset, test_frac=0.2):
    X = dataset[x_cols] # Here x_cols is a List of features we wish to pass
    y = dataset[y_col] # Here y_col is a String that indicates target label 
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_frac)
    
    model = classification_function(X_train, y_train)
    
    y_pred = model.predict(X_test)
    y_train_pred = model.predict(X_train)
    
    # Comparing predictions on train data (seen data) to test data (unseen data)
    train_summary = summarize_classification(y_train, y_train_pred)
    test_summary = summarize_classification(y_test, y_pred)
    
    prediction_results = pd.DataFrame({'y_test' : y_test, 'y_pred' : y_pred})
    model_crosstab = pd.crosstab(prediction_results.y_pred, prediction_results.y_test)
    
    return {'Train summary' : train_summary, 'Test summary' : test_summary, 'Confusion matrix' : model_crosstab}


In [7]:
def compare_results():
    print("Objective : Survival Prediction \n")
    for key in result_dict:
        print('\nClassification Model - ', key, '\n')
        print('Train data')
        for score in result_dict[key]['Train summary']:
            print(score, result_dict[key]['Train summary'][score])
        
        print('\nTest data')
        for score in result_dict[key]['Test summary']:
            print(score, result_dict[key]['Test summary'][score])
            

## Logistic Regression

In [8]:
def logistic_regression(X_train, y_train):
    model = LogisticRegression(solver = 'liblinear').fit(X_train, y_train)
    return model
    

In [9]:
result_dict['Logistic Regression'] = build_model(logistic_regression, 'Survived', features, titanic_df)
compare_results()

Objective : Survival Prediction 


Classification Model -  Logistic Regression 

Train data
Accuracy 0.8137082601054482
Precision 0.8009259259259259
Recall 0.7330508474576272
Number of accurate results 463

Test data
Accuracy 0.7132867132867133
Precision 0.6571428571428571
Recall 0.4423076923076923
Number of accurate results 102


## Linear Discriminant Analysis

In [10]:
def linear_discriminant(X_train, y_train, solver = 'svd'):
    model = LinearDiscriminantAnalysis(solver = solver).fit(X_train, y_train)
    return model
    

In [11]:
result_dict['Linear Discriminant Analysis'] = build_model(linear_discriminant, 'Survived', features, titanic_df)
compare_results()

Objective : Survival Prediction 


Classification Model -  Logistic Regression 

Train data
Accuracy 0.8137082601054482
Precision 0.8009259259259259
Recall 0.7330508474576272
Number of accurate results 463

Test data
Accuracy 0.7132867132867133
Precision 0.6571428571428571
Recall 0.4423076923076923
Number of accurate results 102

Classification Model -  Linear Discriminant Analysis 

Train data
Accuracy 0.7961335676625659
Precision 0.7632850241545893
Recall 0.7022222222222222
Number of accurate results 453

Test data
Accuracy 0.7972027972027972
Precision 0.8148148148148148
Recall 0.6984126984126984
Number of accurate results 114


In [12]:
result_dict['Linear Discriminant Analysis(Dummy Encoded)'] = build_model(linear_discriminant,
                                                                           'Survived', 
                                                                           features[0:-1], #Dummy encoding by dropping last column
                                                                           titanic_df)
compare_results()

Objective : Survival Prediction 


Classification Model -  Logistic Regression 

Train data
Accuracy 0.8137082601054482
Precision 0.8009259259259259
Recall 0.7330508474576272
Number of accurate results 463

Test data
Accuracy 0.7132867132867133
Precision 0.6571428571428571
Recall 0.4423076923076923
Number of accurate results 102

Classification Model -  Linear Discriminant Analysis 

Train data
Accuracy 0.7961335676625659
Precision 0.7632850241545893
Recall 0.7022222222222222
Number of accurate results 453

Test data
Accuracy 0.7972027972027972
Precision 0.8148148148148148
Recall 0.6984126984126984
Number of accurate results 114

Classification Model -  Linear Discriminant Analysis(Dummy Encoded) 

Train data
Accuracy 0.7961335676625659
Precision 0.7761904761904762
Recall 0.7025862068965517
Number of accurate results 453

Test data
Accuracy 0.7902097902097902
Precision 0.7407407407407407
Recall 0.7142857142857143
Number of accurate results 113


## Quadratic Discriminant Analysis

In [13]:
def quadratic_discriminant(X_train, y_train):
    model = QuadraticDiscriminantAnalysis().fit(X_train, y_train)
    return model

In [14]:
result_dict['Quadratic Discriminant Analysis(Dummy Encoded)'] = build_model(quadratic_discriminant,
                                                                           'Survived', 
                                                                           features[0:-1], #Dummy encoding by dropping last column
                                                                           titanic_df)
compare_results()

Objective : Survival Prediction 


Classification Model -  Logistic Regression 

Train data
Accuracy 0.8137082601054482
Precision 0.8009259259259259
Recall 0.7330508474576272
Number of accurate results 463

Test data
Accuracy 0.7132867132867133
Precision 0.6571428571428571
Recall 0.4423076923076923
Number of accurate results 102

Classification Model -  Linear Discriminant Analysis 

Train data
Accuracy 0.7961335676625659
Precision 0.7632850241545893
Recall 0.7022222222222222
Number of accurate results 453

Test data
Accuracy 0.7972027972027972
Precision 0.8148148148148148
Recall 0.6984126984126984
Number of accurate results 114

Classification Model -  Linear Discriminant Analysis(Dummy Encoded) 

Train data
Accuracy 0.7961335676625659
Precision 0.7761904761904762
Recall 0.7025862068965517
Number of accurate results 453

Test data
Accuracy 0.7902097902097902
Precision 0.7407407407407407
Recall 0.7142857142857143
Number of accurate results 113

Classification Model -  Quadratic Discrim

## Stochaistic Gradient Descent

In [15]:
def sgd(X_train, y_train, max_iter = 100000, tol = 1e-3):
    model = SGDClassifier(max_iter=max_iter, tol=tol).fit(X_train, y_train)
    return model

In [16]:
result_dict['Stochastic Gradient Descent'] = build_model(sgd, 'Survived', features, titanic_df)
compare_results()

Objective : Survival Prediction 


Classification Model -  Logistic Regression 

Train data
Accuracy 0.8137082601054482
Precision 0.8009259259259259
Recall 0.7330508474576272
Number of accurate results 463

Test data
Accuracy 0.7132867132867133
Precision 0.6571428571428571
Recall 0.4423076923076923
Number of accurate results 102

Classification Model -  Linear Discriminant Analysis 

Train data
Accuracy 0.7961335676625659
Precision 0.7632850241545893
Recall 0.7022222222222222
Number of accurate results 453

Test data
Accuracy 0.7972027972027972
Precision 0.8148148148148148
Recall 0.6984126984126984
Number of accurate results 114

Classification Model -  Linear Discriminant Analysis(Dummy Encoded) 

Train data
Accuracy 0.7961335676625659
Precision 0.7761904761904762
Recall 0.7025862068965517
Number of accurate results 453

Test data
Accuracy 0.7902097902097902
Precision 0.7407407407407407
Recall 0.7142857142857143
Number of accurate results 113

Classification Model -  Quadratic Discrim

## Linear Support Vector Classifier

In [17]:
def linear_svc(X_train, y_train, C=1.0, max_iter = 1000, tol = 1e-3):
    model = LinearSVC(C=C, max_iter=max_iter, tol=tol, dual=False).fit(X_train, y_train)
    return model

In [18]:
result_dict['Support Vector Classifier'] = build_model(linear_svc, 'Survived', features, titanic_df)
compare_results()

Objective : Survival Prediction 


Classification Model -  Logistic Regression 

Train data
Accuracy 0.8137082601054482
Precision 0.8009259259259259
Recall 0.7330508474576272
Number of accurate results 463

Test data
Accuracy 0.7132867132867133
Precision 0.6571428571428571
Recall 0.4423076923076923
Number of accurate results 102

Classification Model -  Linear Discriminant Analysis 

Train data
Accuracy 0.7961335676625659
Precision 0.7632850241545893
Recall 0.7022222222222222
Number of accurate results 453

Test data
Accuracy 0.7972027972027972
Precision 0.8148148148148148
Recall 0.6984126984126984
Number of accurate results 114

Classification Model -  Linear Discriminant Analysis(Dummy Encoded) 

Train data
Accuracy 0.7961335676625659
Precision 0.7761904761904762
Recall 0.7025862068965517
Number of accurate results 453

Test data
Accuracy 0.7902097902097902
Precision 0.7407407407407407
Recall 0.7142857142857143
Number of accurate results 113

Classification Model -  Quadratic Discrim

## Nearest Neighbours

In [19]:
def radius_neighbors(X_train, y_train, radius=40.0):
    model = RadiusNeighborsClassifier(radius=radius).fit(X_train, y_train)
    return model

In [20]:
result_dict['Radius Nearest Neighbor'] = build_model(radius_neighbors, 'Survived', features, titanic_df)
compare_results()

Objective : Survival Prediction 


Classification Model -  Logistic Regression 

Train data
Accuracy 0.8137082601054482
Precision 0.8009259259259259
Recall 0.7330508474576272
Number of accurate results 463

Test data
Accuracy 0.7132867132867133
Precision 0.6571428571428571
Recall 0.4423076923076923
Number of accurate results 102

Classification Model -  Linear Discriminant Analysis 

Train data
Accuracy 0.7961335676625659
Precision 0.7632850241545893
Recall 0.7022222222222222
Number of accurate results 453

Test data
Accuracy 0.7972027972027972
Precision 0.8148148148148148
Recall 0.6984126984126984
Number of accurate results 114

Classification Model -  Linear Discriminant Analysis(Dummy Encoded) 

Train data
Accuracy 0.7961335676625659
Precision 0.7761904761904762
Recall 0.7025862068965517
Number of accurate results 453

Test data
Accuracy 0.7902097902097902
Precision 0.7407407407407407
Recall 0.7142857142857143
Number of accurate results 113

Classification Model -  Quadratic Discrim

## Decision Tree

In [21]:
def decision_tree(X_train, y_train, max_depth=None, max_features=None):
    model = DecisionTreeClassifier(max_depth=max_depth, max_features=max_features).fit(X_train, y_train)
    return model

In [22]:
result_dict['Decision Trees'] = build_model(decision_tree, 'Survived', features, titanic_df)
compare_results()

Objective : Survival Prediction 


Classification Model -  Logistic Regression 

Train data
Accuracy 0.8137082601054482
Precision 0.8009259259259259
Recall 0.7330508474576272
Number of accurate results 463

Test data
Accuracy 0.7132867132867133
Precision 0.6571428571428571
Recall 0.4423076923076923
Number of accurate results 102

Classification Model -  Linear Discriminant Analysis 

Train data
Accuracy 0.7961335676625659
Precision 0.7632850241545893
Recall 0.7022222222222222
Number of accurate results 453

Test data
Accuracy 0.7972027972027972
Precision 0.8148148148148148
Recall 0.6984126984126984
Number of accurate results 114

Classification Model -  Linear Discriminant Analysis(Dummy Encoded) 

Train data
Accuracy 0.7961335676625659
Precision 0.7761904761904762
Recall 0.7025862068965517
Number of accurate results 453

Test data
Accuracy 0.7902097902097902
Precision 0.7407407407407407
Recall 0.7142857142857143
Number of accurate results 113

Classification Model -  Quadratic Discrim

## Naive Bayes Classifier

In [23]:
def naive_bayes(X_train, y_train, priors=None):
    model = GaussianNB(priors=priors).fit(X_train, y_train)
    return model

In [24]:
result_dict['Naive Bayes Classifier'] = build_model(naive_bayes, 'Survived', features, titanic_df)
compare_results()

Objective : Survival Prediction 


Classification Model -  Logistic Regression 

Train data
Accuracy 0.8137082601054482
Precision 0.8009259259259259
Recall 0.7330508474576272
Number of accurate results 463

Test data
Accuracy 0.7132867132867133
Precision 0.6571428571428571
Recall 0.4423076923076923
Number of accurate results 102

Classification Model -  Linear Discriminant Analysis 

Train data
Accuracy 0.7961335676625659
Precision 0.7632850241545893
Recall 0.7022222222222222
Number of accurate results 453

Test data
Accuracy 0.7972027972027972
Precision 0.8148148148148148
Recall 0.6984126984126984
Number of accurate results 114

Classification Model -  Linear Discriminant Analysis(Dummy Encoded) 

Train data
Accuracy 0.7961335676625659
Precision 0.7761904761904762
Recall 0.7025862068965517
Number of accurate results 453

Test data
Accuracy 0.7902097902097902
Precision 0.7407407407407407
Recall 0.7142857142857143
Number of accurate results 113

Classification Model -  Quadratic Discrim