In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

In [29]:
titanic_df = pd.read_csv('../datasets/titanic/titanic_processed.csv')

titanic_df.sample(10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
394,1,3,1,32.0,0,0,56.4958,0,0,1
189,1,1,0,29.0,0,0,211.3375,0,0,1
658,0,3,0,21.0,2,2,34.375,0,0,1
590,1,3,1,0.42,0,1,8.5167,1,0,0
350,1,1,1,45.0,0,0,26.55,0,0,1
29,0,3,1,70.5,0,0,7.75,0,1,0
328,1,3,0,24.0,1,0,15.85,0,0,1
49,0,3,0,17.0,0,0,14.4583,1,0,0
504,1,3,1,32.0,0,0,8.05,0,0,1
580,0,2,1,52.0,0,0,13.5,0,0,1


In [8]:
FEATURES = list(titanic_df.columns[1:])

FEATURES

['Pclass',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'Embarked_C',
 'Embarked_Q',
 'Embarked_S']

In [9]:
result_dict = {}

In [10]:
def summarize_classification(y_test, y_pred):
    acc = accuracy_score(y_test, y_pred, normalize=True)
    num_acc = accuracy_score(y_test, y_pred, normalize=False)
    
    prec = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    return {
        'accuracy' : acc,
        'precision' : prec,
        'recall' : recall,
        'accuracy_count' : num_acc
    }

In [17]:
def build_model(classifier_fn,
               name_of_y_col,
               names_of_x_cols,
               dataset,
               test_frac=0.2):
    X = dataset[names_of_x_cols]
    y = dataset[name_of_y_col]
    
    x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=test_frac)
    
    model = classifier_fn(x_train, y_train)
    
    y_pred = model.predict(x_test)
    
    y_pred_train = model.predict(x_train)
    
    train_summary = summarize_classification(y_train, y_pred_train)
    test_summary = summarize_classification(y_test, y_pred)
    
    pred_results = pd.DataFrame({
        'y_test' : y_test,
        'y_pred' : y_pred
    })
    
    model_crosstab = pd.crosstab(pred_results.y_pred, pred_results.y_test)
    
    return {
        'training' : train_summary,
        'test' : test_summary,
        'confusion_matrix' : model_crosstab
    }

In [23]:
def compare_results():
    for key in result_dict:
        print('Classification :', key)
        
        print()
        
        print('Training data')
        
        for score in result_dict[key]['training']:
            print(score, result_dict[key]['training'][score])
            
        print()
        print('Test data')
        
        for score in result_dict[key]['test']:
            print(score, result_dict[key]['test'][score])
            
        print()

In [19]:
def logistic_fn(x_train, y_train):
    
    model = LogisticRegression(solver='liblinear')
    model.fit(x_train, y_train)
    
    return model

In [20]:
result_dict['survived - logistic'] = build_model(logistic_fn,
                                                'Survived',
                                                FEATURES,
                                                titanic_df)

compare_results()

Classification :  survived - logistic

Training data
accuracy 0.81195079086116
precision 0.7881773399014779
recall 0.7142857142857143
accuracy_count 462

Test data
accuracy 0.7342657342657343
precision 0.75
recall 0.609375
accuracy_count 105



## LDA
**Find axes to best separate the classes such that all instances of a class are in the same quarant**

The best axes here refers to those axes that best separate the data into different classes

**SVD(singular value decomposition) solver** Find axes without calculating the covariance matric of features, useful when we have many features or many rows in dataset

In [21]:
def linear_discriminant_fn(x_train, y_train, solver='svd'):
    
    model = LinearDiscriminantAnalysis(solver=solver)
    model.fit(x_train, y_train)
    
    return model

In [24]:
result_dict['survived - linear_discriminant_analysis'] = build_model(linear_discriminant_fn,
                                                                    'Survived',
                                                                    FEATURES,
                                                                    titanic_df)

compare_results()

Classification : survived - logistic

Training data
accuracy 0.81195079086116
precision 0.7881773399014779
recall 0.7142857142857143
accuracy_count 462

Test data
accuracy 0.7342657342657343
precision 0.75
recall 0.609375
accuracy_count 105

Classification : survived - linear_discriminant_analysis

Training data
accuracy 0.7978910369068541
precision 0.7850467289719626
recall 0.7088607594936709
accuracy_count 454

Test data
accuracy 0.7762237762237763
precision 0.6862745098039216
recall 0.6862745098039216
accuracy_count 111



**Note** One-hot encoding can result in collinearity of features (Know as dummy trap problem), Instead use *dummy encoding* where we drop one of the one-hot encoded columns

In [31]:
result_dict['survived - linear_discriminant_analysis'] = build_model(linear_discriminant_fn,
                                                                    'Survived',
                                                                    FEATURES[0:-1],
                                                                    titanic_df)

compare_results()

Classification : survived - logistic

Training data
accuracy 0.81195079086116
precision 0.7881773399014779
recall 0.7142857142857143
accuracy_count 462

Test data
accuracy 0.7342657342657343
precision 0.75
recall 0.609375
accuracy_count 105

Classification : survived - linear_discriminant_analysis

Training data
accuracy 0.7926186291739895
precision 0.7619047619047619
recall 0.7017543859649122
accuracy_count 451

Test data
accuracy 0.8111888111888111
precision 0.7796610169491526
recall 0.7666666666666667
accuracy_count 116



## Quadratic Discriminant Analysys

* Find axes to best separate the classes such that all instances of a class are in the same quadrant but the **decision boundary is Quadratic**

* Useful when the X variables corresponding to different labels have different covariances i.e. **Covariances are different for X for all values of Y**

In [32]:
def quadratic_discriminant_fn(x_train, y_train):
    
    model = QuadraticDiscriminantAnalysis()
    model.fit(x_train, y_train)
    
    return model

In [None]:
result_dict['survived - quadratic_discriminant_analysis'] = build_model(quadratic_discriminant_fn,
                                                                        'Survived',
                                                                        FEATURES[0:-1],
                                                                        titanic_df)

compare_results()