In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

In [3]:
titanic_df = pd.read_csv('../datasets/titanic/titanic_processed.csv')

titanic_df.sample(10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
73,0,3,0,37.0,0,0,9.5875,0,0,1
597,0,3,1,2.0,4,1,39.6875,0,0,1
97,0,3,1,39.0,0,0,7.925,0,0,1
321,0,1,1,33.0,0,0,5.0,0,0,1
269,1,1,0,31.0,1,0,113.275,1,0,0
542,0,3,0,18.0,1,0,17.8,0,0,1
198,0,3,1,22.0,0,0,8.05,0,0,1
685,0,2,1,54.0,1,0,26.0,0,0,1
124,1,3,1,25.0,0,0,0.0,0,0,1
229,0,3,1,35.0,0,0,8.05,0,0,1


In [4]:
FEATURES = list(titanic_df.columns[1:])

FEATURES

['Pclass',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'Embarked_C',
 'Embarked_Q',
 'Embarked_S']

In [5]:
result_dict = {}

In [6]:
def summarize_classification(y_test, y_pred):
    acc = accuracy_score(y_test, y_pred, normalize=True)
    num_acc = accuracy_score(y_test, y_pred, normalize=False)
    
    prec = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    return {
        'accuracy' : acc,
        'precision' : prec,
        'recall' : recall,
        'accuracy_count' : num_acc
    }

In [7]:
def build_model(classifier_fn,
               name_of_y_col,
               names_of_x_cols,
               dataset,
               test_frac=0.2):
    X = dataset[names_of_x_cols]
    y = dataset[name_of_y_col]
    
    x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=test_frac)
    
    model = classifier_fn(x_train, y_train)
    
    y_pred = model.predict(x_test)
    
    y_pred_train = model.predict(x_train)
    
    train_summary = summarize_classification(y_train, y_pred_train)
    test_summary = summarize_classification(y_test, y_pred)
    
    pred_results = pd.DataFrame({
        'y_test' : y_test,
        'y_pred' : y_pred
    })
    
    model_crosstab = pd.crosstab(pred_results.y_pred, pred_results.y_test)
    
    return {
        'training' : train_summary,
        'test' : test_summary,
        'confusion_matrix' : model_crosstab
    }

In [8]:
def compare_results():
    for key in result_dict:
        print('Classification :', key)
        
        print()
        
        print('Training data')
        
        for score in result_dict[key]['training']:
            print(score, result_dict[key]['training'][score])
            
        print()
        print('Test data')
        
        for score in result_dict[key]['test']:
            print(score, result_dict[key]['test'][score])
            
        print()

In [9]:
def logistic_fn(x_train, y_train):
    
    model = LogisticRegression(solver='liblinear')
    model.fit(x_train, y_train)
    
    return model

In [10]:
result_dict['survived - logistic'] = build_model(logistic_fn,
                                                'Survived',
                                                FEATURES,
                                                titanic_df)

compare_results()

Classification : survived - logistic

Training data
accuracy 0.7820738137082601
precision 0.7706422018348624
recall 0.6942148760330579
accuracy_count 445

Test data
accuracy 0.8531468531468531
precision 0.7659574468085106
recall 0.782608695652174
accuracy_count 122



## LDA
**Find axes to best separate the classes such that all instances of a class are in the same quarant**

The best axes here refers to those axes that best separate the data into different classes

**SVD(singular value decomposition) solver** Find axes without calculating the covariance matric of features, useful when we have many features or many rows in dataset

In [11]:
def linear_discriminant_fn(x_train, y_train, solver='svd'):
    
    model = LinearDiscriminantAnalysis(solver=solver)
    model.fit(x_train, y_train)
    
    return model

In [12]:
result_dict['survived - linear_discriminant_analysis'] = build_model(linear_discriminant_fn,
                                                                    'Survived',
                                                                    FEATURES,
                                                                    titanic_df)

compare_results()

Classification : survived - logistic

Training data
accuracy 0.7820738137082601
precision 0.7706422018348624
recall 0.6942148760330579
accuracy_count 445

Test data
accuracy 0.8531468531468531
precision 0.7659574468085106
recall 0.782608695652174
accuracy_count 122

Classification : survived - linear_discriminant_analysis

Training data
accuracy 0.8101933216168717
precision 0.775609756097561
recall 0.7194570135746606
accuracy_count 461

Test data
accuracy 0.7412587412587412
precision 0.7419354838709677
recall 0.6865671641791045
accuracy_count 106



**Note** One-hot encoding can result in collinearity of features (Know as dummy trap problem), Instead use *dummy encoding* where we drop one of the one-hot encoded columns

In [13]:
result_dict['survived - linear_discriminant_analysis'] = build_model(linear_discriminant_fn,
                                                                    'Survived',
                                                                    FEATURES[0:-1],
                                                                    titanic_df)

compare_results()

Classification : survived - logistic

Training data
accuracy 0.7820738137082601
precision 0.7706422018348624
recall 0.6942148760330579
accuracy_count 445

Test data
accuracy 0.8531468531468531
precision 0.7659574468085106
recall 0.782608695652174
accuracy_count 122

Classification : survived - linear_discriminant_analysis

Training data
accuracy 0.7996485061511424
precision 0.7737556561085973
recall 0.7276595744680852
accuracy_count 455

Test data
accuracy 0.7622377622377622
precision 0.6666666666666666
recall 0.7169811320754716
accuracy_count 109



## Quadratic Discriminant Analysys

* Find axes to best separate the classes such that all instances of a class are in the same quadrant but the **decision boundary is Quadratic**

* Useful when the X variables corresponding to different labels have different covariances i.e. **Covariances are different for X for all values of Y**

In [14]:
def quadratic_discriminant_fn(x_train, y_train):
    
    model = QuadraticDiscriminantAnalysis()
    model.fit(x_train, y_train)
    
    return model

In [15]:
result_dict['survived - quadratic_discriminant_analysis'] = build_model(quadratic_discriminant_fn,
                                                                        'Survived',
                                                                        FEATURES[0:-1],
                                                                        titanic_df)

compare_results()

Classification : survived - logistic

Training data
accuracy 0.7820738137082601
precision 0.7706422018348624
recall 0.6942148760330579
accuracy_count 445

Test data
accuracy 0.8531468531468531
precision 0.7659574468085106
recall 0.782608695652174
accuracy_count 122

Classification : survived - linear_discriminant_analysis

Training data
accuracy 0.7996485061511424
precision 0.7737556561085973
recall 0.7276595744680852
accuracy_count 455

Test data
accuracy 0.7622377622377622
precision 0.6666666666666666
recall 0.7169811320754716
accuracy_count 109

Classification : survived - quadratic_discriminant_analysis

Training data
accuracy 0.8101933216168717
precision 0.7914691943127962
recall 0.7229437229437229
accuracy_count 461

Test data
accuracy 0.7552447552447552
precision 0.6964285714285714
recall 0.6842105263157895
accuracy_count 108



## Stochastic Gradient Descent Classifier 

* Gradient decent allows Stochastic Gradient Descent Classifier to iteratively converges to the best model
* SGD performs numerical optimization - **One training instance at a time** - to find the best model parameters


* **tol** here is nothing but the stopping criterian for model training. Model will stop training if the loss calculated at a perticular iteration is less than the tolerance value that we have specified as compared with the previous iteration 

* When the change in the loss for perticular iteration falls below your tolerance specification. It means model is not imoroving any further. Stop the training

In [18]:
def sgd_fn(x_train, y_train, max_iter=10000 tol=1e-3):
    
    model = SGDClassifier(max_iter=max_iter, tol=tol)
    model.fit(x_train, y_train)
    
    return model

SyntaxError: invalid syntax (<ipython-input-18-17e6f0c2b37a>, line 1)

In [19]:
result_dict['survived - sgd'] = build_model(sgd_fn,
                                           'Survived',
                                           FEATURES,
                                           titanic_df)

compare_results()

Classification : survived - logistic

Training data
accuracy 0.7820738137082601
precision 0.7706422018348624
recall 0.6942148760330579
accuracy_count 445

Test data
accuracy 0.8531468531468531
precision 0.7659574468085106
recall 0.782608695652174
accuracy_count 122

Classification : survived - linear_discriminant_analysis

Training data
accuracy 0.7996485061511424
precision 0.7737556561085973
recall 0.7276595744680852
accuracy_count 455

Test data
accuracy 0.7622377622377622
precision 0.6666666666666666
recall 0.7169811320754716
accuracy_count 109

Classification : survived - quadratic_discriminant_analysis

Training data
accuracy 0.8101933216168717
precision 0.7914691943127962
recall 0.7229437229437229
accuracy_count 461

Test data
accuracy 0.7552447552447552
precision 0.6964285714285714
recall 0.6842105263157895
accuracy_count 108

Classification : survived - sgd

Training data
accuracy 0.7100175746924429
precision 0.6125827814569537
recall 0.7939914163090128
accuracy_count 404

Test

## Support Vector Machine
* Find a hyperplane that separates points so all points on the same side belong to the same class


In [20]:
def linear_svc_fn(x_train, y_train, C=1.0, max_iter=1000, tol=1e-3):
    
    model = LinearSVC(C=C, max_iter=max_iter, tol=tol, dual=False)
    model.fit(x_train, y_train)
    
    return model

* **Regularization** is used to penalize the complex model
* **C** here is the inverse of the regularization strength, smaller value indicate stronger regularization - penalize points on the wrong side of the margin 

* **SVC or LinearSVC**
* **LinearSVC == SVC(kernel="linear")**

**dual=Flase** is also used for optimization
* Prefer dual=False when n_samples > n_features

In [21]:
result_dict['survived - linear_svc'] = build_model(linear_svc_fn,
                                                  'Survived',
                                                  FEATURES,
                                                  titanic_df)

compare_results()

Classification : survived - logistic

Training data
accuracy 0.7820738137082601
precision 0.7706422018348624
recall 0.6942148760330579
accuracy_count 445

Test data
accuracy 0.8531468531468531
precision 0.7659574468085106
recall 0.782608695652174
accuracy_count 122

Classification : survived - linear_discriminant_analysis

Training data
accuracy 0.7996485061511424
precision 0.7737556561085973
recall 0.7276595744680852
accuracy_count 455

Test data
accuracy 0.7622377622377622
precision 0.6666666666666666
recall 0.7169811320754716
accuracy_count 109

Classification : survived - quadratic_discriminant_analysis

Training data
accuracy 0.8101933216168717
precision 0.7914691943127962
recall 0.7229437229437229
accuracy_count 461

Test data
accuracy 0.7552447552447552
precision 0.6964285714285714
recall 0.6842105263157895
accuracy_count 108

Classification : survived - sgd

Training data
accuracy 0.7100175746924429
precision 0.6125827814569537
recall 0.7939914163090128
accuracy_count 404

Test