In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

In [3]:
titanic_df = pd.read_csv('../datasets/titanic/titanic_processed.csv')

titanic_df.sample(10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
579,0,2,1,23.0,0,0,13.0,0,0,1
418,0,1,1,39.0,0,0,0.0,0,0,1
236,0,3,1,19.0,0,0,8.1583,0,0,1
365,0,3,1,39.0,1,5,31.275,0,0,1
638,1,3,1,6.0,0,1,12.475,0,0,1
592,1,3,0,22.0,0,0,7.75,0,1,0
263,1,3,0,16.0,0,0,7.7333,0,1,0
471,1,2,0,40.0,1,1,39.0,0,0,1
248,1,1,0,18.0,1,0,227.525,1,0,0
615,0,3,1,31.0,0,0,7.75,0,1,0


In [4]:
FEATURES = list(titanic_df.columns[1:])

FEATURES

['Pclass',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'Embarked_C',
 'Embarked_Q',
 'Embarked_S']

In [5]:
result_dict = {}

In [6]:
def summarize_classification(y_test, y_pred):
    acc = accuracy_score(y_test, y_pred, normalize=True)
    num_acc = accuracy_score(y_test, y_pred, normalize=False)
    
    prec = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    return {
        'accuracy' : acc,
        'precision' : prec,
        'recall' : recall,
        'accuracy_count' : num_acc
    }

In [7]:
def build_model(classifier_fn,
               name_of_y_col,
               names_of_x_cols,
               dataset,
               test_frac=0.2):
    X = dataset[names_of_x_cols]
    y = dataset[name_of_y_col]
    
    x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=test_frac)
    
    model = classifier_fn(x_train, y_train)
    
    y_pred = model.predict(x_test)
    
    y_pred_train = model.predict(x_train)
    
    train_summary = summarize_classification(y_train, y_pred_train)
    test_summary = summarize_classification(y_test, y_pred)
    
    pred_results = pd.DataFrame({
        'y_test' : y_test,
        'y_pred' : y_pred
    })
    
    model_crosstab = pd.crosstab(pred_results.y_pred, pred_results.y_test)
    
    return {
        'training' : train_summary,
        'test' : test_summary,
        'confusion_matrix' : model_crosstab
    }

In [8]:
def compare_results():
    for key in result_dict:
        print('Classification :', key)
        
        print()
        
        print('Training data')
        
        for score in result_dict[key]['training']:
            print(score, result_dict[key]['training'][score])
            
        print()
        print('Test data')
        
        for score in result_dict[key]['test']:
            print(score, result_dict[key]['test'][score])
            
        print()

In [9]:
def logistic_fn(x_train, y_train):
    
    model = LogisticRegression(solver='liblinear')
    model.fit(x_train, y_train)
    
    return model

In [10]:
result_dict['survived - logistic'] = build_model(logistic_fn,
                                                'Survived',
                                                FEATURES,
                                                titanic_df)

compare_results()

Classification : survived - logistic

Training data
accuracy 0.8031634446397188
precision 0.7967914438502673
recall 0.6681614349775785
accuracy_count 457

Test data
accuracy 0.7692307692307693
precision 0.7666666666666667
recall 0.7076923076923077
accuracy_count 110



## LDA
**Find axes to best separate the classes such that all instances of a class are in the same quarant**

The best axes here refers to those axes that best separate the data into different classes

**SVD(singular value decomposition) solver** Find axes without calculating the covariance matric of features, useful when we have many features or many rows in dataset

In [11]:
def linear_discriminant_fn(x_train, y_train, solver='svd'):
    
    model = LinearDiscriminantAnalysis(solver=solver)
    model.fit(x_train, y_train)
    
    return model

In [12]:
result_dict['survived - linear_discriminant_analysis'] = build_model(linear_discriminant_fn,
                                                                    'Survived',
                                                                    FEATURES,
                                                                    titanic_df)

compare_results()

Classification : survived - logistic

Training data
accuracy 0.8031634446397188
precision 0.7967914438502673
recall 0.6681614349775785
accuracy_count 457

Test data
accuracy 0.7692307692307693
precision 0.7666666666666667
recall 0.7076923076923077
accuracy_count 110

Classification : survived - linear_discriminant_analysis

Training data
accuracy 0.7908611599297012
precision 0.7660550458715596
recall 0.7106382978723405
accuracy_count 450

Test data
accuracy 0.8111888111888111
precision 0.76
recall 0.7169811320754716
accuracy_count 116



**Note** One-hot encoding can result in collinearity of features (Know as dummy trap problem), Instead use *dummy encoding* where we drop one of the one-hot encoded columns

In [13]:
result_dict['survived - linear_discriminant_analysis'] = build_model(linear_discriminant_fn,
                                                                    'Survived',
                                                                    FEATURES[0:-1],
                                                                    titanic_df)

compare_results()

Classification : survived - logistic

Training data
accuracy 0.8031634446397188
precision 0.7967914438502673
recall 0.6681614349775785
accuracy_count 457

Test data
accuracy 0.7692307692307693
precision 0.7666666666666667
recall 0.7076923076923077
accuracy_count 110

Classification : survived - linear_discriminant_analysis

Training data
accuracy 0.81195079086116
precision 0.7857142857142857
recall 0.7268722466960352
accuracy_count 462

Test data
accuracy 0.7762237762237763
precision 0.8085106382978723
recall 0.6229508196721312
accuracy_count 111



## Quadratic Discriminant Analysys

* Find axes to best separate the classes such that all instances of a class are in the same quadrant but the **decision boundary is Quadratic**

* Useful when the X variables corresponding to different labels have different covariances i.e. **Covariances are different for X for all values of Y**

In [14]:
def quadratic_discriminant_fn(x_train, y_train):
    
    model = QuadraticDiscriminantAnalysis()
    model.fit(x_train, y_train)
    
    return model

In [15]:
result_dict['survived - quadratic_discriminant_analysis'] = build_model(quadratic_discriminant_fn,
                                                                        'Survived',
                                                                        FEATURES[0:-1],
                                                                        titanic_df)

compare_results()

Classification : survived - logistic

Training data
accuracy 0.8031634446397188
precision 0.7967914438502673
recall 0.6681614349775785
accuracy_count 457

Test data
accuracy 0.7692307692307693
precision 0.7666666666666667
recall 0.7076923076923077
accuracy_count 110

Classification : survived - linear_discriminant_analysis

Training data
accuracy 0.81195079086116
precision 0.7857142857142857
recall 0.7268722466960352
accuracy_count 462

Test data
accuracy 0.7762237762237763
precision 0.8085106382978723
recall 0.6229508196721312
accuracy_count 111

Classification : survived - quadratic_discriminant_analysis

Training data
accuracy 0.7978910369068541
precision 0.7633928571428571
recall 0.7339055793991416
accuracy_count 454

Test data
accuracy 0.8111888111888111
precision 0.8181818181818182
recall 0.6545454545454545
accuracy_count 116



## Stochastic Gradient Descent Classifier 

* Gradient decent allows Stochastic Gradient Descent Classifier to iteratively converges to the best model
* SGD performs numerical optimization - **One training instance at a time** - to find the best model parameters


* **tol** here is nothing but the stopping criterian for model training. Model will stop training if the loss calculated at a perticular iteration is less than the tolerance value that we have specified as compared with the previous iteration 

* When the change in the loss for perticular iteration falls below your tolerance specification. It means model is not imoroving any further. Stop the training

In [17]:
def sgd_fn(x_train, y_train, max_iter=10000, tol=1e-3):
    
    model = SGDClassifier(max_iter=max_iter, tol=tol)
    model.fit(x_train, y_train)
    
    return model

In [18]:
result_dict['survived - sgd'] = build_model(sgd_fn,
                                           'Survived',
                                           FEATURES,
                                           titanic_df)

compare_results()

Classification : survived - logistic

Training data
accuracy 0.8031634446397188
precision 0.7967914438502673
recall 0.6681614349775785
accuracy_count 457

Test data
accuracy 0.7692307692307693
precision 0.7666666666666667
recall 0.7076923076923077
accuracy_count 110

Classification : survived - linear_discriminant_analysis

Training data
accuracy 0.81195079086116
precision 0.7857142857142857
recall 0.7268722466960352
accuracy_count 462

Test data
accuracy 0.7762237762237763
precision 0.8085106382978723
recall 0.6229508196721312
accuracy_count 111

Classification : survived - quadratic_discriminant_analysis

Training data
accuracy 0.7978910369068541
precision 0.7633928571428571
recall 0.7339055793991416
accuracy_count 454

Test data
accuracy 0.8111888111888111
precision 0.8181818181818182
recall 0.6545454545454545
accuracy_count 116

Classification : survived - sgd

Training data
accuracy 0.7223198594024605
precision 0.6495726495726496
recall 0.6666666666666666
accuracy_count 411

Test 

## Support Vector Machine
* Find a hyperplane that separates points so all points on the same side belong to the same class

SVC(Support Vector Classifier)


In [19]:
def linear_svc_fn(x_train, y_train, C=1.0, max_iter=1000, tol=1e-3):
    
    model = LinearSVC(C=C, max_iter=max_iter, tol=tol, dual=False)
    model.fit(x_train, y_train)
    
    return model

* **Regularization** is used to penalize the complex model
* **C** here is the inverse of the regularization strength, smaller value indicate stronger regularization - penalize points on the wrong side of the margin 

* **SVC or LinearSVC**
* **LinearSVC == SVC(kernel="linear")**

**dual=Flase** is also used for optimization
* Prefer dual=False when n_samples > n_features

In [20]:
result_dict['survived - linear_svc'] = build_model(linear_svc_fn,
                                                  'Survived',
                                                  FEATURES,
                                                  titanic_df)

compare_results()

Classification : survived - logistic

Training data
accuracy 0.8031634446397188
precision 0.7967914438502673
recall 0.6681614349775785
accuracy_count 457

Test data
accuracy 0.7692307692307693
precision 0.7666666666666667
recall 0.7076923076923077
accuracy_count 110

Classification : survived - linear_discriminant_analysis

Training data
accuracy 0.81195079086116
precision 0.7857142857142857
recall 0.7268722466960352
accuracy_count 462

Test data
accuracy 0.7762237762237763
precision 0.8085106382978723
recall 0.6229508196721312
accuracy_count 111

Classification : survived - quadratic_discriminant_analysis

Training data
accuracy 0.7978910369068541
precision 0.7633928571428571
recall 0.7339055793991416
accuracy_count 454

Test data
accuracy 0.8111888111888111
precision 0.8181818181818182
recall 0.6545454545454545
accuracy_count 116

Classification : survived - sgd

Training data
accuracy 0.7223198594024605
precision 0.6495726495726496
recall 0.6666666666666666
accuracy_count 411

Test 

## Radius Neighbor Classifier

Look for neighbors within the specified radius, neighbors vote to determine classification

In [28]:
def radius_neighbor_fn(x_train, y_train, radius=80.0):
    
    model = RadiusNeighborsClassifier(radius=radius)
    model.fit(x_train, y_train)
    
    return model

In [29]:
result_dict['survived - radius_neighbors'] = build_model(radius_neighbor_fn,
                                                        'Survived',
                                                        FEATURES,
                                                        titanic_df)

compare_results()

Classification : survived - logistic

Training data
accuracy 0.8031634446397188
precision 0.7967914438502673
recall 0.6681614349775785
accuracy_count 457

Test data
accuracy 0.7692307692307693
precision 0.7666666666666667
recall 0.7076923076923077
accuracy_count 110

Classification : survived - linear_discriminant_analysis

Training data
accuracy 0.81195079086116
precision 0.7857142857142857
recall 0.7268722466960352
accuracy_count 462

Test data
accuracy 0.7762237762237763
precision 0.8085106382978723
recall 0.6229508196721312
accuracy_count 111

Classification : survived - quadratic_discriminant_analysis

Training data
accuracy 0.7978910369068541
precision 0.7633928571428571
recall 0.7339055793991416
accuracy_count 454

Test data
accuracy 0.8111888111888111
precision 0.8181818181818182
recall 0.6545454545454545
accuracy_count 116

Classification : survived - sgd

Training data
accuracy 0.7223198594024605
precision 0.6495726495726496
recall 0.6666666666666666
accuracy_count 411

Test 

## Decision Tree

Decision trees set up a tree structure on training data which helps make decisions based on rules

* Fit knowledge into rules
* Each rule involves a threshold
* Order of decision variables matters
* Rules and Orders found using training process

**CART** Classification and Regression Tree

* Consider all features while bulding the tree and build tree till all leaf nodes are pure (Belong to just one class)

In [30]:
def decision_tree_fn(x_train, y_train, max_depth=None, max_features=None):
    
    model = DecisionTreeClassifier(max_depth=max_depth, max_features=max_features)
    
    model.fit(x_train, y_train)
    
    return model

In [31]:
result_dict['survived - decision_tree'] = build_model(decision_tree_fn,
                                                      'Survived',
                                                      FEATURES,
                                                      titanic_df)

compare_results()

Classification : survived - logistic

Training data
accuracy 0.8031634446397188
precision 0.7967914438502673
recall 0.6681614349775785
accuracy_count 457

Test data
accuracy 0.7692307692307693
precision 0.7666666666666667
recall 0.7076923076923077
accuracy_count 110

Classification : survived - linear_discriminant_analysis

Training data
accuracy 0.81195079086116
precision 0.7857142857142857
recall 0.7268722466960352
accuracy_count 462

Test data
accuracy 0.7762237762237763
precision 0.8085106382978723
recall 0.6229508196721312
accuracy_count 111

Classification : survived - quadratic_discriminant_analysis

Training data
accuracy 0.7978910369068541
precision 0.7633928571428571
recall 0.7339055793991416
accuracy_count 454

Test data
accuracy 0.8111888111888111
precision 0.8181818181818182
recall 0.6545454545454545
accuracy_count 116

Classification : survived - sgd

Training data
accuracy 0.7223198594024605
precision 0.6495726495726496
recall 0.6666666666666666
accuracy_count 411

Test 

## Naive Bayes Classifier

* Naive bayes' makes naive (Strong) assumptions about independence of features 
* Naive Bayes' works on conditional probability
* It use Bayes' theorem to find which label is most likely, given the attributes observed in the feature vector, and given how often the different labels occur in the data 

In [32]:
def naive_bayes_fn(x_train, y_train, priors=None):
    
    model = GaussianNB(priors=priors)
    model.fit(x_train, y_train)
    
    return model



In [33]:
result_dict['survived - naive_bayes'] = build_model(naive_bayes_fn,
                                                   'Survived',
                                                   FEATURES,
                                                   titanic_df)

compare_results()

Classification : survived - logistic

Training data
accuracy 0.8031634446397188
precision 0.7967914438502673
recall 0.6681614349775785
accuracy_count 457

Test data
accuracy 0.7692307692307693
precision 0.7666666666666667
recall 0.7076923076923077
accuracy_count 110

Classification : survived - linear_discriminant_analysis

Training data
accuracy 0.81195079086116
precision 0.7857142857142857
recall 0.7268722466960352
accuracy_count 462

Test data
accuracy 0.7762237762237763
precision 0.8085106382978723
recall 0.6229508196721312
accuracy_count 111

Classification : survived - quadratic_discriminant_analysis

Training data
accuracy 0.7978910369068541
precision 0.7633928571428571
recall 0.7339055793991416
accuracy_count 454

Test data
accuracy 0.8111888111888111
precision 0.8181818181818182
recall 0.6545454545454545
accuracy_count 116

Classification : survived - sgd

Training data
accuracy 0.7223198594024605
precision 0.6495726495726496
recall 0.6666666666666666
accuracy_count 411

Test 