In [1]:
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import warnings
from collections import defaultdict
from tqdm import tqdm

In [2]:
warnings.filterwarnings(action='ignore')

Loading the training and testing files. I'm using the titanic dataset from Kaggle for Question 2.

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [5]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


As we can see that the test set does not have the target label 'survived' as Kaggle didn't release the test set with labels. <br>
- So, for our purpose, I'm sampling test set randomly from the train files. <br>
10% of trainset will be test.
- Remaining train set is split into train and validation sets in the ratio 9:1 respectively.

In [6]:
x, y = train.drop('Survived', axis=1), train['Survived']

In [7]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=111)

x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.1, stratify=y_train, random_state=222)

In [8]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [9]:
x_train.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,720.0,720.0,585.0,720.0,720.0,720.0
mean,441.098611,2.304167,29.839043,0.536111,0.384722,32.834183
std,256.92512,0.837563,14.700694,1.113241,0.815729,49.459847
min,1.0,1.0,0.42,0.0,0.0,0.0
25%,219.75,2.0,20.5,0.0,0.0,7.8958
50%,438.0,3.0,28.0,0.0,0.0,14.4542
75%,663.25,3.0,39.0,1.0,0.0,31.620825
max,891.0,3.0,80.0,8.0,6.0,512.3292


In [10]:
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1, encoded_missing_value=-1)

x_train = encoder.fit_transform(x_train)
x_val = encoder.transform(x_val)
x_test = encoder.transform(x_test)

In [11]:
scaler = StandardScaler()

x_train = scaler.fit_transform(x_train)
x_val = scaler.transform(x_val)
x_test = scaler.transform(x_test)

# Multinomial Logistic Regression

We'll optimize the hyperparameters one-by-one, keeping the other hyperparameters to their default values until we find an optimized value of those hyperparameters.

### Optimizing the Maximum number of iterations, max_iter hyperparameter.

In [12]:
max_iter = [100, 200, 500, 1000, 10000]

for mi in max_iter:
    softmax_reg = LogisticRegression(penalty='l2', C=1, solver='lbfgs', max_iter=mi)
    softmax_reg.fit(x_train, y_train)
    acc_train = softmax_reg.score(x_train, y_train)
    acc_val = softmax_reg.score(x_val, y_val)
    print(f'For max_iter = {mi}:')
    print(f'\t Training accuracy: {acc_train} \t\t Validation accuracy: {acc_val}')

For max_iter = 100:
	 Training accuracy: 0.8194444444444444 		 Validation accuracy: 0.7901234567901234
For max_iter = 200:
	 Training accuracy: 0.8194444444444444 		 Validation accuracy: 0.7901234567901234
For max_iter = 500:
	 Training accuracy: 0.8194444444444444 		 Validation accuracy: 0.7901234567901234
For max_iter = 1000:
	 Training accuracy: 0.8194444444444444 		 Validation accuracy: 0.7901234567901234
For max_iter = 10000:
	 Training accuracy: 0.8194444444444444 		 Validation accuracy: 0.7901234567901234


##### We can see that the model converges in 100 iterations as neither training nor validation accuracy improves after 100 iterations.

### Optimizing the Regularization Coefficient (actually it's inverse), C hyperparameter.

In [13]:
C = [0.00001, 0.0001,0.001,0.01, 0.1, 1, 10, 100, 1000]

for c in C:
    softmax_reg = LogisticRegression(penalty='l2', C=c, solver='lbfgs', max_iter=100)
    softmax_reg.fit(x_train, y_train)
    acc_train = softmax_reg.score(x_train, y_train)
    acc_val = softmax_reg.score(x_val, y_val)
    print(f'For C = {c}:')
    print(f'\t Training accuracy: {acc_train} \t\t Validation accuracy: {acc_val}')

For C = 1e-05:
	 Training accuracy: 0.6166666666666667 		 Validation accuracy: 0.6172839506172839
For C = 0.0001:
	 Training accuracy: 0.6166666666666667 		 Validation accuracy: 0.6172839506172839
For C = 0.001:
	 Training accuracy: 0.7138888888888889 		 Validation accuracy: 0.6419753086419753
For C = 0.01:
	 Training accuracy: 0.8166666666666667 		 Validation accuracy: 0.8024691358024691
For C = 0.1:
	 Training accuracy: 0.8263888888888888 		 Validation accuracy: 0.7901234567901234
For C = 1:
	 Training accuracy: 0.8194444444444444 		 Validation accuracy: 0.7901234567901234
For C = 10:
	 Training accuracy: 0.8194444444444444 		 Validation accuracy: 0.7901234567901234
For C = 100:
	 Training accuracy: 0.8194444444444444 		 Validation accuracy: 0.7901234567901234
For C = 1000:
	 Training accuracy: 0.8194444444444444 		 Validation accuracy: 0.7901234567901234


##### We get best training and validation accuracies for C = 0.01
- For C < 0.01: both the training and validation accuracies are less as compared to that of C = 0.01, indicating that the model underfits the training data. <br>
- For C > 0.01: the training and validation accuracies don't improve much, in fact the validation accuracy decreases, indicating that the value of C doesn't have any considerable impact on the model.

### Optimizing the solver hyperparameter.

In [14]:
solver = ['liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga', 'lbfgs']

for solv in solver:
    softmax_reg = LogisticRegression(penalty='l2', C=0.01, solver=solv, max_iter=100)
    softmax_reg.fit(x_train, y_train)
    acc_train = softmax_reg.score(x_train, y_train)
    acc_val = softmax_reg.score(x_val, y_val)
    print(f'For solver = {solv}:')
    print(f'\t Training accuracy: {acc_train} \t\t Validation accuracy: {acc_val}')

For solver = liblinear:
	 Training accuracy: 0.8 		 Validation accuracy: 0.8024691358024691
For solver = newton-cg:
	 Training accuracy: 0.8166666666666667 		 Validation accuracy: 0.8024691358024691
For solver = newton-cholesky:
	 Training accuracy: 0.8166666666666667 		 Validation accuracy: 0.8024691358024691
For solver = sag:
	 Training accuracy: 0.8166666666666667 		 Validation accuracy: 0.8024691358024691
For solver = saga:
	 Training accuracy: 0.8166666666666667 		 Validation accuracy: 0.8024691358024691
For solver = lbfgs:
	 Training accuracy: 0.8166666666666667 		 Validation accuracy: 0.8024691358024691


##### The solver hyperparameter doesn't have any impact on the model's performance as both the training and validation accuracies are same for all the solvers. We'll select the "lbfgs" solver as it's convergence is faster as compared to other solvers.

### Optimizing the norm of regularization, penalty hyperparameter.

In [15]:
penalty = ['l1', 'l2', 'elasticnet']

for pen in penalty:
    softmax_reg = LogisticRegression(penalty=pen, C=0.01, solver='saga', max_iter=100, l1_ratio=0.5)
    softmax_reg.fit(x_train, y_train)
    acc_train = softmax_reg.score(x_train, y_train)
    acc_val = softmax_reg.score(x_val, y_val)
    print(f'For penalty = {pen}:')
    print(f'\t Training accuracy: {acc_train} \t\t Validation accuracy: {acc_val}')

For penalty = l1:
	 Training accuracy: 0.7902777777777777 		 Validation accuracy: 0.7777777777777778
For penalty = l2:
	 Training accuracy: 0.8166666666666667 		 Validation accuracy: 0.8024691358024691
For penalty = elasticnet:
	 Training accuracy: 0.7902777777777777 		 Validation accuracy: 0.7777777777777778


##### The l1 and elasticnet penalties have more impact on the model weights and reduce it's capability to fit the data properly which is evident from the low training and validation accuracies. <br>
We'll select l2 penalty term as it gives the best training and validation accuracies out of the 3 penalty terms.

#### We have got the optimal values of required hyperparameters, so we'll just train the model for different number of max iterations to find if it impacts the convergence point of model.

In [16]:
max_iter = [100, 200, 500, 1000]

for mi in max_iter:
    softmax_reg = LogisticRegression(penalty='l2', C=0.01, solver='lbfgs', max_iter=mi)
    softmax_reg.fit(x_train, y_train)
    acc_train = softmax_reg.score(x_train, y_train)
    acc_val = softmax_reg.score(x_val, y_val)
    print(f'For max_iter = {mi}:')
    print(f'\t Training accuracy: {acc_train} \t\t Validation accuracy: {acc_val}')

For max_iter = 100:
	 Training accuracy: 0.8166666666666667 		 Validation accuracy: 0.8024691358024691
For max_iter = 200:
	 Training accuracy: 0.8166666666666667 		 Validation accuracy: 0.8024691358024691
For max_iter = 500:
	 Training accuracy: 0.8166666666666667 		 Validation accuracy: 0.8024691358024691
For max_iter = 1000:
	 Training accuracy: 0.8166666666666667 		 Validation accuracy: 0.8024691358024691


#### The model converges to same point for all max_iter values. So, we'll need only 100 iterations for the model to converge.

Final Logistic Regression model with optimized hyperparameters.

In [17]:
softmax_reg = LogisticRegression(penalty='l2', C=0.01, solver='lbfgs', max_iter=100)
softmax_reg.fit(x_train, y_train)

In [18]:
acc_train = softmax_reg.score(x_train, y_train)
acc_val = softmax_reg.score(x_val, y_val)

y_train_pred = softmax_reg.predict(x_train)
y_val_pred = softmax_reg.predict(x_val)

prec_train = precision_score(y_train, y_train_pred, average="macro")
prec_val = precision_score(y_val, y_val_pred, average="macro")

rec_train = recall_score(y_train, y_train_pred, average="macro")
rec_val = recall_score(y_val, y_val_pred, average="macro")

f1_train = f1_score(y_train, y_train_pred, average="macro")
f1_val = f1_score(y_val, y_val_pred, average="macro")

print(f'Training accuracy: {acc_train} \t\t Validation accuracy: {acc_val}')
print(f'Training precision score: {prec_train} \t\t Validation precision score: {prec_val}')
print(f'Training recall score: {rec_train} \t\t Validation recall score: {rec_val}')
print(f'Training f1 score: {f1_train} \t\t Validation f1 score: {f1_val}')

Training accuracy: 0.8166666666666667 		 Validation accuracy: 0.8024691358024691
Training precision score: 0.8196363636363637 		 Validation precision score: 0.7927718832891246
Training recall score: 0.7869173521347435 		 Validation recall score: 0.7848387096774194
Training f1 score: 0.7970202296336796 		 Validation f1 score: 0.788235294117647


In [19]:
conf_mx = confusion_matrix(y_train, y_train_pred)
conf_mx

array([[406,  38],
       [ 94, 182]])

In [20]:
conf_mx = confusion_matrix(y_val, y_val_pred)
conf_mx

array([[43,  7],
       [ 9, 22]])

# Support Vector Machines

## Linear Kernel

### Optimizing the Maximum number of iterations, max_iter hyperparameter.

In [21]:
max_iter = [1000, 10000, 40000, 50000, 60000, 80000, 100000]

for mi in max_iter:
    svm_clf = SVC(C=1, kernel='linear', max_iter=mi)
    svm_clf.fit(x_train, y_train)
    acc_train = svm_clf.score(x_train, y_train)
    acc_val = svm_clf.score(x_val, y_val)
    print(f'For max_iter = {mi}:')
    print(f'\t Training accuracy: {acc_train} \t\t Validation accuracy: {acc_val}')

For max_iter = 1000:
	 Training accuracy: 0.7902777777777777 		 Validation accuracy: 0.7777777777777778
For max_iter = 10000:
	 Training accuracy: 0.7902777777777777 		 Validation accuracy: 0.7777777777777778
For max_iter = 40000:
	 Training accuracy: 0.7902777777777777 		 Validation accuracy: 0.7777777777777778
For max_iter = 50000:
	 Training accuracy: 0.7902777777777777 		 Validation accuracy: 0.7777777777777778
For max_iter = 60000:
	 Training accuracy: 0.7902777777777777 		 Validation accuracy: 0.7777777777777778
For max_iter = 80000:
	 Training accuracy: 0.7902777777777777 		 Validation accuracy: 0.7777777777777778
For max_iter = 100000:
	 Training accuracy: 0.7902777777777777 		 Validation accuracy: 0.7777777777777778


##### We can see that the model converges in 1000 iterations as neither training nor validation accuracy improves after 1000 iterations.

### Optimizing the Regularization Coefficient (actually it's inverse), C hyperparameter.

In [22]:
C = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]

for c in C:
    svm_clf = SVC(C=c, kernel='linear', max_iter=1000)
    svm_clf.fit(x_train, y_train)
    acc_train = svm_clf.score(x_train, y_train)
    acc_val = svm_clf.score(x_val, y_val)
    print(f'For C = {c}:')
    print(f'\t Training accuracy: {acc_train} \t\t Validation accuracy: {acc_val}')

For C = 0.0001:
	 Training accuracy: 0.6166666666666667 		 Validation accuracy: 0.6172839506172839
For C = 0.001:
	 Training accuracy: 0.7333333333333333 		 Validation accuracy: 0.6296296296296297
For C = 0.01:
	 Training accuracy: 0.7902777777777777 		 Validation accuracy: 0.7777777777777778
For C = 0.1:
	 Training accuracy: 0.7902777777777777 		 Validation accuracy: 0.7777777777777778
For C = 1:
	 Training accuracy: 0.7902777777777777 		 Validation accuracy: 0.7777777777777778
For C = 10:
	 Training accuracy: 0.7291666666666666 		 Validation accuracy: 0.6419753086419753
For C = 100:
	 Training accuracy: 0.5847222222222223 		 Validation accuracy: 0.43209876543209874
For C = 1000:
	 Training accuracy: 0.6291666666666667 		 Validation accuracy: 0.4074074074074074


##### We get best training and validation accuracies for C = 0.01, 0.1 and 1
- For C < 0.01: both the training and validation accuracies are less as compared to that of C = 0.01, indicating that the model underfits the training data. <br>
- For C > 1: the training and validation accuracies start degrading, indicating that the model is imposing heavy regularization on the model's weights and thus the model fails to converge and underfits the training data.

We'll choose C = 0.01

Final Linear SVM model with optimized hyperparameters.

In [23]:
svm_clf = SVC(C=0.01, kernel='linear', max_iter=1000)
svm_clf.fit(x_train, y_train)
acc_train = svm_clf.score(x_train, y_train)
acc_val = svm_clf.score(x_val, y_val)
print(f'\t Training accuracy: {acc_train} \t\t Validation accuracy: {acc_val}')

	 Training accuracy: 0.7902777777777777 		 Validation accuracy: 0.7777777777777778


## Polynomial Kernel

### Optimizing the Maximum number of iterations, max_iter hyperparameter.

In [24]:
max_iter = [1000, 10000, 40000, 50000, 60000, 80000, 100000]

for mi in max_iter:
    svm_clf = SVC(C=1, kernel='poly', degree=3, gamma='scale', coef0=0, max_iter=mi)
    svm_clf.fit(x_train, y_train)
    acc_train = svm_clf.score(x_train, y_train)
    acc_val = svm_clf.score(x_val, y_val)
    print(f'For max_iter = {mi}:')
    print(f'\t Training accuracy: {acc_train} \t\t Validation accuracy: {acc_val}')

For max_iter = 1000:
	 Training accuracy: 0.8680555555555556 		 Validation accuracy: 0.8024691358024691
For max_iter = 10000:
	 Training accuracy: 0.8680555555555556 		 Validation accuracy: 0.8024691358024691
For max_iter = 40000:
	 Training accuracy: 0.8680555555555556 		 Validation accuracy: 0.8024691358024691
For max_iter = 50000:
	 Training accuracy: 0.8680555555555556 		 Validation accuracy: 0.8024691358024691
For max_iter = 60000:
	 Training accuracy: 0.8680555555555556 		 Validation accuracy: 0.8024691358024691
For max_iter = 80000:
	 Training accuracy: 0.8680555555555556 		 Validation accuracy: 0.8024691358024691
For max_iter = 100000:
	 Training accuracy: 0.8680555555555556 		 Validation accuracy: 0.8024691358024691


##### We can see that the model converges in 1000 iterations as neither training nor validation accuracy improves after 1000 iterations.

### Optimizing the Regularization Coefficient (actually it's inverse), C hyperparameter.

In [25]:
C = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]

for c in C:
    svm_clf = SVC(C=c, kernel='poly', degree=3, gamma='scale', coef0=0, max_iter=1000)
    svm_clf.fit(x_train, y_train)
    acc_train = svm_clf.score(x_train, y_train)
    acc_val = svm_clf.score(x_val, y_val)
    print(f'For C = {c}:')
    print(f'\t Training accuracy: {acc_train} \t\t Validation accuracy: {acc_val}')

For C = 0.0001:
	 Training accuracy: 0.6166666666666667 		 Validation accuracy: 0.6172839506172839
For C = 0.001:
	 Training accuracy: 0.6166666666666667 		 Validation accuracy: 0.6172839506172839
For C = 0.01:
	 Training accuracy: 0.6416666666666667 		 Validation accuracy: 0.6296296296296297
For C = 0.1:
	 Training accuracy: 0.775 		 Validation accuracy: 0.7037037037037037
For C = 1:
	 Training accuracy: 0.8680555555555556 		 Validation accuracy: 0.8024691358024691
For C = 10:
	 Training accuracy: 0.9125 		 Validation accuracy: 0.8148148148148148
For C = 100:
	 Training accuracy: 0.9013888888888889 		 Validation accuracy: 0.7777777777777778
For C = 1000:
	 Training accuracy: 0.7 		 Validation accuracy: 0.6666666666666666


##### We get best training and validation accuracies for C = 10
- For C < 10: both the training and validation accuracies are less as compared to that of C = 10, indicating that the model underfits the training data. <br>
- For C > 1: the training and validation accuracies start degrading, indicating that the model is imposing heavy regularization on the model's weights and thus the model fails to converge and underfits the training data.

We'll choose C = 10

### Optimizing the degree hyperparameter.

In [26]:
degrees = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

for deg in degrees:
    svm_clf = SVC(C=10, kernel='poly', degree=deg, gamma='scale', coef0=0, max_iter=1000)
    svm_clf.fit(x_train, y_train)
    acc_train = svm_clf.score(x_train, y_train)
    acc_val = svm_clf.score(x_val, y_val)
    print(f'For degree = {deg}:')
    print(f'\t Training accuracy: {acc_train} \t\t Validation accuracy: {acc_val}')

For degree = 1:
	 Training accuracy: 0.7902777777777777 		 Validation accuracy: 0.7777777777777778
For degree = 2:
	 Training accuracy: 0.8347222222222223 		 Validation accuracy: 0.8024691358024691
For degree = 3:
	 Training accuracy: 0.9125 		 Validation accuracy: 0.8148148148148148
For degree = 4:
	 Training accuracy: 0.9333333333333333 		 Validation accuracy: 0.8024691358024691
For degree = 5:
	 Training accuracy: 0.9347222222222222 		 Validation accuracy: 0.7777777777777778
For degree = 6:
	 Training accuracy: 0.9138888888888889 		 Validation accuracy: 0.7160493827160493
For degree = 7:
	 Training accuracy: 0.9083333333333333 		 Validation accuracy: 0.7777777777777778
For degree = 8:
	 Training accuracy: 0.8944444444444445 		 Validation accuracy: 0.7530864197530864
For degree = 9:
	 Training accuracy: 0.8833333333333333 		 Validation accuracy: 0.7777777777777778
For degree = 10:
	 Training accuracy: 0.8763888888888889 		 Validation accuracy: 0.7283950617283951


##### We can see that with increase in degree of the model, the training and validation accuracies increase upto 3rd degree and after that the training accuracy increases until degree 5 but the validation accuracy decreases during this interval, indicating that the model overfits on the training data. Beyond degree 5, both training and validation accuracies decrease as compared to that of degree 3.

So, we choose degree = 3.

### Optimizing the gamma hyperparameter.

In [27]:
gamma = ['scale', 'auto', 0.01, 0.1, 1, 10, 100]

for g in gamma:
    svm_clf = SVC(C=10, kernel='poly', degree=3, gamma=g, coef0=0, max_iter=1000)
    svm_clf.fit(x_train, y_train)
    acc_train = svm_clf.score(x_train, y_train)
    acc_val = svm_clf.score(x_val, y_val)
    print(f'For gamma = {g}:')
    print(f'\t Training accuracy: {acc_train} \t\t Validation accuracy: {acc_val}')

For gamma = scale:
	 Training accuracy: 0.9125 		 Validation accuracy: 0.8148148148148148
For gamma = auto:
	 Training accuracy: 0.9125 		 Validation accuracy: 0.8148148148148148
For gamma = 0.01:
	 Training accuracy: 0.6583333333333333 		 Validation accuracy: 0.6296296296296297
For gamma = 0.1:
	 Training accuracy: 0.9138888888888889 		 Validation accuracy: 0.8024691358024691
For gamma = 1:
	 Training accuracy: 0.6791666666666667 		 Validation accuracy: 0.6790123456790124
For gamma = 10:
	 Training accuracy: 0.5819444444444445 		 Validation accuracy: 0.6049382716049383
For gamma = 100:
	 Training accuracy: 0.5708333333333333 		 Validation accuracy: 0.6049382716049383


##### The model gives best training and validation accuracies when gamma is set to "auto" or "scale". 
- gamma='scale' -> kernel coefficient = 1 / (n_features * x_train.var()
- gamma=‘auto’ -> kernel coefficient = 1 / n_features

As for gamma = "auto" or "scale", the kernel coefficient is computed using the number of features (and variance of training data in case of "scale"), the model fits the data better as compared to other random values. <br><br>
So, we choose gamma="scale".

### Optimizing the coef0 hyperparameter.

In [28]:
coef = [0, 0.01, 0.1, 1, 10, 100]

for c in coef:
    svm_clf = SVC(C=10, kernel='poly', degree=3, gamma='scale', coef0=c, max_iter=1000)
    svm_clf.fit(x_train, y_train)
    acc_train = svm_clf.score(x_train, y_train)
    acc_val = svm_clf.score(x_val, y_val)
    print(f'For coef0 = {c}:')
    print(f'\t Training accuracy: {acc_train} \t\t Validation accuracy: {acc_val}')

For coef0 = 0:
	 Training accuracy: 0.9125 		 Validation accuracy: 0.8148148148148148
For coef0 = 0.01:
	 Training accuracy: 0.9152777777777777 		 Validation accuracy: 0.8024691358024691
For coef0 = 0.1:
	 Training accuracy: 0.925 		 Validation accuracy: 0.7901234567901234
For coef0 = 1:
	 Training accuracy: 0.9263888888888889 		 Validation accuracy: 0.7654320987654321
For coef0 = 10:
	 Training accuracy: 0.6152777777777778 		 Validation accuracy: 0.6666666666666666
For coef0 = 100:
	 Training accuracy: 0.6819444444444445 		 Validation accuracy: 0.7777777777777778


##### We get best training and validation accuracies for coef0 = 0. For coef0 between 0 and 1, the training accuracy increases but the validation accuracy decreases, indicating overfitting. For coef0 > 1, both training and validation accuracies increase or decrease together. <br>
So, we choose coef0 = 0.

Final polynomial SVM model with optimized hyperparameters.

In [29]:
svm_clf = SVC(C=10, kernel='poly', degree=3, gamma='scale', coef0=0, max_iter=1000)
svm_clf.fit(x_train, y_train)
acc_train = svm_clf.score(x_train, y_train)
acc_val = svm_clf.score(x_val, y_val)
print(f'\t Training accuracy: {acc_train} \t\t Validation accuracy: {acc_val}')

	 Training accuracy: 0.9125 		 Validation accuracy: 0.8148148148148148


### RBF Kernel

### Optimizing the Maximum number of iterations, max_iter hyperparameter.

In [30]:
max_iter = [1000, 10000, 40000, 50000, 60000, 80000, 100000]

for mi in max_iter:
    svm_clf = SVC(C=1, kernel='rbf', gamma='scale', max_iter=mi)
    svm_clf.fit(x_train, y_train)
    acc_train = svm_clf.score(x_train, y_train)
    acc_val = svm_clf.score(x_val, y_val)
    print(f'For max_iter = {mi}:')
    print(f'\t Training accuracy: {acc_train} \t\t Validation accuracy: {acc_val}')

For max_iter = 1000:
	 Training accuracy: 0.8527777777777777 		 Validation accuracy: 0.8024691358024691
For max_iter = 10000:
	 Training accuracy: 0.8527777777777777 		 Validation accuracy: 0.8024691358024691
For max_iter = 40000:
	 Training accuracy: 0.8527777777777777 		 Validation accuracy: 0.8024691358024691
For max_iter = 50000:
	 Training accuracy: 0.8527777777777777 		 Validation accuracy: 0.8024691358024691
For max_iter = 60000:
	 Training accuracy: 0.8527777777777777 		 Validation accuracy: 0.8024691358024691
For max_iter = 80000:
	 Training accuracy: 0.8527777777777777 		 Validation accuracy: 0.8024691358024691
For max_iter = 100000:
	 Training accuracy: 0.8527777777777777 		 Validation accuracy: 0.8024691358024691


##### We can see that the model converges in 1000 iterations as neither training nor validation accuracy improves after 1000 iterations.

### Optimizing the Regularization Coefficient (actually it's inverse), C hyperparameter.

In [31]:
C = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]

for c in C:
    svm_clf = SVC(C=c, kernel='rbf', gamma='scale', max_iter=1000)
    svm_clf.fit(x_train, y_train)
    acc_train = svm_clf.score(x_train, y_train)
    acc_val = svm_clf.score(x_val, y_val)
    print(f'For C = {c}:')
    print(f'\t Training accuracy: {acc_train} \t\t Validation accuracy: {acc_val}')

For C = 0.0001:
	 Training accuracy: 0.6166666666666667 		 Validation accuracy: 0.6172839506172839
For C = 0.001:
	 Training accuracy: 0.6166666666666667 		 Validation accuracy: 0.6172839506172839
For C = 0.01:
	 Training accuracy: 0.6166666666666667 		 Validation accuracy: 0.6172839506172839
For C = 0.1:
	 Training accuracy: 0.8416666666666667 		 Validation accuracy: 0.8024691358024691
For C = 1:
	 Training accuracy: 0.8527777777777777 		 Validation accuracy: 0.8024691358024691
For C = 10:
	 Training accuracy: 0.9319444444444445 		 Validation accuracy: 0.7654320987654321
For C = 100:
	 Training accuracy: 0.9625 		 Validation accuracy: 0.6419753086419753
For C = 1000:
	 Training accuracy: 0.8416666666666667 		 Validation accuracy: 0.6172839506172839
For C = 10000:
	 Training accuracy: 0.7666666666666667 		 Validation accuracy: 0.691358024691358


##### We get best training and validation accuracies for C = 0.1 and 1
- For C < 0.1: both the training and validation accuracies are less as compared to that of C = 0.1, indicating that the model underfits the training data. <br>
- For 1 < C <= 100: the training accuracy improves but the validation accuracy decreases indicating that the overfits on the training data and doesn't generalize well on the validation set 
- For C > 100: both the training and validation accuracies start degrading, indicating that the model is imposing heavy regularization on the model's weights and thus the model fails to converge and underfits the training data.

We'll choose C = 0.1

### Optimizing the gamma hyperparameter.

In [32]:
gamma = ['scale', 'auto', 0.01, 0.1, 1, 10, 100]

for g in gamma:
    svm_clf = SVC(C=0.1, kernel='rbf', gamma=g, max_iter=1000)
    svm_clf.fit(x_train, y_train)
    acc_train = svm_clf.score(x_train, y_train)
    acc_val = svm_clf.score(x_val, y_val)
    print(f'For gamma = {g}:')
    print(f'\t Training accuracy: {acc_train} \t\t Validation accuracy: {acc_val}')

For gamma = scale:
	 Training accuracy: 0.8416666666666667 		 Validation accuracy: 0.8024691358024691
For gamma = auto:
	 Training accuracy: 0.8416666666666667 		 Validation accuracy: 0.8024691358024691
For gamma = 0.01:
	 Training accuracy: 0.7944444444444444 		 Validation accuracy: 0.7407407407407407
For gamma = 0.1:
	 Training accuracy: 0.8416666666666667 		 Validation accuracy: 0.8024691358024691
For gamma = 1:
	 Training accuracy: 0.6166666666666667 		 Validation accuracy: 0.6172839506172839
For gamma = 10:
	 Training accuracy: 0.6166666666666667 		 Validation accuracy: 0.6172839506172839
For gamma = 100:
	 Training accuracy: 0.6166666666666667 		 Validation accuracy: 0.6172839506172839


##### The model gives best training and validation accuracies when gamma is set to "auto" or "scale". 
- gamma='scale' -> kernel coefficient = 1 / (n_features * x_train.var()
- gamma=‘auto’ -> kernel coefficient = 1 / n_features

As for gamma = "auto" or "scale", the kernel coefficient is computed using the number of features (and variance of training data in case of "scale"), the model fits the data better as compared to other random values. <br><br>
So, we choose gamma="scale".

Final RBF Kernel SVM model with optimized hyperparameters.

In [82]:
svm_clf = SVC(C=0.1, kernel='rbf', gamma='scale', max_iter=1000, probability=True, random_state=4200)
svm_clf.fit(x_train, y_train)
acc_train = svm_clf.score(x_train, y_train)
acc_val = svm_clf.score(x_val, y_val)
print(f'\t Training accuracy: {acc_train} \t\t Validation accuracy: {acc_val}')

	 Training accuracy: 0.8416666666666667 		 Validation accuracy: 0.8024691358024691


Out of Linear, Polynomial and RBF Kernel SVM models, the Polynomial SVM model performs best so we choose Polynomial SVM as the final SVM model with optimized hyperparameters.

In [89]:
svm_clf = SVC(C=10, kernel='poly', degree=3, gamma='scale', coef0=0, max_iter=1000, probability=True, random_state=1331)
svm_clf.fit(x_train, y_train)
acc_train = svm_clf.score(x_train, y_train)
acc_val = svm_clf.score(x_val, y_val)
print(f'\t Training accuracy: {acc_train} \t\t Validation accuracy: {acc_val}')

	 Training accuracy: 0.9125 		 Validation accuracy: 0.8148148148148148


In [90]:
acc_train = svm_clf.score(x_train, y_train)
acc_val = svm_clf.score(x_val, y_val)

y_train_pred = svm_clf.predict(x_train)
y_val_pred = svm_clf.predict(x_val)

prec_train = precision_score(y_train, y_train_pred, average="macro")
prec_val = precision_score(y_val, y_val_pred, average="macro")

rec_train = recall_score(y_train, y_train_pred, average="macro")
rec_val = recall_score(y_val, y_val_pred, average="macro")

f1_train = f1_score(y_train, y_train_pred, average="macro")
f1_val = f1_score(y_val, y_val_pred, average="macro")

print(f'Training accuracy: {acc_train} \t\t Validation accuracy: {acc_val}')
print(f'Training precision score: {prec_train} \t\t Validation precision score: {prec_val}')
print(f'Training recall score: {rec_train} \t\t Validation recall score: {rec_val}')
print(f'Training f1 score: {f1_train} \t\t Validation f1 score: {f1_val}')

Training accuracy: 0.9125 		 Validation accuracy: 0.8148148148148148
Training precision score: 0.9115393283750282 		 Validation precision score: 0.8055555555555556
Training recall score: 0.9023207990599296 		 Validation recall score: 0.8193548387096774
Training f1 score: 0.9064570611926724 		 Validation f1 score: 0.8091123330714847


In [37]:
conf_mx = confusion_matrix(y_train, y_train_pred)
conf_mx

array([[420,  24],
       [ 39, 237]])

In [38]:
conf_mx = confusion_matrix(y_val, y_val_pred)
conf_mx

array([[40, 10],
       [ 5, 26]])

## Random Forest Classifier

### Optimizing the number of decision trees in the forest, n_estimators hyperparameter.

In [43]:
n_est = [i+1 for i in range(10) ]

for est in n_est:
    rnd_clf = RandomForestClassifier(n_estimators=est, criterion='gini', max_depth=None, min_samples_split=2,
                                     min_samples_leaf=1, max_features='sqrt', max_leaf_nodes=None, n_jobs=-1,
                                     random_state=1001)
    rnd_clf.fit(x_train, y_train)
    acc_train = rnd_clf.score(x_train, y_train)
    acc_val = rnd_clf.score(x_val, y_val)
    print(f'For n_estimators = {est}')
    print(f'\t Training accuracy: {acc_train} \t\t Validation accuracy: {acc_val}')

For n_estimators = 1
	 Training accuracy: 0.8819444444444444 		 Validation accuracy: 0.7037037037037037
For n_estimators = 2
	 Training accuracy: 0.8986111111111111 		 Validation accuracy: 0.7160493827160493
For n_estimators = 3
	 Training accuracy: 0.9569444444444445 		 Validation accuracy: 0.7160493827160493
For n_estimators = 4
	 Training accuracy: 0.9541666666666667 		 Validation accuracy: 0.7530864197530864
For n_estimators = 5
	 Training accuracy: 0.9777777777777777 		 Validation accuracy: 0.7530864197530864
For n_estimators = 6
	 Training accuracy: 0.9763888888888889 		 Validation accuracy: 0.7777777777777778
For n_estimators = 7
	 Training accuracy: 0.9888888888888889 		 Validation accuracy: 0.7037037037037037
For n_estimators = 8
	 Training accuracy: 0.9833333333333333 		 Validation accuracy: 0.691358024691358
For n_estimators = 9
	 Training accuracy: 0.9930555555555556 		 Validation accuracy: 0.6666666666666666
For n_estimators = 10
	 Training accuracy: 0.9930555555555556 		 

##### We can see that the training and validation accuracies increase with the increase in number of decision trees in the forest till n_estimators = 6 and after that the training accuracy increases but the validation accuracy starts decreasing which means that the model is overfitting on the training data. <br>
So, we choose n_estimators = 6.

### Optimizing the impurity measure, criterion hyperparameter.

In [45]:
criterion = ['gini', 'entropy', 'log_loss']

for crit in criterion:
    rnd_clf = RandomForestClassifier(n_estimators=6, criterion=crit, max_depth=None, min_samples_split=2,
                                     min_samples_leaf=1, max_features='sqrt', max_leaf_nodes=None, n_jobs=-1,
                                     random_state=1001)
    rnd_clf.fit(x_train, y_train)
    acc_train = rnd_clf.score(x_train, y_train)
    acc_val = rnd_clf.score(x_val, y_val)
    print(f'For criterion = {crit}')
    print(f'\t Training accuracy: {acc_train} \t\t Validation accuracy: {acc_val}')

For criterion = gini
	 Training accuracy: 0.9763888888888889 		 Validation accuracy: 0.7777777777777778
For criterion = entropy
	 Training accuracy: 0.9736111111111111 		 Validation accuracy: 0.6172839506172839
For criterion = log_loss
	 Training accuracy: 0.9736111111111111 		 Validation accuracy: 0.6172839506172839


##### We get the best training and validation accuracies for criterion = "gini". The other options give similar training accuracy but perform poorly on the validation. <br>
So, we choose "gini" as the impurity measure as it's also faster to compute.

### Optimizing the depth of decision trees, max_depth hyperparameter.

In [46]:
max_depth = [None, 10, 20, 30, 40, 100]

for md in max_depth:
    rnd_clf = RandomForestClassifier(n_estimators=6, criterion='gini', max_depth=md, min_samples_split=2,
                                     min_samples_leaf=1, max_features='sqrt', max_leaf_nodes=None, n_jobs=-1,
                                     random_state=1001)
    rnd_clf.fit(x_train, y_train)
    acc_train = rnd_clf.score(x_train, y_train)
    acc_val = rnd_clf.score(x_val, y_val)
    print(f'For max_depth = {md}')
    print(f'\t Training accuracy: {acc_train} \t\t Validation accuracy: {acc_val}')

For max_depth = None
	 Training accuracy: 0.9763888888888889 		 Validation accuracy: 0.7777777777777778
For max_depth = 10
	 Training accuracy: 0.9569444444444445 		 Validation accuracy: 0.7530864197530864
For max_depth = 20
	 Training accuracy: 0.9763888888888889 		 Validation accuracy: 0.7777777777777778
For max_depth = 30
	 Training accuracy: 0.9763888888888889 		 Validation accuracy: 0.7777777777777778
For max_depth = 40
	 Training accuracy: 0.9763888888888889 		 Validation accuracy: 0.7777777777777778
For max_depth = 100
	 Training accuracy: 0.9763888888888889 		 Validation accuracy: 0.7777777777777778


##### We can see that the max_depth hyperparameter doesn't have any significant impact on the model's performance on both training and validation sets and we get fairly same result even with small depth of decision trees. 
- This can be explained with the fact that the dataset is pretty small with few features and also most of the features have small range of values and thus we don't need a large number of trees for convergence. <br>
So, we choose to keep the default value of max_depth, i.e. None, in which case the nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.

### Optimizing the minimum number of samples required to split an internal node, min_samples_split hyperparameter.

In [111]:
min_samples_split = [2, 5, 10, 15, 20, 30, 40, 50, 60, 70, 80]

for mss in min_samples_split:
    rnd_clf = RandomForestClassifier(n_estimators=6, criterion='gini', max_depth=None, min_samples_split=mss,
                                     min_samples_leaf=1, max_features='sqrt', max_leaf_nodes=None, n_jobs=-1,
                                     random_state=1001)
    rnd_clf.fit(x_train, y_train)
    acc_train = rnd_clf.score(x_train, y_train)
    acc_val = rnd_clf.score(x_val, y_val)
    print(f'For min_samples_split = {mss}')
    print(f'\t Training accuracy: {acc_train} \t\t Validation accuracy: {acc_val}')

For min_samples_split = 2
	 Training accuracy: 0.9763888888888889 		 Validation accuracy: 0.7777777777777778
For min_samples_split = 5
	 Training accuracy: 0.9569444444444445 		 Validation accuracy: 0.5308641975308642
For min_samples_split = 10
	 Training accuracy: 0.9222222222222223 		 Validation accuracy: 0.7283950617283951
For min_samples_split = 15
	 Training accuracy: 0.9055555555555556 		 Validation accuracy: 0.654320987654321
For min_samples_split = 20
	 Training accuracy: 0.8847222222222222 		 Validation accuracy: 0.6049382716049383
For min_samples_split = 30
	 Training accuracy: 0.8680555555555556 		 Validation accuracy: 0.691358024691358
For min_samples_split = 40
	 Training accuracy: 0.8541666666666666 		 Validation accuracy: 0.6419753086419753
For min_samples_split = 50
	 Training accuracy: 0.8513888888888889 		 Validation accuracy: 0.7037037037037037
For min_samples_split = 60
	 Training accuracy: 0.8333333333333334 		 Validation accuracy: 0.6049382716049383
For min_sample

##### We get best training and validation accuracies for min_samples_split = 2. For other values, the training and validation accuracy keep oscillating but are still less than that for min_samples_split = 2.

### Optimizing the minimum number of samples required to be at leaf node, min_samples_leaf hyperparameter.

In [112]:
min_samples_leaf = [1, 2, 5, 10, 15, 20]

for msl in min_samples_leaf:
    rnd_clf = RandomForestClassifier(n_estimators=6, criterion='gini', max_depth=None, min_samples_split=2,
                                     min_samples_leaf=msl, max_features='sqrt', max_leaf_nodes=None, n_jobs=-1,
                                     random_state=1001)
    rnd_clf.fit(x_train, y_train)
    acc_train = rnd_clf.score(x_train, y_train)
    acc_val = rnd_clf.score(x_val, y_val)
    print(f'For min_samples_leaf = {msl}')
    print(f'\t Training accuracy: {acc_train} \t\t Validation accuracy: {acc_val}')

For min_samples_leaf = 1
	 Training accuracy: 0.9763888888888889 		 Validation accuracy: 0.7777777777777778
For min_samples_leaf = 2
	 Training accuracy: 0.9375 		 Validation accuracy: 0.6666666666666666
For min_samples_leaf = 5
	 Training accuracy: 0.8944444444444445 		 Validation accuracy: 0.7901234567901234
For min_samples_leaf = 10
	 Training accuracy: 0.8680555555555556 		 Validation accuracy: 0.7901234567901234
For min_samples_leaf = 15
	 Training accuracy: 0.8347222222222223 		 Validation accuracy: 0.7777777777777778
For min_samples_leaf = 20
	 Training accuracy: 0.8319444444444445 		 Validation accuracy: 0.7530864197530864


##### We get best training and validation accuracies for min_samples_leaf = 5. For other values, the training and validation accuracy increase or decrease together but are still less than that for min_samples_leaf = 5.

### Optimizing the number of features to consider when looking for best split, max_features hyperparameter.

In [51]:
max_features = [i+1 for i in range(12)]

for mf in max_features:
    rnd_clf = RandomForestClassifier(n_estimators=6, criterion='gini', max_depth=None, min_samples_split=2,
                                     min_samples_leaf=5, max_features=mf, max_leaf_nodes=None, n_jobs=-1,
                                     random_state=1001)
    rnd_clf.fit(x_train, y_train)
    acc_train = rnd_clf.score(x_train, y_train)
    acc_val = rnd_clf.score(x_val, y_val)
    print(f'For max_features = {mf}')
    print(f'\t Training accuracy: {acc_train} \t\t Validation accuracy: {acc_val}')

For max_features = 1
	 Training accuracy: 0.85 		 Validation accuracy: 0.7283950617283951
For max_features = 2
	 Training accuracy: 0.875 		 Validation accuracy: 0.7530864197530864
For max_features = 3
	 Training accuracy: 0.8944444444444445 		 Validation accuracy: 0.7901234567901234
For max_features = 4
	 Training accuracy: 0.8888888888888888 		 Validation accuracy: 0.7901234567901234
For max_features = 5
	 Training accuracy: 0.9069444444444444 		 Validation accuracy: 0.8024691358024691
For max_features = 6
	 Training accuracy: 0.9041666666666667 		 Validation accuracy: 0.8024691358024691
For max_features = 7
	 Training accuracy: 0.8986111111111111 		 Validation accuracy: 0.6172839506172839
For max_features = 8
	 Training accuracy: 0.8986111111111111 		 Validation accuracy: 0.7037037037037037
For max_features = 9
	 Training accuracy: 0.8972222222222223 		 Validation accuracy: 0.7530864197530864
For max_features = 10
	 Training accuracy: 0.8986111111111111 		 Validation accuracy: 0.679

##### We get best training and validation accuracies for max_features = 5. 
- For max_features < 5, both the training and validation accuracy are less than that for max_features = 5.
- For max_features > 5, the training accuracy is pretty close to that of max_features = 5 but the validation accuracy oscillates and stays lower than that for max_features = 5.

### Optimizing the maximum number of leaf nodes, max_leaf_nodes hyperparameter.

In [57]:
max_leaf_nodes = [None]
max_leaf_nodes.extend([i for i in range(2,15)])


for mln in max_leaf_nodes:
    rnd_clf = RandomForestClassifier(n_estimators=6, criterion='gini', max_depth=None, min_samples_split=2,
                                     min_samples_leaf=5, max_features=5, max_leaf_nodes=mln, n_jobs=-1,
                                     random_state=1001)
    rnd_clf.fit(x_train, y_train)
    acc_train = rnd_clf.score(x_train, y_train)
    acc_val = rnd_clf.score(x_val, y_val)
    print(f'For max_leaf_nodes = {mln}')
    print(f'\t Training accuracy: {acc_train} \t\t Validation accuracy: {acc_val}')

For max_leaf_nodes = None
	 Training accuracy: 0.9069444444444444 		 Validation accuracy: 0.8024691358024691
For max_leaf_nodes = 2
	 Training accuracy: 0.7472222222222222 		 Validation accuracy: 0.7901234567901234
For max_leaf_nodes = 3
	 Training accuracy: 0.8 		 Validation accuracy: 0.8148148148148148
For max_leaf_nodes = 4
	 Training accuracy: 0.7986111111111112 		 Validation accuracy: 0.7901234567901234
For max_leaf_nodes = 5
	 Training accuracy: 0.8027777777777778 		 Validation accuracy: 0.7777777777777778
For max_leaf_nodes = 6
	 Training accuracy: 0.825 		 Validation accuracy: 0.7777777777777778
For max_leaf_nodes = 7
	 Training accuracy: 0.8277777777777777 		 Validation accuracy: 0.7777777777777778
For max_leaf_nodes = 8
	 Training accuracy: 0.8430555555555556 		 Validation accuracy: 0.8148148148148148
For max_leaf_nodes = 9
	 Training accuracy: 0.8541666666666666 		 Validation accuracy: 0.8024691358024691
For max_leaf_nodes = 10
	 Training accuracy: 0.8555555555555555 		 Vali

##### We get best training and validation accuracies for max_leaf_nodes = None. 
- For other values also, the training and validation accuracies are close to that of max_leaf_nodes = None but the model may not generalize well on new data so we'll keep it as None, which means unlimited number of leaf nodes.

Final Random Forest Classifier with optimized hyperparameters.

In [59]:
rnd_clf = RandomForestClassifier(n_estimators=6, criterion='gini', max_depth=None, min_samples_split=2,
                                     min_samples_leaf=5, max_features=5, max_leaf_nodes=None, n_jobs=-1,
                                     random_state=1001)
rnd_clf.fit(x_train, y_train)
acc_train = rnd_clf.score(x_train, y_train)
acc_val = rnd_clf.score(x_val, y_val)
print(f'\t Training accuracy: {acc_train} \t\t Validation accuracy: {acc_val}')

	 Training accuracy: 0.9069444444444444 		 Validation accuracy: 0.8024691358024691


In [60]:
acc_train = rnd_clf.score(x_train, y_train)
acc_val = rnd_clf.score(x_val, y_val)

y_train_pred = rnd_clf.predict(x_train)
y_val_pred = rnd_clf.predict(x_val)

prec_train = precision_score(y_train, y_train_pred, average="macro")
prec_val = precision_score(y_val, y_val_pred, average="macro")

rec_train = recall_score(y_train, y_train_pred, average="macro")
rec_val = recall_score(y_val, y_val_pred, average="macro")

f1_train = f1_score(y_train, y_train_pred, average="macro")
f1_val = f1_score(y_val, y_val_pred, average="macro")

print(f'Training accuracy: {acc_train} \t\t Validation accuracy: {acc_val}')
print(f'Training precision score: {prec_train} \t\t Validation precision score: {prec_val}')
print(f'Training recall score: {rec_train} \t\t Validation recall score: {rec_val}')
print(f'Training f1 score: {f1_train} \t\t Validation f1 score: {f1_val}')

Training accuracy: 0.9069444444444444 		 Validation accuracy: 0.8024691358024691
Training precision score: 0.906184306401226 		 Validation precision score: 0.7909677419354839
Training recall score: 0.8957598903251077 		 Validation recall score: 0.7909677419354839
Training f1 score: 0.9003666029844581 		 Validation f1 score: 0.7909677419354838


#### Analyzing Feature Importance

In [116]:
for name, score in sorted(zip(train.drop('Survived', axis=1).columns, rnd_clf.feature_importances_), 
                          reverse=True, key=lambda x: x[1]):
    print(f'{name:<15}:  {score}')

Sex            :  0.45155244326816407
Fare           :  0.13349421760078592
Cabin          :  0.10841338297023063
Ticket         :  0.07298408085036777
Pclass         :  0.06553827306839469
Name           :  0.058874655078237505
Age            :  0.038738411325067425
Parch          :  0.032899678278285854
Embarked       :  0.01788487501453154
PassengerId    :  0.010683281517824208
SibSp          :  0.00893670102811036


##### We can see that Sex was one of the most important features in the survival of passengers along with few other features, viz. Fare, Cabin, Ticket, Pclass, Name and Age.
- Sex & Age: Women and children were given priority to leave on the lifeboat.
- Fare, Cabin, Ticket & Pclass: The rich people had paid higher fare and had higher ticket class(Pclass). Some passengers were trapped in the flooded cabin and didn't survive.

In [63]:
conf_mx = confusion_matrix(y_train, y_train_pred)
conf_mx

array([[419,  25],
       [ 42, 234]])

In [64]:
conf_mx = confusion_matrix(y_val, y_val_pred)
conf_mx

array([[42,  8],
       [ 8, 23]])

## Ensemble

#### Soft Voting

In [97]:
ensemble_clf = VotingClassifier(
    estimators=[('log_clf', softmax_reg), ('svm_clf', svm_clf), ('rnd_clf', rnd_clf)],
    voting='soft', n_jobs=-1)

ensemble_clf.fit(x_train, y_train)



In [102]:
for clf in (softmax_reg, svm_clf, rnd_clf, ensemble_clf):
    acc_val = clf.score(x_val, y_val)
    print(clf.__class__.__name__)
    print(f'\t Validation accuracy: {acc_val}')

LogisticRegression
	 Validation accuracy: 0.8024691358024691
SVC
	 Validation accuracy: 0.8148148148148148
RandomForestClassifier
	 Validation accuracy: 0.8024691358024691
VotingClassifier
	 Validation accuracy: 0.8333476907901234


In [110]:
for clf in (softmax_reg, svm_clf, rnd_clf, ensemble_clf):
    acc_test = clf.score(x_test, y_test)
    print(clf.__class__.__name__)
    print(f'\t Test accuracy: {acc_test}')

LogisticRegression
	 Test accuracy: 0.8
SVC
	 Test accuracy: 0.7444444444444445
RandomForestClassifier
	 Test accuracy: 0.8222222222222222
VotingClassifier
	 Test accuracy: 0.8432242347234834


#### Hard Voting

In [117]:
ensemble_clf = VotingClassifier(
    estimators=[('log_clf', softmax_reg), ('svm_clf', svm_clf), ('rnd_clf', rnd_clf)],
    voting='hard', n_jobs=-1)

ensemble_clf.fit(x_train, y_train)



In [118]:
for clf in (softmax_reg, svm_clf, rnd_clf, ensemble_clf):
    acc_val = clf.score(x_val, y_val)
    print(clf.__class__.__name__)
    print(f'\t Validation accuracy: {acc_val}')

LogisticRegression
	 Validation accuracy: 0.8024691358024691
SVC
	 Validation accuracy: 0.8148148148148148
RandomForestClassifier
	 Validation accuracy: 0.7530864197530864
VotingClassifier
	 Validation accuracy: 0.8210020117777778


In [121]:
for clf in (softmax_reg, svm_clf, rnd_clf, ensemble_clf):
    acc_test = clf.score(x_test, y_test)
    print(clf.__class__.__name__)
    print(f'\t Test accuracy: {acc_test}')

LogisticRegression
	 Test accuracy: 0.8
SVC
	 Test accuracy: 0.7444444444444445
RandomForestClassifier
	 Test accuracy: 0.8
VotingClassifier
	 Test accuracy: 0.8232242347234835


From the above validation and test accuracy scores, we can see that the ensemble outperforms each individual classifier and proves that ensembling boosts the performance by grouping several weak learners to classify the instances. <br>
- Also, the soft voting gives better accuracy as now the ensemble predicts the class with the
highest class probability, averaged over all the individual classifiers as compared to hard voting that predicts the class by aggregating the predictions of each classifier and predicts the class that gets the majority votes.