## Import libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split,  GridSearchCV 
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/training-ml/Files/main/titanic_train.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
## Unnamed: 0 is not important column so remove this column.

df.drop(['Unnamed: 0'], axis=1, inplace=True)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [5]:
df = df.dropna(axis=0)

In [6]:
df['Sex']= df['Sex'].replace({'male':0, 'female':1})

df['Embarked']= df['Embarked'].replace({'C':0, 'S':1, 'Q':2})

In [7]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,1
6,7,0,1,"McCarthy, Mr. Timothy J",0,54.0,0,0,17463,51.8625,E46,1
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",1,4.0,1,1,PP 9549,16.7,G6,1
11,12,1,1,"Bonnell, Miss. Elizabeth",1,58.0,0,0,113783,26.55,C103,1


In [8]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
count,183.0,183.0,183.0,183.0,183.0,183.0,183.0,183.0,183.0
mean,455.36612,0.672131,1.191257,0.480874,35.674426,0.464481,0.47541,78.682469,0.655738
std,247.052476,0.470725,0.515187,0.501005,15.643866,0.644159,0.754617,76.347843,0.498963
min,2.0,0.0,1.0,0.0,0.92,0.0,0.0,0.0,0.0
25%,263.5,0.0,1.0,0.0,24.0,0.0,0.0,29.7,0.0
50%,457.0,1.0,1.0,0.0,36.0,0.0,0.0,57.0,1.0
75%,676.0,1.0,1.0,1.0,47.5,1.0,1.0,90.0,1.0
max,890.0,1.0,3.0,1.0,80.0,3.0,4.0,512.3292,2.0


### 1. Not null present
### 2. data distribution look like good
### 3. Features like Pclass, Sex, Agw, SlibSp, Parch, Fare & Embarked is important

In [20]:
x = df.drop([('Survived'),('Name'), ('Ticket'), ('Cabin'),('PassengerId')], axis=1)
y = df['Survived']

In [21]:
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

In [22]:
x_scaled.shape[1]   # 1 for column selection

7

In [23]:
# finding variance inflaction factor in each scaled column i.e   x_scaled.shape[1]  (1/(1-R^2))

from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score

In [24]:
vif = pd.DataFrame()

vif['vif'] = [variance_inflation_factor
              (x_scaled,i) for i in range
             (x_scaled.shape[1])]

vif['Features'] = x.columns

vif

Unnamed: 0,vif,Features
0,1.320706,Pclass
1,1.057653,Sex
2,1.248467,Age
3,1.168492,SibSp
4,1.321422,Parch
5,1.509773,Fare
6,1.131665,Embarked


### All the VIF values are less than 5 and are very low. that means no multicollinearity. now, we can go ahead with fitting our data to the model. Before that, let's split our data in test and training set.

In [31]:
x_train, x_test,y_train,  y_test = train_test_split(x_scaled, y, test_size = 0.25, random_state = 349)

In [32]:
log_reg = LogisticRegression()

log_reg.fit(x_train, y_train)

In [34]:
y_pred = log_reg.predict(x_test)

In [35]:
y_pred

array([1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1,
       1, 1], dtype=int64)

In [36]:
log_reg.predict_proba(x_test)

array([[0.08416481, 0.91583519],
       [0.73331314, 0.26668686],
       [0.67608736, 0.32391264],
       [0.13103971, 0.86896029],
       [0.26527502, 0.73472498],
       [0.02633041, 0.97366959],
       [0.03216878, 0.96783122],
       [0.58595242, 0.41404758],
       [0.01330911, 0.98669089],
       [0.0175782 , 0.9824218 ],
       [0.38788386, 0.61211614],
       [0.11018936, 0.88981064],
       [0.6284902 , 0.3715098 ],
       [0.63291978, 0.36708022],
       [0.05458198, 0.94541802],
       [0.81659983, 0.18340017],
       [0.2394889 , 0.7605111 ],
       [0.76072156, 0.23927844],
       [0.18245307, 0.81754693],
       [0.03431942, 0.96568058],
       [0.42888075, 0.57111925],
       [0.68423236, 0.31576764],
       [0.09819111, 0.90180889],
       [0.03046917, 0.96953083],
       [0.04361825, 0.95638175],
       [0.05386227, 0.94613773],
       [0.69341851, 0.30658149],
       [0.31750879, 0.68249121],
       [0.06123015, 0.93876985],
       [0.51140909, 0.48859091],
       [0.

### model accuracy: 

In [37]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.7391304347826086

### 73.91%

In [70]:
conf_mat = confusion_matrix(y_test, y_pred)
conf_mat

array([[ 7,  8],
       [ 6, 25]], dtype=int64)

In [71]:
from sklearn.metrics import classification_report

In [72]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.54      0.47      0.50        15
           1       0.76      0.81      0.78        31

    accuracy                           0.70        46
   macro avg       0.65      0.64      0.64        46
weighted avg       0.69      0.70      0.69        46



# KNN model

In [40]:
# write one fuction and call as many as times to check accuracy_score of different models.

def metric_score (clf, x_train, x_test, y_train, y_test, train = True ):
    if train :
        y_pred = clf.predict(x_train)
        
        print("\n ____________TRAIN RESULT______________")
        
        print(f"Accuracy Score : {accuracy_score (y_train, y_pred) * 100:.2f}%")
   
    
    elif train == False:
        
        pred = clf.predict(x_test)
 
        print("\n ____________TEST RESULT______________")
        
        print(f"Accuracy Score : {accuracy_score (y_test, pred) * 100:.2f}%")
    
        print('\n\n TEST CLASSIFICATION REPORT\n', classification_report(y_test, pred, digits=2))  #model confidence/accuracy
    
   

In [41]:
from sklearn.neighbors import KNeighborsClassifier

In [42]:
# initiate KNeighborsClassifier

knn = KNeighborsClassifier()

# model training

knn.fit(x_train, y_train)

In [43]:
# call the fuction and pass dataset to check train and test score

metric_score (knn, x_train, x_test, y_train, y_test, train = True)  # this is for training score

metric_score (knn, x_train, x_test, y_train, y_test, train = False)  # this is for testing score


 ____________TRAIN RESULT______________
Accuracy Score : 82.48%

 ____________TEST RESULT______________
Accuracy Score : 73.91%


 TEST CLASSIFICATION REPORT
               precision    recall  f1-score   support

           0       0.60      0.60      0.60        15
           1       0.81      0.81      0.81        31

    accuracy                           0.74        46
   macro avg       0.70      0.70      0.70        46
weighted avg       0.74      0.74      0.74        46



### 73.91%

### cross validation 

In [44]:
# cross validation score to chek if the model is overfitting.

from sklearn.model_selection import cross_val_score

cross_val_score(knn,x_scaled, y, cv=5 )

cross_val_score(knn,x_scaled, y, cv=5).mean()

0.7207207207207207

### Hyperparameter tunning

In [45]:
from sklearn.model_selection import GridSearchCV

In [46]:
param_grid = {'algorithm' : ['kd_tree', 'brute'],
             'leaf_size' : [3,5,6,7,8],
             'n_neighbors' : [3,5,7,9,11,13]}

In [47]:
gridsearch = GridSearchCV(estimator = knn, param_grid = param_grid)

In [48]:
gridsearch.fit(x_train, y_train)

In [49]:
gridsearch.best_score_

0.7801587301587302

In [50]:
gridsearch.best_estimator_

In [51]:
# now we will use this best parameter in knn algorithm and check if accuracy is increasing,

knn = KNeighborsClassifier(algorithm='kd_tree', leaf_size=3, n_neighbors=9)
knn.fit(x_train, y_train)

In [52]:
# call the fuction and pass dataset to check train and test score

metric_score (knn, x_train, x_test, y_train, y_test, train = True)  # this is for training score

metric_score (knn, x_train, x_test, y_train, y_test, train = False)  # this is for testing score


 ____________TRAIN RESULT______________
Accuracy Score : 78.83%

 ____________TEST RESULT______________
Accuracy Score : 69.57%


 TEST CLASSIFICATION REPORT
               precision    recall  f1-score   support

           0       0.54      0.47      0.50        15
           1       0.76      0.81      0.78        31

    accuracy                           0.70        46
   macro avg       0.65      0.64      0.64        46
weighted avg       0.69      0.70      0.69        46



### 69.57%

In [53]:
# if you want to check type1 and type2 error using cunfusion matrix 
# we required this error as low as possible

y_pred = knn.predict(x_test)

cfm = confusion_matrix(y_test, y_pred)

cfm

array([[ 7,  8],
       [ 6, 25]], dtype=int64)

# Decision Tree

In [56]:
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score, classification_report

In [57]:
# write one fuction and call as many as times to check accuracy_score of different models.

def metric_score (clf, x_train, x_test, y_train, y_test, train = True ):
    if train :
        y_pred = clf.predict(x_train)
        
        print("\n ____________TRAIN RESULT______________")
        
        print(f"Accuracy Score : {accuracy_score (y_train, y_pred) * 100:.2f}%")
   
    
    elif train == False:
        
        pred = clf.predict(x_test)
 
        print("\n ____________TEST RESULT______________")
        
        print(f"Accuracy Score : {accuracy_score (y_test, pred) * 100:.2f}%")
    
        print('\n\n TEST CLASSIFICATION REPORT\n', classification_report(y_test, pred, digits=2))  #model confidence/accuracy
    

In [58]:
# model initiation

clf = DecisionTreeClassifier()
clf.fit(x_train, y_train)

In [59]:
# call the fuction and pass dataset to check train and test score

metric_score (clf, x_train, x_test, y_train, y_test, train = True)  # this is for training score

metric_score (clf, x_train, x_test, y_train, y_test, train = False)  # this is for testing score


 ____________TRAIN RESULT______________
Accuracy Score : 100.00%

 ____________TEST RESULT______________
Accuracy Score : 82.61%


 TEST CLASSIFICATION REPORT
               precision    recall  f1-score   support

           0       0.68      0.87      0.76        15
           1       0.93      0.81      0.86        31

    accuracy                           0.83        46
   macro avg       0.81      0.84      0.81        46
weighted avg       0.85      0.83      0.83        46



### 82.61%

In [60]:
# if you want to check type1 and type2 error using cunfusion matrix 
# we required this error as low as possible

y_pred = clf.predict(x_test)

cfm = confusion_matrix(y_test, y_pred)

cfm

array([[13,  2],
       [ 6, 25]], dtype=int64)

In [61]:
# We are tunnig four Imortant Hyperparameters right now, we are passing the different values for both parameters.

param_grid = {
            'criterion':['gini','entropy'],    
            'max_depth': range(10,15),          # The maximum depth of the tree
            'min_samples_leaf': range(2,6),     # The minimum number of samples required to be at a leaf node.
            'min_samples_split': range(3,8),     # The minimum number of samples required to split an internal node.
            'max_leaf_nodes': range(20,50)      # If None then unlimited number of leaf nodes.
}     

In [62]:
gridsearch = GridSearchCV(estimator = clf, 
                           param_grid= param_grid,  
                           cv=5, 
                           n_jobs = -1)    # Use all the cores in your system. for perfomance improvement.

In [63]:
gridsearch.fit(x_train,y_train)

In [64]:
best_parameters = gridsearch.best_params_
print(best_parameters)

{'criterion': 'entropy', 'max_depth': 11, 'max_leaf_nodes': 40, 'min_samples_leaf': 2, 'min_samples_split': 3}


In [65]:
# Initiate DecisionTreeClassifier with new parameters and train

clf = DecisionTreeClassifier(criterion ='entropy',  max_depth = 11,  min_samples_leaf = 2,   min_samples_split = 3)

# Train the model

clf.fit(x_train,y_train)

In [66]:
# call the fuction and pass dataset to check train and test score

metric_score (clf, x_train, x_test, y_train, y_test, train = True)  # this is for training score

metric_score (clf, x_train, x_test, y_train, y_test, train = False)  # this is for testing score


 ____________TRAIN RESULT______________
Accuracy Score : 94.89%

 ____________TEST RESULT______________
Accuracy Score : 73.91%


 TEST CLASSIFICATION REPORT
               precision    recall  f1-score   support

           0       0.59      0.67      0.62        15
           1       0.83      0.77      0.80        31

    accuracy                           0.74        46
   macro avg       0.71      0.72      0.71        46
weighted avg       0.75      0.74      0.74        46



### 73.91%

In [67]:
# if you want to check type1 and type2 error using cunfusion matrix 
# we required this error as low as possible

y_pred = clf.predict(x_test)

cfm = confusion_matrix(y_test, y_pred)

cfm

array([[10,  5],
       [ 7, 24]], dtype=int64)