In [1]:

import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
%matplotlib inline

In [2]:
titanic = pd.read_csv('../LogisticRegression/titanic.csv')
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [3]:
titanic.groupby(['parch']).age.mean()

# creating a function to find the missing value in age .. so I used parch since parch is of 7 types I tried to include 
# them with their age 
def approx_age(cols):
    Age = cols[0]
    Parch = cols[1]
    
    if pd.isnull(Age):
        if Parch == 0 :
            return 32
        elif Parch == 1 :
            return 24
        elif Parch == 2 :
            return 17
        elif Parch == 3 :
            return 33 
        elif Parch == 4 :
            return 44
        else : 
            return 40
    else: 
        return Age
titanic['age'] = titanic[['age', 'parch']].apply(approx_age, axis=1)
titanic.isnull().sum()
## embark has 2 Null value
# first we will drop the table deck
titanic.drop('deck', axis=1, inplace=True)
titanic.dropna(inplace=True)

In [4]:
# let's drop the unnecessary columns
col = ['alive', 'alone', 'adult_male', 'who', 'embark_town', 'class']
data =titanic.drop(col, axis=1)

In [5]:
# let's create one hot encoding  using get dummies
col_to_get_dummies = ['sex', 'embarked']
dummie = []
for i in col_to_get_dummies:
    dummie.append(pd.get_dummies(titanic[i], drop_first=True))
    
data.drop(['sex', 'embarked'], axis=1, inplace=True)
data_dmy = pd.concat([data, dummie[0], dummie[1]], axis=1)
data_dmy.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,male,Q,S
0,0,3,22.0,1,0,7.25,1,0,1
1,1,1,38.0,1,0,71.2833,0,0,0
2,1,3,26.0,0,0,7.925,0,0,1
3,1,1,35.0,1,0,53.1,0,0,1
4,0,3,35.0,0,0,8.05,1,0,1


In [7]:
X = data_dmy.drop(['survived'], axis=1)
y = data_dmy['survived']

In [8]:
# train test split 
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [9]:
# Logistic regression
from sklearn.linear_model import LogisticRegression

# removing warnings
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# creating as instance of logitic regression
Log_R = LogisticRegression()

In [10]:
Log_R.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [11]:
y_pred = Log_R.predict(x_test)

In [14]:
# confusion matrix an classify report 
from sklearn.metrics import confusion_matrix, classification_report
from sklearn import metrics

def metricsss(y_test, y_pred):
    print('confusion matrix\n',confusion_matrix(y_test, y_pred),'\n')
    print('classificatio report \n',classification_report(y_test, y_pred))

In [15]:
metricsss(y_test, y_pred)

confusion matrix
 [[104  10]
 [ 20  44]] 

classificatio report 
               precision    recall  f1-score   support

           0       0.84      0.91      0.87       114
           1       0.81      0.69      0.75        64

    accuracy                           0.83       178
   macro avg       0.83      0.80      0.81       178
weighted avg       0.83      0.83      0.83       178



* from the matrices it is clear that recall for 0 is good but the recall for 1 is not that good 
* we will try some tree based classification to check how will it perform and also we will do some hyper-parameter tuning as required 

## Decision Tree based Classification 

In [28]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
print(clf)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)

metricsss(y_test, y_pred)

DecisionTreeClassifier()
confusion matrix
 [[100  14]
 [ 14  50]] 

classificatio report 
               precision    recall  f1-score   support

           0       0.88      0.88      0.88       114
           1       0.78      0.78      0.78        64

    accuracy                           0.84       178
   macro avg       0.83      0.83      0.83       178
weighted avg       0.84      0.84      0.84       178



In [75]:
# let's do some parameter tuning 
param_grid = {
    "max_depth":[3,5,6,9,10],
    "max_features":[1,2,3,4,5,6,8],
    "random_state":[1,2,3,4,5,6,7,8,9,10,11,100,200,300,400,500,600],
    "min_samples_leaf":[1,2,3,4,5]
}


In [76]:
from sklearn.model_selection import GridSearchCV

In [77]:
grid = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=3)

In [78]:
grid.fit(x_train, y_train)

GridSearchCV(cv=3, estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': [3, 5, 6, 9, 10],
                         'max_features': [1, 2, 3, 4, 5, 6, 8],
                         'min_samples_leaf': [1, 2, 3, 4, 5],
                         'random_state': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
                                          100, 200, 300, 400, 500, 600]})

In [79]:
print(grid.best_params_)
print(grid.best_score_)

{'max_depth': 3, 'max_features': 4, 'min_samples_leaf': 5, 'random_state': 500}
0.8199718706047819


In [80]:
y_pred = grid.predict(x_test)
metricsss(y_test, y_pred)

confusion matrix
 [[110   4]
 [ 23  41]] 

classificatio report 
               precision    recall  f1-score   support

           0       0.83      0.96      0.89       114
           1       0.91      0.64      0.75        64

    accuracy                           0.85       178
   macro avg       0.87      0.80      0.82       178
weighted avg       0.86      0.85      0.84       178



* Still after the parameter tunning values doesnot change that much but at some values are worst now than it was previous so it's , I think I should go for more parameters because I am using 2-3 parameters only so it might will help to do more parameter tuning 

### RandomForest Based Classification

In [81]:
from sklearn.ensemble import RandomForestClassifier

In [82]:
clf = RandomForestClassifier()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)

In [83]:
metricsss(y_test, y_pred)

confusion matrix
 [[104  10]
 [ 13  51]] 

classificatio report 
               precision    recall  f1-score   support

           0       0.89      0.91      0.90       114
           1       0.84      0.80      0.82        64

    accuracy                           0.87       178
   macro avg       0.86      0.85      0.86       178
weighted avg       0.87      0.87      0.87       178



* If you compare this results with DT results, you will know now th result is better.
* Random Forest uses multiple trees and then aggregate the results depending on the situation [MAX, VOTE or any other type of aggregation]

## let's do some parameter tuning .. vroooommmmm.... ;p

In [99]:
param_grid = {
    "n_estimators": [20,50,100,150],
    "max_depth": [3, 5,6,7,8 ,10],
    "max_features":[1, 2, 3, 4, 5, 6, 8],
    "min_samples_leaf":[1,2,3]
}

In [100]:
grid = GridSearchCV(RandomForestClassifier(), param_grid, cv=3) 


In [101]:
grid.fit(x_train, y_train)


GridSearchCV(cv=3, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [3, 5, 6, 7, 8, 10],
                         'max_features': [1, 2, 3, 4, 5, 6, 8],
                         'min_samples_leaf': [1, 2, 3],
                         'n_estimators': [20, 50, 100, 150],
                         'random_state': [500]})

In [102]:
y_pred = grid.predict(x_test)

In [103]:
metricsss(y_test, y_pred)

confusion matrix
 [[109   5]
 [ 19  45]] 

classificatio report 
               precision    recall  f1-score   support

           0       0.85      0.96      0.90       114
           1       0.90      0.70      0.79        64

    accuracy                           0.87       178
   macro avg       0.88      0.83      0.85       178
weighted avg       0.87      0.87      0.86       178



* I think because of random_state it is showing less Recall and F1_score.
* So this result don't look that good but as it took 10 mins for me to get complete I will not update it again ;(

## Gradient Descent Classification

In [122]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(n_estimators=100, max_depth=12)

In [123]:
gbc.fit(x_train, y_train)
y_pred = gbc.predict(x_test)
metricsss(y_test, y_pred)

confusion matrix
 [[103  11]
 [ 11  53]] 

classificatio report 
               precision    recall  f1-score   support

           0       0.90      0.90      0.90       114
           1       0.83      0.83      0.83        64

    accuracy                           0.88       178
   macro avg       0.87      0.87      0.87       178
weighted avg       0.88      0.88      0.88       178



In [126]:
param_grid = {
    "n_estimators": [20,50,100,150],
    "max_depth": [8 ,10,11,12,14,15,19,20,22],
    "max_features":[1, 2, 3, 4, 5, 6, 8],
    "min_samples_leaf":[1,2,3]
    
}

In [127]:
grid = GridSearchCV(GradientBoostingClassifier(), param_grid, cv=3) 
gbc.fit(x_train, y_train)
y_pred = gbc.predict(x_test)
metricsss(y_test, y_pred)

confusion matrix
 [[104  10]
 [ 12  52]] 

classificatio report 
               precision    recall  f1-score   support

           0       0.90      0.91      0.90       114
           1       0.84      0.81      0.83        64

    accuracy                           0.88       178
   macro avg       0.87      0.86      0.86       178
weighted avg       0.88      0.88      0.88       178



* Now after changing some parameters and adding some values we are able to get accuracy of 0.88 and    
           precision    recall  f1-score   support
           0       0.90      0.91      0.90       114
           1       0.84      0.81      0.83        64
           
* from previous result this may be better result we got.

## XGBoost

In [128]:
import xgboost as xgb 
clf = xgb.XGBRFClassifier()

In [129]:
clf.fit(x_train, y_train)

XGBRFClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                colsample_bytree=1, gamma=0, gpu_id=-1, importance_type='gain',
                interaction_constraints='', max_delta_step=0, max_depth=6,
                min_child_weight=1, missing=nan, monotone_constraints='()',
                n_estimators=100, n_jobs=0, num_parallel_tree=100,
                objective='binary:logistic', random_state=0, reg_alpha=0,
                scale_pos_weight=1, tree_method='exact', validate_parameters=1,
                verbosity=None)

In [130]:
y_pred = clf.predict(x_test)
metricsss(y_test, y_pred)

confusion matrix
 [[110   4]
 [ 18  46]] 

classificatio report 
               precision    recall  f1-score   support

           0       0.86      0.96      0.91       114
           1       0.92      0.72      0.81        64

    accuracy                           0.88       178
   macro avg       0.89      0.84      0.86       178
weighted avg       0.88      0.88      0.87       178



In [136]:
param_grid = {
    "n_estimators": [20,50,100,150],
    "max_depth": [8 ,10,11,12,14,15,19,20,22],
    "random_state": [10,20,30],
    "colsample_bylevel": [1,0.9,0.7],
    "colsample_bytree" : [1,0.9,0.65],
    "subsample":[1,0.95, 0.9,0.7]
    
    
}
grid = GridSearchCV(xgb.XGBClassifier(), param_grid, cv=3)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
metricsss(y_test, y_pred)

confusion matrix
 [[106   8]
 [ 13  51]] 

classificatio report 
               precision    recall  f1-score   support

           0       0.89      0.93      0.91       114
           1       0.86      0.80      0.83        64

    accuracy                           0.88       178
   macro avg       0.88      0.86      0.87       178
weighted avg       0.88      0.88      0.88       178



In [137]:
import lightgbm, catboost

In [141]:
lgb = lightgbm.LGBMClassifier()
cat = catboost.CatBoostClassifier(n_estimators=100)

In [142]:
lgb.fit(x_train, y_train)
y_pred = lgb.predict(x_test)
metricsss(y_test, y_pred)

confusion matrix
 [[105   9]
 [ 13  51]] 

classificatio report 
               precision    recall  f1-score   support

           0       0.89      0.92      0.91       114
           1       0.85      0.80      0.82        64

    accuracy                           0.88       178
   macro avg       0.87      0.86      0.86       178
weighted avg       0.88      0.88      0.88       178



In [143]:
cat.fit(x_train, y_train)
y_pred = cat.predict(x_test)
metricsss(y_test, y_pred)

Learning rate set to 0.073567
0:	learn: 0.6693612	total: 1.72ms	remaining: 170ms
1:	learn: 0.6493859	total: 3.06ms	remaining: 150ms
2:	learn: 0.6290900	total: 4.28ms	remaining: 139ms
3:	learn: 0.6104354	total: 5.22ms	remaining: 125ms
4:	learn: 0.5936370	total: 6.19ms	remaining: 118ms
5:	learn: 0.5769224	total: 7.43ms	remaining: 116ms
6:	learn: 0.5649483	total: 8.83ms	remaining: 117ms
7:	learn: 0.5529848	total: 9.6ms	remaining: 110ms
8:	learn: 0.5407434	total: 11.1ms	remaining: 113ms
9:	learn: 0.5302091	total: 13.3ms	remaining: 120ms
10:	learn: 0.5190914	total: 14.7ms	remaining: 119ms
11:	learn: 0.5114576	total: 16ms	remaining: 117ms
12:	learn: 0.5034703	total: 17.2ms	remaining: 115ms
13:	learn: 0.4958064	total: 20.5ms	remaining: 126ms
14:	learn: 0.4890309	total: 21.9ms	remaining: 124ms
15:	learn: 0.4833097	total: 22.8ms	remaining: 120ms
16:	learn: 0.4773902	total: 24.5ms	remaining: 119ms
17:	learn: 0.4723384	total: 25.9ms	remaining: 118ms
18:	learn: 0.4676809	total: 27.6ms	remaining: 1