Subject of notebook : Comment each step of best_model set_matplotlib_close\
Name of the auther : Qadir Shahbaz\
Where to contact : qadir_shahbaz@yahoo.co.uk\
date : 22/01/2023 

## Import libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

### Load dataset
### Assign X and y
### used get_dummies to convert data type. This is called label encoding
### deal with missing values

In [2]:
df = sns.load_dataset('titanic')
X = df[["pclass", "sex", "age", "sibsp", "parch", "fare"]]
y = df["survived"]
X = pd.get_dummies(X, columns=["sex"])
X.age.fillna(value= X["age"].mean(), inplace = True)


In [3]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   pclass      891 non-null    int64  
 1   age         891 non-null    float64
 2   sibsp       891 non-null    int64  
 3   parch       891 non-null    int64  
 4   fare        891 non-null    float64
 5   sex_female  891 non-null    uint8  
 6   sex_male    891 non-null    uint8  
dtypes: float64(2), int64(3), uint8(2)
memory usage: 36.7 KB


## from sklearn import supervised machine learning algorithm
## from sklearn import evaluation methods for classification

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score



## We spit the data to check on the basis of metrics

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## We have assign  variable **models** to all the classification algorithm.
## We have assign  variable **models_names** to all the model names

In [6]:
models = [LogisticRegression(), SVC(), DecisionTreeClassifier(), RandomForestClassifier(), KNeighborsClassifier()]
model_names = ['Logistic Regression', 'SVM', 'Decision Tree', 'Random Forest', 'KNN']

### first we have made an empty list and assign variable models_scores to it
### Secondly we used for loop to iterate the variables models and model names. This code will find y_pred and accuracy and will append the values in varaible models_scores for each model.

In [7]:
models_scores = []
for model, model_name in zip(models, model_names):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    models_scores.append([model_name,accuracy])

## The below code will sort the result in descending order

In [8]:
sorted_models = sorted(models_scores, key=lambda x: x[1], reverse=True)
for model in sorted_models:
    print("Accuracy Score: ",f'{model[0]} : {model[1]:.2f}')

Accuracy Score:  Logistic Regression : 0.81
Accuracy Score:  Random Forest : 0.79
Accuracy Score:  Decision Tree : 0.75
Accuracy Score:  KNN : 0.69
Accuracy Score:  SVM : 0.66


## This code will find y_pred and precision and will append the values in varaible models_scores for each model.

In [9]:
models = [LogisticRegression(), SVC(), DecisionTreeClassifier(), RandomForestClassifier(), KNeighborsClassifier()]
model_names = ['Logistic Regression', 'SVM', 'Decision Tree', 'Random Forest', 'KNN']
models_scores = []
for model, model_name in zip(models, model_names):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    Precision = precision_score(y_test, y_pred)
    models_scores.append([model_name,Precision])

sorted_models = sorted(models_scores, key=lambda x: x[1], reverse=True)
for model in sorted_models:
    print("Precision Score: ", f'{model[0]} : {model[1]:.2f}')

Precision Score:  Random Forest : 0.81
Precision Score:  Logistic Regression : 0.80
Precision Score:  SVM : 0.76
Precision Score:  Decision Tree : 0.73
Precision Score:  KNN : 0.66


## This code will find y_pred and recall and will append the values in varaible models_scores for each model.


In [37]:
from sklearn.model_selection import GridSearchCV


models = [LogisticRegression(), SVC(), DecisionTreeClassifier(), RandomForestClassifier()]
model_names = ['Logistic Regression', 'SVM', 'Decision Tree', 'Random Forest', 'KNN']
models_scores = []
Best_parameter = []
Best_score = []

for model, model_name in zip(models, model_names):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    Recall = recall_score(y_test, y_pred)
    models_scores.append([model_name,Recall])
    
for param in zip(model,model_names):
    if param == LogisticRegression:
        param_grid = {'fit_intercept': [True, False]}
        grid_search = GridSearchCV(model, param_grid, cv=5, scoring = "r2")
        grid_search.fit(X,y)
        Best_parameter.append([grid_search.best_params_])
        Best_score.append([grid_search.best_score_])
        
    if param == SVC:
        param_grid = {'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}
        grid_search = GridSearchCV(model, param_grid, cv=5, scoring = "r2")
        grid_search.fit(X,y)
        Best_parameter.append([grid_search.best_params_])
        Best_score.append([grid_search.best_score_])
        
    if param == DecisionTreeClassifier:
        param_grid = {'criterion': ['gini', 'entropy']}
        grid_search = GridSearchCV(model, param_grid, cv=5, scoring = "r2")
        grid_search.fit(X,y)
        Best_parameter.append([grid_search.best_params_])
        Best_score.append([grid_search.best_score_])
        
    if param == RandomForestClassifier:
        param_grid = {'n_estimators': [50, 100, 150, 200, 250]}
        grid_search = GridSearchCV(model, param_grid, cv=5, scoring = "r2")
        grid_search.fit(X,y)
        Best_parameter.append([grid_search.best_params_])
        Best_score.append([grid_search.best_score_])
    # if param == KNeighborsClassifier:
    #     param_grid = {'n_neighbors': [3, 5, 7, 9, 11, 13, 15, 17, 19, 21]}
    #     grid_search = GridSearchCV(model, param_grid, cv=5, scoring = "precision")
    #     grid_search.fit(X,y)
    #     print("Best Parameters: ", grid_search.best_params_)     
sorted_models = sorted(models_scores, key=lambda x: x[1], reverse=True)
for model in sorted_models:
    print("Precision Score: ", f'{model[0]} : {model[1]:.2f}')


sorted_Best_parameter = sorted(Best_parameter, key=lambda x: x[1], reverse=True)
for x in sorted_Best_parameter :
    print("Best_parameter: ",f'{model[0]} : {model[1]:.2f}')
    
print(Best_score)
print(Best_score) 


Precision Score:  Logistic Regression : 0.72
Precision Score:  Random Forest : 0.72
Precision Score:  Decision Tree : 0.70
Precision Score:  SVM : 0.26
[]
[]


## This code will find y_pred and F1 and will append the values in varaible models_scores for each model.

In [12]:
models = [LogisticRegression(), SVC(), DecisionTreeClassifier(), RandomForestClassifier(), KNeighborsClassifier()]
model_names = ['Logistic Regression', 'SVM', 'Decision Tree', 'Random Forest', 'KNN']
models_scores = []
for model, model_name in zip(models, model_names):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    F1 = f1_score(y_test, y_pred)
    models_scores.append([model_name,F1])

sorted_models = sorted(models_scores, key=lambda x: x[1], reverse=True)
for model in sorted_models:
    print("F1 Score: ",f'{model[0]} : {model[1]:.2f}')

F1 Score:  Logistic Regression : 0.76
F1 Score:  Random Forest : 0.75
F1 Score:  Decision Tree : 0.72
F1 Score:  KNN : 0.59
F1 Score:  SVM : 0.38
