# Visualization, EDA and Model Metrics Comparison for Car Insurance Cold Calls

****Let's Analyze the Data****

In [None]:
# Importing the required Libraries

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from datetime import datetime
from matplotlib import pyplot

from sklearn.feature_selection import SelectKBest,f_classif  # Feature Engineering
from sklearn.model_selection import train_test_split  # Splitting the dataset into training & testing

# Regression & Classification Models
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import RidgeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier


#Metrics
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,f1_score,recall_score,precision_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

In [None]:
# Loading the Dataset.

df_train=pd.read_csv('../input/carinsurance/carInsurance_train.csv')

In [None]:
df_train.head()

In [None]:
df_train.info()

# Handling the Missing Values

In [None]:
# There are missing values in the fields -> "Job", "Education", "Communication" & "Outcome".
# Let's check the percentage of missing values in them.

print("Missing values (Count):-")
print("\n")
print(df_train.isnull().sum())
print("\n")
print("Missing values (Percentage (%)):-")
print("\n")
print((df_train.isnull().sum()/len(df_train))*100)

Highest percentage of missing data (76%) is in the "Outcome" field.
Let's handle the Missing values.

In [None]:
# Analyzing the Categorical values in the Missing value fields.

print("Job Field ->")
print(df_train.Job.value_counts())
print("\n")
print("Education Field ->")
print(df_train.Education.value_counts())
print("\n")
print("Communication Field ->")
print(df_train.Communication.value_counts())

In [None]:
# Let's determine the most commonly occuring values in the fields -> "Job", "Education" & "Communication".

print("Job Field")
print(df_train.Job.mode())
print("\n")
print("Education Field")
print(df_train.Education.mode())
print("\n")
print("Communication Field")
print(df_train.Communication.mode())

In [None]:
# Let's fill the missing values with their respective modes.

for i in ["Job","Education","Communication"]:
    df_train[i]=df_train[i].fillna(df_train[i].mode()[0])

In [None]:
# As 76% of the data is missing in "Outcome" field, it's better to drop the column.

df_train.drop('Outcome',axis=1,inplace=True)

In [None]:
# Converting the type of "CallStart" & "CallEnd" to Datetime.

df_train[['CallStart','CallEnd']]=df_train[['CallStart','CallEnd']].astype('datetime64[ns]')

In [None]:
# Calculating the total Call Duration

df_train['Call_Duration']=df_train['CallEnd']-df_train['CallStart']

# Extracting the time & converting it to seconds

df_train['Call_Duration']=df_train['Call_Duration'].dt.components['minutes']*60 + df_train['Call_Duration'].dt.components['seconds']
df_train['Call_Duration'].head()

In [None]:
df_train.head()

In [None]:
# Adding a column with Age Ranges.

def agerange(age):
    if age >= 18 and age <= 20:
        return "18-20"
    elif age >= 21 and age <= 30:
        return "21-30"
    elif age >= 31 and age <= 40:
        return "31-40"
    elif age >= 41 and age <= 50:
        return "41-50"
    elif age >= 51 and age <= 60:
        return "51-60"
    elif age >= 61 and age <= 70:
        return "61-70"
    elif age >= 71 and age <= 80:
        return "71-80"
    elif age >=81 and age <= 90:
        return "81-90"
    elif age > 90:
        return "Above 90"

In [None]:
df_train['Age Range']=df_train['Age'].apply(agerange)

# Data Visualization

In [None]:
# Let's analyze the Age of the Customers w.r.t Jobs

AgeRange_crosstab=pd.crosstab(index=df_train['Age Range'],columns=df_train['Job'])
AgeRange_crosstab

* Almost all customers have been employed from the age of 21.
* The oldest customers are above 90 yrs & both of them are retired.
* Most of the Customers are in blue-collar or management Jobs & some are in technician & admin Jobs.

In [None]:
# Plotting the above values.

AgeRange_crosstab.plot(kind='bar',stacked=False,figsize=(20,6))
plt.xticks(rotation = 0)
plt.xlabel('Age Ranges',fontsize=16)
plt.ylabel('Job',fontsize=16)
plt.title('Analyzing Age w.r.t Job',fontsize=18)
plt.legend(title='Job',title_fontsize=15,prop={"size":12})

In [None]:
# Let's check the age dependancy w.r.t Car Insurance Opting Decisions

Age_crosstab=pd.crosstab(index=df_train['Age Range'],columns=df_train['CarInsurance'])
Age_crosstab['Percentage Enrolled']=round(Age_crosstab[1]/(Age_crosstab[0]+Age_crosstab[1])*100,2)
Age_crosstab

* Majority of the people who've Enrolled for the car insurance are in their 30s & also Majority of the rejectors are also in their 30s.
* But Most of the people who've Enrolled range from 21 to 60 yrs of age maybe due to job securities.

In [None]:
# Plotting the above values.

Age_crosstab.drop(['Percentage Enrolled'], axis=1).plot(kind='bar',stacked=False,figsize=(20,6))
plt.xticks(rotation = 0,fontsize=12)
plt.xlabel('Age Ranges',fontsize=16)
plt.ylabel('Car Insurance',fontsize=16)
plt.title('Car Insurance policy Decisions w.r.t Age',fontsize=18)
plt.legend(['Rejected','Accepted'],title='Car Insurance',title_fontsize=15,prop={"size":12})

In [None]:
# Let's check the Job dependancy w.r.t Car Insurance Opting Decisions

Job_crosstab=pd.crosstab(df_train['Job'],df_train['CarInsurance'],colnames=['Car Insurance'])
Job_crosstab['Percentage Enrolled']=round(Job_crosstab[1]/(Job_crosstab[0]+Job_crosstab[1])*100,2)
Job_crosstab

* Most of the people who've enrolled for the insurance are working in management or Technician jobs.
* Surprisingly, unemployed members Enrolled for the insurance are more than the ones who have rejected it.

In [None]:
# Plotting the above values.

Job_crosstab.drop(['Percentage Enrolled'], axis=1).plot(kind='bar',stacked=False,figsize=(20,6))
plt.xticks(rotation = 0,fontsize=12)
plt.xlabel('Jobs',fontsize=16)
plt.ylabel('Car Insurance',fontsize=16)
plt.title('Car Insurance Policy Decisions w.r.t Job',fontsize=18)
plt.legend(['Rejected','Accepted'],title='Car Insurance',title_fontsize=15,prop={"size":12})

In [None]:
# Let's check the dependancy of marital status on Car Insurance Opting Decisions

Marital_crosstab=pd.crosstab(df_train['Marital'],df_train['CarInsurance'],colnames=['Car Insurance'])
Marital_crosstab['Percentage Enrolled']=round(Marital_crosstab[1]/(Marital_crosstab[0]+Marital_crosstab[1])*100,2)
Marital_crosstab

* people who are single have enrolled for the insurance more than the divorced ones & married people are the highest group to have enrolled in it.

In [None]:
# Plotting the above values.

Marital_crosstab.drop(['Percentage Enrolled'], axis=1).plot(kind='bar',stacked=False,figsize=(20,6))
plt.xticks(rotation = 0,fontsize=12)
plt.xlabel('Marital Status',fontsize=16)
plt.ylabel('Car Insurance',fontsize=16)
plt.title('Car Insurance Policy Decisions w.r.t Marital Status',fontsize=18)
plt.legend(['Rejected','Accepted'],title='Car Insurance',title_fontsize=15,prop={"size":12})

In [None]:
# Let's check the dependancy of Education on Car Insurance Opting Decisions

Education_crosstab=pd.crosstab(df_train['Education'],df_train['CarInsurance'],colnames=['Car Insurance'])
Education_crosstab['Percentage Enrolled']=round(Education_crosstab[1]/(Education_crosstab[0]+Education_crosstab[1])*100,2)
Education_crosstab

* People having secondary level of education are the highest enrollers. But, there are more rejectors than enrollers.

In [None]:
# Plotting the above values.

Education_crosstab.drop(['Percentage Enrolled'], axis=1).plot(kind='bar',stacked=False,figsize=(20,6))
plt.xticks(rotation = 0,fontsize=12)
plt.xlabel('Education',fontsize=16)
plt.ylabel('Car Insurance',fontsize=16)
plt.title('Car Insurance Policy Decisions w.r.t Education',fontsize=18)
plt.legend(['Rejected','Accepted'],title='Car Insurance',title_fontsize=15,prop={"size":12})

In [None]:
# Let's check the dependancy of House-Hold Insurance on Car Insurance Opting Decisions

HHInsurance_crosstab=pd.crosstab(df_train['HHInsurance'],df_train['CarInsurance'],colnames=['Car Insurance'])
HHInsurance_crosstab['Percentage Enrolled']=round(HHInsurance_crosstab[1]/(HHInsurance_crosstab[0]+HHInsurance_crosstab[1])*100,2)
HHInsurance_crosstab

* Most of them having a House-Hold Insurance (1380 Customers) have rejected the car Insurance.

In [None]:
# Plotting the above values.

HHInsurance_crosstab.drop(['Percentage Enrolled'], axis=1).plot(kind='bar',stacked=False,figsize=(20,6))
plt.xticks(rotation = 0,fontsize=12)
plt.xlabel('House-Hold Insurance',fontsize=16)
plt.ylabel('Car Insurance',fontsize=16)
plt.title('Car Insurance Policy Decisions w.r.t House-Hold Insurance',fontsize=18)
plt.legend(['Rejected','Accepted','Percentage Accepted'],title='Car Insurance',title_fontsize=15,prop={"size":12})

In [None]:
# Let's check the dependancy of Loan defaulting on Car Insurance Opting Decisions

Default_crosstab=pd.crosstab(df_train['Default'],df_train['CarInsurance'],colnames=['Car Insurance'])
Default_crosstab['Percentage Enrolled']=round(Default_crosstab[1]/(Default_crosstab[0]+Default_crosstab[1])*100,2)
Default_crosstab

* About 40% of the Non-Defaulters have enrolled in the policy & 14 Defaulters have also enrolled themselves in the policy.

In [None]:
# Plotting the above values.

Default_crosstab.drop(['Percentage Enrolled'], axis=1).plot(kind='bar',stacked=False,figsize=(20,6))
plt.xticks(rotation = 0,fontsize=12)
plt.xlabel('Default',fontsize=16)
plt.ylabel('Car Insurance',fontsize=16)
plt.title('Car Insurance Policy Decisions w.r.t Loan Defaulting',fontsize=18)
plt.legend(['Rejected','Accepted'],title='Car Insurance',title_fontsize=15,prop={"size":12})

In [None]:
# Let's check the dependancy of Car Loan on Car Insurance Opting Decisions

CarLoan_crosstab=pd.crosstab(df_train['CarLoan'],df_train['CarInsurance'],colnames=['Car Insurance'])
CarLoan_crosstab['Percentage Enrolled']=round(CarLoan_crosstab[1]/(CarLoan_crosstab[0]+CarLoan_crosstab[1])*100,2)
CarLoan_crosstab

* About 42% of them without a car loan have enrolled in the policy.
* So, it's likely that more people without a car loan (about 41%) may enroll in the policy.

In [None]:
# Plotting the above values.

CarLoan_crosstab.drop(['Percentage Enrolled'], axis=1).plot(kind='bar',stacked=False,figsize=(20,6))
plt.xticks(rotation = 0,fontsize=12)
plt.xlabel('Car Loan',fontsize=16)
plt.ylabel('Car Insurance',fontsize=16)
plt.title('Car Insurance Policy Decisions w.r.t Car Loan',fontsize=18)
plt.legend(['Rejected','Accepted'],title='Car Insurance',title_fontsize=15,prop={"size":12})

In [None]:
# Let's check the dependancy of the Mode of Communication on Car Insurance Opting Decisions

Communication_crosstab=pd.crosstab(df_train['Communication'],df_train['CarInsurance'],colnames=['Car Insurance'])
Communication_crosstab['Percentage Enrolled']=round(Communication_crosstab[1]/(Communication_crosstab[0]+Communication_crosstab[1])*100,2)
Communication_crosstab

* Almost 40% of them who have been contacted through a cellular device have enrolled in the policy.
* So, the probability of getting the people enrolled in the policy by contacting them through a cellular device is high.

In [None]:
# Plotting the above values.

Communication_crosstab.drop(['Percentage Enrolled'], axis=1).plot(kind='bar',stacked=False,figsize=(20,6))
plt.xticks(rotation = 0,fontsize=12)
plt.xlabel('Communication',fontsize=16)
plt.ylabel('Car Insurance',fontsize=16)
plt.title('Car Insurance Policy Decisions w.r.t Communication',fontsize=18)
plt.legend(['Rejected','Accepted'],title='Car Insurance',title_fontsize=15,prop={"size":12})

In [None]:
# Let's check the dependancy of the Contact Month on Car Insurance Opting Decisions

LastContactMonth_crosstab=pd.crosstab(df_train['LastContactMonth'],df_train['CarInsurance'],colnames=['Car Insurance'])
LastContactMonth_crosstab['Percentage Enrolled']=round(LastContactMonth_crosstab[1]/(LastContactMonth_crosstab[0]+
                                                                                     LastContactMonth_crosstab[1])*100,2)
LastContactMonth_crosstab

* More People who have been contacted during the months of "March", "April", "September", "October" & "December" have enrolled in the policy than compared with the other months. 

In [None]:
# Plotting the above values.

LastContactMonth_crosstab.drop(['Percentage Enrolled'], axis=1).plot(kind='bar',stacked=False,figsize=(20,6))
plt.xticks(rotation = 0,fontsize=12)
plt.xlabel('Last Contact Month',fontsize=16)
plt.ylabel('Car Insurance',fontsize=16)
plt.title('Car Insurance Policy Decisions w.r.t Last Contact Month',fontsize=18)
plt.legend(['Rejected','Accepted'],title='Car Insurance',title_fontsize=15,prop={"size":12})

In [None]:
# Let's define a new column containing the categorical values of the days of a month.

def Day_Categories(day):
    if(day >= 1 and day <= 11):
        return "Month Starting"
    elif(day >= 12 and day <= 21):
        return "Middle of the Month"
    elif(day >= 22 and day <= 31):
        return "Month Ending"

In [None]:
df_train['Day_Categories']=df_train['LastContactDay'].apply(Day_Categories)

In [None]:
# Let's check the dependancy of Last Contacted Day on Car Insurance Opting Decisions

LastContactDay_crosstab=pd.crosstab(df_train['Day_Categories'],df_train['CarInsurance'],colnames=['Car Insurance'])
LastContactDay_crosstab['Percentage Enrolled']=round(LastContactDay_crosstab[1]/(LastContactDay_crosstab[0]+
                                                                                     LastContactDay_crosstab[1])*100,2)
LastContactDay_crosstab

* More people have enrolled in the policy during the Starting & ending days of the month than compared with the middle of the month.

In [None]:
# Plotting the above values.

LastContactDay_crosstab.drop(['Percentage Enrolled'], axis=1).plot(kind='bar',stacked=False,figsize=(20,6))
plt.xticks(rotation = 0,fontsize=12)
plt.xlabel('Last Contact Day',fontsize=16)
plt.ylabel('Car Insurance',fontsize=16)
plt.title('Car Insurance Policy Decisions w.r.t Last Contact Day',fontsize=18)
plt.legend(['Rejected','Accepted'],title='Car Insurance',title_fontsize=15,prop={"size":12})

In [None]:
df_train.NoOfContacts.value_counts()

In [None]:
# Let's categorize the No of times the bank has contacted the customers regarding the Insurance policy.

def update_contacts(contact):
    if(contact == 1):
        return "Contacted once"
    elif(contact > 1 and contact <= 10):
        return "Contacted More than once"
    elif(contact > 10 and contact <= 20):
        return "Contacted more than 10 times"
    elif(contact > 20 and contact <= 30):
        return "Contacted more than 20"
    elif(contact > 30):
        return "Contacted more than 30 times"

In [None]:
df_train['NoOfContacts_Category']=df_train['NoOfContacts'].apply(update_contacts)

In [None]:
df_train.NoOfContacts_Category.value_counts()

In [None]:
# Dependancy of No of contacts by the bank on Car Insurance Opting Decisions

NoOfContacts_Category_crosstab=pd.crosstab(df_train['NoOfContacts_Category'],df_train['CarInsurance'],colnames=['Car Insurance'])
NoOfContacts_Category_crosstab['Percentage Enrolled']=round(NoOfContacts_Category_crosstab[1]/(NoOfContacts_Category_crosstab[0]+
                                                                                     NoOfContacts_Category_crosstab[1])*100,2)
NoOfContacts_Category_crosstab

* When contacted just once, almost 46% of them have enrolled in the policy.
* So, there's a better chance of getting the people into enrolling themselves by contacting & convincing them once or more than once in       some cases.
* So, higher the bank tries to contact the people, they're more likely to not opt the policy.

In [None]:
# Plotting the above values.

NoOfContacts_Category_crosstab.drop(['Percentage Enrolled'], axis=1).plot(kind='bar',stacked=False,figsize=(20,6))
plt.xticks(rotation = 90,fontsize=12)
plt.xlabel('No of Contacts made',fontsize=16)
plt.ylabel('Car Insurance',fontsize=16)
plt.title('Car Insurance Policy Decisions w.r.t Contacts made',fontsize=18)
plt.legend(['Rejected','Accepted'],title='Car Insurance',title_fontsize=15,prop={"size":12})

In [None]:
# Converting "Call Duration" to minutes.

df_train['Call_Duration']=df_train['Call_Duration'].apply(lambda x: round(x/60),2)
df_train['Call_Duration'].head()

In [None]:
# Dependancy of Call Duration on Car Insurance Opting Decisions

Call_Duration_crosstab=pd.crosstab(df_train['Call_Duration'],df_train['CarInsurance'],colnames=['Car Insurance'],
                                   rownames=['Call Duration (in minutes)'])
Call_Duration_crosstab['Percentage Enrolled']=round(Call_Duration_crosstab[1]/(Call_Duration_crosstab[0]+ Call_Duration_crosstab[1])*100,2)
Call_Duration_crosstab

* 1 person who hasen't received the call has enrolled themselves in the policy.
* It can be observed that longer the call duration, more customers have enrolled themselves in the policy.
* It may be that during longer calls with the customers, the bank officials may have gotten more time to convince the customers for 
  their enrollment.

In [None]:
# Plotting the above values.

Call_Duration_crosstab.drop(['Percentage Enrolled'], axis=1).plot(kind='bar',stacked=False,figsize=(20,6))
plt.xticks(rotation = 0,fontsize=12)
plt.xlabel('Call Duration (in minutes)',fontsize=16)
plt.ylabel('Car Insurance',fontsize=16)
plt.title('Car Insurance Policy Decisions w.r.t Call Duration',fontsize=18)
plt.legend(['Rejected','Accepted'],title='Car Insurance',title_fontsize=15,prop={"size":12})

In [None]:
# Let's categorize the Call durations for better understanding according to the mean duration.

mean=df_train.Call_Duration.mean()
mean

def update_duration(call):
    if(call < mean):
        return "Less than Mean Duration"
    elif(call > mean):
        return "More than Mean Duration"

In [None]:
df_train['Call_Duration_Mean']=df_train['Call_Duration'].apply(update_duration)

In [None]:
# Dependancy of Mean Call Duration on Car Insurance Opting Decisions

Mean_Call_Duration_crosstab=pd.crosstab(df_train['Call_Duration_Mean'],df_train['CarInsurance'],colnames=['Car Insurance'],
                                   rownames=['Call Duration (in minutes)'])
Mean_Call_Duration_crosstab['Percentage Enrolled']=round(Mean_Call_Duration_crosstab[1]/(Mean_Call_Duration_crosstab[0]+ Mean_Call_Duration_crosstab[1])*100,2)
Mean_Call_Duration_crosstab

* When the call duration b/w the customers & bank is greater than the mean call duration, more customers have enrolled themselves in the policy

In [None]:
# Plotting the above values.

Mean_Call_Duration_crosstab.drop(['Percentage Enrolled'], axis=1).plot(kind='bar',stacked=False,figsize=(20,6))
plt.xticks(rotation = 0,fontsize=12)
plt.xlabel('Call Duration (in minutes)',fontsize=16)
plt.ylabel('Car Insurance',fontsize=16)
plt.title('Car Insurance Policy Decisions w.r.t Mean Call Duration',fontsize=18)
plt.legend(['Rejected','Accepted'],title='Car Insurance',title_fontsize=15,prop={"size":12})

In [None]:
df_train.head()

In [None]:
plt.figure(figsize=(21,7))
res=sns.heatmap(df_train.corr(),annot=True)
res.set_xticklabels (res.get_xmajorticklabels (), fontsize = 12,rotation=45)
res.set_yticklabels (res.get_xmajorticklabels (), fontsize = 12)

* There is a correlation of 0.5 b/w "DaysPassed" & PrevAttempts.
* There is a correlationof 0.48 b/w "CarInsurance" & "Call_Duration".

# Data Preparation

In [None]:
# Let's drop the columns "Id", "CallStart" & "CallEnd".

df_train.drop(['Id','CallStart','CallEnd'],axis=1,inplace=True)

In [None]:
# Let's convert all the categorical valued features into numerical values by using get_dummies method in pandas. 

df_train=pd.get_dummies(data=df_train,columns=['Job','Marital','Education','Communication','LastContactMonth','NoOfContacts_Category',
                                      'Call_Duration_Mean','Day_Categories','Age Range','Call_Duration_Mean'],drop_first=True)

In [None]:
# Removing all the duplicate columns if any.

df_train = df_train.loc[:,~df_train.columns.duplicated()]

In [None]:
# evaluating the important features for consideration using SelectKBest method.

X_temp=df_train.drop(['CarInsurance','NoOfContacts_Category_Contacted more than 10 times','NoOfContacts_Category_Contacted more than 20',
               'NoOfContacts_Category_Contacted more than 30 times','NoOfContacts_Category_Contacted once',
                'Call_Duration_Mean_More than Mean Duration','Day_Categories_Month Ending','Day_Categories_Month Starting',
               'Age Range_21-30','Age Range_31-40','Age Range_41-50','Age Range_51-60','Age Range_61-70','Age Range_71-80',
               'Age Range_81-90','Age Range_Above 90'],axis=1)

Dropping all the columns created for the purpose of visualization in the above cell along with the target label column "CarInsurance".

In [None]:
X_temp.columns

In [None]:
y_temp=df_train['CarInsurance']

# Feature Engineering

In [None]:
# As there are -ve values present in the data, we need to use "f_classif" scoring function.
# Let's select top 35 features.

Best_Params=SelectKBest(score_func=f_classif, k=35)
Best_Params.fit(X_temp,y_temp)

In [None]:
df_scores=pd.DataFrame(Best_Params.scores_)                    # Feature Scores
df_columns=pd.DataFrame(X_temp.columns)                        # Feature Names
df_score_evaluation=pd.concat([df_scores,df_columns],axis=1)   # Concatinating both the dataframes
df_score_evaluation.columns=['Scores','Features']              # Renaming the columns
print(df_score_evaluation.nlargest(35,'Scores'))               # Sorting Scores in Descending order

In [None]:
X_temp.columns.sort_values()

In [None]:
import copy
X_new=copy.deepcopy(X_temp)
X_new.head()

In [None]:
y_new=copy.deepcopy(y_temp)

In [None]:
# Let's split the data into Training & Test sets.

X_train,X_test,y_train,y_test=train_test_split(X_new,y_new,test_size=0.20,random_state=42)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

# Model Training

In [None]:
# LOGISTIC REGRESSION MODEL

lr_model=LogisticRegression()

# Hyper-parameter tuning
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
lr_c = [100, 10, 1.0, 0.1, 0.01]

lr_grid = dict(solver=solvers,penalty=penalty,C=lr_c)

# cross-validation using Repeated Stratified K-fold method.
lr_cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

# Grid Search CV method loops through the different hyper parameters determining the optimal values.
lr_grid_search = GridSearchCV(estimator=lr_model, param_grid=lr_grid, n_jobs=-1, cv=lr_cv, scoring='accuracy',error_score=0)

# Fitting the Model to the Dataset.
lr_grid_result=lr_grid_search.fit(X_train,y_train)

# returns the best hyper parameters.
lr_grid_result.best_params_

In [None]:
# Making predictions using our model.
lr_grid_predictions=lr_grid_result.predict(X_test)

In [None]:
# Model Metrics

print("LOGISTIC REGRESSION Model Performance Metrics:")
print(classification_report(y_test,lr_grid_predictions))

print("CONFUSION MATRIX :")
print(confusion_matrix(y_test,lr_grid_predictions))
print("\n")

plt.figure(figsize=(12,6))
pyplot.plot([0, 1], [0, 1], linestyle='--', label='No Skill Classifier') # Let's define a no skill (Dummy) Classifier for reference.

fpr, tpr, _ = roc_curve(y_test,lr_grid_predictions)             # passing the target labels & Model Predictions to the roc_curve method. 

pyplot.plot(fpr, tpr, marker='.', label='Logistic Regression')  # Plotting the obtained results.

ns_probs = [0 for _ in range(len(y_test))]       # generating predictions from the no skill (Dummy) classifier for reference.

# Calculating Area under Curve for the No Skill & the trained Model.
ns_auc = roc_auc_score(y_test, ns_probs)             
lr_auc = roc_auc_score(y_test, lr_grid_predictions)

print('NO SKILL CLASSIFIER: ROC AUC=%.3f' % (ns_auc))
print('LOGISTIC REGRESSION: ROC AUC=%.3f' % (lr_auc))

pyplot.xlabel('False Positive Rate',fontsize=16)
pyplot.ylabel('True Positive Rate',fontsize=16)
pyplot.title("ROC Curve",fontsize=18)
pyplot.legend(prop={'size':12})
pyplot.show()

In [None]:
rc_model=RidgeClassifier()

alpha = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

rc_grid = dict(alpha=alpha)

rc_cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

rc_grid_search = GridSearchCV(estimator=rc_model, param_grid=rc_grid, n_jobs=-1, cv=rc_cv, scoring='accuracy',error_score=0)

rc_grid_result=rc_grid_search.fit(X_train,y_train)

rc_grid_result.best_params_

In [None]:
rc_grid_predictions=rc_grid_result.predict(X_test)

In [None]:
print("RIDGE CLASSIFIER Model Performance Metrics:")
print(classification_report(y_test,rc_grid_predictions))

print("CONFUSION MATRIX :")
print(confusion_matrix(y_test,rc_grid_predictions))
print("\n")

plt.figure(figsize=(12,6))
pyplot.plot([0, 1], [0, 1], linestyle='--', label='No Skill Classifier') # Let's define a no skill (Dummy) Classifier for reference.

fpr, tpr, _ = roc_curve(y_test,rc_grid_predictions)             # passing the target labels & Model Predictions to the roc_curve method. 

pyplot.plot(fpr, tpr, marker='.', label='RIDGE CLASSIFIER')  # Plotting the obtained results.

ns_probs = [0 for _ in range(len(y_test))]       # generating predictions from the no skill (Dummy) classifier for reference.

# Calculating Area under Curve for the No Skill & the trained Model.
ns_auc = roc_auc_score(y_test, ns_probs)             
rc_auc = roc_auc_score(y_test, rc_grid_predictions)

print('NO SKILL CLASSIFIER: ROC AUC=%.3f' % (ns_auc))
print('RIDGE CLASSIFIER: ROC AUC=%.3f' % (rc_auc))

pyplot.xlabel('False Positive Rate',fontsize=16)
pyplot.ylabel('True Positive Rate',fontsize=16)
pyplot.title("ROC Curve",fontsize=18)
pyplot.legend(prop={'size':12})
pyplot.show()

In [None]:
dtc_model=DecisionTreeClassifier()

dtc_grid = { 'criterion':['gini','entropy'],'max_depth': np.arange(1, 10),'min_samples_split':range(1,10),'min_samples_leaf':range(1,5)}

dtc_cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

dtc_grid_search = GridSearchCV(estimator=dtc_model, param_grid=dtc_grid, n_jobs=-1, cv=dtc_cv, scoring='accuracy',error_score=0)

dtc_grid_result=dtc_grid_search.fit(X_train,y_train) 

dtc_grid_result.best_params_

In [None]:
dtc_grid_predictions=dtc_grid_result.predict(X_test)

In [None]:
print("DECISION TREE CLASSIFIER Model Performance Metrics:")
print(classification_report(y_test,dtc_grid_predictions))

print("CONFUSION MATRIX :")
print(confusion_matrix(y_test,dtc_grid_predictions))
print("\n")

plt.figure(figsize=(12,6))
pyplot.plot([0, 1], [0, 1], linestyle='--', label='No Skill Classifier') # Let's define a no skill (Dummy) Classifier for reference.

fpr, tpr, _ = roc_curve(y_test,dtc_grid_predictions)             # passing the target labels & Model Predictions to the roc_curve method. 

pyplot.plot(fpr, tpr, marker='.', label='DECISION TREE CLASSIFIER')  # Plotting the obtained results.

ns_probs = [0 for _ in range(len(y_test))]       # generating predictions from the no skill (Dummy) classifier for reference.

# Calculating Area under Curve for the No Skill & the trained Model.
ns_auc = roc_auc_score(y_test, ns_probs)             
dtc_auc = roc_auc_score(y_test, dtc_grid_predictions)

print('NO SKILL CLASSIFIER: ROC AUC=%.3f' % (ns_auc))
print('DECISION TREE CLASSIFIER: ROC AUC=%.3f' % (dtc_auc))

pyplot.xlabel('False Positive Rate',fontsize=16)
pyplot.ylabel('True Positive Rate',fontsize=16)
pyplot.title("ROC Curve",fontsize=18)
pyplot.legend(prop={'size':12})
pyplot.show()

In [None]:
bc_model=BaggingClassifier()

bc_n_estimators = [1000]

bc_grid = dict(n_estimators=bc_n_estimators)

bc_cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

bc_grid_search = GridSearchCV(estimator=bc_model, param_grid=bc_grid, n_jobs=-1, cv=bc_cv, scoring='accuracy',error_score=0)

bc_grid_result=bc_grid_search.fit(X_train,y_train)

bc_grid_result.best_params_

In [None]:
bc_grid_predictions=bc_grid_result.predict(X_test)

In [None]:
print("BAGGING CLASSIFIER Model Performance Metrics:")
print(classification_report(y_test,bc_grid_predictions))

print("CONFUSION MATRIX :")
print(confusion_matrix(y_test,bc_grid_predictions))
print("\n")

plt.figure(figsize=(12,6))
pyplot.plot([0, 1], [0, 1], linestyle='--', label='No Skill Classifier') # Let's define a no skill (Dummy) Classifier for reference.

fpr, tpr, _ = roc_curve(y_test,bc_grid_predictions)             # passing the target labels & Model Predictions to the roc_curve method. 

pyplot.plot(fpr, tpr, marker='.', label='BAGGING CLASSIFIER')  # Plotting the obtained results.

ns_probs = [0 for _ in range(len(y_test))]       # generating predictions from the no skill (Dummy) classifier for reference.

# Calculating Area under Curve for the No Skill & the trained Model.
ns_auc = roc_auc_score(y_test, ns_probs)             
bc_auc = roc_auc_score(y_test, bc_grid_predictions)

print('NO SKILL CLASSIFIER: ROC AUC=%.3f' % (ns_auc))
print('BAGGING CLASSIFIER: ROC AUC=%.3f' % (bc_auc))

pyplot.xlabel('False Positive Rate',fontsize=16)
pyplot.ylabel('True Positive Rate',fontsize=16)
pyplot.title("ROC Curve",fontsize=18)
pyplot.legend(prop={'size':12})
pyplot.show()

In [None]:
knc_model=KNeighborsClassifier()

n_neighbors = range(1, 21)
weights = ['uniform', 'distance']
metric = ['euclidean', 'manhattan', 'minkowski']

knc_grid = dict(n_neighbors=n_neighbors,weights=weights,metric=metric)

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

knc_grid_search = GridSearchCV(estimator=knc_model, param_grid=knc_grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)

knc_grid_result=knc_grid_search.fit(X_train,y_train)

knc_grid_result.best_params_

In [None]:
knc_grid_predictions=knc_grid_result.predict(X_test)

In [None]:
print("K-NEIGHBOURS CLASSIFIER Model Performance Metrics:")
print(classification_report(y_test,knc_grid_predictions))

print("CONFUSION MATRIX :")
print(confusion_matrix(y_test,knc_grid_predictions))
print("\n")

plt.figure(figsize=(12,6))
pyplot.plot([0, 1], [0, 1], linestyle='--', label='No Skill Classifier') # Let's define a no skill (Dummy) Classifier for reference.

fpr, tpr, _ = roc_curve(y_test,knc_grid_predictions)             # passing the target labels & Model Predictions to the roc_curve method. 

pyplot.plot(fpr, tpr, marker='.', label='K-NEIGHBOURS CLASSIFIER')  # Plotting the obtained results.

ns_probs = [0 for _ in range(len(y_test))]       # generating predictions from the no skill (Dummy) classifier for reference.

# Calculating Area under Curve for the No Skill & the trained Model.
ns_auc = roc_auc_score(y_test, ns_probs)             
knc_auc = roc_auc_score(y_test, knc_grid_predictions)

print('NO SKILL CLASSIFIER: ROC AUC=%.3f' % (ns_auc))
print('K-NEIGHBOURS CLASSIFIER: ROC AUC=%.3f' % (knc_auc))

pyplot.xlabel('False Positive Rate',fontsize=16)
pyplot.ylabel('True Positive Rate',fontsize=16)
pyplot.title("ROC Curve",fontsize=18)
pyplot.legend(prop={'size':12})
pyplot.show()

In [None]:
rfc_model=RandomForestClassifier()

max_features = ['sqrt', 'log2']

rfc_n_estimators = [1000]

rfc_grid = dict(n_estimators=rfc_n_estimators,max_features=max_features)

rfc_cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

rfc_grid_search = GridSearchCV(estimator=rfc_model, param_grid=rfc_grid, n_jobs=-1, cv=rfc_cv, scoring='accuracy',error_score=0)

rfc_grid_result=rfc_grid_search.fit(X_train,y_train)

rfc_grid_result.best_params_

In [None]:
rfc_grid_predictions=rfc_grid_result.predict(X_test)

In [None]:
print("RANDOM FOREST CLASSIFIER Model Performance Metrics:")
print(classification_report(y_test,rfc_grid_predictions))

print("CONFUSION MATRIX :")
print(confusion_matrix(y_test,rfc_grid_predictions))
print("\n")

plt.figure(figsize=(12,6))
pyplot.plot([0, 1], [0, 1], linestyle='--', label='No Skill Classifier') # Let's define a no skill (Dummy) Classifier for reference.

fpr, tpr, _ = roc_curve(y_test,rfc_grid_predictions)             # passing the target labels & Model Predictions to the roc_curve method. 

pyplot.plot(fpr, tpr, marker='.', label='RANDOM FOREST CLASSIFIER')  # Plotting the obtained results.

ns_probs = [0 for _ in range(len(y_test))]       # generating predictions from the no skill (Dummy) classifier for reference.

# Calculating Area under Curve for the No Skill & the trained Model.
ns_auc = roc_auc_score(y_test, ns_probs)             
rfc_auc = roc_auc_score(y_test, rfc_grid_predictions)

print('NO SKILL CLASSIFIER: ROC AUC=%.3f' % (ns_auc))
print('RANDOM FOREST CLASSIFIER: ROC AUC=%.3f' % (rfc_auc))

pyplot.xlabel('False Positive Rate',fontsize=16)
pyplot.ylabel('True Positive Rate',fontsize=16)
pyplot.title("ROC Curve",fontsize=18)
pyplot.legend(prop={'size':12})
pyplot.show()

In [None]:
svc_model=SVC()

#kernel = ['poly', 'rbf', 'sigmoid']
#C = [50, 10, 1.0, 0.1, 0.01]

kernel=['rbf']
C=[1000]
gamma = ['scale']

svc_grid = dict(kernel=kernel,C=C,gamma=gamma)

svc_cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

svc_grid_search = GridSearchCV(estimator=svc_model, param_grid=svc_grid, n_jobs=-1, cv=svc_cv, scoring='accuracy',error_score=0)

svc_grid_result=svc_grid_search.fit(X_train,y_train)

svc_grid_result.best_params_

In [None]:
svc_grid_predictions=svc_grid_result.predict(X_test)

In [None]:
print("SVM CLASSIFIER Model Performance Metrics:")
print(classification_report(y_test,svc_grid_predictions))

print("CONFUSION MATRIX :")
print(confusion_matrix(y_test,svc_grid_predictions))
print("\n")

plt.figure(figsize=(12,6))
pyplot.plot([0, 1], [0, 1], linestyle='--', label='No Skill Classifier') # Let's define a no skill (Dummy) Classifier for reference.

fpr, tpr, _ = roc_curve(y_test,svc_grid_predictions)             # passing the target labels & Model Predictions to the roc_curve method. 

pyplot.plot(fpr, tpr, marker='.', label='SVM CLASSIFIER')  # Plotting the obtained results.

ns_probs = [0 for _ in range(len(y_test))]       # generating predictions from the no skill (Dummy) classifier for reference.

# Calculating Area under Curve for the No Skill & the trained Model.
ns_auc = roc_auc_score(y_test, ns_probs)             
svc_auc = roc_auc_score(y_test, svc_grid_predictions)

print('NO SKILL CLASSIFIER: ROC AUC=%.3f' % (ns_auc))
print('SVM CLASSIFIER: ROC AUC=%.3f' % (svc_auc))

pyplot.xlabel('False Positive Rate',fontsize=16)
pyplot.ylabel('True Positive Rate',fontsize=16)
pyplot.title("ROC Curve",fontsize=18)
pyplot.legend(prop={'size':12})
pyplot.show()

In [None]:
gnb_model=GaussianNB()

gnb_grid = {'var_smoothing': np.logspace(0,-9, num=100)}

gnb_cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

gnb_grid_search = GridSearchCV(estimator=gnb_model, param_grid=gnb_grid, n_jobs=-1, cv=gnb_cv, scoring='accuracy',error_score=0)

gnb_grid_result=gnb_grid_search.fit(X_train,y_train)

gnb_grid_result.best_params_

In [None]:
gnb_grid_predictions=gnb_grid_result.predict(X_test)

In [None]:
print("GAUSSIANNB Model Performance Metrics:")
print(classification_report(y_test,gnb_grid_predictions))

print("CONFUSION MATRIX :")
print(confusion_matrix(y_test,gnb_grid_predictions))
print("\n")

plt.figure(figsize=(12,6))
pyplot.plot([0, 1], [0, 1], linestyle='--', label='No Skill Classifier') # Let's define a no skill (Dummy) Classifier for reference.

fpr, tpr, _ = roc_curve(y_test,gnb_grid_predictions)             # passing the target labels & Model Predictions to the roc_curve method. 

pyplot.plot(fpr, tpr, marker='.', label='GAUSSIANNB CLASSIFIER')  # Plotting the obtained results.

ns_probs = [0 for _ in range(len(y_test))]       # generating predictions from the no skill (Dummy) classifier for reference.

# Calculating Area under Curve for the No Skill & the trained Model.
ns_auc = roc_auc_score(y_test, ns_probs)             
gnb_auc = roc_auc_score(y_test, gnb_grid_predictions)

print('NO SKILL CLASSIFIER: ROC AUC=%.3f' % (ns_auc))
print('GAUSSIANNB CLASSIFIER: ROC AUC=%.3f' % (gnb_auc))

pyplot.xlabel('False Positive Rate',fontsize=16)
pyplot.ylabel('True Positive Rate',fontsize=16)
pyplot.title("ROC Curve",fontsize=18)
pyplot.legend(prop={'size':12})
pyplot.show()

In [None]:
gbc_model=GradientBoostingClassifier()

#n_estimators = [10, 100, 1000]
#gbc_n_estimators = [0.001, 0.01, 0.1]
#gbc_subsample = [0.5, 0.7, 1.0]
#gbc_max_depth = [3, 7, 9]
#gbc_learning_rate = [0.0001, 0.001, 0.01, 0.1]

gbc_n_estimators = [1000]
gbc_learning_rate = [0.01]
gbc_subsample = [0.5]
gbc_max_depth = [7]

gbc_grid = dict(learning_rate=gbc_learning_rate, n_estimators=gbc_n_estimators, subsample=gbc_subsample, max_depth=gbc_max_depth)

gbc_cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

gbc_grid_search = GridSearchCV(estimator=gbc_model, param_grid=gbc_grid, n_jobs=-1, cv=gbc_cv, scoring='accuracy',error_score=0)

gbc_grid_result=gbc_grid_search.fit(X_train,y_train)

gbc_grid_result.best_params_

In [None]:
gbc_grid_predictions=gbc_grid_result.predict(X_test)

In [None]:
print("GRADIENT BOOSTING Model Performance Metrics:")
print(classification_report(y_test,gbc_grid_predictions))

print("CONFUSION MATRIX :")
print(confusion_matrix(y_test,gbc_grid_predictions))
print("\n")

plt.figure(figsize=(12,6))
pyplot.plot([0, 1], [0, 1], linestyle='--', label='No Skill Classifier') # Let's define a no skill (Dummy) Classifier for reference.

fpr, tpr, _ = roc_curve(y_test,gbc_grid_predictions)             # passing the target labels & Model Predictions to the roc_curve method. 

pyplot.plot(fpr, tpr, marker='.', label='GRADIENT BOOSTING MODEL')  # Plotting the obtained results.

ns_probs = [0 for _ in range(len(y_test))]       # generating predictions from the no skill (Dummy) classifier for reference.

# Calculating Area under Curve for the No Skill & the trained Model.
ns_auc = roc_auc_score(y_test, ns_probs)             
gbc_auc = roc_auc_score(y_test, gbc_grid_predictions)

print('NO SKILL CLASSIFIER: ROC AUC=%.3f' % (ns_auc))
print('GRADIENT BOOSTING MODEL: ROC AUC=%.3f' % (gbc_auc))

pyplot.xlabel('False Positive Rate',fontsize=16)
pyplot.ylabel('True Positive Rate',fontsize=16)
pyplot.title("ROC Curve",fontsize=18)
pyplot.legend(prop={'size':12})
pyplot.show()

In [None]:
xgbc_model=XGBClassifier()

#n_estimators = [10,100,1000]
#xgbc_learning_rate = [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3]
#xgbc_subsample = [0.3,0.4,0.5,.6,0.7,0.8,0.9]
#xgbc_max_depth = [3, 4, 5, 6, 7, 8, 9]
#colsample_bytree = [0.5,0.6,0.7,0.8,0.9],
#xgbc_min_child_weight = [1, 2, 3, 4]

xgbc_n_estimators = [1000]
xgbc_learning_rate = [0.01]
xgbc_subsample = [0.7]
xgbc_max_depth = [8]
xgbc_min_child_weight = [1]

grid = dict(n_estimators=xgbc_n_estimators,learning_rate=xgbc_learning_rate,subsample=xgbc_subsample,max_depth=xgbc_max_depth,
min_child_weight=xgbc_min_child_weight)

xgbc_cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

xgbc_grid_search = GridSearchCV(estimator=xgbc_model, param_grid=grid, n_jobs=-1, cv=xgbc_cv, scoring='accuracy',error_score=0)

xgbc_grid_result=xgbc_grid_search.fit(X_train,y_train)

xgbc_grid_result.best_params_

In [None]:
xgbc_grid_predictions=xgbc_grid_result.predict(X_test)

In [None]:
print("EXTREME GRADIENT BOOSTING Model Performance Metrics:")
#print(classification_report(y_test,xgbc_grid_predictions,output_dict=True))
print(classification_report(y_test,xgbc_grid_predictions))

print("CONFUSION MATRIX :")
print(confusion_matrix(y_test,xgbc_grid_predictions))
print("\n")

plt.figure(figsize=(12,6))
pyplot.plot([0, 1], [0, 1], linestyle='--', label='No Skill Classifier') # Let's define a no skill (Dummy) Classifier for reference.

fpr, tpr, _ = roc_curve(y_test,xgbc_grid_predictions)             # passing the target labels & Model Predictions to the roc_curve method. 

pyplot.plot(fpr, tpr, marker='.', label='EXTREME GRADIENT BOOSTING MODEL')  # Plotting the obtained results.

ns_probs = [0 for _ in range(len(y_test))]       # generating predictions from the no skill (Dummy) classifier for reference.

# Calculating Area under Curve for the No Skill & the trained Model.
ns_auc = roc_auc_score(y_test, ns_probs)             
xgbc_auc = roc_auc_score(y_test, xgbc_grid_predictions)

print('NO SKILL CLASSIFIER: ROC AUC=%.3f' % (ns_auc))
print('EXTREME GRADIENT BOOSTING MODEL: ROC AUC=%.3f' % (xgbc_auc))

pyplot.xlabel('False Positive Rate',fontsize=16)
pyplot.ylabel('True Positive Rate',fontsize=16)
pyplot.title("ROC Curve",fontsize=18)
pyplot.legend(prop={'size':12})
pyplot.show()

**Let's Display all the Model Metrics in a dataframe for easier analysis**

In [None]:
# Model Names
Model_Names=['LOGISTIC REGRESSION', 'RIDGE CLASSIFIER', 'DECISION TREE', 'BAGGING CLASSIFIER', 'K-NEIGHBOURS CLASSIFIER', 'RANDOM FOREST',
            'SVM CLASSIFIER', 'GAUSSIANNB CLASSIFIER', 'GRADIENT BOOSTING', 'EXTREME GRADIENT BOOSTING']

# DataFrame Index values
Index=['Accuracy','Precision','Recall','F1 Score','AUC Score']

# Model Prediction values
Model_Predictions=[lr_grid_predictions,rc_grid_predictions,dtc_grid_predictions,bc_grid_predictions,knc_grid_predictions,
                   rfc_grid_predictions,svc_grid_predictions,gnb_grid_predictions,gbc_grid_predictions,xgbc_grid_predictions]

# Model Metrics methods
model_metrics=[accuracy_score,precision_score,recall_score,f1_score,roc_auc_score]

# DataFrame Initialisation
Model_Metrics_Comparison=pd.DataFrame(columns=Model_Names,index=Index)

# Let's fill the dataframe with the model metrics values of all the trained models above.
for index,metric in zip(range(0,5),model_metrics):
    for model_name,model_prediction in zip(Model_Names,Model_Predictions):
        Model_Metrics_Comparison[model_name].values[index]=metric(y_test,model_prediction)*100

# Metric values in Percentage (%).
Model_Metrics_Comparison

* From the above dataframe, we can see that "Extreme Gradient Boosting Classifier" has an accuracy of 84.3% which is the highest among all
  the trained models & also "Random Forest Classifier" has an accuracy of 84.1%.

* As we cannot always depend only on the accuracy of models, we also need to consider other metrics such as "Precision", "Recall" & 
  "F1 Score" for optimal results.
  
* When a model has high values for both precision & recall, then it can be told that, that model is performing well.

*  So in this case, out of any of the following models i.e., ("BAGGING CLASSIFIER", "RANDOM FOREST CLASSIFIER", "GRADIENT BOOSTING" & 
   "EXTREME GRADIENT BOOSTING"), we can get good results as all 4 models are having almost same Precision & Recall values.
   
* In the case of "F1 Score", as it is the weighted average of both precision & Recall Metrics, it can be more useful than Accuracy most of   the time.
* We can also observe from the AUC Scores that, the Models "EXTREME GRADIENT BOOSTING", "GRADIENT BOOSTING" & "RANDOM FOREST CLASSIFIER" have good Area under Curve values.

* Moving ahead, I'll try to improve the model performances and Metric results.

**If you like my Kernel, Please Upvote. Please feel free to provide suggestions in the comments which helps me to improve myself. Thank you :)**