In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<h1 style="font-family: 'Poppins', sans-serif; font-size:36px; text-align: center; color: #282B28">Diabetes EDA and Predictions</h1>

<div style="display: flex; flex-direction: column; justify-content: center; align-items: center;">
    <img style="height: 200px; width: auto;" src="https://www.idf.org/images/site1/content/Blue-circle.jpg"/>
</div>

<a href=#section1 style="font-family: 'Poppins', sans-serif; font-size:16px;">1. Introduction</a><br>
<a href="#section2" style="font-family: 'Poppins', sans-serif; font-size:16px;">2. Exploratory Data Analysis</a><br>
<a href="#section3" style="font-family: 'Poppins', sans-serif; font-size:16px;">3. Data Preprocessing</a><br>
<a href="#section4" style="font-family: 'Poppins', sans-serif; font-size:16px;">4. Classification Models</a><br>
<a href="#section5" style="font-family: 'Poppins', sans-serif; font-size:16px;">5. Results</a><br>

<a id="section1"></a>
<h1 style="font-family: 'Poppins', sans-serif; font-size: 24px; text-align: left; color: #176087">Introduction</h1>

<p style="font-family: 'Poppins', sans-serif; font-size:18px;">Diabetes mellitus, commonly known as diabetes is a metabolic disease that causes high blood sugar. The hormone insulin moves sugar from the blood into your cells to be stored or used for energy. With diabetes, your body either doesn’t make enough insulin or can’t effectively use the insulin it does make.<br><br>Untreated high blood sugar from diabetes can damage your nerves, eyes, kidneys, and other organs.<br><br>Blood Sugar level less than 140mg/dL is normal.<br>A reading of more than 200mg/dL after two hours indicates diabetes.<br>A reading between 140 and 199mg/dL indicates prediabetes.
</p>

<a id="section2"></a>
<h1 style="font-family: 'Poppins', sans-serif; font-size: 28px; text-align: left; color: #176087; background-color: #FDF0D5; padding: 25px; border-radius: 10px;">1. Exploratory Data Analysis</h1>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv("/kaggle/input/pima-indians-diabetes-database/diabetes.csv")

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
colors= ['#0A2239' ,'#53A2BE' ,'#1D84B5' ,'#132E32' ,'#176087']
sns.palplot(colors, size=3)
plt.text(-0.5, -0.75, "Color Palette for Visualizations", {'fontfamily': 'sans-serif', 'size': 21, 'weight':'semibold'})
for idx,values in enumerate(colors):
    plt.text(idx-0.25,0, colors[idx],{'fontfamily':'Poppins, sans-serif', 'size':16, 'weight':'semibold','color':'#fff'}, alpha =1)

<h1 style="font-family: 'Poppins', sans-serif; font-size: 20px; text-align: left; color: #oa2239; background-color: #C0FDFB; padding: 15px; border-radius: 40px">Univariate Analysis</h1>

In [None]:
fig = plt.figure(figsize=(20,12))
gs = fig.add_gridspec(2,3)
gs.update(wspace=0.5, hspace=0.25)
ax0 = fig.add_subplot(gs[0,0])
ax1 = fig.add_subplot(gs[0,1])
ax2 = fig.add_subplot(gs[0,2])
ax3 = fig.add_subplot(gs[1,0])
ax4 = fig.add_subplot(gs[1,1])
ax5 = fig.add_subplot(gs[1,2])

background_color = "#DFFDFF"
fig.patch.set_facecolor(background_color)
ax0.set_facecolor(background_color) 
ax1.set_facecolor(background_color) 
ax2.set_facecolor(background_color) 
ax3.set_facecolor(background_color) 
ax4.set_facecolor(background_color) 
ax5.set_facecolor(background_color)


ax0.text(0.08, 550, 'Outcome', fontsize=25, color='#6D454C', weight='bold')
ax0.grid(axis='y', color="#333", linestyle=':')
ax0.tick_params(axis='both', which='major', labelsize=20)
sns.countplot(x='Outcome', data=df, palette=colors, ax=ax0, edgecolor="black").set(xlabel="", ylabel="")

ax1.text(40, 240, 'Age', fontsize=25, color='#6D454C', weight='bold')
ax1.grid(axis='y', color="#333", linestyle=':')
ax1.tick_params(axis='both', which='major', labelsize=20)
sns.histplot(x='Age', data=df, palette=colors, ax=ax1, edgecolor="black", kde=True).set(xlabel="", ylabel="")

ax2.text(15, 110, 'Blood Pressure', fontsize=25, color='#6D454C', weight='bold')
ax2.grid(axis='y', color="#333", linestyle=':')
ax2.tick_params(axis='both', which='major', labelsize=20)
sns.histplot(x='BloodPressure', data=df, color='#0A2239', ax=ax2, edgecolor="black", kde=True).set(xlabel="", ylabel="")

ax3.text(100, 420, 'Insulin Levels', fontsize=25, color='#6D454C', weight='bold')
ax3.grid(axis='y', color="#333", linestyle=':')
ax3.tick_params(axis='both', which='major', labelsize=20)
sns.histplot(x='Insulin', data=df, palette=colors, ax=ax3, edgecolor="black", kde=True).set(xlabel="", ylabel="")

ax4.text(50, 111, 'Glucose Levels', fontsize=25, color='#6D454C', weight='bold')
ax4.grid(axis='y', color="#333", linestyle=':')
ax4.tick_params(axis='both', which='major', labelsize=20)
sns.histplot(x='Glucose', data=df, color='#0A2239', ax=ax4, edgecolor="black", kde=True).set(xlabel="", ylabel="")

ax5.text(25, 103 , 'BMI', fontsize=25, color='#6D454C', weight='bold')
ax5.grid(axis='y', color="#333", linestyle=':')
ax5.tick_params(axis='both', which='major', labelsize=20)
sns.histplot(x='BMI', data=df, ax=ax5, edgecolor="black", kde=True).set(xlabel="", ylabel="")

fig.suptitle('Distribution plots of different attributes', fontsize="28", weight="bold", color="#176087")


for s in ["top","right","left"]:
    ax0.spines[s].set_visible(False)
    ax1.spines[s].set_visible(False)
    ax2.spines[s].set_visible(False)
    ax3.spines[s].set_visible(False)
    ax4.spines[s].set_visible(False)
    ax5.spines[s].set_visible(False)

<h4 style="font-family: 'Poppins', sans-serif; font-size: 18px; font-weight: 700;">Interpretations from the above graphs.</h4>
<ul style="font-family: 'Poppins', sans-serif; font-size: 16px;">
    <li>Age and Insulin columns are highly right skewed. I'll have to normalize it before using it for the model building,</li>
    <li>Dataset contains more number of people between the age of 20-40 years.</li>
    <li>Large number of people have blood pressure between 50 and 100 mmHg</li>
    <li>Large number of people with insulin values of 0. Mostly insulin levels of 0 are seen in Type 1 diabetic patients.</li>
    <li>Lot of people have glucose levels between 100 and 200 mg/dL. People with Glucose levels between 140mg/dL and 199mg/dL are considered to be prediabetic. </li>
    <li>There are a lot of people in the BMI range of 20 and 50. A healthy adult should have a BMI between 18.5 and 24.9. This dataset obviously has many people who are either overweight or obese.</li>
</ul>

<h1 style="font-family: 'Poppins', sans-serif; font-size: 20px; text-align: left; color: #oa2239; background-color: #C0FDFB; padding: 15px; border-radius: 40px">Multivariate Analysis</h1>

In [None]:
plt.figure(figsize=(12,6))
matrix = np.triu(df.corr())
sns.heatmap(df.corr(), annot=True, cmap='Blues', mask=matrix)

In [None]:
fig = plt.figure(figsize=(15,14))
gs = fig.add_gridspec(3,2)
gs.update(wspace=0.5, hspace=0.25)
axA = fig.add_subplot(gs[0,0])
axB = fig.add_subplot(gs[0,1])
axC = fig.add_subplot(gs[1,0])
axD = fig.add_subplot(gs[1,1])
axE = fig.add_subplot(gs[2,0])
axF = fig.add_subplot(gs[2,1])

background_color = "#DFFDFF"
fig.patch.set_facecolor(background_color)
axA.set_facecolor(background_color) 
axB.set_facecolor(background_color) 
axC.set_facecolor(background_color) 
axD.set_facecolor(background_color) 
axE.set_facecolor(background_color) 
axF.set_facecolor(background_color)

axA.tick_params(axis='both',left=False, bottom=False)
axA.set_xticklabels([])
axA.set_yticklabels([])
axA.text(0.6,0.4, "Age vs Pregnancies\n____________",horizontalalignment = 'center',verticalalignment = 'center',
         fontsize = 24,fontweight='bold',fontfamily='sans-serif', color='#437F97')

axC.tick_params(axis='both',left=False, bottom=False)
axC.set_xticklabels([])
axC.set_yticklabels([])
axC.text(0.6,0.4, "Skin Thickness vs Insulin\n____________",horizontalalignment = 'center',verticalalignment = 'center',
         fontsize = 24,fontweight='bold',fontfamily='sans-serif', color='#437F97')

axE.tick_params(axis='both',left=False, bottom=False)
axE.set_xticklabels([])
axE.set_yticklabels([])
axE.text(0.6,0.4, "Skin Thickness vs BMI\n____________",horizontalalignment = 'center',verticalalignment = 'center',
         fontsize = 24,fontweight='bold',fontfamily='sans-serif', color='#437F97')

axB.grid(axis='y', color="#333", linestyle=':')
axB.tick_params(axis='both', which='major', labelsize=12)
axB.set_xlabel('Age',fontsize=12)
axB.set_ylabel('Pregnancies',fontsize=12)
sns.scatterplot(x="Age",y="Pregnancies",data=df, ax=axB, hue="Outcome")

axD.grid(axis='y', color="#333", linestyle=':')
axD.tick_params(axis='both', which='major', labelsize=12)
axD.set_xlabel('Skin Thickness',fontsize=12)
axD.set_ylabel('Insulin',fontsize=12)
sns.scatterplot(x="SkinThickness",y="Insulin",data=df, ax=axD, hue="Outcome")

axF.grid(axis='y', color="#333", linestyle=':')
axF.tick_params(axis='both', which='major', labelsize=12)
axF.set_xlabel('Skin Thickness',fontsize=12)
axF.set_ylabel('BMI',fontsize=12)
sns.scatterplot(x="SkinThickness",y="BMI",data=df, ax=axF, hue="Outcome")

fig.suptitle('Positive Correlation', fontsize="28", 
             weight="bold", color="#176087")

for s in ["top","right","left", "bottom"]:
    axA.spines[s].set_visible(False)
    axC.spines[s].set_visible(False)
    axE.spines[s].set_visible(False)
    
for s in ["top","right","left"]:
    axB.spines[s].set_visible(False)
    axD.spines[s].set_visible(False)
    axF.spines[s].set_visible(False)

<h4 style="font-family: 'Poppins', sans-serif; font-size: 18px; font-weight: 700;">Interpretations from the above graphs.</h4>
<ul style="font-family: 'Poppins', sans-serif; font-size: 16px;">
    <li>Age vs Pregnancies shows minorly that with low number of pregnancy and lesser age the person tends to not have diabetes. But with increase in age and number of pregnancies chances of getting diabetes increases. </li>
    <li>Skin Thickness vs Insulin shows the larger skin thickness and higher insulin values contributes to Diabetes.</li>
    <li> <b>Now some people may think that if there is high level of Insulin then why isn't Diabetes balanced?</b> It happens when the body doesn't respond properly to the Insulin produced. So this leads to Diabetes and also increased Insulin Levels. </li>
    <li>Skin Thickness vs BMI shows higher BMI and larger skin thickness contributes to Diabetes. BMI greater than 25 is already defined as overweight and there are many people in this dataset with BMI greater than 30.</li>
</ul>

In [None]:
fig = plt.figure(figsize=(15,10))
gs = fig.add_gridspec(2,2)
gs.update(wspace=0.5, hspace=0.25)
axA = fig.add_subplot(gs[0,0])
axB = fig.add_subplot(gs[0,1])
axC = fig.add_subplot(gs[1,0])
axD = fig.add_subplot(gs[1,1])

background_color = "#DFFDFF"
fig.patch.set_facecolor(background_color)
axA.set_facecolor(background_color) 
axB.set_facecolor(background_color) 
axC.set_facecolor(background_color) 
axD.set_facecolor(background_color) 

axA.tick_params(axis='both',left=False, bottom=False)
axA.set_xticklabels([])
axA.set_yticklabels([])
axA.text(0.6,0.4, "Age vs Skin Thickness\n____________",horizontalalignment = 'center',verticalalignment = 'center',
         fontsize = 24,fontweight='bold',fontfamily='sans-serif', color='#437F97')

axC.tick_params(axis='both',left=False, bottom=False)
axC.set_xticklabels([])
axC.set_yticklabels([])
axC.text(0.6,0.4, "Age vs Insulin\n____________",horizontalalignment = 'center',verticalalignment = 'center',
         fontsize = 24,fontweight='bold',fontfamily='sans-serif', color='#437F97')

axB.grid(axis='y', color="#333", linestyle=':')
axB.tick_params(axis='both', which='major', labelsize=12)
axB.set_xlabel('Age',fontsize=12)
axB.set_ylabel('Skin Thickness',fontsize=12)
sns.scatterplot(x="Age",y="Pregnancies",data=df, ax=axB, hue="Outcome")

axD.grid(axis='y', color="#333", linestyle=':')
axD.tick_params(axis='both', which='major', labelsize=12)
axD.set_xlabel('Age',fontsize=12)
axD.set_ylabel('Insulin',fontsize=12)
sns.scatterplot(x="Age",y="Insulin",data=df, ax=axD, hue="Outcome")

fig.suptitle('Negative Correlation', fontsize="28", 
             weight="bold", color="#176087")

for s in ["top","right","left", "bottom"]:
    axA.spines[s].set_visible(False)
    axC.spines[s].set_visible(False)
    
for s in ["top","right","left"]:
    axB.spines[s].set_visible(False)
    axD.spines[s].set_visible(False)

<h4 style="font-family: 'Poppins', sans-serif; font-size: 18px; font-weight: 700;">Interpretations from the above graphs.</h4>
<ul style="font-family: 'Poppins', sans-serif; font-size: 16px;">
    <li>Age vs Skin Thickness shows minorly that with lesser skin thickness and lesser age the person tends to not have diabetes. But with increase in age and increase in skin thickness chances of getting diabetes increases. </li>
    <li>Age vs Insulin shows the higher insulin levels and more aged people tend to have Diabetes.</li>
</ul>

<a id="section3"></a>
<h1 style="font-family: 'Poppins', sans-serif; font-size: 28px; text-align: left; color: #176087; background-color: #FDF0D5; padding: 25px; border-radius: 10px;">2. Data Preprocessing</h1>

<h1 style="font-family: 'Poppins', sans-serif; font-size: 20px; text-align: left; color: #oa2239; background-color: #C0FDFB; padding: 15px; border-radius: 40px">Missing Values</h1>

<ul style="font-family: 'Poppins', sans-serif; font-size: 16px;"><li>After some digging I found that the zeroes in the columns like Insulin levels, BMI, Glucose etc. are just missing values.</li> <li>Also it was kind of obvious that glucose and other such important attributes of Human Body can never be zero.</li><li>So first I'll replace all the zeroes in such columns to NaN values and then impute accordingly with median.</li></ul>

In [None]:
df[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] = df[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0,np.NaN)

In [None]:
def median_target(data, var):   
    temp = data[data[var].notnull()]
    temp = temp[[var, 'Outcome']].groupby(['Outcome'])[[var]].median().reset_index()
    return temp

In [None]:
def replace_median(data, columns):
    for i in columns:
        f = median_target(data, i)
        display(f)
        data.loc[(data['Outcome'] == 0 ) & (data[i].isnull()), i] = f[[i]].values[0][0]
        data.loc[(data['Outcome'] == 1 ) & (data[i].isnull()), i] = f[[i]].values[1][0]

In [None]:
null_cols = ['Glucose', 'BloodPressure','SkinThickness','Insulin', 'BMI']

In [None]:
replace_median(df, null_cols)

In [None]:
df.isnull().sum()

In [None]:
df.info()

<h1 style="font-family: 'Poppins', sans-serif; font-size: 20px; text-align: left; color: #oa2239; background-color: #C0FDFB; padding: 15px; border-radius: 40px">Binning of Columns</h1>

In [None]:
df['Age'] = pd.qcut(df['Age'], 10, duplicates='drop')

In [None]:
df['BMI'] = pd.qcut(df['BMI'], 5, duplicates='drop')

<h1 style="font-family: 'Poppins', sans-serif; font-size: 20px; text-align: left; color: #oa2239; background-color: #C0FDFB; padding: 15px; border-radius: 40px">Preparing data for Models</h1>

In [None]:
df = pd.get_dummies(df)

In [None]:
df.head()

In [None]:
# Dividing data into Train and Test
from sklearn.model_selection import train_test_split
X = df.drop('Outcome', axis=1)
y = df['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
from imblearn.over_sampling import SMOTE
os = SMOTE(random_state=42)
X_train,y_train = os.fit_resample(X_train, y_train.ravel())

<p style="font-family: 'Poppins', sans-serif; font-size: 16px;">I observed that there is more data that has Outcome 0. This may lead our models predicting more of people not having Diabetes. So I use SMOTE.
SMOTE is an oversampling technique where the synthetic samples are generated for the minority class. This algorithm helps to overcome the overfitting problem posed by random oversampling.</p>

<a id="section4"></a>
<h1 style="font-family: 'Poppins', sans-serif; font-size: 28px; text-align: left; color: #176087; background-color: #FDF0D5; padding: 25px; border-radius: 10px;">3. Classification Models</h1>

<h1 style="font-family: 'Poppins', sans-serif; font-size: 20px; text-align: left; color: #oa2239; background-color: #C0FDFB; padding: 15px; border-radius: 40px">Logistic Regression</h1>

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [None]:
logmodel = LogisticRegression(max_iter=200)
logmodel.fit(X_train, y_train)
prediction1 = logmodel.predict(X_test)

In [None]:
print('Confusion Matrix:\n', confusion_matrix(y_test, prediction1))
print('\n')
print('Classification Report:\n', classification_report(y_test, prediction1))

<h1 style="font-family: 'Poppins', sans-serif; font-size: 20px; text-align: left; color: #oa2239; background-color: #C0FDFB; padding: 15px; border-radius: 40px">K Nearest Neighbors</h1>

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)
prediction2 = knn.predict(X_test)

In [None]:
print('Confusion Matrix:\n', confusion_matrix(y_test, prediction2))
print('\n')
print('Classification Report:\n', classification_report(y_test, prediction2))

In [None]:
error_rate = []

for i in range(1,40):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    pred_i = knn.predict(X_test)
    error_rate.append(np.mean(pred_i != y_test))

In [None]:
plt.figure(figsize=(10,6))
plt.plot(range(1,40), error_rate, color='blue', linestyle='--', marker='o', markerfacecolor='red', markersize=10)
plt.title('Error Rate vs K value')
plt.xlabel = ('K')
plt.ylabel = ('Error Rate')

In [None]:
knn = KNeighborsClassifier(n_neighbors=4)
knn.fit(X_train, y_train)
prediction2 = knn.predict(X_test)

In [None]:
print('Confusion Matrix:\n', confusion_matrix(y_test, prediction2))
print('\n')
print('Classification Report:\n', classification_report(y_test, prediction2))

<h1 style="font-family: 'Poppins', sans-serif; font-size: 20px; text-align: left; color: #oa2239; background-color: #C0FDFB; padding: 15px; border-radius: 40px">Random Forest</h1>

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfc = RandomForestClassifier(n_estimators=200)
rfc.fit(X_train, y_train)
prediction3 = rfc.predict(X_test)

In [None]:
print('Confusion Matrix:\n', confusion_matrix(y_test, prediction3))
print('\n')
print('Classification Report:\n', classification_report(y_test, prediction3))

<h1 style="font-family: 'Poppins', sans-serif; font-size: 20px; text-align: left; color: #oa2239; ">Hyperparameter Tuning for Random Forest</h1>

In [None]:
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 80, num=10)] 
max_features = ['auto', 'sqrt']
max_depth = [2,4]
min_samples_split = [2,5]
min_samples_leaf = [1,2]
bootstrap = [True, False]

In [None]:
param_grid = {'n_estimators': n_estimators,
              'max_features': max_features,
              'max_depth': max_depth,
              'min_samples_split': min_samples_split,
              'min_samples_leaf': min_samples_leaf,
              'bootstrap': bootstrap}

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
grid_search = GridSearchCV(estimator = rfc, param_grid = param_grid, verbose=3, cv=10, n_jobs = 4)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
grid_predictions = grid_search.predict(X_test)

In [None]:
print("Confusion Matrix: \n", confusion_matrix(y_test, grid_predictions))
print("\n")
print(classification_report(y_test, grid_predictions))

Did not have much benefit from tuning the random forest.

<h1 style="font-family: 'Poppins', sans-serif; font-size: 20px; text-align: left; color: #oa2239; background-color: #C0FDFB; padding: 15px; border-radius: 40px">Support Vector Machines</h1>

In [None]:
from sklearn.svm import SVC

In [None]:
svc_model = SVC()
svc_model.fit(X_train, y_train)
predictions4 = svc_model.predict(X_test)

In [None]:
print('Confusion Matrix:\n', confusion_matrix(y_test, predictions4))
print('\n')
print('Classification Report:\n', classification_report(y_test, predictions4))

<h1 style="font-family: 'Poppins', sans-serif; font-size: 20px; text-align: left; color: #oa2239; ">Hyperparameter Tuning for SVM</h1>

In [None]:
param_grid_svm = {'C':[0.1, 1, 10, 100, 1000], 'gamma':[1, 0.1, 0.01, 0.001, 0.0001]}
from sklearn.model_selection import RandomizedSearchCV
rndm_cv = RandomizedSearchCV(estimator=svc_model, param_distributions=param_grid_svm, cv=10, verbose=2)
rndm_cv.fit(X_train, y_train)

In [None]:
rndm_preds = rndm_cv.predict(X_test)

In [None]:
print('Confusion Matrix:\n', confusion_matrix(y_test, rndm_preds))
print('\n')
print('Classification Report:\n', classification_report(y_test, rndm_preds))

<h1 style="font-family: 'Poppins', sans-serif; font-size: 20px; text-align: left; color: #oa2239; background-color: #C0FDFB; padding: 15px; border-radius: 40px">XG Boost</h1>

In [None]:
from xgboost import XGBClassifier

In [None]:
xgb = XGBClassifier(booster = 'gbtree', learning_rate = 0.1, max_depth = 5, n_estimators = 180)
xgb.fit(X_train, y_train)

In [None]:
xgb_preds = xgb.predict(X_test)

In [None]:
print(f"Confusion Matrix :- \n{confusion_matrix(y_test, xgb_preds)}\n")
print(f"Classification Report :- \n {classification_report(y_test, xgb_preds)}")

<h1 style="font-family: 'Poppins', sans-serif; font-size: 20px; text-align: left; color: #oa2239; background-color: #C0FDFB; padding: 15px; border-radius: 40px">Voting Classifier</h1>

In [None]:
from sklearn.ensemble import VotingClassifier

classifiers = [('XGboost', xgb), ('Random Forest', rfc), 
               ('Logistic', logmodel), ('KNN', knn), ('SVM', svc_model)]

In [None]:
vc = VotingClassifier(estimators = classifiers)

In [None]:
vc.fit(X_train, y_train)

In [None]:
vc_preds = vc.predict(X_test)

In [None]:
print(f"Confusion Matrix :- \n{confusion_matrix(y_test, vc_preds)}\n")
print(f"Classification Report :- \n {classification_report(y_test, vc_preds)}")

<a id="section5"></a>
<h1 style="font-family: 'Poppins', sans-serif; font-size: 28px; text-align: left; color: #176087; background-color: #FDF0D5; padding: 25px; border-radius: 10px;">4. Results</h1>

In [None]:
print('The accuracy score of Logistic Regression Model is: ', accuracy_score(y_test, prediction1)*100,'%')
print('The accuracy score of K Nearest Neighbors Model is: ', accuracy_score(y_test, prediction2)*100,'%')
print('The accuracy score of Random Forests Model is: ', accuracy_score(y_test, prediction3)*100,'%')
print('The accuracy score of SVM Model is: ', accuracy_score(y_test, rndm_preds)*100,'%')
print('The accuracy score of XG Boost  is: ', accuracy_score(y_test, xgb_preds)*100,'%')
print('The accuracy score of Voting Classifer  is: ', accuracy_score(y_test, vc_preds)*100,'%')

In [None]:
lr_acc = accuracy_score(y_test, prediction1)
knn_acc = accuracy_score(y_test, prediction2)
rfc_acc = accuracy_score(y_test, prediction3)
SVM_acc = accuracy_score(y_test, rndm_preds)
xgb_acc = accuracy_score(y_test, xgb_preds)
vc_acc = accuracy_score(y_test, vc_preds)

In [None]:
model = ['Logistic Regression', 'K Nearest Neighbors', 'Random Forests', 'Support Vector Machines',
         'XGBoost', 'Voting Classifier']
score = [lr_acc, knn_acc, rfc_acc, SVM_acc, xgb_acc, vc_acc]

In [None]:
plt.figure(figsize = (12, 6))
sns.barplot(x = score, y = model, palette = 'magma')
plt.show()

<h1 style="font-family: 'Poppins', sans-serif; font-size: 24px; text-align: center; color: #fff; background-color: #1D84B5; padding: 15px; border-radius: 40px">XG Boost performed the best with accuracy of 88.3%</h1>
<h1 style="font-family: 'Poppins', sans-serif; font-size: 16px; text-align: center; color: #000";>Random Forests is just behind with accuracy of 86.14%</h1>

<h1 style="font-family: 'Poppins', sans-serif; font-size: 32px; text-align: center; color: #fff; background-color: #000; padding: 15px; border-radius: 5px">Please give your valuable feedbacks and if you like the notebook do UPVOTE!!</h1>