# More complex and accurate voting model

In [None]:
%matplotlib inline

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np

# Setting up the data

In [None]:
df = pd.read_csv("../input/pima-indians-diabetes-database/diabetes.csv")
df.head()

In [None]:
# The data in this dataset is 100% complete and very easy to work with, therefore it doesn't need any modification
# There also aren't any categorical data
# We were working with a dataset with 768 entries, which can be enough considering there are no missing entries

df.info()

In [None]:
df.describe()

In [None]:
# From the pairplot we've suprisingly found out that there almost isn't a correlation between age and other values(apart of the pregnancies as to be  expected)
# Unsuprisingly the skin thickness and BMI levels are depending(with increasing weight) as well as rising glucose levels are forcing body to produce more insulin
# From the observation, insulin levels are decreasing much more with the number of pregnancies as with age

# It's important to remove the outliers that will be missleading the ML models

sns.pairplot(data=df)

# Removing outliers

In [None]:
# Replacing the outliers, that have impossible values, with mean or median can lead to more accurate models as well as better overall analytics 

for column in df:
    count = 0
    for i in df[column]:
        if(i == 0):
            count += 1
    print(f'{column}: {count}') 

In [None]:
# The columns where we need to get rid of the outliers are:
# Glucose, BloodPressure, SkinThickness, Insulin and BMI

In [None]:
# Decision to choose the median or the mean can be of very importance as it can improve the model 

for column in df:
    print(f'{column}:')
    print("Max = ", end="")
    print(df[column].max())
    print("Min = ", end="")
    print(df[column].min())
    print("Mean = ", end="")
    print(df[column].mean())
    print("Median = ", end="")
    print(df[column].median())

In [None]:
# We've decided to use the mean for all of the values as it will be more accurate when training the models

df.loc[(df.Glucose==0)& (df.Outcome==0), 'Glucose']=int(df[(df.Outcome==0)]['Glucose'].mean())
df.loc[(df.Glucose==0)& (df.Outcome==1), 'Glucose']=int(df[(df.Outcome==1)]['Glucose'].mean())

df.loc[(df.BloodPressure==0)& (df.Outcome==0), 'BloodPressure']=int(df[(df.Outcome==0)]['BloodPressure'].mean())
df.loc[(df.BloodPressure==0)& (df.Outcome==1), 'BloodPressure']=int(df[(df.Outcome==1)]['BloodPressure'].mean())

df.loc[(df.SkinThickness<5)& (df.Outcome==0), 'SkinThickness']=int(df[(df.Outcome==0)]['SkinThickness'].mean())
df.loc[(df.SkinThickness<5)& (df.Outcome==1), 'SkinThickness']=int(df[(df.Outcome==1)]['SkinThickness'].mean())

df.loc[(df.Insulin==0)& (df.Outcome==0), 'Insulin']=int(df[(df.Outcome==0)]['Insulin'].mean())
df.loc[(df.Insulin==0)& (df.Outcome==1), 'Insulin']=int(df[(df.Outcome==1)]['Insulin'].mean())

df.loc[(df.BMI==0)& (df.Outcome==0), 'BMI']=int(df[(df.Outcome==0)]['BMI'].mean())
df.loc[(df.BMI==0)& (df.Outcome==1), 'BMI']=int(df[(df.Outcome==1)]['BMI'].mean())

# Data visualization

In [None]:
# There is a really high propability of having diabetes with higher amount of pregnancies(starting with 5)
# For glucose, the safe limit seems to be around 100, after this level the probability for having diabetes rapidly increases
# There doesn't seem to be a safe level of blood pressure, but people with lower blood pressure tend to have diabetes less
# The same goes for the skin thickness
# As for insulin the range is large(hitting every value) but around 70, the probabylity for diabetes rises
# Statistically, BMI has an impact on the probability of diabetes, but the as the increasing weight doesn't mean increasing fat, the BMI is not that much valid(apart from the higher values), it is possible, that the FMI(Fat mass index) could provide a better results, but is harder to calculate
#  As to be expected, the Pedigree function has a correlation with the probability of outcome
# According to age, after 27, the probability of having diabetes rapidly increases

for column in df:
    sns.violinplot(x=df.Outcome, y=df[column])
    plt.show()

# Simple model using Decision tree classifier

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

train, test = train_test_split(df)

x = train[["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "DiabetesPedigreeFunction", "Age"]]
y = train["Outcome"]

clf = DecisionTreeClassifier()
clf = clf.fit(x, y)

In [None]:
testX = test[["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "DiabetesPedigreeFunction", "Age"]]
testY = test["Outcome"]

predictions = clf.predict(testX)

x = 0
l = len(predictions)

yList = []

for i in testY:
    yList.append(i)

for i in range(l):
    if(predictions[i] == yList[i]):
        x += 1
acc = x/l*100

print("Accuracy:", f"{acc}%")

# More accurate and complex voting model

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score

log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()
tree_clf = DecisionTreeClassifier()
knn_clf= KNeighborsClassifier()
bgc_clf=BaggingClassifier()
gbc_clf=GradientBoostingClassifier()
abc_clf= AdaBoostClassifier()

x = train[["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "DiabetesPedigreeFunction", "Age"]]
y = train["Outcome"]


voting_clf = VotingClassifier(estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf), ('tree', tree_clf),('knn', knn_clf),('bg', bgc_clf), ('gbc', gbc_clf),('abc', abc_clf)],voting='hard')

voting_clf.fit(x, y)

for clf in  (log_clf, rnd_clf, svm_clf,tree_clf,knn_clf,bgc_clf,gbc_clf,abc_clf,voting_clf):
    clf.fit(x,y)
    predictions = clf.predict(testX)
    print(clf.__class__.__name__, accuracy_score(testY, predictions))

In [None]:
# As the best accuracy was shown on the Gradient bosting classifier(even more than the voting classifier) after multiple runs, we've decided to use this ML model in the end

from sklearn.ensemble import GradientBoostingClassifier
gbc=GradientBoostingClassifier(random_state=0)
gbc.fit(x, y)
pred=gbc.predict(testX)
print("Accuracy for GradientBoosting data: ",gbc.score(testX, testY))