# Diabetes Classification Model

### A classification model to classify weather a patient is having diabetes or not.

# Importing Data and Modules

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix,accuracy_score,f1_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from scipy.stats import iqr
sns.set()

In [None]:
path='../input/diabetes-data-set/diabetes-dataset.csv'
df=pd.read_csv(path)
df.head()

# Understanding the Data

Independent Variables
    1. Pregnancies
    2. Glucose
    3. BloodPressure
    4. SkinThickness
    5. Insulin
    6. BMI
    7. DiabetesPedigreeFunction
    8. Age

Dependent Variable
    1. Outcome

In [None]:
df.shape

In [None]:
df.describe()

    1. Some attributes in data contain 0 values which is not possible practically.
    2. Attribute like Pregnancies, SkinThickness, Insulin contain some extreme values as compair to their mean.
    3. No attribute contain NAN/null values.
    4. No attribute is of type object/string.

In [None]:
df.info()

<b>Continuous variable and Categorical variable

In [None]:
df.nunique()

All attributes are continuous only the target or dependent variable and Pregnancies are categorical.

# Data Visualization

### Univarient Analysis

In [None]:
fig=plt.figure(figsize=(20,20))
for i,col in enumerate(df.drop(['Pregnancies','Outcome'],axis=1)):
    ax=fig.add_subplot(4,2,i+1)
    sns.distplot(df[col])

    1. Insulin, DiabetesPedigreeFunction, Age are Positively skew.
    2. Glucose, BloodPressure, BMI are uniformally distributed.
    3. Glucose, BloodPressure, SkinThickness, Insulin, BMI contain zero values which in somecase acting like outliers

In [None]:
fig=plt.figure(figsize=(15,5))
for i,col in enumerate(['Pregnancies','Outcome']):
    ax=fig.add_subplot(1,2,i+1)
    sns.countplot(df[col])

    1. Out of 2000 records more then 1200 record have outcome as zero which means that these people don't have Diabetes, and more then 600 have outcome as 1 which means these people have Diabetes.
    2. Maximum records have 0,1 or 2 pregnancies.

### Bivariate Analysis 

In [None]:
fig=plt.figure(figsize=(15,15))
for i,col in enumerate(df.drop(['Pregnancies','Outcome'],axis=1)):
    ax=fig.add_subplot(3,3,i+1)
    sns.boxplot(y=df[col],x=df['Outcome'])

    1. Boxplot shows attribute like Glucose, BMI, DiabetesPedigreeFunction, Age have much effect on Outcome as compair to other attributes

In [None]:
fig=plt.figure(figsize=(20,20))
for i,col in enumerate(df.drop(['Pregnancies','Outcome'],axis=1)):
    ax=fig.add_subplot(4,2,i+1)
    ax1=sns.distplot(df[col][df['Outcome']==1],label='Positive')
    sns.distplot(df[col][df['Outcome']==0],label='Negative',ax=ax1)
    plt.legend()

    1. Persons having Glucose approxly in range between 120 to 200 have high chances of having Positive Outcome.
    2. Persons having BloodPressure approxly in range 70 to 120 have high chances of having Positive Outcome.
    3. Persons having SkinThickness more then 30 -35 have high chances of having Positive Outcome.
    4. If a Person is having very low or high Insulin have high risk of Positive Outcome.
    5. Person with BMI more then 30-35 have high chances of having Diabetes.
    6. After age of 30 peoples usually have high chances of having Diabetes.

In [None]:
sns.barplot(x='Pregnancies',y='Outcome',data=df,ci=None)

    1. Graph clearly indicates that high number of Pregnancies have high risk of having Diabetes or Positive Outcome 

In [None]:
fig=plt.figure(figsize=(15,15))
for i,col in enumerate(df.drop(['Pregnancies','Outcome','Glucose'],axis=1)):
    ax=fig.add_subplot(3,3,i+1)
    sns.scatterplot('Glucose',df[col],hue='Outcome',data=df)

    1. These plot clearly shows that high level of glucose is one of the major cause for diabetes.
    2. High level of Glucose with addition of high level of Insulin, Age, DiabetesPedigreeFunction contribute for Positive    Outcome

In [None]:
fig=plt.figure(figsize=(15,15))
for i,col in enumerate(df.drop(['Pregnancies','Outcome','Glucose','BMI'],axis=1)):
    ax=fig.add_subplot(3,3,i+1)
    sns.scatterplot('BMI',df[col],hue='Outcome',data=df)

    BMI doesn't have much effect on the Outcome but other factor in addition to this like Age, Insulin causes effect on the outcome.

In [None]:
plt.figure(figsize=(8,5))
sns.countplot(x='Pregnancies',hue='Outcome',data=df)

    Having low number of Pregnancies there is low risk of having Diabetes but as number of pregnancies increases the risk   also incerases 

# Feature Engineering

### Data Cleaning

In [None]:
df.duplicated().sum()

dataframe contain duplicated value we need to drop them to prevent data leakage 

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.shape

<b>Outliers Removal

In [None]:
def iqr_outliers(df):
    out=[]
    q1 = df.quantile(0.25)
    q3 = df.quantile(0.75)
    iqr = q3-q1
    Lower_tail = q1 - 1.5 * iqr
    Upper_tail = q3 + 1.5 * iqr
    for i in df:
        if i > Upper_tail or i < Lower_tail:
            out.append(i)
    print("Outliers:",len(out))
for col in df.drop('Outcome',axis=1).columns:
    iqr_outliers(df[col])

    Data contain outliers so we will replace them with median as removing them may lead to loss of important data of other  attributes.
    

In [None]:
for col in df.drop('Outcome',axis=1).columns:
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3-q1
    Lower_tail = q1 - 1.5 * iqr
    Upper_tail = q3 + 1.5 * iqr
    
    df[col] = np.where((df[col]<Lower_tail) | (df[col]>Upper_tail), df[col].median(),df[col])

Finding Correlation among attributes

In [None]:
corr=df.corr()
plt.figure(figsize=(10,10))
plt.title('Correlation')
sns.heatmap(corr > 0.90, annot=True, square=True)

<b>Scaling

In [None]:
scaler=StandardScaler()

In [None]:
scaled_df=scaler.fit_transform(df.drop('Outcome',axis=1))

In [None]:
scaled_df=pd.DataFrame(scaled_df,columns=df.drop('Outcome',axis=1).columns)

In [None]:
scaled_df.head()

# Model Development

In [None]:
x=scaled_df
y=df.Outcome

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=2)

In [None]:
model_list=[]
model_f1_score=[]
model_accuracy_score=[]

### LogisticRegression

In [None]:
model_list.append('LogisticRegression')
lm=LogisticRegression()

In [None]:
lm.fit(x_train,y_train)

In [None]:
yhat_lm=lm.predict(x_test)

In [None]:
lm_score=f1_score(y_test,yhat_lm)
model_f1_score.append(lm_score)
lm_score

In [None]:
lm_accuracy=accuracy_score(y_test,yhat_lm)
model_accuracy_score.append(lm_accuracy)
lm_accuracy

In [None]:
print(classification_report(y_test,yhat_lm))

In [None]:
sns.heatmap(confusion_matrix(y_test,yhat_lm),annot=True,fmt='',cmap='YlGnBu')

### DecisionTreeClassifier

In [None]:
model_list.append('DecisionTreeClassifier')
tree=DecisionTreeClassifier()

In [None]:
tree.fit(x_train,y_train)

In [None]:
yhat_tree=tree.predict(x_test)

In [None]:
tree_score=f1_score(y_test,yhat_tree)
model_f1_score.append(tree_score)
tree_score

In [None]:
tree_accuracy=accuracy_score(y_test,yhat_tree)
model_accuracy_score.append(tree_accuracy)
tree_accuracy

In [None]:
print(classification_report(y_test,yhat_tree))

In [None]:
sns.heatmap(confusion_matrix(y_test,yhat_tree),annot=True,fmt='',cmap='YlGnBu')

### RandomForestClassifier

In [None]:
model_list.append('RandomForestClassifier')
forest=RandomForestClassifier()

In [None]:
forest.fit(x_train,y_train)

In [None]:
yhat_forest=forest.predict(x_test)

In [None]:
forest_score=f1_score(y_test,yhat_forest)
model_f1_score.append(forest_score)
forest_score

In [None]:
forest_accuracy=accuracy_score(y_test,yhat_forest)
model_accuracy_score.append(forest_accuracy)
forest_accuracy

In [None]:
print(classification_report(y_test,yhat_forest))

In [None]:
sns.heatmap(confusion_matrix(y_test,yhat_forest),annot=True,fmt='',cmap='YlGnBu')

### SVC

In [None]:
model_list.append('SVC')
svc=SVC()

In [None]:
svc.fit(x_train,y_train)

In [None]:
yhat_svc=svc.predict(x_test)

In [None]:
svc_score=f1_score(y_test,yhat_svc)
model_f1_score.append(svc_score)
svc_score

In [None]:
svc_accuracy=accuracy_score(y_test,yhat_svc)
model_accuracy_score.append(svc_accuracy)
svc_accuracy

In [None]:
print(classification_report(y_test,yhat_svc))

In [None]:
sns.heatmap(confusion_matrix(y_test,yhat_svc),annot=True,fmt='',cmap='YlGnBu')

### KNeighborsClassifier

In [None]:
model_list.append('KNeighborsClassifier')
neighbour=KNeighborsClassifier()

In [None]:
neighbour.fit(x_train,y_train)

In [None]:
yhat_neighbour=neighbour.predict(x_test)

In [None]:
neighbour_score=f1_score(y_test,yhat_neighbour)
model_f1_score.append(neighbour_score)
neighbour_score

In [None]:
neighbour_accuracy=accuracy_score(y_test,yhat_neighbour)
model_accuracy_score.append(neighbour_accuracy)
neighbour_accuracy

In [None]:
print(classification_report(y_test,yhat_neighbour))

In [None]:
sns.heatmap(confusion_matrix(y_test,yhat_neighbour),annot=True,fmt='',cmap='YlGnBu')

### GaussianNB

In [None]:
model_list.append('GaussianNB')
naive=GaussianNB()

In [None]:
naive.fit(x_train,y_train)

In [None]:
yhat_naive=naive.predict(x_test)

In [None]:
naive_score=f1_score(y_test,yhat_naive)
model_f1_score.append(naive_score)
naive_score

In [None]:
naive_accuracy=accuracy_score(y_test,yhat_naive)
model_accuracy_score.append(naive_accuracy)
naive_accuracy

In [None]:
print(classification_report(y_test,yhat_naive))

In [None]:
sns.heatmap(confusion_matrix(y_test,yhat_naive),annot=True,fmt='',cmap='YlGnBu')

# Conclusion

### F1-Score

In [None]:
fig,ax=plt.subplots(figsize=(10,8))
sns.barplot(model_list,model_f1_score)
ax.set_title("F1 Score of  Test Data",pad=20)
ax.set_xlabel("Models",labelpad=20)
ax.set_ylabel("F1_Score",labelpad=20)
plt.xticks(rotation=90)

for p in ax.patches:
    width, height = p.get_width(), p.get_height()
    x, y = p.get_xy() 
    ax.annotate('{:.0%}'.format(height), (x+0.25, y + height + 0.01))

### Accuracy Score

In [None]:
fig,ax=plt.subplots(figsize=(10,8))
sns.barplot(model_list,model_accuracy_score)
ax.set_title("Accuracy of Models on Test Data",pad=20)
ax.set_xlabel("Models",labelpad=20)
ax.set_ylabel("Accuracy",labelpad=20)
plt.xticks(rotation=90)

for p in ax.patches:
    width, height = p.get_width(), p.get_height()
    x, y = p.get_xy() 
    ax.annotate('{:.0%}'.format(height), (x+0.25, y + height + 0.01))

Among all the model that has been implemented SVC gives the highest F1-score and Accuracy score.