In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns # for aesthetic visualization 
import pandasql.sqldf as pdsql  # to execute SQL queries on DataFrame
import matplotlib.pyplot as plt # for basic visualization


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pima_data = pd.read_csv('/kaggle/input/pima-indians-diabetes-database/diabetes.csv')

In [None]:
pima_data.head()

In [None]:
pima_data.shape

In [None]:
#Checking the datatypes of the columns
#So every column is numerical type
pima_data.dtypes

In [None]:
#Checking if there are any null values
pima_data.isnull().sum()

In [None]:
#Lets check the number of healthy people and patients in the data set

#Using Bar plot
fig = plt.figure(figsize = (10,5))
ax1 = fig.add_subplot(121)
sns.barplot(data = pima_data,x='Outcome',y=pima_data.index)

#Using pie chart
ax2 = fig.add_subplot(122)
outcome_count = pdsql('SELECT COUNT(Outcome) as count FROM pima_data GROUP BY Outcome')
plt.pie(outcome_count['count'],
        explode = (0,0),labels =['Healthy ','Diabetic'],
        autopct='%1.1f%%',
        shadow =True)

In [None]:
#We will separate the details of a patient and healthy 
pima_patient = pdsql('SELECT * FROM pima_data where Outcome =1')
pima_healthy = pdsql('SELECT * FROM pima_data where Outcome =0')

# [What is Diabetes?](https://www.niddk.nih.gov/health-information/diabetes/overview/what-is-diabetes)
**Diabetes is a disease that occurs when your blood glucose, also called blood sugar, *is too high*. Blood glucose is your main source of energy and comes from the food you eat. Insulin, a hormone made by the pancreas, helps glucose from food get into your cells to be used for energy. Sometimes your body *doesn’t make enough—or any—insulin or doesn’t use insulin well*. Glucose then stays in your blood and doesn’t reach your cells.**



In [None]:
# From the defination above we can that a diabetic patient will have blood sugar(In our cas Glucose) levels
# and low Insulin level and the opposite for healthy patients.
# Lets see if the data also derives the same or not.

In [None]:
# Form the distribution of glucose levels we can see that diabetic patients have 
# higher range of glucose levels and healthy patients have lower range of glucose levels.
# This shows that the data strongly satisfies with defination
# we got earlier.
#We can also see that Glucose levels are zero.These samples has to be
#errornous samples.

fig = plt.figure(figsize = (16,8))
fig.suptitle('Distribution of glucose levels')

ax1 =fig.add_subplot(221) 
sns.distplot(pima_patient['Glucose'], label = 'Diabetic')
sns.distplot(pima_healthy['Glucose'], label = 'Healthy')
ax1.set_xlabel('Glucose level')


ax2 = fig.add_subplot(222) 
sns.boxplot(data= pima_data, x='Glucose',y='Outcome',ax= ax2,orient = 'h')
ax2.set_xlabel('Glucose level')

ax3 = fig.add_subplot(223) 
sns.swarmplot(data =pima_data,x='Outcome',y='Glucose',ax=ax3)

ax4 = fig.add_subplot(224) 
glu_data= pdsql("SELECT Glucose, count(Glucose) as Count FROM pima_data GROUP BY Glucose limit 20")
sns.barplot(data = glu_data,x='Glucose',y= 'Count',ax=ax4)

fig.tight_layout(pad=4.0)

In [None]:
#Through the plots we can see that Insulin levels are zero and in negative numbers.
#These examples has to be errornous samples for sure.
fig = plt.figure(figsize = (16,8))
fig.suptitle('Distribution of Insulin levels')

ax1 =fig.add_subplot(221) 
sns.distplot(pima_patient['Insulin'], label = 'Diabetic')
sns.distplot(pima_healthy['Insulin'], label = 'Healthy')
ax1.set_xlabel('Insulin level')

ax2 = fig.add_subplot(222) 
sns.boxplot(data= pima_data, x='Insulin',y='Outcome',ax= ax2,orient = 'h')
ax2.set_xlabel('Insulin level')

ax3 = fig.add_subplot(223) 
sns.swarmplot(data =pima_data,x='Outcome',y='Insulin',ax=ax3)

ax4 = fig.add_subplot(224) 
insu_data= pdsql("SELECT Insulin, count(Insulin) as Count FROM pima_data GROUP BY Insulin limit 20")
sns.barplot(data = insu_data,x='Insulin',y= 'Count',ax=ax4)

fig.tight_layout(pad=4.0)

In [None]:
#Here through plot we can see that a max preganices by any patient
#is 17.
fig = plt.figure(figsize = (12,8))
fig.suptitle('Distribution of Pregnancies')

ax1 =fig.add_subplot(221) 
sns.distplot(pima_patient['Pregnancies'], label = 'Diabetic')
sns.distplot(pima_healthy['Pregnancies'], label = 'Healthy')
ax1.set_xlabel('Pregnancies')


ax2 = fig.add_subplot(222) 
sns.boxplot(data= pima_data, x='Pregnancies',y='Outcome',ax= ax2,orient = 'h')
ax2.set_xlabel('Pregnancies')

ax3 = fig.add_subplot(223) 
sns.swarmplot(data =pima_data,x='Outcome',y='Pregnancies',ax=ax3)

ax4 = fig.add_subplot(224) 
preg_data= pdsql("SELECT Pregnancies, count(Pregnancies) as Count FROM pima_data GROUP BY Pregnancies")
sns.barplot(data = preg_data,x='Pregnancies',y= 'Count',ax=ax4)

fig.tight_layout(pad=4.0)

In [None]:
#Lets do further analyis of Preganacies
#Through plot we can see that  for few sample at age 
#of twenty they are having 4,6 and 8 Preganacies
sns.scatterplot(data = pima_data,x='Pregnancies',y='Age')

In [None]:
# Blood pressure is the force of your blood pushing against the walls of your arteries .
#In few Blood pressure is given as zero which cannot true.These samples can also said as errornous

fig = plt.figure(figsize = (14,8))
fig.suptitle('Distribution of BloodPressure levels')

ax1 =fig.add_subplot(221) 
sns.distplot(pima_patient['BloodPressure'], label = 'Diabetic')
sns.distplot(pima_healthy['BloodPressure'], label = 'Healthy')
ax1.set_xlabel('BloodPressure')


ax2 = fig.add_subplot(222) 
sns.boxplot(data= pima_data, x='BloodPressure',y='Outcome',ax= ax2,orient = 'h')
ax2.set_xlabel('BloodPressure')

ax3 = fig.add_subplot(223) 
sns.swarmplot(data =pima_data,x='Outcome',y='BloodPressure',ax=ax3)

ax4 = fig.add_subplot(224) 
bp_data= pdsql("SELECT BloodPressure, count(BloodPressure) as Count FROM pima_data GROUP BY BloodPressure LIMIT 20")
sns.barplot(data = bp_data.sort_values(by= 'BloodPressure'),x='BloodPressure',y= 'Count',ax=ax4)

fig.tight_layout(pad=4.0)

In [None]:
#Here SkinThickness is Triceps skin fold thickness (mm).In few samples
#thickness is given as zero.These samples can also said as errornous
fig = plt.figure(figsize = (14,8))
fig.suptitle('Distribution of SkinThickness')

ax1 =fig.add_subplot(221) 
sns.distplot(pima_patient['SkinThickness'], label = 'Diabetic')
sns.distplot(pima_healthy['SkinThickness'], label = 'Healthy')
ax1.set_xlabel('SkinThickness')


ax2 = fig.add_subplot(222) 
sns.boxplot(data= pima_data, x='SkinThickness',y='Outcome',ax= ax2,orient = 'h')
ax2.set_xlabel('SkinThickness')

ax3 = fig.add_subplot(223) 
sns.swarmplot(data =pima_data,x='Outcome',y='SkinThickness',ax=ax3)


ax4 = fig.add_subplot(224) 
st_data= pdsql("SELECT SkinThickness, count(SkinThickness) as Count FROM pima_data GROUP BY SkinThickness LIMIT 20")
sns.barplot(data = st_data.sort_values(by= 'SkinThickness'),x='SkinThickness',y= 'Count',ax=ax4)


fig.tight_layout(pad=4.0)

# What is BMI ?
**Body mass index (BMI) is a measure of body fat based on height and weight that applies to adult men and women.And the lower threshold is 15 and higher threshold is 40**

[BMI Wikipedia](https://en.wikipedia.org/wiki/Body_mass_index)

In [None]:
#From the above defination we can atleast be sure that BMI can cannot be zero.
#As in the plots we can there few examples where BMI is zero and 60,70.We can say these erroneous
#samples
fig = plt.figure(figsize = (14,8))
fig.suptitle('Distribution of BMI')

ax1 =fig.add_subplot(221) 
sns.distplot(pima_patient['BMI'], label = 'Diabetic')
sns.distplot(pima_healthy['BMI'], label = 'Healthy')
ax1.set_xlabel('BMI')


ax2 = fig.add_subplot(222) 
sns.boxplot(data= pima_data, x='BMI',y='Outcome',ax= ax2,orient = 'h')
ax2.set_xlabel('BMI')

ax3 = fig.add_subplot(223) 
sns.swarmplot(data =pima_data,x='Outcome',y='BMI',ax=ax3)

ax4 = fig.add_subplot(224) 
bmi_data= pdsql("SELECT BMI, count(BMI) as Count FROM pima_data GROUP BY BMI LIMIT 20")
sns.barplot(data = bmi_data.sort_values(by= 'BMI'),x='BMI',y= 'Count',ax=ax4)


fig.tight_layout(pad=4.0)

In [None]:
#Subjects used in the dataset all are above Age 20
#The plots also shows that most of the subject lie between Age band of 20-40
fig = plt.figure(figsize = (14,8))
fig.suptitle('Distribution of Age')

ax1 =fig.add_subplot(221) 
sns.distplot(pima_patient['Age'], label = 'Diabetic')
sns.distplot(pima_healthy['Age'], label = 'Healthy')
ax1.set_xlabel('Age')


ax2 = fig.add_subplot(222) 
sns.boxplot(data= pima_data, x='Age',y='Outcome',ax= ax2,orient = 'h')
ax2.set_xlabel('Age')

ax3 = fig.add_subplot(223) 
sns.swarmplot(data =pima_data,x='Outcome',y='Age',ax=ax3)


ax4 = fig.add_subplot(224) 
age_data= pdsql("SELECT Age, count(Age) as Count FROM pima_data GROUP BY Age LIMIT 20")
sns.barplot(data = age_data.sort_values(by= 'Age'),x='Age',y= 'Count',ax=ax4)

fig.tight_layout(pad=4.0)

In [None]:
err_col=['Glucose','Insulin','BloodPressure','SkinThickness','BMI']
for col in err_col: 
    pima_patient[col] = pima_patient[col].replace({0:pima_patient[col].mean()})
    pima_healthy[col] = pima_healthy[col].replace({0:pima_healthy[col].mean()})
correct_pima_data = pd.concat([pima_patient,pima_healthy])

In [None]:
#These plots throws some light on what are average values of 'Glucose','Insulin','BloodPressure','SkinThickness','BMI'
#wrt to Diabetic and healthy
fig = plt.figure(figsize= (12,12))

i =320
for col in err_col:
    if i<=324 :
        ax = fig.add_subplot(i+1)
        ax.set_title('Average value of '+col +' in Diabetic and Healthy')
        d= pima_data.groupby(by= 'Outcome')[[col]].mean()
        sns.barplot(data = d,x=d.index,y=col,ax=ax)
        plt.xticks([0,1],['Diabetic','Healthy'])
        i = i+1
fig.tight_layout(pad=4.0)


In [None]:
#Lets also check the correlation matrix.
#Here all features are showing some kind of correlation with BloodPressure,PedigreeFunction having lowest
#and Glucose having the highest correlation wrt to Outcome.
fig.suptitle('Correlation Matrix')
fig = plt.figure(figsize= (10,8))
mask = np.triu(np.ones_like(correct_pima_data.corr(), dtype=bool))
sns.heatmap(correct_pima_data.corr(),annot =True,linewidths=.8,cbar =False,mask = mask,cmap = 'YlOrBr')

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score 
from sklearn.metrics import recall_score 

In [None]:
#Features to be used for to train the model
features = ['Glucose','Insulin','SkinThickness','BMI']

#Lets split the data into training data 60%,test size,validation set as 20%,20% 
X_train, X_test, y_train, y_test = train_test_split(correct_pima_data[features], correct_pima_data['Outcome'],
                                                    test_size=0.40, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test,
                                                    test_size=0.50, random_state=42)

In [None]:
#Lets train a logistic regressin model 
model = LogisticRegression(max_iter = 200)
model.fit(X_train,y_train)
y_hat_val = model.predict(X_val)
print('F1_score : ',f1_score(y_val,y_hat_val))
print('Precision_score : ',precision_score(y_val,y_hat_val))
print('Recall_score : ',recall_score(y_val,y_hat_val))
sns.heatmap(confusion_matrix(y_val,y_hat_val),annot = True,linewidths=2,cbar =False)

In [None]:
y_hat_test = model.predict(X_test)
print('F1_score : ',f1_score(y_test,y_hat_test))
print('Precision_score : ',precision_score(y_test,y_hat_test))
print('Recall_score : ',recall_score(y_test,y_hat_test))
sns.heatmap(confusion_matrix(y_test,y_hat_test),annot = True,linewidths=2,cbar =False)