# **Import the dataset and the libaries**

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pima_data=pd.read_csv('/kaggle/input/pima-indians-diabetes-database/diabetes.csv')
pima=pima_data.copy()
pima_data.head()

# **Statistical Overview of dataset** 

In [None]:
pima_data.info()

In [None]:
pima_data.describe()

In [None]:
pima_data.shape

In [None]:
print('Count of people having Glusose level = 0 : ',pima_data[pima_data['Glucose']==0]['Glucose'].count())
print('Count of people having BloodPressure level = 0 : ',pima_data[pima_data['BloodPressure']==0]['BloodPressure'].count())
print('Count of people having BMI level = 0 : ',pima_data[pima_data['BMI']==0]['BMI'].count())
print('Count of people having SkinThickness level = 0 : ',pima_data[pima_data['SkinThickness']==0]['SkinThickness'].count())
print('Count of people having Insulin level = 0 : ',pima_data[pima_data['Insulin']==0]['Insulin'].count())

From above we can infer that there are datapoints which have 0 value in them, that can be representing null values. So replacing the zero values up np.nan and doing analysis. 

In [None]:
pima_data[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']]=pima_data[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0,np.nan)

In [None]:
pima_data.isnull().sum()

In [None]:
labels= 'Diabetic','Not Diabetic'
plt.pie(pima_data['Outcome'].value_counts(),labels=labels,autopct='%0.02f%%')
plt.legend()
plt.show()

In [None]:
glucose=pima_data.groupby('Outcome')['Glucose'].median()
bloodpress=pima_data.groupby('Outcome')['BloodPressure'].median()
bmi=pima_data.groupby('Outcome')['BMI'].median()
skinthickness=pima_data.groupby('Outcome')['SkinThickness'].median()
insulin=pima_data.groupby('Outcome')['Insulin'].median()

In [None]:
glucose_df = pd.DataFrame(glucose)
bloodpress_df=pd.DataFrame(bloodpress)
bmi_df=pd.DataFrame(bmi)
skinthickness_df=pd.DataFrame(skinthickness)
insulin_df=pd.DataFrame(insulin)

In [None]:
pima_df = pd.concat([glucose_df, bloodpress_df, bmi_df,skinthickness_df,insulin_df], axis=1)
pima_df

In [None]:
pima_data.loc[(pima_data['Outcome']==0) & (pima_data['Insulin'].isnull()),'Insulin']=pima_df['Insulin'][0]
pima_data.loc[(pima_data['Outcome']==1) & (pima_data['Insulin'].isnull()),'Insulin']=pima_df['Insulin'][1]
pima_data.loc[(pima_data['Outcome']==0) & (pima_data['Glucose'].isnull()),'Glucose']=pima_df['Glucose'][0]
pima_data.loc[(pima_data['Outcome']==1) & (pima_data['Glucose'].isnull()),'Glucose']=pima_df['Glucose'][1]
pima_data.loc[(pima_data['Outcome']==0) & (pima_data['BloodPressure'].isnull()),'BloodPressure']=pima_df['BloodPressure'][0]
pima_data.loc[(pima_data['Outcome']==1) & (pima_data['BloodPressure'].isnull()),'BloodPressure']=pima_df['BloodPressure'][1]
pima_data.loc[(pima_data['Outcome']==0) & (pima_data['SkinThickness'].isnull()),'SkinThickness']=pima_df['SkinThickness'][0]
pima_data.loc[(pima_data['Outcome']==1) & (pima_data['SkinThickness'].isnull()),'SkinThickness']=pima_df['SkinThickness'][1]
pima_data.loc[(pima_data['Outcome']==0) & (pima_data['BMI'].isnull()),'BMI']=pima_df['BMI'][0]
pima_data.loc[(pima_data['Outcome']==1) & (pima_data['BMI'].isnull()),'BMI']=pima_df['BMI'][1]
pima_data.head(20)

In [None]:
pima_data.isnull().sum()

Once all the null values of the columns are converted to the column median values based on the Outcome, we'll plot boxplot for all the features to know about the data and outliers

In [None]:
fig,ax0=plt.subplots(4,2,figsize=(20,15))
sns.boxplot('Pregnancies',data=pima_data,ax=ax0[0][0])
sns.boxplot('Glucose',data=pima_data,ax=ax0[0][1])
sns.boxplot('BloodPressure',data=pima_data,ax=ax0[1][0])
sns.boxplot('SkinThickness',data=pima_data,ax=ax0[1][1])
sns.boxplot('Insulin',data=pima_data,ax=ax0[2][0])
sns.boxplot('BMI',data=pima_data,ax=ax0[2][1])
sns.boxplot('DiabetesPedigreeFunction',data=pima_data,ax=ax0[3][0])
sns.boxplot('Age',data=pima_data,ax=ax0[3][1])


From above, it is visible that DiabetesPedigreeFunction, Insulin, SkinThickness have more outliers as compared to other outlier features (Age, Pregancies, BMI, BloodPressure). It also helps to visualize how the values in the data are spread out.

In [None]:
fig,ax1=plt.subplots(4,2,figsize=(20,15))
sns.distplot(pima_data['Pregnancies'],ax=ax1[0][0])
sns.distplot(pima_data['Glucose'],ax=ax1[0][1])
sns.distplot(pima_data['BloodPressure'],ax=ax1[1][0])
sns.distplot(pima_data['SkinThickness'],ax=ax1[1][1])
sns.distplot(pima_data['Insulin'],ax=ax1[2][0])
sns.distplot(pima_data['BMI'],ax=ax1[2][1])
sns.distplot(pima_data['DiabetesPedigreeFunction'],ax=ax1[3][0])
sns.distplot(pima_data['Age'],ax=ax1[3][1])


BloodPressure, BMI and Glucose are normally distributed whereas DiabetePedigreeFunction, Age, Pregancies and others are positively skewed.

In [None]:
fig=plt.figure(figsize=(15,10))
sns.scatterplot(x='Pregnancies',y='Age',data=pima_data,hue='Outcome',style='Outcome')
sns.set(style='whitegrid')
plt.title('Pregnancies vs Age')
plt.show()

In [None]:
fig=plt.figure(figsize=(15,10))
sns.scatterplot(x='Glucose',y='Age',data=pima_data,hue='Outcome',style='Outcome')
#sns.scatterplot(x='BloodPressure',y='Age',data=pima_data,hue='Outcome',style='Outcome',ax=ax2[0][1])
sns.set(style='whitegrid')
plt.title('Glucose vs Age')
plt.show()

We can infer from above plot that people with Glucose<=120 and Age<=30 will be considered.

In [None]:
fig=plt.figure(figsize=(15,10))
sns.scatterplot(x='BloodPressure',y='Age',data=pima_data,hue='Outcome',style='Outcome')
sns.set(style='whitegrid')
plt.title('BloodPressure vs Age')
plt.show()

BloodPressure<=80 and Age<=30 can be considered healthy.

In [None]:
fig=plt.figure(figsize=(15,10))
sns.scatterplot(x='BMI',y='Age',data=pima_data,hue='Outcome',style='Outcome')
sns.set(style='whitegrid')
plt.title('BMI vs Age')
plt.show()

People within the age of 30-35 with a BMI of 30-40 can be considered as healthy.

In [None]:
fig=plt.figure(figsize=(15,8))
sns.scatterplot(x='Insulin',y='Age',data=pima_data,hue='Outcome',style='Outcome')
sns.set(style='whitegrid')
plt.title('Insulin vs Age')
plt.show()

Most people whether diabetic or not, lie within the insulin range of 0-200.

In [None]:
fig=plt.figure(figsize=(15,8))
sns.scatterplot(x='SkinThickness',y='BMI',data=pima_data,hue='Outcome',style='Outcome',alpha=0.8)
sns.set(style='whitegrid')
plt.title('SkinThickness vs BMI')
plt.show()

Healthy people have a skin thickness of 25 and a bmi of 30-35

In [None]:
fig=plt.figure(figsize=(15,10))
sns.scatterplot(x='BloodPressure',y='Glucose',data=pima_data,hue='Outcome',style='Outcome')
sns.set(style='whitegrid')
plt.title('BloodPressure vs Glucose')
plt.show()

From here also, we can infer that a healthy person's blood pressure should be 80 and glucose should be 120.

# **Distribution of diabetic people age-wise**

In [None]:
age=pd.Series([])
for i in pima_data.index:
    if (pima_data.loc[i:i,]['Age']<=30).bool():
        age=age.append(pd.Series(['21-30']))
    elif (pima_data.loc[i:i,]['Age']<=40).bool():
        age=age.append(pd.Series(['31-40']))
    elif (pima_data.loc[i:i,]['Age']<=50).bool():
        age=age.append(pd.Series(['41-50']))
    elif (pima_data.loc[i:i,]['Age']<=60).bool():
        age=age.append(pd.Series(['51-60']))
    else:
        age=age.append(pd.Series(['>60']))

age=age.reset_index(drop=True)
pima_data['Age_Range']=age
df=pima_data.groupby('Age_Range')[['Outcome']].count().reset_index()
df

In [None]:
age_df=pima_data[pima_data['Outcome']==1].groupby('Age_Range')['Outcome'].count().reset_index()
age_df['%diabetic']=(age_df['Outcome']/df['Outcome'])*100
age_df['%non-diabetic']=100-age_df['%diabetic']

fig,(ax5,ax6)=plt.subplots(1,2,figsize=(15,10))
ax5.pie(x=age_df['%diabetic'].values,labels=age_df['Age_Range'],autopct='%0.02f%%')
ax5.set_title('% of diabetic person based on age')
ax6.pie(x=age_df['%non-diabetic'].values,labels=age_df['Age_Range'],autopct='%0.02f%%')
ax6.set_title('% of non-diabetic person based on age')
plt.legend()
plt.show()

# **Distribution of diabetic people pregancy-wise**

In [None]:
pregancy=pd.Series([])
for i in pima_data.index:
    if (pima_data.loc[i:i,]['Pregnancies']<=4).bool():
        pregancy=pregancy.append(pd.Series(['0-4']))
    elif (pima_data.loc[i:i,]['Pregnancies']<=9).bool():
        pregancy=pregancy.append(pd.Series(['5-9']))
    elif (pima_data.loc[i:i,]['Pregnancies']<=14).bool():
        pregancy=pregancy.append(pd.Series(['10-14']))
    else:
        pregancy=pregancy.append(pd.Series(['>14']))

pregancy=pregancy.reset_index(drop=True)
pima_data['Pregancy_Range']=pregancy
preg_df=pima_data.groupby('Pregancy_Range')[['Outcome']].count().reset_index()
preg_df

In [None]:
pregancy_df=pima_data[pima_data['Outcome']==1].groupby('Pregancy_Range')['Outcome'].count().reset_index()
pregancy_df['%diabetic']=(pregancy_df['Outcome']/preg_df['Outcome'])*100
pregancy_df['Non-Diabetic%']=100-pregancy_df['%diabetic']
pregancy_df
fig,(ax3,ax4)=plt.subplots(1,2,figsize=(15,10))
ax3.pie(x=pregancy_df['%diabetic'].values,labels=pregancy_df['Pregancy_Range'],autopct='%0.02f%%')
ax3.set_title('% of diabetic Pregant women')
ax4.pie(x=pregancy_df['Non-Diabetic%'].values,labels=pregancy_df['Pregancy_Range'],autopct='%0.02f%%')
ax4.set_title('% of non-diabetic Pregant women')
plt.legend()
plt.show()

In [None]:
fig=plt.figure(figsize=(15,10))
sns.heatmap(pima_data.corr(),annot=True)

1. Pregnancies and Age are correlated to each other.
1. Glucose, BMI and Age are somewhat correlated to Outcome.
1. SkinThickness is correlated to Insulin and BMI.
1. Glucose and BloodPressure are positively correlated to Age

# **Standardization of Data**

It means that the data should have a mean of 0 and a standard variance of 1. It can be done using the StandardScaler function present in sklearn.preprocessing library. But I have done it by creaing a function.

In [None]:
def z_scores(pima_df):
    pima_df=pima_df.drop('Outcome',axis=1)
    pima_copy=pima_df.copy()
    for col in pima_copy.columns:
        pima_copy[col]=(pima_copy[col]-pima_copy[col].mean())/pima_copy[col].std()
    return pima_copy

In [None]:
pima_df=z_scores(pima)
pima_df

# **Splitting the data into training and test set.**

In [None]:
X=pima_df
y=pima_data['Outcome']
X.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.30,random_state=42)
print('X_train : {} and y_train : {}'.format(X_train.shape,y_train.shape))
print('X_test : {} and y_test : {}'.format(X_test.shape,y_test.shape))

# **Model Creation and Evaluation**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
logregressor=LogisticRegression(max_iter=200)
logregressor.fit(X_train,y_train)

In [None]:
y_pred=logregressor.predict(X_test)
y_pred

In [None]:
logregressor.classes_

It is a binary classification problem

In [None]:
print('Slope : {} and intercept : {}'.format(logregressor.coef_,logregressor.intercept_))

# **Confusion Matrix**

To know about the accuracy we'll use confusion matrix. In the case of binary classification, the confusion matrix shows the numbers of the following:

* True negatives in the upper-left position
* False negatives in the lower-left position
* False positives in the upper-right position
* True positives in the lower-right position

In [None]:
cm=confusion_matrix(y_test, y_pred)
cm

Plot the confusion matrix

In [None]:
fig, ax7 = plt.subplots(figsize=(8, 8))
ax7.imshow(cm)
ax7.grid(False)
ax7.xaxis.set(ticks=(0, 1), ticklabels=('Predicted 0s', 'Predicted 1s'))
ax7.yaxis.set(ticks=(0, 1), ticklabels=('Actual 0s', 'Actual 1s'))
ax7.set_ylim(1.5, -0.5)
for i in range(2):
    for j in range(2):
        ax7.text(j, i, cm[i, j], ha='center', va='center', color='red')
plt.show()

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
pima_data.to_csv('submission.csv')