In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<div style="background-color:#c9c8c8;
            border-radius:10px;
            letter-spacing:0.5px;
            display:fill;
            font-size:110%;
            margin: 5px;">
<p style="padding:5px;">
    
**Context :** This dataset is originally from the National Institute of Diabetes and Digestive and Kidney Diseases. The objective of the dataset is to diagnostically predict whether or not a patient has diabetes, based on certain diagnostic measurements included in the dataset. Several constraints were placed on the selection of these instances from a larger database. In particular, all patients here are females at least 21 years old of Pima Indian heritage.

In [None]:
# import library 

import seaborn as sns
import matplotlib.pyplot as plt
import scipy 

# set graph grid
sns.set_style("whitegrid")

## Selection of data

In [None]:
# import data

datadf = pd.read_csv("../input/pima-indians-diabetes/pima-indians-diabetes.csv",header=None)

In [None]:
# col name created
datacol = ["Pregnancies","Glucose","BloodPressure","SkinThickness","Insulin","BMI","DiabetesPedigreeFunction","Age","Outcome"]

# assign column name

datadf.columns= datacol

In [None]:
# Column names check
datadf.columns

# Column explanation :

1. Pregnancies : No. of times pregnant
2. Gloucose : Plasma gloucose concentration a 2 hours in an oral glucose tolerance test
3. BloodPressure : Diastolic blood Pressure(mm Hg)
4. SkinThickness : Triceps skin fold thickness (mm)
5. Insulin : 2 hours serum insulin (muU/ml)
6. BMI : Body mass index(weight in kg/(height in m)^2)
7. DiabetesPedigreeFunction : Diabetes pedigree function
8. Age : Age(years)
9. Outcome : Class variable(0 or 1)



In [None]:
# Observe few rows
datadf.head()

In [None]:
# data size
datadf.shape

**So data has 768 rows and 9 column**

In [None]:
#data information on column type , null and unique count
datadf.info()

In [None]:
# Checking Null
datadf.isna().sum(axis=0)

**Na values are Zero**

In [None]:
# Min check of data

datadf.min()

**For variable Glucose ,BloodPresure ,Skin Thickness , Insulin, and BMI what we observe min is zero which is not possible** 

# Missing Data Impute


In [None]:
# Precence of Zero in percentage 

datadf[datadf==0.0].count(0)/len(datadf)*100

* For Glucose , BloodPressure  BMI  
    * To impute the data will be using statistics location point median
* For SkinThickness & Insulin 
    * Since we have high number of data missing 
    * will check the correlation to check for multicollinearity  
    * then build model to impute the data

In [None]:
# Missing Data impute 
datadf.Glucose.replace(0,datadf.Glucose.median(),inplace=True)
datadf.BloodPressure.replace(0,datadf.BloodPressure.median(),inplace=True)
datadf.BMI.replace(0,datadf.BMI.median(),inplace=True)


In [None]:
# Coorelation
plt.figure(figsize=(15,8))
sns.heatmap(datadf.corr(),annot=True)

1. Column Insulin & DiabetesPedigressFunction , we donot see any corr > 0.7
2. Will use linear regression model to predict the missing data 
3. Will drop Insulin & DiabetesPedigressFunction for EDA

In [None]:
# drop the column
datadf.drop(['Insulin','SkinThickness'],axis=1,inplace=True)

In [None]:
# Description of data
datadf.describe()

# Understanding Variables in Data

Based on descriptive our understand of variable :

1. Pregnancies 
 * Missing value - No Value
 * Data type - Discrete 
 * Mean and Median value is fairly close which imply the distribution are not much skewed
 * The mean is greater than the median ,so data is right skewed / positive skewed. 
 * Interquartile range for number of pregnencies in 5 (6 - 1) ie (Q3 - Q1)


2. Glucose 
 *  Missing value - Replace with Median
 *  Data type - Continuous
 *  Mean is 121.65 & Median is 117.00,as Mean > Median , so it indicate it is right skew / positive skewed.
 *  Interquartile range for Glucose in  (140.25 - 99.75) ie (Q3 - Q1)


3. BloodPressure
 *  Missing value - Replace with Median
 *  Data type - Continuous
 *  Mean is 72.38 & Median is 72.00,as Mean > Median , so it indicate it is right skew / positive skewed.
 *  Interquartile range for BloodPressure in  (140.25 - 99.75) ie (Q3 - Q1)

4. BMI
 *  Missing value - Replace with Median
 *  Data type - Continuous
 *  Mean is 32.45 & Median is 32.00,as Mean > Median , so it indicate it is right skew / positive skewed.
 *  Interquartile range for BMI in  (36.00 - 27.00) ie (Q3 - Q1)
 
 
5. DiabetesPedigreeFunction   
 *  Missing value - No Value
 *  Data type - Continuous 
 *  Mean is 0.47 & Median is 0.37,as Mean > Median , so it indicate it is right skew / positive skewed.
 *  Interquartile range for DiabetesPedigreeFunction in  (0.62 - 0.24) ie (Q3 - Q1)
 
6. Age    
 *  Missing value - No Value
 *  Data type - Discrete 
 *  Mean is 33.24 & Median is 29.00,as Mean > Median , so it indicate it is right skew / positive skewed.
 *  Interquartile range for Age in  (41.00 - 24.00) ie (Q3 - Q1)
 
7.  Outcome
 *  Missing Value -  No Value
 *  Data Type - Nominal & Boolean
 *  five point summary is not valid for Nominal data type

# Univariate Analysis of Variable

## Discrete Variable Analysis 
### Pregnancies

In [None]:
sns.countplot(x=datadf.Pregnancies,
             palette="pastel").set_title('No. of Pregnacies')
plt.xlabel("Pregnancy count")
plt.ylabel("Total count")


### Age

In [None]:
# Histoplot for Age
sns.histplot(x=datadf.Age).set_title("Histograph of Age")

### Outcome

In [None]:
# Outcome count
datadf.Outcome.value_counts()

In [None]:
# Outcome count
datadf.Outcome.value_counts()/len(datadf)

**Out of 788 data points 65% do not have diabetes**

In [None]:
sns.countplot(x=datadf.Outcome).set_title("BarPlot on Outcome")

### Continous Variable Analysis

In [None]:
fig, axes = plt.subplots(2, 2, sharey=False,figsize=(18, 10))
  
fig.suptitle('Distribution of Continuous Variable of data')
  
sns.histplot(ax=axes[0, 0], x=datadf.Glucose)
sns.histplot(ax=axes[0, 1], x=datadf.BloodPressure)
sns.histplot(ax=axes[1, 0], x=datadf.BMI)
sns.histplot(ax=axes[1, 1], x=datadf.DiabetesPedigreeFunction)


In [None]:
datadf.loc[:,datadf.columns != 'Outcome'].skew(axis = 0, skipna = True)

**Skewness of Variables are greater than Zero which indicate they are right /positive skewed. As we observerd looking at histograph and on five point summary**

In [None]:
# Box plot 

fig, axes = plt.subplots(2, 2, sharey=False,figsize=(18, 10))
  
fig.suptitle('BoxPlot Continuous Variable of data')
  
sns.boxplot(ax=axes[0, 0], x=datadf.Glucose)
sns.boxplot(ax=axes[0, 1], x=datadf.BloodPressure)
sns.boxplot(ax=axes[1, 0], x=datadf.BMI)
sns.boxplot(ax=axes[1, 1], x=datadf.DiabetesPedigreeFunction)


<div style="background-color:#a89e32;
            border-radius:10px;
            letter-spacing:0.5px;
            display:fill;
            font-size:110%;
            margin: 5px;">
<p style="padding:5px;">
    
  <b>Observation : </b>
    
* Other than Glucose we observe outliers in other variable (Blood Presssure,BMI, Diabetes Pedigree Function)

# Bivariate Analysis

In [None]:
sns.pairplot(datadf,hue="Outcome",corner=True)

## Correlation Plot

In [None]:
plt.figure(figsize=(15,8))

sns.heatmap(datadf.corr(),annot=True)

### Pregnancy - Outcome

In [None]:
plt.figure(figsize=(20, 6))
plt.subplot(1,2,1)
sns.histplot(x="Pregnancies",data = datadf, hue="Outcome")

plt.subplot(1,2,2)
sns.boxplot(x=datadf.Outcome, y=datadf.Pregnancies).set_title("Boxplot for Pregnancies by Outcome")


datadf[["Outcome","Pregnancies"]].groupby(by="Outcome").agg(("mean","median","min","max","skew","count"))

<div style="background-color:#a89e32;
            border-radius:10px;
            letter-spacing:0.5px;
            display:fill;
            font-size:110%;
            margin: 5px;">
<p style="padding:5px;">
    
<b>Observation : <b> 
* Women are likely to be diabetic with  higher pregnancies count
* There are few outliner in non diabetic cases 



### Glucose - Outcome

In [None]:
plt.figure(figsize=(20, 6))
plt.subplot(1,2,1)
sns.histplot(x="Glucose",data = datadf, hue="Outcome")

plt.subplot(1,2,2)
sns.boxplot(x=datadf.Outcome, y=datadf.Glucose).set_title("Boxplot for Glucose by Outcome")


datadf[["Outcome","Glucose"]].groupby(by="Outcome").agg(("mean","median","min","max","skew","count"))


<div style="background-color:#a89e32;
            border-radius:10px;
            letter-spacing:0.5px;
            display:fill;
            font-size:110%;
            margin: 5px;">
<p style="padding:5px;">

 <b>Observation:<b>
* Glucose level are high for diabetic women

### Blood Pressure  - Outcome

In [None]:
plt.figure(figsize=(20, 6))
plt.subplot(1,2,1)
sns.histplot(x="BloodPressure",data = datadf, hue="Outcome")

plt.subplot(1,2,2)
sns.boxplot(x=datadf.Outcome, y=datadf.BloodPressure).set_title("Boxplot for BloodPressure by Outcome")


datadf[["Outcome","BloodPressure"]].groupby(by="Outcome").agg(("mean","median","min","max","skew","count"))


<div style="background-color:#a89e32;
            border-radius:10px;
            letter-spacing:0.5px;
            display:fill;
            font-size:110%;
            margin: 5px;">
<p style="padding:5px;">
    
<b>Observation: </b>
* There is less difference of BP level for diabetic & non-diabetic cases. 
</p>
</div>    

### BMI - Outcome

In [None]:
plt.figure(figsize=(20, 6))
plt.subplot(1,2,1)
sns.histplot(x="BMI",data = datadf, hue="Outcome")

plt.subplot(1,2,2)
sns.boxplot(x=datadf.Outcome, y=datadf.BMI).set_title("Boxplot for BMI by Outcome")


datadf[["Outcome","BMI"]].groupby(by="Outcome").agg(("mean","median","min","max","skew","count"))

Observation : 
* Women with diabetic seems to have high BMI
* Few outlier cases observed in both

### DiabetesPedigreeFunction - Outcome

In [None]:
plt.figure(figsize=(20, 6))
plt.subplot(1,2,1)
sns.histplot(x="DiabetesPedigreeFunction",data = datadf, hue="Outcome")

plt.subplot(1,2,2)
sns.boxplot(x=datadf.Outcome, y=datadf.DiabetesPedigreeFunction).set_title("Boxplot for DiabetesPedigreeFunction by Outcome")


datadf[["Outcome","DiabetesPedigreeFunction"]].groupby(by="Outcome").agg(("mean","median","min","max","skew","count"))



Observation : 
* There are slight difference location value, Diabetes Pedigree Function might effect diabetic 
* Outliners can be seem in both cases (Diabetic and NonDiabetic)

### Age - Outcome

In [None]:
plt.figure(figsize=(20, 6))
plt.subplot(1,2,1)
sns.histplot(x="Age",data = datadf, hue="Outcome")

plt.subplot(1,2,2)
sns.boxplot(x=datadf.Outcome, y=datadf.Age).set_title("Boxplot for Age by Outcome")


datadf[["Outcome","Age"]].groupby(by="Outcome").agg(("mean","median","min","max","skew","count"))

Observation:
* Diabetic case are more in higher age group with high mean value