In [1]:
# Importing the Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as  plt

%matplotlib inline

In [2]:
# Loading the Data

df = pd.read_csv("https://raw.githubusercontent.com/plotly/datasets/master/diabetes.csv")

In [3]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
# Get the Info of the Dataset

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
Pregnancies                 768 non-null int64
Glucose                     768 non-null int64
BloodPressure               768 non-null int64
SkinThickness               768 non-null int64
Insulin                     768 non-null int64
BMI                         768 non-null float64
DiabetesPedigreeFunction    768 non-null float64
Age                         768 non-null int64
Outcome                     768 non-null int64
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [5]:
df2 = df[["Glucose", "BMI", "Age", "Outcome"]]

In [6]:
df2.head()
# For No 0 values

Unnamed: 0,Glucose,BMI,Age,Outcome
0,148,33.6,50,1
1,85,26.6,31,0
2,183,23.3,32,1
3,89,28.1,21,0
4,137,43.1,33,1


In [7]:
# Get the Descriptive Statistics of the Data

df2.describe()

Unnamed: 0,Glucose,BMI,Age,Outcome
count,768.0,768.0,768.0,768.0
mean,120.894531,31.992578,33.240885,0.348958
std,31.972618,7.88416,11.760232,0.476951
min,0.0,0.0,21.0,0.0
25%,99.0,27.3,24.0,0.0
50%,117.0,32.0,29.0,0.0
75%,140.25,36.6,41.0,1.0
max,199.0,67.1,81.0,1.0


In [8]:
# Glucose and BMI has 0 values - which is NA. So we Remove Zeroes

# Check if Columns other than Outcome has 0s in it

print((df2[df2.columns[:-1]] == 0).sum())
df2[df2.columns[:-1]] == 0

Glucose     5
BMI        11
Age         0
dtype: int64


Unnamed: 0,Glucose,BMI,Age
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False
5,False,False,False
6,False,False,False
7,False,False,False
8,False,False,False
9,False,True,False


In [9]:
# If any row observation is Containing 0, show it via

(df2[df2.columns[:-1]] == 0).any(axis=1)

0      False
1      False
2      False
3      False
4      False
5      False
6      False
7      False
8      False
9       True
10     False
11     False
12     False
13     False
14     False
15     False
16     False
17     False
18     False
19     False
20     False
21     False
22     False
23     False
24     False
25     False
26     False
27     False
28     False
29     False
       ...  
738    False
739    False
740    False
741    False
742    False
743    False
744    False
745    False
746    False
747    False
748    False
749    False
750    False
751    False
752    False
753    False
754    False
755    False
756    False
757    False
758    False
759    False
760    False
761    False
762    False
763    False
764    False
765    False
766    False
767    False
Length: 768, dtype: bool

In [10]:
# If it foesnt have one data point Drop the Whole Row

df3 = df2.loc[ ~ (df2[df2.columns[:-1]] == 0).any(axis = 1)]

In [11]:
df3.describe()

Unnamed: 0,Glucose,BMI,Age,Outcome
count,752.0,752.0,752.0,752.0
mean,121.941489,32.454654,33.3125,0.351064
std,30.601198,6.928926,11.709395,0.477621
min,44.0,18.2,21.0,0.0
25%,99.75,27.5,24.0,0.0
50%,117.0,32.3,29.0,0.0
75%,141.0,36.6,41.0,1.0
max,199.0,67.1,81.0,1.0


In [12]:
# Group by Outcome and check for its mean

df3.groupby("Outcome").mean()

Unnamed: 0_level_0,Glucose,BMI,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,110.82582,30.876434,31.309426
1,142.488636,35.37197,37.015152


In [13]:
# We can infer that those who have diabetes are Older and have High BMI and Glucose

In [14]:
# We can also Create Aggregate method Via Dictionary

df3.groupby("Outcome").agg({"Glucose": "mean", "BMI" : "median", "Age" : "sum"})

Unnamed: 0_level_0,Glucose,BMI,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,110.82582,30.1,15279
1,142.488636,34.25,9772


In [15]:
# If we want Multiple Aggregates in a Column

df3.groupby("Outcome").agg(["mean", "median"])

Unnamed: 0_level_0,Glucose,Glucose,BMI,BMI,Age,Age
Unnamed: 0_level_1,mean,median,mean,median,mean,median
Outcome,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
0,110.82582,107.5,30.876434,30.1,31.309426,27
1,142.488636,140.5,35.37197,34.25,37.015152,36


In [16]:
# To Extract the Outcomes having Value 1 and 0

positive = df3.loc[df3['Outcome'] == 1]
negative = df3.loc[df3['Outcome'] == 0]

In [17]:
positive.shape, negative.shape

((264, 4), (488, 4))

In [None]:
# Outliers

