In [244]:
import numpy as np
import pandas as pd

df = pd.read_csv('/home/heisenberg/Downloads/population_diabetes.csv')
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


### 1. Check if all the features are numeric

In [245]:
df.shape[1] == df.select_dtypes(include=np.number).shape[1] #will return true if all the columns are numeric

True

### 2. Given dataset contains many 0(zero) values, replace 0 with NaN value

In [246]:
# we will replace 0 with NaN value of all columns except Outcome since it is dependent feature having 2 classes of 0 and 1. 

df_1 = df.iloc[:,:-1]
df_2 = df.iloc[:,-1]

df_1.replace(0, np.nan, inplace = True)
df_1

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6.0,148.0,72.0,35.0,,33.6,0.627,50
1,1.0,85.0,66.0,29.0,,26.6,0.351,31
2,8.0,183.0,64.0,,,23.3,0.672,32
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21
4,,137.0,40.0,35.0,168.0,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10.0,101.0,76.0,48.0,180.0,32.9,0.171,63
764,2.0,122.0,70.0,27.0,,36.8,0.340,27
765,5.0,121.0,72.0,23.0,112.0,26.2,0.245,30
766,1.0,126.0,60.0,,,30.1,0.349,47


### 3. Replace NaN with mean , median, mode.

In [247]:
import plotly.express as px

In [248]:
# Pregnancies column

fig = px.histogram(df_1['Pregnancies'], x="Pregnancies",marginal="box")
fig.show()

In [249]:
# As 'Pregnancies' column data is highly skewed, we can replace NaN values of 'Pregnancies' column with median or mode. Here we replacing NaN with median

df_1['Pregnancies'].fillna(df_1['Pregnancies'].median(),inplace = True)
df_1

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6.0,148.0,72.0,35.0,,33.6,0.627,50
1,1.0,85.0,66.0,29.0,,26.6,0.351,31
2,8.0,183.0,64.0,,,23.3,0.672,32
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21
4,4.0,137.0,40.0,35.0,168.0,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10.0,101.0,76.0,48.0,180.0,32.9,0.171,63
764,2.0,122.0,70.0,27.0,,36.8,0.340,27
765,5.0,121.0,72.0,23.0,112.0,26.2,0.245,30
766,1.0,126.0,60.0,,,30.1,0.349,47


In [250]:
# Glucose column

fig = px.histogram(df_1['Glucose'], x="Glucose",marginal="box")
fig.show()

In [251]:
# As 'Glucose' column data is highly skewed, we can replace NaN values of 'Glucose' column with median or mode. Here we replacing NaN with median

df_1['Glucose'].fillna(df_1['Glucose'].median(),inplace = True)
df_1['Glucose']

0      148.0
1       85.0
2      183.0
3       89.0
4      137.0
       ...  
763    101.0
764    122.0
765    121.0
766    126.0
767     93.0
Name: Glucose, Length: 768, dtype: float64

In [252]:
# BloodPressure column

fig = px.histogram(df_1['BloodPressure'], x="BloodPressure", marginal="box")
fig.show()

In [253]:
# Replacing NaN with mean
df_1['BloodPressure'].fillna(df_1['BloodPressure'].mean(),inplace = True)
df_1['BloodPressure']

0      72.0
1      66.0
2      64.0
3      66.0
4      40.0
       ... 
763    76.0
764    70.0
765    72.0
766    60.0
767    70.0
Name: BloodPressure, Length: 768, dtype: float64

In [254]:
# SkinThickness Column

fig = px.histogram(df_1['SkinThickness'], x="SkinThickness", marginal="box")
fig.show()

In [255]:
# Replacing NaN with median

df_1['SkinThickness'].fillna(df_1['SkinThickness'].median(),inplace = True)
df_1['SkinThickness']

0      35.0
1      29.0
2      29.0
3      23.0
4      35.0
       ... 
763    48.0
764    27.0
765    23.0
766    29.0
767    31.0
Name: SkinThickness, Length: 768, dtype: float64

In [256]:
# Insulin Column

fig = px.histogram(df_1['Insulin'], x="Insulin", marginal="box")
fig.show()

In [257]:
# Replacing NaN with median

df_1['Insulin'].fillna(df_1['Insulin'].median(),inplace = True)
df_1['Insulin']

0      125.0
1      125.0
2      125.0
3       94.0
4      168.0
       ...  
763    180.0
764    125.0
765    112.0
766    125.0
767    125.0
Name: Insulin, Length: 768, dtype: float64

In [258]:
# BMI Column

fig = px.histogram(df_1['BMI'], x="BMI", marginal="box")
fig.show()

In [259]:
# Replacing NaN with mode

df_1['BMI'].fillna(df_1['BMI'].mode()[0],inplace = True)
df_1['BMI']

0      33.6
1      26.6
2      23.3
3      28.1
4      43.1
       ... 
763    32.9
764    36.8
765    26.2
766    30.1
767    30.4
Name: BMI, Length: 768, dtype: float64

In [260]:
# DiabetesPedigreeFunction column

fig = px.histogram(df_1['DiabetesPedigreeFunction'], x="DiabetesPedigreeFunction", marginal="box")
fig.show()

In [261]:
# Replacing NaN with median

df_1['DiabetesPedigreeFunction'].fillna(df_1['DiabetesPedigreeFunction'].median(),inplace = True)
df_1['DiabetesPedigreeFunction']

0      0.627
1      0.351
2      0.672
3      0.167
4      2.288
       ...  
763    0.171
764    0.340
765    0.245
766    0.349
767    0.315
Name: DiabetesPedigreeFunction, Length: 768, dtype: float64

In [262]:
# Age column

fig = px.histogram(df_1['Age'], x="Age", marginal="box")
fig.show()

In [263]:
# Replacing NaN with median

df_1['Age'].fillna(df_1['Age'].median(),inplace = True)
df_1['Age']

0      50
1      31
2      32
3      21
4      33
       ..
763    63
764    27
765    30
766    47
767    23
Name: Age, Length: 768, dtype: int64

 ### 4. Correlation matrix of each column

In [264]:
df = pd.concat([df_1, df_2],axis=1, join='inner')
df.corr()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
Pregnancies,1.0,0.153525,0.256151,0.126082,0.032397,0.100746,-0.009198,0.522303,0.248172
Glucose,0.153525,1.0,0.218749,0.192615,0.419451,0.2314,0.137327,0.266909,0.492782
BloodPressure,0.256151,0.218749,1.0,0.191853,0.045087,0.281063,-0.002763,0.324595,0.166074
SkinThickness,0.126082,0.192615,0.191853,1.0,0.15561,0.543275,0.102188,0.126107,0.214873
Insulin,0.032397,0.419451,0.045087,0.15561,1.0,0.180373,0.126503,0.097101,0.20379
BMI,0.100746,0.2314,0.281063,0.543275,0.180373,1.0,0.153506,0.025744,0.312249
DiabetesPedigreeFunction,-0.009198,0.137327,-0.002763,0.102188,0.126503,0.153506,1.0,0.033561,0.173844
Age,0.522303,0.266909,0.324595,0.126107,0.097101,0.025744,0.033561,1.0,0.238356
Outcome,0.248172,0.492782,0.166074,0.214873,0.20379,0.312249,0.173844,0.238356,1.0


### 5. Descriptive statistics of each column.

In [265]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,4.423177,121.65625,72.405184,29.108073,140.671875,32.450911,0.471876,33.240885,0.348958
std,2.980481,30.438286,12.096346,8.791221,86.38306,6.875366,0.331329,11.760232,0.476951
min,1.0,44.0,24.0,7.0,14.0,18.2,0.078,21.0,0.0
25%,2.0,99.75,64.0,25.0,121.5,27.5,0.24375,24.0,0.0
50%,4.0,117.0,72.202592,29.0,125.0,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


### 6. Inferential Statistics

#### a. Normal Deviate Z Test

In [266]:
from statsmodels.stats import weightstats as stests

# sample data
df_sample = pd.read_excel('/home/heisenberg/Downloads/sample.xlsx')
df_sample.head(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [267]:
# Removing Zeros with NaN and then imputing NaN value with mean, median, mode

df_sample1 = df_sample.iloc[:,:-1]

df_sample2 = df_sample.iloc[:,-1]

df_sample1.replace(0, np.nan, inplace = True)

df_sample1.head(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6.0,148,72.0,35.0,,33.6,0.627,50
1,1.0,85,66.0,29.0,,26.6,0.351,31
2,8.0,183,64.0,,,23.3,0.672,32
3,1.0,89,66.0,23.0,94.0,28.1,0.167,21
4,,137,40.0,35.0,168.0,43.1,2.288,33


In [268]:
df_sample1['Pregnancies'].fillna(df_sample1['Pregnancies'].median(),inplace = True)

df_sample1['Glucose'].fillna(df_sample1['Glucose'].median(),inplace = True)

df_sample1['BloodPressure'].fillna(df_sample1['BloodPressure'].median(),inplace = True)

df_sample1['SkinThickness'].fillna(df_sample1['SkinThickness'].median(),inplace = True)

df_sample1['Insulin'].fillna(df_sample1['Insulin'].median(),inplace = True)

df_sample1['BMI'].fillna(df_sample1['BMI'].median(),inplace = True)

df_sample1['DiabetesPedigreeFunction'].fillna(df_sample1['DiabetesPedigreeFunction'].median(),inplace = True)

df_sample1['Age'].fillna(df_sample1['Age'].median(),inplace = True)


In [269]:
df_sample = pd.concat([df_sample1, df_sample2],axis=1, join='inner')

In [291]:
df_sample.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6.0,148,72.0,35.0,146.0,33.6,0.627,50,1
1,1.0,85,66.0,29.0,146.0,26.6,0.351,31,0
2,8.0,183,64.0,32.5,146.0,23.3,0.672,32,1
3,1.0,89,66.0,23.0,94.0,28.1,0.167,21,0
4,5.5,137,40.0,35.0,168.0,43.1,2.288,33,1


### Z-test

In [289]:
print('Mean:\n')
print(df.mean())
print('\nStandard Deviation:\n')
print(df.std())
print(df_sample.mean())

Mean:

Pregnancies                   4.423177
Glucose                     121.656250
BloodPressure                72.405184
SkinThickness                29.108073
Insulin                     140.671875
BMI                          32.450911
DiabetesPedigreeFunction      0.471876
Age                          33.240885
Outcome                       0.348958
dtype: float64

Standard Deviation:

Pregnancies                  2.980481
Glucose                     30.438286
BloodPressure               12.096346
SkinThickness                8.791221
Insulin                     86.383060
BMI                          6.875366
DiabetesPedigreeFunction     0.331329
Age                         11.760232
Outcome                      0.476951
dtype: float64
Pregnancies                   5.761905
Glucose                     126.857143
BloodPressure                73.404762
SkinThickness                31.738095
Insulin                     171.309524
BMI                          32.416667
DiabetesPedigr

***z-test***

Population mean and std is known in this case

the mean and std value of the population from descriptive stats
table.

Hypothesis

H0 - Samples are likely drawn from the same distributions.
H1 - Samples are likely drawn from different distributions.

taking 95% of confidence interval

for Null hypothesis to be accepted z value sholud be in range of -1.96 to 1.96 and p value should be equal or greater then 0.05 for two tail significant value of 0.05 otherwise we can we can reject our null hypothesis


z = ( x̄−μ) / (σ / √n)

x̄ = Mean of the Sample,
μ = Mean of the Population,
s = Standard Deviation of Population,
n = Sample Size


for sample data column 'Pregnancies'

x̄ = 5.761905,
μ = 4.423177, n = 42, 
σ = 2.98

z = (5.761905−4.423177)÷(2.98÷(√42))
z = 2.91139229319

pval = 2*(1 - norm.cdf(abs(z)))
pval = 0.0036

as z is not in range of -1.96 to 1.96 we can reject our null hypothesis and accept the alternate hypothesis. Also pval is less then 0.5 hence we can reject the null hypothesis and can accept the alternate hypothesis

Hence we can say that 'Pregnancies' column of sample is drawn from the different distribution.  



In [314]:
# ztest for each samples

from math import sqrt
from scipy.stats import norm

for j in df.columns:
    ztest = ((df_sample[j].mean()-df[j].mean())/(df[j].std()/(sqrt(len(df_sample)))))
    pval = 2*(1 - norm.cdf(abs(ztest)))
    print(f"\nztest and p value for column {j} are :{ztest} , {pval}\n")
    if -1.96 <= t_test <= 1.96:
        print("Null hyphothesis rejected , Alternative hyphothesis accepted")
    else:
        print("Null hyphothesis accepted , Alternative hyphothesis rejected")


ztest and p value for column Pregnancies are :2.9109213882561904 , 0.003603646718801201

Null hyphothesis accepted , Alternative hyphothesis rejected

ztest and p value for column Glucose are :1.1073435016673372 , 0.2681454430292205

Null hyphothesis accepted , Alternative hyphothesis rejected

ztest and p value for column BloodPressure are :0.5355339520184804 , 0.5922806865425048

Null hyphothesis accepted , Alternative hyphothesis rejected

ztest and p value for column SkinThickness are :1.9388083465808372 , 0.05252467941869754

Null hyphothesis accepted , Alternative hyphothesis rejected

ztest and p value for column Insulin are :2.2985369845504757 , 0.021531245410633337

Null hyphothesis accepted , Alternative hyphothesis rejected

ztest and p value for column BMI are :-0.032279241554883244 , 0.9742493634167759

Null hyphothesis accepted , Alternative hyphothesis rejected

ztest and p value for column DiabetesPedigreeFunction are :0.5962009566874271 , 0.5510409865960519

Null hyph

if Null hypothesis is accepted or Alternative hyphothes is rejected we can say that column of sample is drawn from the same distribution.
if Null hypothesis is rejected or Alternative hyphothes is accepted we can say that column of sample is drawn from the different distribution.

## t-test

In [309]:
df_sample.std()

Pregnancies                   3.331736
Glucose                      31.740235
BloodPressure                13.584531
SkinThickness                 7.444296
Insulin                     128.946479
BMI                           6.222576
DiabetesPedigreeFunction      0.408390
Age                          11.487432
Outcome                       0.505487
dtype: float64

***t-test***

Hypothesis
H0 - Samples are likely drawn from the same distributions.
H1 - Samples are likely drawn from different distributions.

for One paired t-test

t = ( x̄ – μ) / (s / √n)
where

x̄ = Observed Mean of the Sample
μ = Theoretical Mean of the Population
s = Standard Deviation of the Sample
n = Sample Size

Also 

df(degree of freedom) = n-1
n = 42
for confidence level of 95% and df(degree of freedom = 41)

t value should +/-2.0181

for Null hypothesis to be accepted t value sholud be in range of -2.0195 to 2.0195 for two tail significant value of 0.05 otherwise we can we can reject our null hypothesis

for sample data column 'Pregnancies'

x̄ = 5.761905
μ = 4.423177
s = 3.331736
n = 42

t = (5.761905−4.423177)÷(3.331736 ÷ (√42))
t  = 2.60403256251

as t is not in range of -2.0195 to 2.0195 we can reject our null hypothesis and accept the alternate hypothesis. 

Hence we can say that 'Pregnancies' column of sample is drawn from the different distribution.

In [313]:
# ztest for each samples

for j in df.columns:
    t_test = ((df_sample[j].mean()-df[j].mean())/(df_sample[j].std()/(sqrt(len(df_sample)))))
    print(f"\nt-test for column {j} are :{t_test} \n")
    if -2.0195 <= t_test <=2.0195:
        print("Null hyphothesis rejected , Alternative hyphothesis accepted")
    else:
        print("Null hyphothesis accepted , Alternative hyphothesis rejected")


t-test for column Pregnancies are :2.6040319604650293 

Null hyphothesis accepted , Alternative hyphothesis rejected

t-test for column Glucose are :1.0619215062621719 

Null hyphothesis rejected , Alternative hyphothesis accepted

t-test for column BloodPressure are :0.4768662210899816 

Null hyphothesis rejected , Alternative hyphothesis accepted

t-test for column SkinThickness are :2.2896044829694158 

Null hyphothesis accepted , Alternative hyphothesis rejected

t-test for column Insulin are :1.5398222465444285 

Null hyphothesis rejected , Alternative hyphothesis accepted

t-test for column BMI are :-0.035665555417278644 

Null hyphothesis rejected , Alternative hyphothesis accepted

t-test for column DiabetesPedigreeFunction are :0.48369989588048634 

Null hyphothesis rejected , Alternative hyphothesis accepted

t-test for column Age are :2.6177365898267917 

Null hyphothesis accepted , Alternative hyphothesis rejected

t-test for column Outcome are :2.2417308788292574 

Null h

if Null hypothesis is accepted or Alternative hyphothes is rejected we can say that column of sample is drawn from the same distribution. if Null hypothesis is rejected or Alternative hyphothes is accepted we can say that column of sample is drawn from the different distribution.