In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import statsmodels.api as sm
import scipy.stats.distributions as dist
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
dataset = pd.read_csv('../input/pima-indians-diabetes-database/diabetes.csv')
dataset.head()

In [None]:
dataset.isnull().sum()

# NO null values are there

In [None]:
dataset.describe()

In [None]:
plt.figure(figsize = (15,8))
correlation_matrix = dataset.corr()

sns.heatmap(correlation_matrix , annot = True)
plt.show()

The distribution of number of pregnancies is skewed right, centered at around 0 with most data being between 0-5 points, a range of roughly 0-15, and some outliers are present in higher end.

In [None]:
plt.figure(figsize = (15,6))
plt.subplot(1,2,1)
_ = sns.distplot(dataset.Pregnancies).set_ylabel("Distributions" , fontsize = 15)
plt.subplot(1,2,2)
_ = sns.boxplot(dataset.Pregnancies)

In [None]:
dataset['agegrp'] = pd.cut(dataset.Age , [18,30,40,50,60,70,80])
plt.figure(figsize = (15,7))
sns.boxplot(x = dataset.agegrp , y = dataset.BloodPressure)
plt.show()

In [None]:
plt.figure(figsize = (15,7))
dataset['Outcomex'] = dataset.Outcome.replace({1:'Diabetic' , 0:'Non_Diabetic'})
sns.boxplot(x = dataset.agegrp , y = dataset.BloodPressure,hue = dataset.Outcomex)
plt.show()

## Question 1

 Research Question : What proportion of people reported that they have diabetes?
 
 Target Population: Whole Population
 
 Parameter of Interest: Proportion

In [None]:
sample_size_pregnant = dataset[dataset['Outcome'] == 1]['Outcome'].count()
total_size = dataset.shape[0]
unbiased_point_estimate = np.round(sample_size_pregnant / total_size,100)
unbiased_point_estimate

So according to our population the unbiased point estimate is 0.35, that means 35% of population in our sample reported that they are preganant

Margin of error = z * standard error

standard error = sqrt(p-hat*(1-p-hat/total_size))

In [None]:
Margin_of_error = 1.96 * np.sqrt(unbiased_point_estimate * (1-unbiased_point_estimate)/total_size)
Margin_of_error

We estimated with 95% confidence, that the population proportion of females who are pregnant is estimated to be between 31% to 38% . That means with 95% confidence we are estimating that 32% to 38% of females are pregnant based on our sample size of 768

In [None]:
lcb = unbiased_point_estimate - Margin_of_error
ucb = unbiased_point_estimate + Margin_of_error
(lcb,ucb)

Using statsmodels libray also we got the same result

In [None]:
sm.stats.proportion_confint(sample_size_pregnant,total_size)

## Question 2

*2.1* Research Question: What is the average number of preganancy for a diabetic patient?

Target Population: Preganant Females having diabetes

Parameter of Interest: Average Month of pregnancy

In [None]:
unbiased_point_estimate = dataset[dataset.Outcome == 1]['Pregnancies'].mean()
std = dataset[dataset.Outcome == 1]['Pregnancies'].std()
(unbiased_point_estimate ,std)

In [None]:
Margin_of_error = 1.96 * std/np.sqrt(sample_size_pregnant)
Margin_of_error

We estimated with 95% confidence interval that the average number of pregnancies for a diabetic patient is in between 4.41 to 5.31

In [None]:
lcb = unbiased_point_estimate - Margin_of_error
ucb = unbiased_point_estimate + Margin_of_error
(lcb,ucb)

In [None]:
sm.stats.DescrStatsW(dataset[dataset.Outcome == 1]['Pregnancies']).zconfint_mean()

*2.2* Research Question: What is the average month of pregnancy for a non-diabetic patient

Target Population: Pregnant females(Non-Diabetic)

Parameter of Interest: Average month for pregnancy

In [None]:
unbiased_point_estimate = dataset[dataset.Outcome == 0]['Pregnancies'].mean()
std = dataset[dataset.Outcome == 0]['Pregnancies'].std()
(unbiased_point_estimate , std)

In [None]:
Margin_of_error = 1.96 * std/np.sqrt(dataset[dataset.Outcome == 0]['Outcome'].count())
Margin_of_error

In [None]:
lcb = unbiased_point_estimate - Margin_of_error
ucb = unbiased_point_estimate + Margin_of_error
(lcb,ucb)

With 95% confidence,the population mean of total number of pregnancies for a non-diabetic patient is estimated to be in between 3.03 to 3.56, we can observe here that it is clearly distinct with the average number of pregnancies for a diabetic patient(4.41 , 5.31). If the number of pregnancies is equal or more than 4, we can estimate that it's a diabetic patient.

Below the python code to find out the above results

In [None]:
sm.stats.DescrStatsW(dataset[dataset.Outcome == 0]['Pregnancies']).zconfint_mean()

To solidify our above observation, we can now find the average difference of pregnancies for diabetic and non diabetic patients

Research Question: Find the average pregnancy difference for diabetic and non-diabetic patient

Population of Interest: Pregnant Females

Parameter of Interest: (*U<sub>1</sub>* - *U<sub>2</sub>*):Pregnancies

Note that 1 = Diabetic,2 = Non-Diabetic


In [None]:
std1 = dataset[dataset.Outcome == 1]['Pregnancies'].std()
std2 = dataset[dataset.Outcome == 0]['Pregnancies'].std()
(std1**2 , std2**2)

### Unpooled Approach

In [None]:
mean1 = dataset[dataset.Outcome == 1]['Pregnancies'].mean()
mean2 = dataset[dataset.Outcome == 0]['Pregnancies'].mean()
print(mean1 - mean2)

n1 = dataset[dataset.Outcome == 1]['Pregnancies'].count()
n2 = dataset[dataset.Outcome == 0]['Pregnancies'].count()

(n1,n2)

In [None]:
t_star = 1.98
Margin_of_error = t_star*np.sqrt(std1**2/n1 + std**2/n2)
Margin_of_error

In [None]:
lcb = (mean1 - mean2) - Margin_of_error
ucb = (mean1 - mean2) + Margin_of_error
(lcb,ucb)

### Pooled Approach


In [None]:
Margin_of_error = t_star * np.sqrt(((n1 - 1)*std1**2 + (n2 - 1)*std2**2)/(n1 + n2 - 2)) * np.sqrt(1/n1 + 1/n2)
Margin_of_error

Here we can see that using both the approaches we get almost same result, and that's the good thing as our results are more robust. So, with 95% confidence interval, the population mean difference between the number of pregnancies for a diabetic patient is estimated to be 1,o4 to 2.09 more than the non-diabetic patient.

In [None]:
lcb = (mean1 - mean2) - Margin_of_error
ucb = (mean1 - mean2) + Margin_of_error
(lcb,ucb)

### We can also confirm the above results using hypothesis testing

Research Question: Is there a significant difference between the number of pregnancies for a diabetic ad non diabetic patient

$H_0$ : mu1 - mu2 = 0

$H_1$ : mu1 - mu2 $\neq$ 0

significance level = 0.05

Test-statistic = Best estimate - Hypothesised result / standard error

Test-statistic: A measure of how far our sample statistic is from our hypothesized population parameter, in terms of estimated standard errors. The farther away our sample statistic is, the less confident we will be in our null hypothesised value

Assumptions:
1. Data collected using simple random sampling.
2. Distribution is approximately normal, or we have large enough sample size so that we can rely on central limit theorem

#### Pooled Approach

In [None]:
best_estimate = mean1 - mean2
std_error = np.sqrt(std1**2/n1 + std**2/n2)
test_statistic = best_estimate/std_error
p_val = 2*dist.norm.cdf(-np.abs(test_statistic))
(test_statistic , p_val)

#### Unpooled Approach


In [None]:
best_estimate = mean1 - mean2
std_error = np.sqrt(((n1 - 1)*std1**2 + (n2 - 1)*std2**2)/(n1 + n2 - 2)) * np.sqrt(1/n1 + 1/n2)
test_statistic = best_estimate/std_error
p_val = 2*dist.norm.cdf(-np.abs(test_statistic))
(test_statistic , p_val)

We can observe that our p-value is less than significance level, that means we have enough evidence to reject our null hypothesis. Based on our estimated difference in sample means we have enough evidence to support that there is a difference between the average number of pregnancies for diabetic and non diabetic patient

In [None]:
sm.stats.ztest(dataset[dataset.Outcome == 1]['Pregnancies'] , dataset[dataset.Outcome == 0]['Pregnancies'])

We can calculate the average difference in number of pregnancies stratified by age 

In [None]:
dataset['agegrp'] = pd.cut(dataset.Age , [18,30,40,50,60,70,80,90])

dataset['Outcomex'] = dataset.Outcome.replace({1:'Diabetic',0:'Non-Diabetic'})
#Mean
dx_mean = dataset.groupby(['agegrp','Outcomex']).agg({'Pregnancies':[np.mean]}).unstack()
dx_mean.columns = ['Diabetic','Non_Diabetic']

# Standard Deviation
dx_std = dataset.groupby(['agegrp','Outcomex']).agg({'Pregnancies':[np.std]}).unstack()
dx_std.columns = ['Diabetic','Non_Diabetic']

# Size
dx_size = dataset.groupby(['agegrp','Outcomex']).agg({'Pregnancies':[np.size]}).unstack()
dx_size.columns = ['Diabetic','Non_Diabetic']

mean_diff = dx_mean.Diabetic - dx_mean.Non_Diabetic
se = dx_std/np.sqrt(dx_size)
se_diff = np.sqrt(se.Diabetic**2 + se.Non_Diabetic**2)

x = np.arange(dx_size.shape[0])
pp = sns.pointplot(x , mean_diff , color = 'black')
pp.set(xlabel = 'Age group' , ylabel = "Diabetic-Non Diabetic Pregnancies Mean Difference")
sns.pointplot(x , mean_diff - 1.96*se_diff)
sns.pointplot(x , mean_diff + 1.96*se_diff)
pp.set_xticklabels(dx_size.index)
plt.grid(alpha = 0.3)
plt.show()


Now Let's Look at Blood Pressure of Diabetic and Non Diabetic

$Research Question$: What's the average Blood Pressure of diabetic patient?

Population: Diabetic Patients

Parameter of interest: Average Blood Pressure

In [None]:
unbiased_point_estimate = dataset[dataset.Outcome == 1]['BloodPressure'].mean()
std = dataset[dataset.Outcome == 1]['BloodPressure'].std()
(unbiased_point_estimate,std)

In [None]:
Margin_of_error = 1.96 * std/np.sqrt(dataset[dataset.Outcome == 1]['BloodPressure'].count())
Margin_of_error

We estimated with 95% confidence that the average Blood Pressure for a diabetic patient is in between 68.25 to 73.39

In [None]:
lcb = unbiased_point_estimate - Margin_of_error
ucb = unbiased_point_estimate + Margin_of_error
(lcb,ucb)

In [None]:
sm.stats.DescrStatsW(dataset[dataset.Outcome == 1]['BloodPressure']).zconfint_mean()

Research Question: What's the average Blood Pressure of non diabetic patient?

Population: Diabetic Patients

Parameter of interest: Average Blood Pressure

In [None]:
unbiased_point_estimate = dataset[dataset.Outcome == 0]['BloodPressure'].mean()
std = dataset[dataset.Outcome == 0]['BloodPressure'].std()
(unbiased_point_estimate,std)

In [None]:
Margin_of_error = 1.96 * std/np.sqrt(dataset[dataset.Outcome == 0]['BloodPressure'].count())
Margin_of_error

In [None]:
lcb = unbiased_point_estimate - Margin_of_error
ucb = unbiased_point_estimate + Margin_of_error
(lcb,ucb)

In [None]:
sm.stats.DescrStatsW(dataset[dataset.Outcome == 0]['BloodPressure']).zconfint_mean()

Research Question: Is there a significant difference of average Blood Pressure for Diabetic and Non-Diabetic patients?

Population: Diabetic Patients

Parameter of Interest: (*U<sub>1</sub>* - *U<sub>2</sub>*):Blood Pressure

Note that 1 = Diabetic,2 = Non-Diabetic

In [None]:
mean1 = dataset[dataset.Outcome == 1]['BloodPressure'].mean()
mean2 = dataset[dataset.Outcome == 0]['BloodPressure'].mean()
(mean1 , mean2)
(n1,n2)

In [None]:
std1 = dataset[dataset.Outcome == 0]['BloodPressure'].std()
std2 = dataset[dataset.Outcome == 1]['BloodPressure'].std()
(std1**2 , std2**2)

#### Pooled Approach


In [None]:
Margin_of_error = t_star * np.sqrt(((n1 - 1)*std1**2 + (n2 - 1)*std2**2)/(n1 + n2 - 2)) * np.sqrt(1/n1 + 1/n2)
Margin_of_error

In [None]:
lcb = (mean1 - mean2) - Margin_of_error
ucb = (mean1 - mean2) + Margin_of_error
(lcb,ucb)

#### Unpooled Approach


In [None]:
t_star = 1.98
Margin_of_error = t_star*np.sqrt(std1**2/n1 + std**2/n2)
Margin_of_error

In both the approaches we used, we have zero as a possible value for our parameter, so we cannot say that there is a significant difference of average blood pressure for diabetic and non diabetic patient

In [None]:
lcb = (mean1 - mean2) - Margin_of_error
ucb = (mean1 - mean2) + Margin_of_error
(lcb,ucb)

Next we can do hypothesis testing to confirm our above results

Research Question: Is there a significant difference of average blood pressure for a diabetic and non diabetic patient

$H_0$ : mu1 - mu2 = 0

$H_1$ : mu1 - mu2 $\neq$ 0

significance level = 0.05

Test-statistic = Best estimate - Hypothesised result / standard error

Test-statistic: A measure of how far our sample statistic is from our hypothesized population parameter, in terms of estimated standard errors. The farther away our sample statistic is, the less confident we will be in our null hypothesised value

Assumptions:
1. Data collected using simple random sampling.
2. Distribution is approximately normal, or we have large enough sample size so that we can rely on central limit theorem

#### Pooled Approach

In [None]:
best_estimate = mean1 - mean2
std_error = np.sqrt(std1**2/n1 + std**2/n2)
test_statistic = best_estimate/std_error
p_val = 2*dist.norm.cdf(-np.abs(test_statistic))
(test_statistic , p_val)

In [None]:
best_estimate = mean1 - mean2
std_error = np.sqrt(((n1 - 1)*std1**2 + (n2 - 1)*std2**2)/(n1 + n2 - 2)) * np.sqrt(1/n1 + 1/n2)
test_statistic = best_estimate/std_error
p_val = 2*dist.norm.cdf(-np.abs(test_statistic))
(test_statistic , p_val)

We conclude from the above results that we don't have enough evidence to reject the NULL hypothesis as our p-val is not significant at 5% level


We can calculate the average BloodPressure difference stratified by age

In [None]:
dataset['agegrp'] = pd.cut(dataset.Age , [18,30,40,50,60,70,80,90])

dataset['Outcomex'] = dataset.Outcome.replace({1:'Diabetic',0:'Non-Diabetic'})
#Mean
dx_mean = dataset.groupby(['agegrp','Outcomex']).agg({'BloodPressure':[np.mean]}).unstack()
dx_mean.columns = ['Diabetic','Non_Diabetic']

# Standard Deviation
dx_std = dataset.groupby(['agegrp','Outcomex']).agg({'BloodPressure':[np.std]}).unstack()
dx_std.columns = ['Diabetic','Non_Diabetic']

# Size
dx_size = dataset.groupby(['agegrp','Outcomex']).agg({'BloodPressure':[np.size]}).unstack()
dx_size.columns = ['Diabetic','Non_Diabetic']

mean_diff = dx_mean.Diabetic - dx_mean.Non_Diabetic
se = dx_std/np.sqrt(dx_size)
se_diff = np.sqrt(se.Diabetic**2 + se.Non_Diabetic**2)

x = np.arange(dx_size.shape[0])
pp = sns.pointplot(x , mean_diff , color = 'black')
pp.set(xlabel = 'Age group' , ylabel = "Diabetic-Non Diabetic BloodPressure Mean Difference")
sns.pointplot(x , mean_diff - 1.96*se_diff)
sns.pointplot(x , mean_diff + 1.96*se_diff)
pp.set_xticklabels(dx_size.index)
plt.show()


$Research Question$: What is the average glucose level for diabetic patient

Population: Diabetic Patient

Parameter of Interest: Average Glucose level

In [None]:
unbiased_point_estimate = dataset[dataset.Outcome == 1]['Glucose'].mean()
unbiased_point_estimate

In [None]:
std = dataset[dataset.Outcome == 1]['Glucose'].std()
std

In [None]:
std_error = std/np.sqrt(dataset[dataset.Outcome == 1]['Glucose'].count())
std_error
                     

In [None]:
lcb = unbiased_point_estimate - 1.96 *std_error
ucb = unbiased_point_estimate + 1.96 *std_error
(lcb , ucb)

With 95% confidence, the population average glucose level for diabetic patient is estimated to be in between 138 to 145

In [None]:
sm.stats.DescrStatsW(dataset[dataset.Outcome == 1]['Glucose']).zconfint_mean()

$Research Question$: What is the average glucose level for non-diabetic patient

Population: Non-Diabetic Patient

Parameter of Interest: Average Glucose level

In [None]:
unbiased_point_estimate = dataset[dataset.Outcome == 0]['Glucose'].mean()
std = dataset[dataset.Outcome == 0]['Glucose'].std()
print((unbiased_point_estimate,std))

std_error = std/np.sqrt(dataset[dataset.Outcome == 0]['Glucose'].count())
print(std_error)

lcb = unbiased_point_estimate - 1.96 *std_error
ucb = unbiased_point_estimate + 1.96 *std_error
(lcb , ucb)

We can see that there is a clear distinction of glucose level between the diabetic and non diabetic patient. Wit 95% confidence the population average glucose level for non diabetic patients is estimated to be in between 108 to 112

In [None]:
sm.stats.DescrStatsW(dataset[dataset.Outcome == 0]['Glucose']).zconfint_mean()

$Research Question$: Is there a significant difference of average Glucose level for Diabetic and Non-Diabetic patients?

Population: Patients

Parameter of Interest: (*U<sub>1</sub>* - *U<sub>2</sub>*):Glucose

Note that 1 = Diabetic,2 = Non-Diabetic

In [None]:
mean1 = dataset[dataset.Outcome == 1]['Glucose'].mean()
mean2 = dataset[dataset.Outcome == 0]['Glucose'].mean()
print(mean1 , mean2)
print(n1,n2)

std1 = dataset[dataset.Outcome == 1]['Glucose'].std()
std2 = dataset[dataset.Outcome == 0]['Glucose'].std()
(std1**2 , std2**2)



 The variance is not equal
 
 ### Unpooled Approach 
 
 With 95% confidence, the population average glucose level is 26 to 35 units more in diabetic patients as compared to non diabetic patients.

In [None]:
t_star = 1.98
Margin_of_error = t_star*np.sqrt(std1**2/n1 + std**2/n2)
Margin_of_error

lcb = (mean1 - mean2) - Margin_of_error
ucb = (mean1 - mean2) + Margin_of_error
(lcb,ucb)

In [None]:
dataset['agegrp'] = pd.cut(dataset.Age , [18,30,40,50,60,70,80,90])

dataset['Outcomex'] = dataset.Outcome.replace({1:'Diabetic',0:'Non-Diabetic'})
#Mean
dx_mean = dataset.groupby(['agegrp','Outcomex']).agg({'Glucose':[np.mean]}).unstack()
dx_mean.columns = ['Diabetic','Non_Diabetic']

# Standard Deviation
dx_std = dataset.groupby(['agegrp','Outcomex']).agg({'Glucose':[np.std]}).unstack()
dx_std.columns = ['Diabetic','Non_Diabetic']

# Size
dx_size = dataset.groupby(['agegrp','Outcomex']).agg({'Glucose':[np.size]}).unstack()
dx_size.columns = ['Diabetic','Non_Diabetic']

mean_diff = dx_mean.Diabetic - dx_mean.Non_Diabetic
se = dx_std/np.sqrt(dx_size)
se_diff = np.sqrt(se.Diabetic**2 + se.Non_Diabetic**2)

x = np.arange(dx_size.shape[0])
pp = sns.pointplot(x , mean_diff , color = 'black')
pp.set(xlabel = 'Age group' , ylabel = "Diabetic-Non Diabetic Glucose Mean Difference")
sns.pointplot(x , mean_diff - 1.96*se_diff)
sns.pointplot(x , mean_diff + 1.96*se_diff)
pp.set_xticklabels(dx_size.index)
plt.show()

As we can observe from the above plot, the difference in estimated average glucose level is larger for higher age groups, one reason for that might be because we have less data in these age groups, as shown below 

In [None]:
dx = dataset.groupby(['agegrp' , 'Outcomex'])['Outcome'].apply(lambda x:x.count()).unstack()
dx['Total'] = dx.sum(axis = 1)
dx = dx.apply(lambda x:x/x.sum(axis = 0))
dx

**Important Note**: So far we have got two possible predictors one is Pregnancies and the other is Glucose level.

Let's do some model fitting based on that

In [None]:
model = sm.GLM.from_formula('Outcome ~ Pregnancies', family = sm.families.Binomial() , data = dataset)
result = model.fit()
result.summary()

### Add additional covariate

In [None]:
model = sm.GLM.from_formula('Outcome ~ Pregnancies + Glucose', family = sm.families.Binomial() , data = dataset)
result = model.fit()
result.summary()

Adding Glucose to the model leads to a very small shift in the Pregnancies (it changed from .1372 to .1233).In general, regression coefficients can change a lot when adding or removing other variables from a model.  But in this case the change is quite minimal. The log odds for Diabeties increases by 0.026 for each unit increase in glucose. This effect is additive, so
that comparing two people whose glucose level differ by 20 units, the log odds
of the person being diabetic will be around 0.52 units greater than the
log odds for the person having lower glucose level, and the odds for the higher glucose level person will be around `exp(0.52) = 1.68` times greater than
the odds for the lower glucose level.



The following plot shows the fitted log odds (or logit) probability
for the diabeties outcome as a function of pregnancies.  The grey band is a
simultaneous 95% simultaneous confidence band.


In [None]:
from statsmodels.sandbox.predict_functional import predict_functional

values = {"Glucose":120,'BloodPressure':80,'SkinThickness':30,'Insulin':0,'BMI':30,'DiabetesPedigreeFunction':0.627,'Age':50,'Outcomex':'Diabeties','agegrp':'[18,30)'}

pr , cb , fv = predict_functional(result , 'Pregnancies' , values = values , ci_method = 'simultaneous')

ax = sns.lineplot(fv , pr , lw = 4)
ax.fill_between(fv , cb[:,0],cb[:,1],color = 'grey',alpha = 0.5)
ax.set_xlabel('Pregnancies')
_ = ax.set_ylabel('Diabeties')

In [None]:
values = {"Pregnancies":3,'BloodPressure':80,'SkinThickness':30,'Insulin':0,'BMI':30,'DiabetesPedigreeFunction':0.627,'Age':50,'Outcomex':'Diabeties','agegrp':'[18,30)'}

pr , cb , fv = predict_functional(result , 'Glucose' , values = values , ci_method = 'simultaneous')

ax = sns.lineplot(fv , pr , lw = 4)
ax.fill_between(fv , cb[:,0],cb[:,1],color = 'grey',alpha = 0.5)
ax.set_xlabel('Glucose')
_ = ax.set_ylabel('Diabeties')

We can see that after 125 units of glucose level there is a sharp non linearity in a below curve.

In [None]:
from statsmodels.graphics.regressionplots import add_lowess
fig = result.plot_ceres_residuals("Glucose")
ax = fig.get_axes()[0]
ax.lines[0].set_alpha(0.2)
_ = add_lowess(ax)