In [None]:
import numpy as np
import pandas as pd
import scipy.stats as st
import math
import matplotlib.pyplot as plt
import seaborn as sns


# import statsmodels
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

# import 'stats' package from scipy library
from scipy import stats

# import the functions to perform Chi-square tests
from scipy.stats import chi2_contingency
from scipy.stats import chi2
from scipy.stats import chisquare

# function to perform post-hoc test
import statsmodels.stats.multicomp as mc

# import function to perform post-hoc
# install scikit_posthocs using "!pip install scikit_posthocs" 
!pip install scikit_posthocs
import scikit_posthocs

# <center style='color:brown'>EDA

**Diabetes dataset**<br>
I will be studying the various factors and if they contribute to Diabetes in women

In [None]:
diabetes_df=pd.read_csv("../input/pima-indians-diabetes-database/diabetes.csv")

In [None]:
diabetes_df.head(5)

<b>Understanding the various columns:</b><br>
 - **Pregnancies:** Number of times pregnant
 - **Glucose:** Plasma glucose concentration a 2 hours in an oral glucose tolerance test
 - **BloodPressure:** Diastolic blood pressure (mm Hg)
 - **SkinThickness:** Triceps skin fold thickness (mm)
 - **Insulin:** 2-Hour serum insulin (mu U/ml)
 - **BMI:** Body mass index (weight in kg/(height in m)^2)
 - **DiabetesPedigreeFunction:** Diabetes pedigree function
 - **Age:** in years
 - **Outcome:** Whether patient is diabetic (1) or not (0)

In [None]:
diabetes_df.shape

Dataframe has 9 columns and 768 rows

In [None]:
diabetes_df.info()

We can see that although pregnancies andOutcome are categorical variables they are classified as numerical.<br>
We shall now change the variables to categorical

In [None]:
diabetes_df=diabetes_df.astype({'Pregnancies':'object','Outcome':'object'})

In [None]:
diabetes_df.info()

In [None]:
diabetes_df.describe()

Minimum value for Glucose, Blood Pressure, Skin Thickness, Insulin, BMI are all 0<br>
We know that it cannot be possible to have such values<br>
Below we have plotted a number of violin plots to show the 0 values in each column

In [None]:
plt.figure(1)
sns.violinplot(x='Outcome',y='Glucose',data=diabetes_df)
plt.title('Glucose vs Outcome')
plt.figure(2)
sns.violinplot(x='Outcome',y='BloodPressure',data=diabetes_df)
plt.title('Blood Pressure vs. Outcome')
plt.figure(3)
sns.violinplot(x='Outcome',y='SkinThickness',data=diabetes_df)
plt.title('Skin Thickness vs Outcome')
plt.figure(4)
sns.violinplot(x='Outcome',y='Insulin',data=diabetes_df)
plt.title('Insulin vs Outcome')
plt.figure(5)
sns.violinplot(x='Outcome',y='BMI',data=diabetes_df)
plt.title('BMI vs Outcome')
plt.figure(6)
sns.violinplot(x='Outcome',y='DiabetesPedigreeFunction',data=diabetes_df)
plt.title('Diabetes Pedigree Function vs. Outcome')

The above graphs have been plotted to show 0 values in each column.<br>
We shall now proceed to change 0 values to the median values of the positive or negative outcomes respectively<br>
**Insulin** column has median at 0. Therefore Insulin column will not change

In [None]:
positive_df=diabetes_df[diabetes_df['Outcome']==1]
negative_df=diabetes_df[diabetes_df['Outcome']==0]

positive_df=positive_df.replace({'Glucose':0},np.median(positive_df['Glucose']))
negative_df=negative_df.replace({'Glucose':0},np.median(negative_df['Glucose']))
df=[positive_df,negative_df]
diabetes_df=pd.concat(df)

In [None]:
positive_df=positive_df.replace({'BloodPressure':0},np.median(positive_df['BloodPressure']))
negative_df=negative_df.replace({'BloodPressure':0},np.median(negative_df['BloodPressure']))
df=[positive_df,negative_df]
diabetes_df=pd.concat(df)

In [None]:
positive_df=positive_df.replace({'SkinThickness':0},np.median(positive_df['SkinThickness']))
negative_df=negative_df.replace({'SkinThickness':0},np.median(negative_df['SkinThickness']))
df=[positive_df,negative_df]
diabetes_df=pd.concat(df)

In [None]:
positive_df=positive_df.replace({'Insulin':0},np.median(positive_df['Insulin']))
negative_df=negative_df.replace({'Insulin':0},np.median(negative_df['Insulin']))
df=[positive_df,negative_df]
diabetes_df=pd.concat(df)

In [None]:
positive_df=positive_df.replace({'BMI':0},np.median(positive_df['BMI']))
negative_df=negative_df.replace({'BMI':0},np.median(negative_df['BMI']))
df=[positive_df,negative_df]
diabetes_df=pd.concat(df)

In [None]:
diabetes_df.describe()

All the minimum values (except for **Insulin**) are now greater than 0.<br>

In [None]:
mat=diabetes_df.corr()
ax=sns.heatmap(mat,annot=True,fmt='.2g',cmap='Blues_r')
b, t = plt.ylim() # discover the values for bottom and top
b += 0.5 # Add 0.5 to the bottom
t -= 0.5 # Subtract 0.5 from the top
plt.ylim(b, t) # update the ylim(bottom, top) values
plt.title('Correlation between various numerical parameters in dataset')
plt.show()

The Heatmap shows the Pearson's correlations between various parameters. <br>
Most of the variables do not show a very strong correlation.<br>
BMI and Skin Thickness are the only 2 variables to show a correlation >0.5. However it still is not a very strong correlation number.

In [None]:
diabetes_df.groupby('Outcome').count()

The number of women who have diabetes = 500<br>
Number of women who do not have diabetes = 268

In [None]:
l1=len(positive_df)
l2=len(negative_df)
values=[round(l1/(l1+l2)*100,2),round(l2/(l1+l2)*100,2)]
label=['Positive','Negative']
plt.pie(x=[l1,l2],labels=label,autopct='%.2f',colors=['maroon','crimson'])
plt.show()

34.9% of women have diabetes.<br>
65.1% of women do not have diabetes

# <center style='color:brown'> Hypothesis testing<br>
In this section we will be checking if the numerical variables are affected by Outcome. The procedure is as follows:
 - We first conduct the shapiro test. If the datasets are not normally distributed we shall proceed with the MannWhitneyU test else we shall proceed with two sample independant t-test.
 - Following this we shall conduct a one sample test for the women who have been diagnosed with diabetics to check the threshold values. This will help us in determining the value above which the risk of contracting diabetes is higher.
 - We shall then check if the number of pregnancies a woman has had will impact the different variables. 
    
First we create datasets for pregnancies for one-way ANOVA test

In [None]:
preg_0=positive_df[positive_df['Pregnancies']==0]
preg_1=positive_df[positive_df['Pregnancies']==1]
preg_2=positive_df[positive_df['Pregnancies']==2]
preg_3=positive_df[positive_df['Pregnancies']==3]
preg_4=positive_df[positive_df['Pregnancies']==4]
preg_5=positive_df[positive_df['Pregnancies']==5]
preg_6=positive_df[positive_df['Pregnancies']==6]
preg_7=positive_df[positive_df['Pregnancies']==7]
preg_8=positive_df[positive_df['Pregnancies']==8]
preg_9=positive_df[positive_df['Pregnancies']==9]
preg_10=positive_df[positive_df['Pregnancies']==10]
preg_11=positive_df[positive_df['Pregnancies']==11]
preg_12=positive_df[positive_df['Pregnancies']==12]
preg_13=positive_df[positive_df['Pregnancies']==13]
preg_14=positive_df[positive_df['Pregnancies']==14]
preg_15=positive_df[positive_df['Pregnancies']==15]
preg_16=positive_df[positive_df['Pregnancies']==16]
preg_17=positive_df[positive_df['Pregnancies']==17]

### <center style='color:purple'>Blood Pressure vs. Outcome

**H0:$(Mu_{bloodpressure})_{diabteic}-(Mu_{blood pressure})_{non-diabetic} \leq 0$** <br>
**H1:$(Mu_{bloodpressure})_{diabteic}-(Mu_{blood pressure})_{non-diabetic} > 0$** <br>

In [None]:
print(st.shapiro(positive_df['BloodPressure'])[1],st.shapiro(negative_df['BloodPressure'])[1])
print(st.mannwhitneyu(negative_df['BloodPressure'],positive_df['BloodPressure'],alternative='less'))

**Shapiro Test result**<br>
The pvalue<<0.05 the data is not normally distributed. We shall now continue with the MannWhitneyU test.<br>
**Result of MannWhitneyU Test**<br>
pvalue<0.05. Test statistic falls in rejection region.We now reject H0.<br>
***We can now conclude that Blood Pressure is higher for Diabetic women***

In [None]:
sns.boxenplot(x='Outcome',y='BloodPressure',data=diabetes_df)
plt.title('Effect of Outcome on Blood Pressure')
plt.show()

From the above graph we can see that medain of Blood Pressure for diabetic women is slightly higher than for non- diabetic women

**Diabetic patients tend to have a BP of 80 or above**<br>
**H0: $(M)_{bloodpressure} \leq 80$**<br>
**H1: $(M)_{bloodpressure} > 80$**<br>

Since we have already acertained that dataset is not normally distributed we shall proceed with the Wilcoxon Signed Rank Test

In [None]:
mu=80
bp=[]
for i in positive_df['BloodPressure']:
    bp.append(i-mu)
    
st.wilcoxon(bp,alternative='greater')

pvalue>0.95. The test statistic falls in the rejection region. <br>
Therefore we reject H0.<br>
***Women with BP above 80 (or high BP) are at a higher risk of getting diabetes***

**We shall further check if diabetic women tend to have varying blood pressure based on pregnancies**

### <center style='color:purple'>Blood Pressure vs. Pregnancies<center>

We shall start the process with a shapiro test to check for normality of datasets. <br>
Since the datasets are too many in number we shall be using a for loop to assign a data with a particular pregnancy value to the temporary dataframe, p.<br> 
Following this we shall conduct a shapiro test on p.<br>
If the pvalue of the test<0.05 we shall break out of the loop.<br>
If any one of the datasets are not normally distributed, we shall then proceed with the kruskal test which is a non-parametric equivalent of one way ANOVA.<br>
Same procedure is followed for all the numerical values

In [None]:
for i in range(18):
    p=positive_df[positive_df['Pregnancies']==i]['BloodPressure']
    if st.shapiro(p)[1]>0.95 or st.shapiro(p)[1]<0.05:
        print(i)
        print(st.shapiro(p))
        break

Pregnancy 5 dataset is not normally distributed thus we shall proceed with Kruskal test to determine if the means of all values are equal.<br>
**Hypothesis:**<br>
**H0: Average Blood Pressure of all the pregnancies are same**<br>
**H1: Average Blood Pressure of atleast one pregnancy is different**

In [None]:
print(st.kruskal(preg_0['BloodPressure'],preg_1['BloodPressure'],preg_2['BloodPressure'],preg_3['BloodPressure'],preg_4['BloodPressure'],preg_5['BloodPressure'],preg_6['BloodPressure'],preg_7['BloodPressure'],preg_8['BloodPressure'],preg_9['BloodPressure'],preg_10['BloodPressure'],preg_11['BloodPressure'],preg_12['BloodPressure'],preg_13['BloodPressure'],preg_14['BloodPressure'],preg_15['BloodPressure'],preg_17['BloodPressure']))

The pvalue of the test is >0.05. Statistic falls in the non rejection region.<br>
We fail to reject H0.<br>
<b>Conclusion: Average BP of all pregnancies in diabetic women are the same. (or) Blood Pressure does not change with pregnancy</b><br>

In [None]:
plt.figure(figsize=(10,10))
ax=sns.lineplot(x='Pregnancies',y='BloodPressure',data=diabetes_df,hue='Outcome',style='Outcome',markers=True)
plt.xticks([0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17])
plt.title('Effect of Pregnancies on Blood Pressure')
plt.show()

<p style='color:blue'>The graph above shows us that blood pressure of diabetic women is generally higher than for non-diabetic women. However, Blood Pressure does not depend on number of Pregnancies a woman has had as no clear pattern can be seen.

### <center style='color:purple'>BMI vs Outcome

***BMI of diabetic women tend to be higher than for non-diabetic women***<br>
**H0: $(Mu_{BMI})_{diabteic}-(Mu_{BMI})_{non-diabetic} \leq 0$**<br>
**H1: $(Mu_{BMI})_{diabteic}-(Mu_{BMI})_{non-diabetic}>0$**

In [None]:
print("Shapiro: ",st.shapiro(positive_df['BMI'])[1],st.shapiro(negative_df['BMI'])[1])
print(st.mannwhitneyu(negative_df['BMI'],positive_df['BMI'],alternative='less'))

**Result of Shapiro test**<br>
Since pvalue for both datasets <<0.05 we can conclude that both the datasets are not normal. We shall now continue with the MannWhitneyU test. <br>
**Result of MannWhitneyU test**<br>
pvalue<0.05. The test statistic falls in the rejection region. We can therefore reject H0.<br>
***Thus we can conclude that BMI is higher for Diabetic women***

In [None]:
sns.boxenplot(x='Outcome',y='BMI',data=diabetes_df)
plt.title('Effect of Outcome on BMI')
plt.show()

The above graph proves our hypothesis that diabetic women tend to have a higher BMI

We shall now study the threshold value of BMI for diabetic patients

**People with diabetes tend to have a BMI>36**<br>
**H0: $(Mu_{BMI})_{diabetic} \leq 36$**<br>
**H1: $(Mu_{BMI})_{diabetic} > 36$**<br>

We shall proceed with wilcoxon rank signed test since our dataset is not normally distributed as shown earlier.

In [None]:
mu=36
bmi=[]
for i in positive_df['BMI']:
    bmi.append(i-mu)
print(st.wilcoxon(bmi,alternative='greater'))

From the above result we can see that women who are obese are at a greater risk of getting diabetes. 

**We shall see if pregnancies actually impact BMI in diabetic women**

### <center style='color:purple'>BMI vs Pregnancies</center>


In [None]:
for i in range(18):
    p=positive_df[positive_df['Pregnancies']==i]['BMI']
    if st.shapiro(p)[1]>0.95 or st.shapiro(p)[1]<0.05:
        print(i)
        print(st.shapiro(p))
        break

Pregnancy 0 dataset is not normally distributed thus we shall proceed with Kruskal test to determine if the means of all values are equal.<br>
**Hypothesis:**<br>
**H0: Average BMI of all the pregnancies are same**<br>
**H1: Average BMI of atleast one pregnancy is different**

In [None]:
print(st.kruskal(preg_0['BMI'],preg_1['BMI'],preg_2['BMI'],preg_3['BMI'],preg_4['BMI'],preg_5['BMI'],preg_6['BMI'],preg_7['BMI'],preg_8['BMI'],preg_9['BMI'],preg_10['BMI'],preg_11['BMI'],preg_12['BMI'],preg_13['BMI'],preg_14['BMI'],preg_15['BMI'],preg_17['BMI']))

pvalue<0.05 test statistic falls in the rejection region<br>
<b> We shall then conclude that BMI is dependant on number of pregnancies a woman has had</b><br>
We shall conduct a posthoc test to check which combinations of pregnancies does not have the same average BMI

In [None]:
df_bmi=scikit_posthocs.posthoc_conover(a=positive_df,val_col='BMI',group_col='Pregnancies')
df_bmi

Since it is difficult to interpret the result from the above table, I have automated the process. Using a for loop, I have only saved the pregnancy combinations which have a pvalue<0.05 or pvalue>0.95.<br>
*Note: I have however skipped the pvalues=1*<br>
I have then used a for loop to filter out the repeating combinations.

In [None]:
t=[]
for k in df_bmi:
    i=0
    for v in df_bmi[k]:
        if v<0.05 or v>0.95:
            if v!=1 :
                t.append((k,i))
        i=i+1
for i in t:
    for j in t:
        if (i[0]==j[1]) and( i[1]==j[0]):
            t.remove(j)
print(t)

The above results give us the combinations of pregnancies that do not have the same average BMI

In [None]:
plt.figure(figsize=(10,10))
ax=sns.lineplot(x='Pregnancies',y='BMI',data=diabetes_df,hue='Outcome',style='Outcome',markers=True)
plt.xticks([0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17])
plt.title('Effect of Pregnancies on BMI')
plt.show()

<p style='color:blue'><b>With regard to Outcome</b><br>
BMI tends to be higher for diabetic women as compared to non-diabetic women.<br>
    <b>With respect to Pregnancies</b><br>
For women with 0 pregnancies, BMI tends to be on the higher side for both Outcomes. Following this we see a drop in BMI with increase in Pregnancies irregardless of Outcome upto the 3rd pregnancy. <p>

### <center style='color:purple'>Diabetes Pedigree Function vs Outcome

***Diabetes Pedigree Function for diabetic women is higer than for non-diabetic women***<br>
**H0: $(Mu_{DiabetesPedigreeFunction})_{diabetic}-(Mu_{DiabetesPedigreeFunction})_{non-diabetic} \leq 0$**<br>
**H1: $(Mu_{DiabetesPedigreeFunction})_{diabetic}-(Mu_{DiabetesPedigreeFunction})_{non-diabetic}>0$**<br>

In [None]:
print("Shapiro test: ",st.shapiro(positive_df['DiabetesPedigreeFunction'])[1],st.shapiro(negative_df['DiabetesPedigreeFunction'])[1])
print(st.mannwhitneyu(negative_df['DiabetesPedigreeFunction'],positive_df['DiabetesPedigreeFunction'],alternative='less'))

**Result of Shapiro test**<br>
pvalue for both datasets<0.05. We can therefore conclude that both datasets are not normally distributed. We shall now proceed with the mannwhitneyU test. <br>
**Result for MannwhitneyU test**<br>
pvalue<0.05. Test statistic falls in the rejection region. We therefore reject H0. <br>
**We can then conclude that Diabetes Pedigree Function for diabetic women is greater than for non-diabetic women**

In [None]:
sns.boxenplot(x='Outcome',y='DiabetesPedigreeFunction',data=diabetes_df)
plt.title('Effect of diabetic pedigree function on Outcome')
plt.show()

Thus we can see that diabetic women tend to have a higher Diabetes Pedigree Function

**People with diabetes tend to have a Diabetic Pedigree Function >0.6**<br>
**H0: $(Mu_{diabeticpedigreefunction})_{diabetic} \leq 0.6$**<br>
**H1: $(Mu_{diabeticpedigreefunction})_{diabetic} > 0.6$**

We shall proceed with wilcoxon signed rank test since dataset is not normally distributed. <br>

In [None]:
mu=0.6
dpf=[]
for i in positive_df['DiabetesPedigreeFunction']:
    dpf.append(i-mu)
print(st.wilcoxon(dpf,alternative='greater'))

For a value of Diabetes Pedigree Function of 0.6 tendancy to have diabetes is higher in women.<br>
Diabetic Pedigree function is an inherent property of genetics. Thus number of pregnancies will not change this value.

In [None]:
plt.figure(figsize=(10,10))
ax=sns.lineplot(x='Pregnancies',y='DiabetesPedigreeFunction',data=diabetes_df,hue='Outcome',style='Outcome',markers=True)
plt.xticks([0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17])
plt.title('Effect of Pregnancies and Diabetes Pedigree Function on Outcome')
plt.show()

### <center style='color:purple'>Glucose vs. Outcome

**Glucose level is higher in diabetic women than non-diabetic women**<br>
**H0: $(Mu_{glucose})_{diabetic}-(Mu_{glucose})_{nondiabetic}\leq 0$**<br>
**H1: $(Mu_{glucose})_{diabetic}-(Mu_{glucose})_{nondiabetic}>0$**

In [None]:
print("Shapiro test: ",st.shapiro(positive_df['Glucose'])[1],st.shapiro(negative_df['Glucose'])[1])
print(st.mannwhitneyu(negative_df['Glucose'],positive_df['Glucose'],alternative='less'))

**Result of Shapiro test**<br>
Both datasets are not normally distributed therefore we proceed with manwhitneyu test. <br>
**Result of MannWhitneyU test**<br>
pvalue<0.05. Test statistic falls in the rejection region. Thus we reject H0.<br>
***Thus we conclude that Glucose level for diabetic women is higher than for non-diabetic women***

In [None]:
sns.boxenplot(x='Outcome',y='Glucose',data=diabetes_df)
plt.title('Outcome vs. Glucose levels')
plt.show()

**Threshold value for Glucose**<br>
**H0: $(Mu_{glucose})_{diabetic} \leq 150$**<br>
**H1: $(Mu_{glucose})_{diabetic} > 150$**

We shall proceed with wilcoxon signed rank test since dataset is not normally distributed. <br>

In [None]:
mu=150
dpf=[]
for i in positive_df['Glucose']:
    dpf.append(i-mu)
print(st.wilcoxon(dpf,alternative='greater'))

Women who have a glucose level>150 are at a higher risk of getting diabetic

**We shall proceed to check if diabetic women actually have different glucose levels based on pregnancies**

### <center style='color:purple'>Glucose vs. Pregnancies

In [None]:
for i in range(18):
    p=positive_df[positive_df['Pregnancies']==i]['Glucose']
    if st.shapiro(p)[1]>0.95 or st.shapiro(p)[1]<0.05:
        print(i)
        print(st.shapiro(p))
        break

<b>Kruskal Wallis test</b><br>
Since atleast one dataset is not normally distributed we shall proceed with Kruskal test.<br>
<b>Hypothesis:<br>
    H0: Average Glucose levels is the same for all pregnancies.<br>
    H1: Average Glucose levels is not the same for all pregnancies</b>

In [None]:
print(st.kruskal(preg_0['Glucose'],preg_1['Glucose'],preg_2['Glucose'],preg_3['Glucose'],preg_4['Glucose'],preg_5['Glucose'],preg_6['Glucose'],preg_7['Glucose'],preg_8['Glucose'],preg_9['Glucose'],preg_10['Glucose'],preg_11['Glucose'],preg_12['Glucose'],preg_13['Glucose'],preg_14['Glucose'],preg_15['Glucose'],preg_17['Glucose']))

Since pvalue>0.05 test stastic falls in the non-rejection region.<br>
<b> Conclusion: Average glucose levels does not change with pregnancies if a woman is diabetic</b><br>

In [None]:
plt.figure(figsize=(10,10))
ax=sns.lineplot(x='Pregnancies',y='Glucose',data=diabetes_df,hue='Outcome',style='Outcome',markers=True)
plt.xticks([0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17])
plt.title('Effect of Pregnancies on Glucose')
plt.show()

<p style='color:blue'>Generally Glucose level is higher for diabetic women but there is no such visible pattern for pregnancy

### <center style='color:purple'>Skin Thickness vs Outcome

**Effect of outcome on Skin Thickness**<br>
**H0: ($Mu_{st})_{non-diabetic}-(Mu_{st})_{diabetic} \geq 0$**<br>
**H1: ($Mu_{st})_{non-diabetic}-(Mu_{st})_{diabetic}<0$**

In [None]:
print("Shapiro test: ",st.shapiro(positive_df['SkinThickness'])[1],st.shapiro(negative_df['SkinThickness'])[1])
print(st.mannwhitneyu(negative_df['SkinThickness'],positive_df['SkinThickness'],alternative='less'))

**Result of Shapiro test**<br>
pvalue for both datasets<0.05. Both datasets are not normally distributed. We shall then continue on with MannWhitneyU test<br>
**Result of MannWhitneyU test**<br>
pvalue <0.05. Test Statistic falls in the rejection region. Thus we reject H0.<br>
***We can now conclude that Skin Thickness for diabetic women is greater than that of non-diabetic women.***

In [None]:
sns.boxenplot(x='Outcome',y='SkinThickness',data=diabetes_df)
plt.title('Outcome vs. Skin Thickness')
plt.show()

**Threshold value for skin thickness**<br>
**H0: $(Mu)_{skinthickness} \leq 35$** <br>
**H0: $(Mu)_{skinthickness} > 35$**

In [None]:
mu=35
sth=[]
for i in positive_df['SkinThickness']:
    sth.append(i-mu)
print(st.wilcoxon(sth,alternative='greater'))
       

Fom the above test we can see that skin  thickness >35 for women who have diabetes

**Check if skin thickness of diabetic women change with number of pregnancies**

### <center style='color:purple'>Skin Thickness vs Pregnancies

In [None]:
for i in range(18):
    p=positive_df[positive_df['Pregnancies']==i]['BloodPressure']
    if st.shapiro(p)[1]>0.95 or st.shapiro(p)[1]<0.05:
        print(i)
        print(st.shapiro(p))
        break

pvalue of pregnancy 5 dataset<0.05. Thus atleast 1 dataset is not normally distributed. 
<b>Kruskal Wallis test<br>
    Hypothesis:<br>
    H0: Average skin thickness is same for all pregnancies in diabetic women<br>
    H1: Average skin thickness is different for atleast 1 pregnancy in diabetic women.


In [None]:
print(st.kruskal(preg_0['SkinThickness'],preg_1['SkinThickness'],preg_2['SkinThickness'],preg_3['SkinThickness'],preg_4['SkinThickness'],preg_5['SkinThickness'],preg_6['SkinThickness'],preg_7['SkinThickness'],preg_8['SkinThickness'],preg_9['SkinThickness'],preg_10['SkinThickness'],preg_11['SkinThickness'],preg_12['SkinThickness'],preg_13['SkinThickness'],preg_14['SkinThickness'],preg_15['SkinThickness'],preg_17['SkinThickness']))

Since p value>0.05 stastic falls in the non-rejection region.<br>
We fail to reject H0.<br>
<b>Conclusion: Pregnancies does not change the skin thickness in diabetic women</b>

In [None]:
plt.figure(figsize=(10,10))
ax=sns.lineplot(x='Pregnancies',y='SkinThickness',data=diabetes_df,hue='Outcome',style='Outcome',markers=True)
plt.xticks([0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17])
plt.title('Effect of Pregnancies on Skin Thickness')
plt.show()

<p style='color:blue'>Skin Thickness for diabetic women is greater than non-diabetic women but there is no clear pattern for number of pregnancies

### <center style='color:purple'>Insulin vs Outcome

**H0: $(Mu_{insulin})_{diabetic}-(Mu_{insulin})_{non-diabetic} \geq 0$**<br>
**H1: $(Mu_{insulin})_{diabetic}-(Mu_{insulin})_{non-diabetic} < 0$**

In [None]:
print("Shapiro test: ",st.shapiro(positive_df['Insulin'])[1],st.shapiro(negative_df['Insulin'])[1])
print(st.mannwhitneyu(positive_df['Insulin'],negative_df['Insulin'],alternative='less'))

**Result of Shapiro test**<br>
pvalue for both datasets>0.05. Both datasets are not normally distributed. We shall then continue on with MannWhitneyU test<br>
**Result of MannWhitneyU test**<br>
pvalue <0.05. Test Statistic falls in the rejection region. Thus we reject H0.<br>
***We can now conclude that Insulin for diabetic women is lesser than that of non-diabetic women.***

In [None]:
sns.boxplot(x='Outcome',y='Insulin',data=diabetes_df)
plt.title('Outcome vs. Insulin levels')
plt.show()

<b>Hypothesis:<br>
    H0: $(Mu_{positive})_{insulin} \geq 95$<br>
    H1: $(Mu_{positive})_{insulin} < 95$

In [None]:
mu=95
insulin=[]
for i in positive_df['Insulin']:
    insulin.append(i-mu)
print(st.wilcoxon(insulin,alternative='less'))
       

We can see that the pvalue<0.05 thus statistic falls in the rejection region.<br>
We shall reject H0.<br>
<b>Thus women wih diabetes tends to have an Insulin level<95</b>

### <center style='color:purple'>Insulin vs Pregnancies</center>
To further check if insulin value is dependant on pregnancies for diabetic women we shall conduct a one-way ANOVA test/ Kruskal Wallis test

In [None]:
for i in range(18):
    p=positive_df[positive_df['Pregnancies']==i]['Insulin']
    if st.shapiro(p)[1]>0.95 or st.shapiro(p)[1]<0.05:
        print(i)
        print(st.shapiro(p))
        break

Since atleast 1 of the shapiro test fails we shall proceed with Kruskal Wallis test:<br>
<b>Hypothesis:<br>
    H0: Average Insulin level remains same for all pregnancies<br>
    H1: Average insulin values is different for atleast 1 pregnancies</b><br>

In [None]:
print(st.kruskal(preg_0['Insulin'],preg_1['Insulin'],preg_2['Insulin'],preg_3['Insulin'],preg_4['Insulin'],preg_5['Insulin'],preg_6['Insulin'],preg_7['Insulin'],preg_8['Insulin'],preg_9['Insulin'],preg_10['Insulin'],preg_11['Insulin'],preg_12['Insulin'],preg_13['Insulin'],preg_14['Insulin'],preg_15['Insulin'],preg_17['Insulin']))

pvalue<0.05. Test statistic falls in the rejection region. We shall reject H0.<br>
<b>Conclusion: Avergae insulin levels depend on number of Pregnancies<b>

In [None]:
df_i=scikit_posthocs.posthoc_conover(a=positive_df,val_col='Insulin',group_col='Pregnancies')
df_i

In [None]:
t=[]
for k in df_i:
    i=0
    for v in df_i[k]:
        if v<0.05 or v>0.95:
            if v!=1 :
                t.append((k,i))
        i=i+1
for i in t:
    for j in t:
        if (i[0]==j[1]) and( i[1]==j[0]):
            t.remove(j)
print(t)

The above results give us the pregnancies that do not have the same average Insulin.<br> From the graph below we cannot draw a clear conclusion.

In [None]:
plt.figure(figsize=(10,10))
ax=sns.lineplot(x='Pregnancies',y='Insulin',data=diabetes_df,hue='Outcome',style='Outcome',markers=True)
plt.xticks([0,1,2,3,4,5,6,7,8,9,10,11,12,13,14])
plt.title('Effect of Pregnancies on Insulin')
plt.show()

### <center style='color:purple'>Age vs Outcome

**Effect of outcome on Skin Thickness**<br>
**H0: ($Mu_{age})_{diabetic}-(Mu_{age})_{non-diabetic} \leq 0$**<br>
**H1: ($Mu_{age})_{diabetic}-(Mu_{age})_{non-diabetic}>0$**

In [None]:
print("Shapiro test: ",st.shapiro(positive_df['Age'])[1],st.shapiro(negative_df['Age'])[1])
print(st.mannwhitneyu(negative_df['Age'],positive_df['Age'],alternative='less'))

**Result of Shapiro test**<br>
pvalue for both datasets<0.05. Both datasets are not normally distributed. We shall then continue on with MannWhitneyU test<br>
**Result of MannWhitneyU test**<br>
pvalue <0.05. Test Statistic falls in the rejection region. Thus we reject H0.<br>
***We can now conclude that age of diabetic women is greater than that of non-diabetic women.***

In [None]:
sns.boxenplot(x='Outcome',y='Age',data=diabetes_df)
plt.title('Outcome vs. Age')
plt.show()

In [None]:
mu=40
age=[]
for i in positive_df['Age']:
    age.append(i-mu)
print(st.wilcoxon(age,alternative='greater'))

From the above test we can conclude that women above 40 years of age are at a higher risk of getting diabetes.

In [None]:
plt.figure(figsize=(10,10))
ax=sns.lineplot(x='Pregnancies',y='Age',data=diabetes_df,hue='Outcome',style='Outcome',markers=True)
plt.xticks([0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17])
plt.title('Effect of Pregnancies on Age')
plt.show()

## <p style='color:purple'>Is number of Pregnancies and Outcome related?

**H0: Pregnancies and Outcome are independant of each other**<br>
**H1: Pregnancies and Outcome are dependant on each other**

In [None]:
preg_out=pd.crosstab(diabetes_df.Pregnancies,diabetes_df.Outcome)
preg_out

Value of pregnancies >10 are all <5. For Chi-Square test it is necessary that the values be $\geq 5$

In [None]:
preg_out=preg_out[preg_out.index<=10]
preg_out

In [None]:
r=11
c=2
df=(r-1)*(c-1)
obs_val=preg_out.values
obs_val

In [None]:
chi2_crit=st.chi2.isf(0.05,df)
print(chi2_crit)
print("Chi_statistic: {}\nPvalue: {}".format(*st.chi2_contingency(obs_val,correction=False)[:2]))

Chi critical value< chi statistic value.<br>
Also, pvalue<0.05.<br>
***Thus we can conclude that pregnancies and outcome are dependant on each other.***

In [None]:
sns.countplot('Pregnancies',hue='Outcome',data=diabetes_df)

# <center style='color:brown'>Conclusions Drawn</center>

1. **Glucose:** Rise in glucose levels could lead to increasing risk of diabetes. However pregnancy does not impact the glucose levels.
2. **Blood Pressure:** High blood pressure indicates an increased risk of Diabetes. However, number of pregnancies that a woman has had does not have an impact on her blood pressure.
3. **BMI:** Increasing BMI could increase the risk of diabetes. Pregnancies could cause a change in BMI leading to change in risk of getting diabetes. Surprisingly, women who have not been pregnant even once tend to have a higher BMI.
4. **Skin Thickness:** Increase in thickness of the triceps skin fold can also cause increase in risk of getting diabetes. This is a means of measuring the body fat percentage. In the same case as BMI, increase in body fat percentage could lead to higher risk of getting diabetes. Number of pregnancies does not impact the skin thickness of a woman
5. **Diabetes Pedigree Function:** This is a genetic function that gives us an idea of our risk of getting diabetes. Naturally, higher the value higher the risk. 
6. **Insulin:** Decrease in insulin level could again lead to increasing risk of contracting diabetes. Insulin level changes with the number of pregnancies a woman has had.
7. **Age:** Naturally number of pregnancies increase with age. Also older women are at a higher risk of getting diabetes.