In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from matplotlib import pyplot as plt # plotting
from math import * # sqrt() etc
import seaborn as sns
# with %matplotlib inline you turn on the immediate display.
# %matplotlib inline

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

# Gather Data

In [None]:
data_dictionary_loc = '../input/CAB_data_dictionary.xlsx'
data_dic = pd.read_excel(data_dictionary_loc, dtype = object)
data_dic['File Content Description'] #well, how to import the correct column width? can be viewed using other programs
data_dic

In [None]:
data_u_pradesh = pd.read_csv('../input/CAB_09_UP.csv', low_memory = False) 
#needed to specify low_memory because columns (14, 43 had mixed types)
data_u_pradesh.head()

# Cleaning data

Subset of adults has 299 570 individuals

In [None]:
data = data_u_pradesh[(data_u_pradesh['age_code']=='Y')&(data_u_pradesh['age']>=18)]
len(data)

Original data had -1 for missing values

In [None]:
data = data.replace([-1, '-1'], np.nan)

Dropping columns only applicable to under 5 year olds

In [None]:
cols_under5 = ['illness_type', 'illness_duration', 'treatment_type']
cols_under3 = ['first_breast_feeding', 'is_cur_breast_feeding', 'day_or_month_for_breast_feeding_', 'day_or_month_for_breast_feeding', 'water_month', 'ani_milk_month', 'semisolid_month_or_day', 'solid_month', 'vegetables_month_or_day']

In [None]:
data = data.drop(cols_under5, axis = 1)
data = data.drop(cols_under3, axis = 1)

Dropping unnecessary features
 - 'state_code'
 - 'PSU_ID' - This is a seven digit number to uniquely identify each record.
 - 'ahs_house_unit' - House Number
 - 'house_hold_no' - Household Number
 - 'record_code_iodine_reason' - Why was iodine testing refused
 - 'sl_no' - Each record of the Household has a serial no. 
 - 'usual_residence' - Whether the member usually lives here
 - 'usual_residence_reason' - Reason for member not being usual resident
 - 'identification_code' - Each member of a PSU is assigned a unique number
 - 'v54' ?

In [None]:
data = data.drop(['state_code', 'psu_id', 'ahs_house_unit', 'house_hold_no', 'record_code_iodine_reason', 'sl_no', 'usual_residance', 'usual_residance_reason', 'identification_code', 'v54'], axis = 1)

From data dictionary:
- 'rural_urban' - Rural-1; Urban-2
- 'stratum' - 1 or 2 when 'rural_urban'=1, 0 when 'rural_urban'=2

dropping feature 'rural_urban', since 'stratum' contains the same information

I guess 'stratum' feature values:
- 0 - urban
- 1 - rural  
- 2 - very rural?

not specified in dictionary

In [None]:
data = data.drop('rural_urban', axis = 1)

## Age related
From data dictionary:
- 'age_code' - unit of recording age
- 'age'
- 'date_of_birth' - DD
- 'month_of_birth' - MM
- 'year_of_birth' - YYYY

Dropping feature age_code(values: Y, M, D for years, months, days), since age always recorded in years for adults

In [None]:
display(np.unique(data['age_code']))
data = data.drop('age_code', axis = 1)

In [None]:
plt.hist(data.age.dropna(), bins = 50)
plt.title('Age')
plt.show

## Iodine
From data dictionary:
- 'test_salt_iodine' - Salt used by the Household has been tested for Iodine content[Recorded as Parts Per Million(PPM)]
- 'record_code_iodine' - No iodine – 1; Less than 15 PPM – 2; More than or equal to 15 PPM – 3; No salt in Household – 4; Salt not tested  – 5

In [None]:
pd.value_counts(data['record_code_iodine'])

## Height/weight
From data dictionary:
- 'weight_measured' - Measured-1;  Member - not present-2, Refused-3, Other-4
- 'weight_in_kg' - outcome
- 'length_height_measured' - Measured-1;  Member not present-2, Refused-3, Other-4
- 'length_height_code' - L- Length, H-Height
- 'length_height_cm' - outcome

Dropping, unnecessary columns, NA in weight/length column if measurement was not conducted

In [None]:
data = data.drop(['weight_measured', 'length_height_measured', 'length_height_code'], axis = 1)

In [None]:
data = data.rename(index=str, columns={"weight_in_kg": "weight", "length_height_cm": "height"})

In [None]:
plt.boxplot(data['weight'].dropna())
plt.title('Weight with outliers')
plt.show

In [None]:
plt.boxplot(data['height'].dropna())
plt.title('Height with outliers')
plt.show

In [None]:
# exclude any measurements where difference from median is larger than 3 standard deviations
def remove_outliers(data, feature):
    stdev = sqrt(np.var(data[feature].dropna()))
    median = np.median(data[feature].dropna())
    print("number of discarded measurements")
    display(len(data[[feature]].where(abs(data[feature] - median)>(3*stdev)).dropna()))
# keep original values if difference from mean is less than 3 standard deviations. NA otherwise
    return data[[feature]].where(abs(data[feature] - median)<(3*stdev), other = np.nan)

In [None]:
data['height'] = remove_outliers(data, 'height')

Removing weight outliers. NA for anything under 20kg

In [None]:
print('number of discarded measurements')
display(len(data[data['weight']<20]))
data['weight'] = data['weight'].where(data['weight']>20, other=np.nan)

In [None]:
plt.boxplot(data['weight'].dropna())
plt.title('Weight without outliers')
plt.show

In [None]:
plt.boxplot(data['height'].dropna())
plt.title('Height without outliers')
plt.show

Body mass index: weight(kg)/(height(m) * height(m))

In [None]:
data['bmi'] = data['weight']/(data['height']/100)**2

In [None]:
plt.hist(data['weight'].dropna(), bins = 50)
plt.title('Weight without outliers')
plt.show()

In [None]:
plt.hist(data['height'].dropna(), bins = 50)
plt.title('Height without outliers')
plt.show()

A lot of individuals with 130, 140, 150cm height

In [None]:
plt.hist(data['bmi'].dropna(), bins = 50)
plt.title('BMI')
plt.show()

Data cleaning steps for height/weight related data: 
- Discarded any height measurements where difference from median was further than 3 standard deviations. Looking at distribution of height/weight as normally distributed.
- Discarded any weight measurements under 20kg
- Calculated BMI

Discarded ~800 values for height, ~460 values for weight. Out of ~200 000

## Pulse, blood pressure(heart disease)
From data dictionary:
- 'bp_systolic'
- 'bp_systolic_2_reading'
- 'bp_diastolic'
- 'bp_diastolic_2reading'
- 'pulse_rate',
- 'pulse_rate_2_reading'

In [None]:
# distribution of measurement differences
#plt.hist((data['bp_systolic'] - data['bp_systolic_2_reading']).dropna(), bins = 50)
#plt.hist((data['pulse_rate'] - data['pulse_rate_2_reading']).dropna(), bins = 50)
#plt.hist((data['bp_diastolic'] - data['bp_diastolic_2reading']).dropna(), bins = 50)

In [None]:
# for features where two measurements were taken, exclude any where difference between measurements is larger than 3 standard deviations
def remove_outliers_difference(data, col1, col2):
    stdev = sqrt((data[col1] - data[col2]).var())
# how many measurements were excluded
    print('number of discarded measurements')
    display(len(data[[col1, col2]].where(abs(data[col1] - data[col2])>(3*stdev)).dropna()))
# keep original values if difference of two measurements is less than 3 standard deviations. NA otherwise
    return data[[col1, col2]].where(abs(data[col1] - data[col2])<(3*stdev), other = np.nan)

In [None]:
data[['bp_systolic', 'bp_systolic_2_reading']] = remove_outliers_difference(data, 'bp_systolic', 'bp_systolic_2_reading')
data[['bp_diastolic', 'bp_diastolic_2reading']] = remove_outliers_difference(data, 'bp_diastolic', 'bp_diastolic_2reading')
data[['pulse_rate', 'pulse_rate_2_reading']] = remove_outliers_difference(data, 'pulse_rate', 'pulse_rate_2_reading')

Now that outliers have been removed, aggregate remaining data by finding mean between two readings

In [None]:
# aggregate two reading by finding mean
def aggregate_readings(data, col1, col2):
    data[col1] = data.apply(lambda row: sum([row[col1], row[col2]])/2, axis = 1)
    data = data.drop(col2, axis = 1)
    return data

In [None]:
data = aggregate_readings(data, 'bp_systolic', 'bp_systolic_2_reading')
data = aggregate_readings(data, 'bp_diastolic', 'bp_diastolic_2reading')
data = aggregate_readings(data, 'pulse_rate', 'pulse_rate_2_reading')

Systolic - beating, diastolic - resting blood pressure. Likely input/measurement error where systolic < diastolic

In [None]:
# retain original values where resting blood pressure lower than beating. NA otherwise 
data[['bp_diastolic', 'bp_systolic']] = data[['bp_diastolic', 'bp_systolic']].where(data.bp_diastolic < data.bp_systolic, other = np.nan)

Data cleaning steps for heart disease related data: 
- Discarded any where difference between two measurements was further from mean than 3 standard deviations. Looking at distribution of measurement differences as normally distributed.
- Aggregated two measurements by finding mean
- Discarded any where diastolic pressure was higher than systolic

Lost less than 5% of values for each feature

## Haemoglobin(anemia)
From data dictionary:
- 'haemoglobin_test' - Consent for Haemoglobin test (Yes-1; No-2)
- 'haemoglobin'- Status of Haemoglobin Test (Measured-1; Member not present-2; Refused-3, Other-4)
- 'haemoglobin_level' - Outcome of Haemoglobin Level (Hb) Test (in percentage gms)  

In [None]:
data = data.drop(['haemoglobin_test', 'haemoglobin'], axis = 1)

In [None]:
plt.hist(data.haemoglobin_level[~np.isnan(data.haemoglobin_level)], bins=50)
plt.title('Blood haemoglobin')
plt.show

## Blood sugar(diabetes)
From data dictionary:
- 'diabetes_test' - consent for testing
- 'fasting_blood_glucose' - Measured-1; Member not present-2; Refused-3; Other-4
- 'fasting_blood_glucose_mg_dl' - outcome of test

In [None]:
data = data.drop(['diabetes_test', 'fasting_blood_glucose'], axis = 1)

In [None]:
data = data.rename(index = str, columns = {'fasting_blood_glucose_mg_dl' : 'glucose'})

In [None]:
plt.hist(data.glucose[~np.isnan(data.glucose)], bins=50)
plt.title('Blood sugar')
plt.show

In [None]:
plt.boxplot(data.glucose[~np.isnan(data.glucose)])
plt.title('Blood sugar')
plt.show

In [None]:
data['glucose'] = remove_outliers(data,'glucose')

## Features only applicable to women
From data dictionary:
- 'marital_status' - Never married=1,Married but Gauna not performed=2, Married and Gauna perfomed=3, Remarried=4,Widow=5, Divorced=6, Separated=7, Not stated=8
- 'gauna_perfor_not_perfor' - Pregnant-1; Lactating-2; Non-pregnant or Non-lactating-3
- 'duration_pregnanacy' - Duration of pregnancy/lactation (in months)

In [None]:
cols_women = ['marital_status', 'gauna_perfor_not_perfor', 'duration_pregnanacy']

placing NA where marital status 'not stated' 

In [None]:
data['marital_status'] = data['marital_status'].where(~(data['marital_status']==8.0), other = np.nan)

In [None]:
# input errors have to be dealt with
plt.boxplot(data['duration_pregnanacy'].dropna())
plt.show

## Correlations
From 53 initial features to 21

In [None]:
corr=data.corr()[['haemoglobin_level', 'pulse_rate', 'bp_diastolic', 'bp_systolic', 'glucose']]
corr.where(abs(corr)>0.1)

Removing features where there's no correlation

In [None]:
data_correlated = data.drop(['district_code', 'stratum', 'test_salt_iodine', 'record_code_iodine', 'date_of_birth', 'month_of_birth', 'duration_pregnanacy'], axis = 1)
corr = data_correlated.corr()[['haemoglobin_level', 'pulse_rate', 'bp_diastolic', 'bp_systolic', 'glucose']]
corr.where(abs(corr)>0.1)

## Dummies code from Sophie 

Dummies should be drawn by all features that are encoded numerically and which are actually categorical. From the 21 remaining, these are

district_code
stratum
record_code_iodine. Here, 1, 2 and 3 are ordered, while 4 should be 0 (no salt in household) and 5 should be replaced by NaN (no information).
sex should be replaced by male and female as categorical features.
marital status
gauna_perfor_not_perfor: 1- pregnant, 2-lactating, 3-nothing of both. Better rename to "pregnant" and "lactating" after OneHoteEncoding

In [None]:
dummieable =['district_code', 'stratum', 'record_code_iodine', 'sex', 'marital_status', 'gauna_perfor_not_perfor']
dummiedata = [data]
for dum in dummieable: 
    dummiedata.append(pd.get_dummies(data[dum], prefix = dum))
dummied_data = pd.concat(dummiedata, axis = 1)

In [None]:
print("Number of features", len(dummied_data.columns))
dummied_data.columns

Finally remove the old columns, rename the new ones and set all not given data NaN.

In [None]:
dummied_data = dummied_data.drop(dummieable, axis =1)
print("Number of features", len(dummied_data.columns))
dummied_data.columns

In [None]:
rename_dict = {'marital_status_1.0': 'never_married', 'marital_status_2.0': 'married_no_gauna',
               'marital_status_3.0': 'married_and_gauna',
       'marital_status_4.0': 'remarried', 'marital_status_5.0': 'widow', 'marital_status_6.0': 'divorced',
       'marital_status_7.0': 'separated', 'gauna_perfor_not_perfor_1.0': 'pregnant',
       'gauna_perfor_not_perfor_2.0': 'lactating', 'gauna_perfor_not_perfor_3.0': 'non_pregnant_non_lactating',
        'sex_1': 'male', 'sex_2': 'female'}
dummied_data = dummied_data.rename(rename_dict, axis = 'columns').drop(['female'], axis = 1)
dummied_data.columns[70:]

# Patterns in diabetes
- First trying to see how blood sugar is distributed among general population
    - among subpopulations? men, women, different counties, strata?
- Looking to predict gestational diabetes
    - pregnant women have a higher risk of developing diabetes
- Creating new categorical column for blood sugar. Where "1" stands for (pre)diabetes
    - 1 >=100mg/dl
    - 0 <100mg/dl

The data is collected as fasting blood sugar, for which the normal range is <100mg/dl

In [None]:
ft_without_district = [x for x in dummied_data.columns if not x.startswith('district')]
ft_without_district.remove('glucose')
print("These", len(ft_without_district),"features are going to be compared to diabetes: \n")
print(ft_without_district)

In [None]:
dummied_data['diabetes'] = dummied_data['glucose'].apply(lambda x: 1 if x >= 100 else 0)
data['diabetes'] = data['glucose'].apply(lambda x: 1 if x >= 100 else 0)

In [None]:
dummied_data.diabetes.value_counts()

77893 individuals have a high fasting blood sugar

Looking at how each feature correlates to blood sugar values. Using categorical values for depicting blood sugar. Category "1" stands for diabetic or prediabetic based on https://www.mayoclinic.org/diseases-conditions/diabetes/diagnosis-treatment/drc-20371451

In [None]:
# make pairplots of each feature and blood glucose
import seaborn as sns
sns.pairplot(data.dropna(), x_vars = data.columns.drop(['diabetes', 'glucose'])[0:4] , y_vars = ['glucose'])
sns.pairplot(data.dropna(), x_vars = data.columns.drop(['diabetes', 'glucose'])[4:8] , y_vars = ['glucose'])
sns.pairplot(data.dropna(), x_vars = data.columns.drop(['diabetes', 'glucose'])[8:12] , y_vars = ['glucose'])
sns.pairplot(data.dropna(), x_vars = data.columns.drop(['diabetes', 'glucose'])[12:16] , y_vars = ['glucose'])
sns.pairplot(data.dropna(), x_vars = data.columns.drop(['diabetes', 'glucose'])[16:20] , y_vars = ['glucose'])

Based on medical knowledge people with higher weight, bmi and age should have higher blood sugar. These plots don't really depict that, hard to make any judgement.

In [None]:
# getting relative frequency of high blood sugar
def diabetes_relative_freq(feature):
    subset = data.groupby(feature)
    high = pd.Series()
    for i in np.unique(data[feature]):
        high = high.append(pd.Series((subset['diabetes'].value_counts()[i]/sum(subset['diabetes'].value_counts()[i])).loc[1]))
    high.index = np.arange(1,len(subset)+1)  
    plt.bar(np.arange(1, len(subset)+1), high)
    plt.ylabel("Realtive freq of high blood sugar")
    plt.title(feature)
    plt.show

### Looking at relative frequency of high blood sugar by different features to find features that might be good indicators of diabetes risk. 

In [None]:
plt.rcParams['figure.figsize'] = [35, 15]
diabetes_relative_freq("district_code")

Relative frequency of high blood sugar by district. Number of individuals with high blood sugar is normalized by total number of individuals in district. This shows that frequency of high blood sugar might be as low as 10% or as high as 45% depending on the district. So district code might be a good indicator of diabetes risk

In [None]:
plt.rcParams['figure.figsize'] = [6, 6]
diabetes_relative_freq("stratum")

Urban and rural populations have similar frequency of high blood sugar. Not really informative 

In [None]:
plt.rcParams['figure.figsize'] = [6, 6]
diabetes_relative_freq('sex')

Same for relative frequency in men/women

## Trying variance filtering techinques

In [None]:
ft_with_district = [x for x in dummied_data.columns if x.startswith('district')]

fig = plt.figure(figsize = (20, 50))

for counter, d in enumerate(ft_with_district): 
    df= dummied_data.loc[dummied_data[d] == 1].glucose.dropna()
    axes= fig.add_subplot(14, 5, 1+ counter)
    axes.hist(df, density = True, bins = 20)
    axes.set_title(d + ' ' +str(df.shape[0]))
plt.subplots_adjust(wspace = 0.5, hspace = 0.5)
plt.show()

Blood glucose seems to be quite similarly distributed in all districts. At least there are no districts that remarkably stand out.   

In [None]:
var = []
for dist in ft_with_district:
    df= dummied_data.loc[dummied_data[dist] == 1].glucose.dropna()
    var.append(np.var(df.value_counts().sort_index()))
print("The lowest glucose distribution variance: ", np.min(var))
print("The highest glucose distribution variance ", np.max(var))

ft_district_droppable = []
for d in ft_with_district: 
    df= dummied_data.loc[dummied_data[d] == 1].glucose.dropna()
    if(np.var(df.value_counts().sort_index()))< 2000: 
        ft_district_droppable.append(d)
print("The following districts will be discarded: ", ft_district_droppable)

In [None]:
print(dummied_data.shape[1], "features before")
dummied_data.drop(ft_district_droppable, axis = 1, inplace = True)
print(dummied_data.shape[1], "features left")

In [None]:
#transform survey date into year and month
def parse(string):
    return int(string[6:])*10000 + int(string[3:5])*100 + int(string[:2])
dummied_data['year_month_day_survey'] = dummied_data.date_survey.apply(parse)
display(dummied_data[['date_survey', 'year_month_day_survey']].head(10))

ft_numeric = ['year_month_day_survey','test_salt_iodine', 'age', 'date_of_birth', 'month_of_birth', 'year_of_birth', 'weight', 
              'height', 'haemoglobin_level', 'bp_systolic', 'bp_diastolic', 'duration_pregnanacy',
              'bmi', 'pulse_rate']

dummied_data.drop('date_survey', axis = 1, inplace = True);

Lets now handle the other categorical features.

In [None]:
dummied_data['female'] = dummied_data['male'].apply(lambda x: 1 if x == 0 else 0)

In [None]:
ft_cat_no_distr = [x for x in dummied_data.columns if x not in ft_numeric + ft_with_district + ['glucose'] + ['diabetes']]
fig = plt.figure(figsize = (20, 50))
std_dict = {}

for counter, d in enumerate(ft_cat_no_distr): 
    df= dummied_data.loc[dummied_data[d] == 1].glucose.dropna()
    std_dict[d] = (np.std(df.value_counts().sort_index()/np.nanmean(dummied_data.glucose)))
    axes= fig.add_subplot(14, 5, 1+ counter)
    axes.hist(df, density = True, bins = 20)
    axes.set_title(d + ' ' +str(df.shape[0]))
    
plt.subplots_adjust(wspace = 0.5, hspace = 0.5)
plt.show()

Blood sugar is very similarly distributed across all features. There are some differences in features only applycable to women(remarried, separated, widow, divorced).

In [None]:
std_dict

In [None]:
dummied_data['stratum_1_2'] = dummied_data['stratum_1'] + dummied_data.stratum_2
print(np.nanstd(dummied_data.stratum_1_2.value_counts().sort_index())/np.nanmean(dummied_data.pulse_rate))
dummied_data.drop(['stratum_1', 'stratum_2'], axis = 1, inplace = True)

In [None]:
ft_with_district = [x for x in dummied_data.columns if x.startswith('district')]
std_dict = {}
for f in ft_with_district: 
    df= dummied_data.loc[dummied_data[f] == 1].pulse_rate.dropna()
    std_dict[f] = (np.std(df.value_counts().sort_index()/np.nanmean(dummied_data.glucose)))
dummied_data['district_signi'] = dummied_data[ft_with_district].sum(axis = 0)
std_dict['district_signi'] = (np.std(df.value_counts().sort_index()/np.nanmean(dummied_data.glucose)))
std_dict

In [None]:
stds = sorted(list(std_dict.values()))
std_treshold = stds[np.round(int(len(stds)*3/4))]
for f in ft_with_district: 
    if std_dict[f] < std_treshold: 
        std_dict.pop(f)

In [None]:
dummied_data.drop([x for x in dummied_data[ft_with_district].columns if x not in std_dict.keys()], axis = 1, inplace= True)
print(dummied_data.shape[1], "features left")

In [None]:
#only apply on numerical features, as categorical ones have too many zeros and might be discarded

dummied_data_numeric = dummied_data[ft_numeric]
#centralize data
dummied_data_numeric_cent = (dummied_data_numeric-dummied_data_numeric.mean())
#normalize by mean to get relative information of the feature
d = (dummied_data_numeric_cent.apply(np.nanstd, axis= 0))/dummied_data_numeric.mean()
print("This is the normalized standard deviation: ")
display(d)
ft_numeric_selected = d.where(d > 0.11)
ft_numeric_selected = ft_numeric_selected.index[np.where(ft_numeric_selected > 0)].tolist()
print("The following features will be kept: ", )
display(ft_numeric_selected)     
print("The following features will be discarded: ")
ft_numeric_discarded = [x for x in ft_numeric if x not in ft_numeric_selected]
display(ft_numeric_discarded)

Actual age will be needed for filtering, this column will later be dropped. Only centralized age will be used in modeling. 

In [None]:
dummied_data['age_orig'] = dummied_data['age']

In [None]:
dummied_data = dummied_data.drop(ft_numeric, axis = 1)
dummied_data = pd.concat([dummied_data, dummied_data_numeric_cent[ft_numeric_selected]], axis = 1)
print(dummied_data.shape[1], "features left")

# Modeling
Trying to predict categorical variable "diabetes". 
- 1 stands for blood glucose > 100md/dl

## Ensemble methods

### Over 45year olds
- linear svm 68%
- Ensemble rf for men&women 59%. This took 2 hours since subset of data is so large. 

Now looking at men and women separately.

In [None]:
dummied_data.columns

In [None]:
dummied_data.drop('district_signi', axis = 1, inplace = True)

Preparing the separate subsets for men and women over 45 years old. For men dropping features only applicable to women.

In [None]:
cols_women = ['never_married', 'married_no_gauna', 'married_and_gauna', 'remarried', 'widow', 'divorced', 'separated', 'pregnant', 'lactating', 'non_pregnant_non_lactating', 'duration_pregnanacy']
data_over45 = dummied_data.where(dummied_data['age_orig']>=45)
data_over45.drop('age_orig', axis = 1, inplace = True)

men_over45 = data_over45.where(dummied_data.male == 1).drop(cols_women, axis = 1).dropna()
women_over45 = data_over45.where(dummied_data.male == 0).dropna()

Predictions for men

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(men_over45.drop(['diabetes', 'glucose'], axis = 1).fillna(0),
                                                    men_over45.diabetes)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score as acc

from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc_param_grid = {'n_estimators': np.arange(160, 170, 1), 
                  'criterion': ['entropy', 'gini']}
rfc_grid_search = GridSearchCV(rfc, rfc_param_grid, cv=20, return_train_score=True)
rfc_grid_search.fit(X_train, y_train)
rfc_grid_search.best_params_
print("Accuracy achieved by Random Forest with parameters above: ", 
     acc(y_test, rfc_grid_search.best_estimator_.predict(X_test)))

Predictions for women

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(women_over45.drop(['diabetes', 'glucose'], axis = 1).fillna(0),
                                                    women_over45.diabetes)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score as acc

from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc_param_grid = {'n_estimators': np.arange(160, 170, 1), 
                  'criterion': ['entropy', 'gini']}
rfc_grid_search = GridSearchCV(rfc, rfc_param_grid, cv=20, return_train_score=True)
rfc_grid_search.fit(X_train, y_train)
rfc_grid_search.best_params_
print("Accuracy achieved by Random Forest with parameters above: ", 
     acc(y_test, rfc_grid_search.best_estimator_.predict(X_test)))

### Gestational diabetes
Trying to predict diabetes in pregnant women. This https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4673797/ paper suggest that a normal upper limit for fasting blood glucose in pregnant women is 92mg/dl. The upper limit is slightly higher for the general population. I have taken this into account when assigning the categorical feature "diabetes".

In [None]:
data_pregnant = dummied_data.where(dummied_data.pregnant == 1).dropna(how = "all")
# centralized age is used for modeling
data_pregnant.drop(['age_orig'], axis = 1, inplace = True)
#data_pregnant['diabetes'] = data_pregnant.glucose.apply(lambda x : 1 if x > 92 else 0)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data_pregnant.drop(['diabetes', 'glucose'], axis = 1).fillna(0),
                                                    data_pregnant.diabetes)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score as acc

from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc_param_grid = {'n_estimators': np.arange(160, 170, 1), 
                  'criterion': ['entropy', 'gini']}
rfc_grid_search = GridSearchCV(rfc, rfc_param_grid, cv=20, return_train_score=True)
rfc_grid_search.fit(X_train, y_train)
rfc_grid_search.best_params_
print("Accuracy achieved by Random Forest with parameters above: ", 
     acc(y_test, rfc_grid_search.best_estimator_.predict(X_test)))

Results when predicting with blood glucose normal upper limit 100mg/dl.

In [None]:
data_pregnant = dummied_data.where(dummied_data.pregnant == 1).dropna(how = "all")
# centralized age is used for modeling
data_pregnant.drop(['age_orig'], axis = 1, inplace = True)
data_pregnant['diabetes'] = data_pregnant.glucose.apply(lambda x : 1 if x > 92 else 0)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score as acc

from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc_param_grid = {'n_estimators': np.arange(160, 170, 1), 
                  'criterion': ['entropy', 'gini']}
rfc_grid_search = GridSearchCV(rfc, rfc_param_grid, cv=20, return_train_score=True)
rfc_grid_search.fit(X_train, y_train)
rfc_grid_search.best_params_
print("Accuracy achieved by Random Forest with parameters above: ", 
     acc(y_test, rfc_grid_search.best_estimator_.predict(X_test)))

Results when predicting with blood glucose normal upper limit 92mg/dl.

# Overview
These models are hoping to predict type 2 diabetes or a prediabetic condition. These are characterized by a fasting blood glucose over 100mg/dl. India has the largest population of type 2 diabetics in the world. Most patients are diagnosed at 46-47years of age. https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3920109/
The rising number of diabetic patients in India is thought to be caused by urbanization and excessive weight.

Pregnant women also have a higher risk of developing type 2 diabetes. In addition India has a high prevalence of gestational diabetes(gestational==pregnant) compared to world average. https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4673797/

These studies are the reason why the models are hoping to predict high blood glucose specifically in people over 45 and pregnant women.

Issues: 
- Method for assigning diabetes categorical variable. Need to take into account NA values.