In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from matplotlib import pyplot as plt # plotting
from math import * # sqrt() etc
# with %matplotlib inline you turn on the immediate display.
# %matplotlib inline

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

# Gather Data

In [None]:
data_dictionary_loc = '../input/CAB_data_dictionary.xlsx'
data_dic = pd.read_excel(data_dictionary_loc, dtype = object)
data_dic['File Content Description'] #well, how to import the correct column width? can be viewed using other programs
data_dic

In [None]:
data_u_pradesh = pd.read_csv('../input/CAB_09_UP.csv', low_memory = False) 
#needed to specify low_memory because columns (14, 43 had mixed types)
data_u_pradesh.head()

# Cleaning data

Subset of adults has 299 570 individuals

In [None]:
data = data_u_pradesh[(data_u_pradesh['age_code']=='Y')&(data_u_pradesh['age']>=18)]
len(data)

Original data had -1 for missing values

In [None]:
data = data.replace([-1, '-1'], np.nan)

Dropping columns only applicable to under 5 year olds

In [None]:
cols_under5 = ['illness_type', 'illness_duration', 'treatment_type']
cols_under3 = ['first_breast_feeding', 'is_cur_breast_feeding', 'day_or_month_for_breast_feeding_', 'day_or_month_for_breast_feeding', 'water_month', 'ani_milk_month', 'semisolid_month_or_day', 'solid_month', 'vegetables_month_or_day']

In [None]:
data = data.drop(cols_under5, axis = 1)
data = data.drop(cols_under3, axis = 1)

Dropping unnecessary features
 - 'state_code'
 - 'PSU_ID' - This is a seven digit number to uniquely identify each record.
 - 'ahs_house_unit' - House Number
 - 'house_hold_no' - Household Number
 - 'record_code_iodine_reason' - Why was iodine testing refused
 - 'sl_no' - Each record of the Household has a serial no. 
 - 'usual_residence' - Whether the member usually lives here
 - 'usual_residence_reason' - Reason for member not being usual resident
 - 'identification_code' - Each member of a PSU is assigned a unique number
 - 'v54' ?

In [None]:
data = data.drop(['state_code', 'psu_id', 'ahs_house_unit', 'house_hold_no', 'record_code_iodine_reason', 'sl_no', 'usual_residance', 'usual_residance_reason', 'identification_code', 'v54'], axis = 1)

From data dictionary:
- 'rural_urban' - Rural-1; Urban-2
- 'stratum' - 1 or 2 when 'rural_urban'=1, 0 when 'rural_urban'=2

dropping feature 'rural_urban', since 'stratum' contains the same information

I guess 'stratum' feature values:
- 0 - urban
- 1 - rural  
- 2 - very rural?

not specified in dictionary

In [None]:
data = data.drop('rural_urban', axis = 1)

## Age related
From data dictionary:
- 'age_code' - unit of recording age
- 'age'
- 'date_of_birth' - DD
- 'month_of_birth' - MM
- 'year_of_birth' - YYYY

Dropping feature age_code(values: Y, M, D for years, months, days), since age always recorded in years for adults

In [None]:
display(np.unique(data['age_code']))
data = data.drop('age_code', axis = 1)

In [None]:
plt.hist(data.age.dropna(), bins = 50)
plt.title('Age')
plt.show

## Iodine
From data dictionary:
- 'test_salt_iodine' - Salt used by the Household has been tested for Iodine content[Recorded as Parts Per Million(PPM)]
- 'record_code_iodine' - No iodine – 1; Less than 15 PPM – 2; More than or equal to 15 PPM – 3; No salt in Household – 4; Salt not tested  – 5

In [None]:
pd.value_counts(data['record_code_iodine'])

## Height/weight
From data dictionary:
- 'weight_measured' - Measured-1;  Member - not present-2, Refused-3, Other-4
- 'weight_in_kg' - outcome
- 'length_height_measured' - Measured-1;  Member not present-2, Refused-3, Other-4
- 'length_height_code' - L- Length, H-Height
- 'length_height_cm' - outcome

Dropping, unnecessary columns, NA in weight/length column if measurement was not conducted

In [None]:
data = data.drop(['weight_measured', 'length_height_measured', 'length_height_code'], axis = 1)

In [None]:
data = data.rename(index=str, columns={"weight_in_kg": "weight", "length_height_cm": "height"})

In [None]:
plt.boxplot(data['weight'].dropna())
plt.title('Weight with outliers')
plt.show

In [None]:
plt.boxplot(data['height'].dropna())
plt.title('Height with outliers')
plt.show

In [None]:
# exclude any measurements where difference from median is larger than 3 standard deviations
def remove_outliers(data, feature):
    stdev = sqrt(np.var(data[feature].dropna()))
    median = np.median(data[feature].dropna())
    print("number of discarded measurements")
    display(len(data[[feature]].where(abs(data[feature] - median)>(3*stdev)).dropna()))
# keep original values if difference from mean is less than 3 standard deviations. NA otherwise
    return data[[feature]].where(abs(data[feature] - median)<(3*stdev), other = np.nan)

In [None]:
data['height'] = remove_outliers(data, 'height')

Removing weight outliers. NA for anything under 20kg

In [None]:
print('number of discarded measurements')
display(len(data[data['weight']<20]))
data['weight'] = data['weight'].where(data['weight']>20, other=np.nan)

In [None]:
plt.boxplot(data['weight'].dropna())
plt.title('Weight without outliers')
plt.show

In [None]:
plt.boxplot(data['height'].dropna())
plt.title('Height without outliers')
plt.show

Body mass index: weight(kg)/(height(m) * height(m))

In [None]:
data['bmi'] = data['weight']/(data['height']/100)**2

In [None]:
plt.hist(data['weight'].dropna(), bins = 50)
plt.title('Weight without outliers')
plt.show()

In [None]:
plt.hist(data['height'].dropna(), bins = 50)
plt.title('Height without outliers')
plt.show()

A lot of individuals with 130, 140, 150cm height

In [None]:
plt.hist(data['bmi'].dropna(), bins = 50)
plt.title('BMI')
plt.show()

Data cleaning steps for height/weight related data: 
- Discarded any height measurements where difference from median was further than 3 standard deviations. Looking at distribution of height/weight as normally distributed.
- Discarded any weight measurements under 20kg
- Calculated BMI

Discarded ~800 values for height, ~460 values for weight. Out of ~200 000

## Pulse, blood pressure(heart disease)
From data dictionary:
- 'bp_systolic'
- 'bp_systolic_2_reading'
- 'bp_diastolic'
- 'bp_diastolic_2reading'
- 'pulse_rate',
- 'pulse_rate_2_reading'

In [None]:
# distribution of measurement differences
#plt.hist((data['bp_systolic'] - data['bp_systolic_2_reading']).dropna(), bins = 50)
#plt.hist((data['pulse_rate'] - data['pulse_rate_2_reading']).dropna(), bins = 50)
#plt.hist((data['bp_diastolic'] - data['bp_diastolic_2reading']).dropna(), bins = 50)

In [None]:
# for features where two measurements were taken, exclude any where difference between measurements is larger than 3 standard deviations
def remove_outliers_difference(data, col1, col2):
    stdev = sqrt((data[col1] - data[col2]).var())
# how many measurements were excluded
    print('number of discarded measurements')
    display(len(data[[col1, col2]].where(abs(data[col1] - data[col2])>(3*stdev)).dropna()))
# keep original values if difference of two measurements is less than 3 standard deviations. NA otherwise
    return data[[col1, col2]].where(abs(data[col1] - data[col2])<(3*stdev), other = np.nan)

In [None]:
data[['bp_systolic', 'bp_systolic_2_reading']] = remove_outliers_difference(data, 'bp_systolic', 'bp_systolic_2_reading')
data[['bp_diastolic', 'bp_diastolic_2reading']] = remove_outliers_difference(data, 'bp_diastolic', 'bp_diastolic_2reading')
data[['pulse_rate', 'pulse_rate_2_reading']] = remove_outliers_difference(data, 'pulse_rate', 'pulse_rate_2_reading')

Now that outliers have been removed, aggregate remaining data by finding mean between two readings

In [None]:
# aggregate two reading by finding mean
def aggregate_readings(data, col1, col2):
    data[col1] = data.apply(lambda row: sum([row[col1], row[col2]])/2, axis = 1)
    data = data.drop(col2, axis = 1)
    return data

In [None]:
data = aggregate_readings(data, 'bp_systolic', 'bp_systolic_2_reading')
data = aggregate_readings(data, 'bp_diastolic', 'bp_diastolic_2reading')
data = aggregate_readings(data, 'pulse_rate', 'pulse_rate_2_reading')

Systolic - beating, diastolic - resting blood pressure. Likely input/measurement error where systolic < diastolic

In [None]:
# retain original values where resting blood pressure lower than beating. NA otherwise 
data[['bp_diastolic', 'bp_systolic']] = data[['bp_diastolic', 'bp_systolic']].where(data.bp_diastolic < data.bp_systolic, other = np.nan)

Data cleaning steps for heart disease related data: 
- Discarded any where difference between two measurements was further from mean than 3 standard deviations. Looking at distribution of measurement differences as normally distributed.
- Aggregated two measurements by finding mean
- Discarded any where diastolic pressure was higher than systolic

Lost less than 5% of values for each feature

## Haemoglobin(anemia)
From data dictionary:
- 'haemoglobin_test' - Consent for Haemoglobin test (Yes-1; No-2)
- 'haemoglobin'- Status of Haemoglobin Test (Measured-1; Member not present-2; Refused-3, Other-4)
- 'haemoglobin_level' - Outcome of Haemoglobin Level (Hb) Test (in percentage gms)  

In [None]:
data = data.drop(['haemoglobin_test', 'haemoglobin'], axis = 1)

In [None]:
plt.hist(data.haemoglobin_level[~np.isnan(data.haemoglobin_level)], bins=50)
plt.title('Blood haemoglobin')
plt.show

## Blood sugar(diabetes)
From data dictionary:
- 'diabetes_test' - consent for testing
- 'fasting_blood_glucose' - Measured-1; Member not present-2; Refused-3; Other-4
- 'fasting_blood_glucose_mg_dl' - outcome of test

In [None]:
data = data.drop(['diabetes_test', 'fasting_blood_glucose'], axis = 1)

In [None]:
plt.hist(data.fasting_blood_glucose_mg_dl[~np.isnan(data.fasting_blood_glucose_mg_dl)], bins=50)
plt.title('Blood sugar')
plt.show

In [None]:
plt.boxplot(data.fasting_blood_glucose_mg_dl[~np.isnan(data.fasting_blood_glucose_mg_dl)])
plt.title('Blood sugar')
plt.show

In [None]:
data['fasting_blood_glucose_mg_dl'] = remove_outliers(data,'fasting_blood_glucose_mg_dl')

## Features only applicable to women
From data dictionary:
- 'marital_status' - Never married=1,Married but Gauna not performed=2, Married and Gauna perfomed=3, Remarried=4,Widow=5, Divorced=6, Separated=7, Not stated=8
- 'gauna_perfor_not_perfor' - Pregnant-1; Lactating-2; Non-pregnant or Non-lactating-3
- 'duration_pregnanacy' - Duration of pregnancy/lactation (in months)

In [None]:
cols_women = ['marital_status', 'gauna_perfor_not_perfor', 'duration_pregnanacy']

placing NA where marital status 'not stated' 

In [None]:
data['marital_status'] = data['marital_status'].where(~(data['marital_status']==8.0), other = np.nan)

In [None]:
# input errors have to be dealt with
plt.boxplot(data['duration_pregnanacy'].dropna())
plt.show

In [None]:
corr=data.corr()[['haemoglobin_level', 'pulse_rate', 'bp_diastolic', 'bp_systolic', 'fasting_blood_glucose_mg_dl']]
corr.where(abs(corr)>0.1)

Removing features where there's no correlation

In [None]:
data_correlated = data.drop(['district_code', 'stratum', 'test_salt_iodine', 'record_code_iodine', 'date_of_birth', 'month_of_birth', 'duration_pregnanacy'], axis = 1)
corr = data_correlated.corr()[['haemoglobin_level', 'pulse_rate', 'bp_diastolic', 'bp_systolic', 'fasting_blood_glucose_mg_dl']]
corr.where(abs(corr)>0.1)

# Cleaning and formatting further

## Goals: 
1. Why are there so many heights 130cm, 140cm, 150cm? 
2. Transform actual categorical features (marital status etc) with OneHotEncoding


In [None]:
print(data.shape)
data.columns


### 1. Why are there so many  heights 130, ... is this measure errors? 

In [None]:
data.height.value_counts().head()

In [None]:

weird_heights = data.height.value_counts().index[:3].tolist()
data_filter_helper = data.isin(weird_heights)
weird_heights_data = data.loc[data_filter_helper.height]
print(weird_heights_data.shape)


So this affects ~25 000 rows. 

In [None]:
fig = plt.figure(figsize = (10, 30))

for counter, column in enumerate(weird_heights_data.columns): 
    axes= fig.add_subplot(7, 3, 1+ counter)
    axes.bar(weird_heights_data[column].value_counts().index, weird_heights_data[column].value_counts().values)
    axes.set_title(column)  
plt.subplots_adjust(wspace = 0.5)
plt.show()

As we can observe, most of the other measurements are fairly distributed. As the district code varies, I can only assume that sometimes the height was very loosely taken. Also it is interesting, that month and date of birth are all the same among the regarded group of people. Hence I can only conclude that these people do not own a birth confirmation and their body size was maybe simply estimated when registrating them. 

### 2. Transform actual categorical features with OneHotEncoding

Dummies should be drawn by all features that are encoded numerically and which are actually categorical. From the 21 remaining,  these are 
- district_code
- stratum     
- record_code_iodine.  Here, 1, 2 and 3 are ordered, while 4 should be 0 (no salt in household) and 5 should be replaced by NaN (no information).
- sex should be replaced by male and female as categorical features. 
- marital status 
- gauna_perfor_not_perfor:  1- pregnant, 2-lactating, 3-nothing of both. Better rename to "pregnant" and "lactating" after OneHoteEncoding


In [None]:
dummieable =['district_code', 'stratum', 'record_code_iodine', 'sex', 'marital_status', 'gauna_perfor_not_perfor']
dummiedata = [data]
for dum in dummieable: 
    dummiedata.append(pd.get_dummies(data[dum], prefix = dum))
dummied_data = pd.concat(dummiedata, axis = 1)


In [None]:
print("Number of features", len(dummied_data.columns))
dummied_data.columns

Finally remove the old columns, rename the new ones and set all not given data NaN. 

In [None]:
dummied_data = dummied_data.drop(dummieable, axis =1)
print("Number of features", len(dummied_data.columns))
dummied_data.columns

In [None]:
rename_dict = {'marital_status_1.0': 'never_married', 'marital_status_2.0': 'married_no_gauna',
               'marital_status_3.0': 'married_and_gauna',
       'marital_status_4.0': 'remarried', 'marital_status_5.0': 'widow', 'marital_status_6.0': 'divorced',
       'marital_status_7.0': 'separated', 'gauna_perfor_not_perfor_1.0': 'pregnant',
       'gauna_perfor_not_perfor_2.0': 'lactating', 'gauna_perfor_not_perfor_3.0': 'non_pregnant_non_lactating',
        'sex_1': 'male', 'sex_2': 'female'}
dummied_data = dummied_data.rename(rename_dict, axis = 'columns')
dummied_data.columns[70:]

# Investigation on Heart Diseases

### Goals
1. Find relevant features
2. Design an evaluation measure
3. Find a good modeling technique
4. Learn a model and assess

### 1. Find relevant features

We expect, that heart pulse rate, age, sex and BMI will be indicators for heart diseases. As we do not have a feature which tells us, that a person is heart sick, we will set the heart pulse rate as target value. So lets investigate how heart pulse rate and the other features interact. 

In [None]:
ft_with_district = [x for x in dummied_data.columns if x.startswith('district')]

fig = plt.figure(figsize = (20, 50))

for counter, d in enumerate(ft_with_district): 
    df= dummied_data.loc[dummied_data[d] == 1].pulse_rate.dropna()
    axes= fig.add_subplot(14, 5, 1+ counter)
    axes.hist(df, density = True, bins = 20)
    axes.set_title(d + ' ' +str(df.shape[0]))
plt.subplots_adjust(wspace = 0.5, hspace = 0.5)
plt.show()


We can see, that in most of the districts the pulse_rate is normally distributed. But there are some states which are not normally distributed, although the sample num is comparably high to the normally distributed ones. So the belonging to a state might have an impacts on your pulse rate.  So lets sort out all the districts that have a normally distributed pulse_rate. 

In [None]:
df1= dummied_data.loc[dummied_data[ft_with_district[0]] == 1].pulse_rate.dropna()
df2= dummied_data.loc[dummied_data[ft_with_district[25]] == 1].pulse_rate.dropna()
print("A normally distributed pulse_rate has variance", np.var(df1.value_counts().sort_index()))
print("A non normally distributed pulse_rate has variance", np.var(df2.value_counts().sort_index()))

ft_district_droppable = []
for d in ft_with_district: 
    df= dummied_data.loc[dummied_data[d] == 1].pulse_rate.dropna()
    if(np.var(df.value_counts().sort_index()))< 1000: 
        ft_district_droppable.append(d)
print("The following districts will be discarded: ", ft_district_droppable)

In [None]:
print(dummied_data.shape[1], "features before")
dummied_data.drop(ft_district_droppable, axis = 1, inplace = True)
print(dummied_data.shape[1], "features left")

In [None]:
#transform survey date into year and month
def parse(string):
    return int(string[6:])*10000 + int(string[3:5])*100 + int(string[:2])
dummied_data['year_month_day_survey'] = dummied_data.date_survey.apply(parse)
display(dummied_data[['date_survey', 'year_month_day_survey']].head(10))

ft_numeric = ['year_month_day_survey','test_salt_iodine', 'age', 'date_of_birth', 'month_of_birth', 'year_of_birth', 'weight', 
              'height', 'haemoglobin_level', 'bp_systolic', 'bp_diastolic', 'fasting_blood_glucose_mg_dl', 'duration_pregnanacy',
              'bmi']

In [None]:
dummied_data.drop('date_survey', axis = 1, inplace = True);

Lets now handle the other categorical features. 

In [None]:
ft_cat_no_distr = [x for x in dummied_data.columns if x not in ft_numeric + ft_with_district + ['pulse_rate']]
fig = plt.figure(figsize = (20, 50))
std_dict = {}

for counter, d in enumerate(ft_cat_no_distr): 
    df= dummied_data.loc[dummied_data[d] == 1].pulse_rate.dropna()
    std_dict[d] = (np.std(df.value_counts().sort_index()/np.nanmean(dummied_data.pulse_rate)))
    axes= fig.add_subplot(14, 5, 1+ counter)
    axes.hist(df, density = True, bins = 20)
    axes.set_title(d + ' ' +str(df.shape[0]))
    
plt.subplots_adjust(wspace = 0.5, hspace = 0.5)
plt.show()

In general, most of the pulse_rates here are normally distributed. There is a clear difference between male and female too. Visually stratum1 and stratum2 have a very similiar distribution, as well as all the record_code_iodines. Between the different marital stati there are visible differences. We could summarize some of these similiar features in one feature or omit the ones which are very normally distributed (and have a low variance).


In [None]:
std_dict

This tells the contrary of the visual inspection. Lets try to construct some extra features and compare them to the single features.  It will show that combining stratum1 and stratum2 will create a feature that has a very high std deviation. 

In [None]:
dummied_data['stratum_1_2'] = dummied_data['stratum_1'] + dummied_data.stratum_2
print(np.nanstd(dummied_data.stratum_1_2.value_counts().sort_index())/np.nanmean(dummied_data.pulse_rate))
dummied_data.drop(['stratum_1', 'stratum_2'], axis = 1, inplace = True)

Let's try the same with the districts: Create one feature that contains information if the person is from one of the abnormally distributed districts. 

In [None]:
ft_with_district = [x for x in dummied_data.columns if x.startswith('district')]
std_dict = {}
for f in ft_with_district: 
    df= dummied_data.loc[dummied_data[f] == 1].pulse_rate.dropna()
    std_dict[f] = (np.std(df.value_counts().sort_index()/np.nanmean(dummied_data.pulse_rate)))
dummied_data['district_signi'] = dummied_data[ft_with_district].sum(axis = 0)
std_dict['district_signi'] = (np.std(df.value_counts().sort_index()/np.nanmean(dummied_data.pulse_rate)))
std_dict

This shows the new feature does not have a mentionable higher variance than other single district features and still it is better than most of them. To reduce more, keep only the ones in the highest quarter. 

In [None]:
stds = sorted(list(std_dict.values()))
std_treshold = stds[np.round(int(len(stds)*3/4))]
for f in ft_with_district: 
    if std_dict[f] < std_treshold: 
        std_dict.pop(f)

In [None]:

dummied_data.drop([x for x in dummied_data[ft_with_district].columns if x not in std_dict.keys()], axis = 1, inplace= True)
print(dummied_data.shape[1], "features left")

**Standard deviation filtering of numeric features**

In [None]:
#only apply on numerical features, as categorical ones have too many zeros and might be discarded

dummied_data_numeric = dummied_data[ft_numeric]
#centralize data
dummied_data_numeric_cent = (dummied_data_numeric-dummied_data_numeric.mean())
#normalize by mean to get relative information of the feature
d = (dummied_data_numeric_cent.apply(np.nanstd, axis= 0))/dummied_data_numeric.mean()
print("This is the normalized standard deviation: ")
display(d)
ft_numeric_selected = d.where(d > 0.15)
ft_numeric_selected = ft_numeric_selected.index[np.where(ft_numeric_selected > 0)].tolist()
print("The following features will be kept: ", )
display(ft_numeric_selected)     
print("The following features will be discarded: ")
ft_numeric_discarded = [x for x in ft_numeric if x not in ft_numeric_selected]
display(ft_numeric_discarded)

In [None]:
dummied_data = dummied_data.drop(ft_numeric, axis = 1)
dummied_data_cent = pd.concat([dummied_data, dummied_data_numeric_cent[ft_numeric_selected]], axis = 1)
dummied_data = pd.concat([dummied_data, dummied_data_numeric[ft_numeric_selected]], axis = 1)
print(dummied_data.shape[1], "features left")

As a last thing, we can create two different subsets for men and women and discard all female features for the men's data set. 

In [None]:
ft_female = ['married_no_gauna', 'never_married', 'married_and_gauna', 'remarried', 
             'widow', 'pregnant', 'lactating', 'non_pregnant_non_lactating', 'duration_pregnanacy']
men_data = dummied_data_cent.loc[dummied_data['male'] == 1].drop(ft_female + ['male', 'female'], axis = 1)
fem_data = dummied_data_cent.loc[dummied_data['female']== 1].drop(['male', 'female'], axis = 1)
print(men_data.shape[1], "features for men")
print(fem_data.shape[1], "features for women")

### 2. Design an evaluation measure

As we focus on the pulse rate, we need a good estimation when a heart pulse is dangerous. Therefore, lets make a classification task to predict if a person has a  normal (0) or high (1, above 76) pulse rate. 

We need original age and pulse rate for the following investigations. 

In [None]:
fem_data['age_orig'] = dummied_data.loc[dummied_data.female == 1].age
men_data['age_orig'] = dummied_data.loc[dummied_data.male == 1].age

In [None]:
fig = plt.figure(figsize = (10, 5))
axes = fig.add_subplot(2, 1, 1)
axes.hist(men_data.pulse_rate.dropna(), density = True, bins = 20);
axes.set_title('mens pulse rate, mean ' + str(np.nanmean(men_data.pulse_rate)))
axes = fig.add_subplot(2, 1, 2)
axes.hist(fem_data.dropna(), density = True, bins = 20);
axes.set_title('womans pulse rate, mean ' + str(np.nanmean(fem_data.pulse_rate)))
plt.subplots_adjust(wspace = 0.5, hspace = 0.5)
plt.show()

It  is interesting that already the mean of the pulse rates is higher than the pulse rate which is considered to increase the risk of heart attack, which is 76 for postmenopausal women according to https://www.nhs.uk/news/heart-and-lungs/pulse-predicts-heart-attacks/.  As this is a clear treshold, lets filter women older than 50 and take a look at their heart beat and perform our classification task first on them. 

In [None]:
fem_data_postmeno = fem_data.loc[fem_data['age_orig'] >= 50]
plt.hist(fem_data_postmeno.pulse_rate.dropna(), density = True, bins = 20)
plt.title('pulse rate of elder women, mean ' + str(np.nanmean(fem_data_postmeno.pulse_rate)))

In [None]:
fem_data_postmeno.head()

In [None]:
#drop all women that did not give a pulse rate
print(fem_data_postmeno.shape)
fem_data_postmeno = fem_data_postmeno.where(fem_data_postmeno.pulse_rate.notna() == True).dropna(how = 'all')
print(fem_data_postmeno.shape)

In [None]:
def pulse_dange(x): 
    return (x >= 76) * 1
fem_data_postmeno['pulse_rate_dangerous'] = fem_data_postmeno.pulse_rate.apply(pulse_dange)

In [None]:
display(fem_data_postmeno[['pulse_rate', 'pulse_rate_dangerous']].head())
fem_data_postmeno_train = fem_data_postmeno.drop(['pulse_rate', 'pulse_rate_dangerous'], axis = 1)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(fem_data_postmeno_train.fillna(0),
                                                    fem_data_postmeno.pulse_rate_dangerous)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

### 3. Find a good modelling technique

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
from sklearn.metrics import accuracy_score as acc

In [None]:
fem_data_postmeno_train.fillna(0).head()

In [None]:
#this will take ca 14min
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc_param_grid = {'n_estimators': np.arange(160, 170, 5), 
                  'criterion': ['entropy', 'gini']}
rfc_grid_search = GridSearchCV(rfc, rfc_param_grid, cv=20, return_train_score=True)
rfc_grid_search.fit(X_train, y_train)
rfc_grid_search.best_params_

In [None]:
print("Accuracy achieved by Random Forest with parameters above: ", 
     acc(y_test, rfc_grid_search.best_estimator_.predict(X_test)))

In [None]:
splits = [x for x in list(rfc_grid_search.cv_results_.keys()) if x.endswith('test_score') and x.startswith('split')]
best_rfc_scores = {}
for counter, x in enumerate(splits): 
    best_rfc_scores[counter]= (rfc_grid_search.cv_results_[x][1])
plt.scatter(best_rfc_scores.keys(), best_rfc_scores.values())
plt.title('variance in performance dependent on split');

In [None]:
imps = rfc_grid_search.best_estimator_.feature_importances_
important_features = [idx for idx in range(len(imps))if imps[idx]>0]
print("There are ", len(important_features), "features used for Random Forest: ")
plt.xticks(rotation='vertical')
plt.bar(fem_data_postmeno_train.columns[important_features], imps[important_features])

As we achieve an acceptable score with Random Forest, lets try to train a similiar model on older men as well. Maybe later, we can try other ensemble methods. 

**Older Men**

In [None]:
men_data_older = men_data.loc[men_data['age_orig'] >= 50]
plt.hist(men_data_older.pulse_rate.dropna(), density = True, bins = 20)
plt.title('pulse rate of elder men, mean ' + str(np.nanmean(men_data_older.pulse_rate)))
plt.show()
#drop all men that did not give a pulse rate
print(men_data_older.shape)
men_data_older = men_data_older.where(men_data_older.pulse_rate.notna() == True).dropna(how = 'all')
print(men_data_older.shape)

In [None]:
men_data_older['pulse_rate_dangerous'] = men_data_older.pulse_rate.apply(pulse_dange)
men_data_older_train = men_data_older.drop(['pulse_rate', 'pulse_rate_dangerous'], axis = 1)
X_train, X_test, y_train, y_test = train_test_split(men_data_older_train.fillna(0),
                                                    men_data_older.pulse_rate_dangerous)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
rfc_param_grid = {'n_estimators':[160, 170, 180, 200], 
                  'criterion': ['entropy', 'gini']}
rfc_grid_search = GridSearchCV(rfc, rfc_param_grid, cv=20)
rfc_grid_search.fit(X_train, y_train)
display(rfc_grid_search.best_params_)
print("Accuracy achieved by Random Forest: ", 
      acc(y_test, rfc_grid_search.best_estimator_.predict(X_test)))

In [None]:
imps = rfc_grid_search.best_estimator_.feature_importances_
important_features_men = [idx for idx in range(len(imps))if imps[idx]>0]
print("There are ", len(important_features_men), "features used for Random Forest for men: ")
#fig = plt.figure(figsize = (10, 5))
plt.xticks(rotation='vertical')
plt.bar(men_data_older_train.columns[important_features_men], imps[important_features_men])