In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
from matplotlib import pyplot as plt # plotting
from math import * # sqrt() etc
# with %matplotlib inline you turn on the immediate display.
# %matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.
import warnings
warnings.filterwarnings("ignore")

# Gather Data

In [None]:
data_dictionary_loc = '../input/CAB_data_dictionary.xlsx'
data_dic = pd.read_excel(data_dictionary_loc, dtype = object)
data_dic['File Content Description'] #well, how to import the correct column width? can be viewed using other programs
data_dic

In [None]:
data_u_pradesh = pd.read_csv('../input/CAB_09_UP.csv', low_memory = False) 
#needed to specify low_memory because columns (14, 43 had mixed types)
data_u_pradesh.head()

# Cleaning data

Subset of adults has 299 570 individuals

In [None]:
data = data_u_pradesh[(data_u_pradesh['age_code']=='Y')&(data_u_pradesh['age']>=18)]
len(data)

Original data had -1 for missing values

In [None]:
data = data.replace([-1, '-1'], np.nan)

Dropping columns only applicable to under 5 year olds

In [None]:
cols_under5 = ['illness_type', 'illness_duration', 'treatment_type']
cols_under3 = ['first_breast_feeding', 'is_cur_breast_feeding', 'day_or_month_for_breast_feeding_', 'day_or_month_for_breast_feeding', 'water_month', 'ani_milk_month', 'semisolid_month_or_day', 'solid_month', 'vegetables_month_or_day']

In [None]:
data = data.drop(cols_under5, axis = 1)
data = data.drop(cols_under3, axis = 1)

Dropping unnecessary features
 - 'state_code'
 - 'PSU_ID' - This is a seven digit number to uniquely identify each record.
 - 'ahs_house_unit' - House Number
 - 'house_hold_no' - Household Number
 - 'record_code_iodine_reason' - Why was iodine testing refused
 - 'sl_no' - Each record of the Household has a serial no. 
 - 'usual_residence' - Whether the member usually lives here
 - 'usual_residence_reason' - Reason for member not being usual resident
 - 'identification_code' - Each member of a PSU is assigned a unique number
 - 'v54' ?

In [None]:
data = data.drop(['state_code', 'psu_id', 'ahs_house_unit', 'house_hold_no', 'record_code_iodine_reason', 'sl_no', 'usual_residance', 'usual_residance_reason', 'identification_code', 'v54'], axis = 1)

From data dictionary:
- 'rural_urban' - Rural-1; Urban-2
- 'stratum' - 1 or 2 when 'rural_urban'=1, 0 when 'rural_urban'=2

dropping feature 'rural_urban', since 'stratum' contains the same information

I guess 'stratum' feature values:
- 0 - urban
- 1 - rural  
- 2 - very rural?

not specified in dictionary

In [None]:
data = data.drop('rural_urban', axis = 1)

## Age related
From data dictionary:
- 'age_code' - unit of recording age
- 'age'
- 'date_of_birth' - DD
- 'month_of_birth' - MM
- 'year_of_birth' - YYYY

Dropping feature age_code(values: Y, M, D for years, months, days), since age always recorded in years for adults

In [None]:
display(np.unique(data['age_code']))
data = data.drop('age_code', axis = 1)

In [None]:
plt.hist(data.age.dropna(), bins = 50)
plt.title('Age')
plt.show

## Iodine
From data dictionary:
- 'test_salt_iodine' - Salt used by the Household has been tested for Iodine content[Recorded as Parts Per Million(PPM)]
- 'record_code_iodine' - No iodine – 1; Less than 15 PPM – 2; More than or equal to 15 PPM – 3; No salt in Household – 4; Salt not tested  – 5

In [None]:
pd.value_counts(data['record_code_iodine'])

## Height/weight
From data dictionary:
- 'weight_measured' - Measured-1;  Member - not present-2, Refused-3, Other-4
- 'weight_in_kg' - outcome
- 'length_height_measured' - Measured-1;  Member not present-2, Refused-3, Other-4
- 'length_height_code' - L- Length, H-Height
- 'length_height_cm' - outcome

Dropping, unnecessary columns, NA in weight/length column if measurement was not conducted

In [None]:
data = data.drop(['weight_measured', 'length_height_measured', 'length_height_code'], axis = 1)

In [None]:
data = data.rename(index=str, columns={"weight_in_kg": "weight", "length_height_cm": "height"})

In [None]:
plt.boxplot(data['weight'].dropna())
plt.title('Weight with outliers')
plt.show

In [None]:
plt.boxplot(data['height'].dropna())
plt.title('Height with outliers')
plt.show

In [None]:
# exclude any measurements where difference from median is larger than 3 standard deviations
def remove_outliers(data, feature):
    stdev = sqrt(np.var(data[feature].dropna()))
    median = np.median(data[feature].dropna())
    print("number of discarded measurements")
    display(len(data[[feature]].where(abs(data[feature] - median)>(3*stdev)).dropna()))
# keep original values if difference from mean is less than 3 standard deviations. NA otherwise
    return data[[feature]].where(abs(data[feature] - median)<(3*stdev), other = np.nan)

In [None]:
data['height'] = remove_outliers(data, 'height')

Removing weight outliers. NA for anything under 20kg

In [None]:
print('number of discarded measurements')
display(len(data[data['weight']<20]))
data['weight'] = data['weight'].where(data['weight']>20, other=np.nan)

In [None]:
plt.boxplot(data['weight'].dropna())
plt.title('Weight without outliers')
plt.show

In [None]:
plt.boxplot(data['height'].dropna())
plt.title('Height without outliers')
plt.show

Body mass index: weight(kg)/(height(m) * height(m))

In [None]:
data['bmi'] = data['weight']/(data['height']/100)**2

In [None]:
plt.hist(data['weight'].dropna(), bins = 50)
plt.title('Weight without outliers')
plt.show()

In [None]:
plt.hist(data['height'].dropna(), bins = 50)
plt.title('Height without outliers')
plt.show()

A lot of individuals with 130, 140, 150cm height

In [None]:
plt.hist(data['bmi'].dropna(), bins = 50)
plt.title('BMI')
plt.show()

Data cleaning steps for height/weight related data: 
- Discarded any height measurements where difference from median was further than 3 standard deviations. Looking at distribution of height/weight as normally distributed.
- Discarded any weight measurements under 20kg
- Calculated BMI

Discarded ~800 values for height, ~460 values for weight. Out of ~200 000

## Pulse, blood pressure(heart disease)
From data dictionary:
- 'bp_systolic'
- 'bp_systolic_2_reading'
- 'bp_diastolic'
- 'bp_diastolic_2reading'
- 'pulse_rate',
- 'pulse_rate_2_reading'

In [None]:
# distribution of measurement differences
#plt.hist((data['bp_systolic'] - data['bp_systolic_2_reading']).dropna(), bins = 50)
#plt.hist((data['pulse_rate'] - data['pulse_rate_2_reading']).dropna(), bins = 50)
#plt.hist((data['bp_diastolic'] - data['bp_diastolic_2reading']).dropna(), bins = 50)

In [None]:
# for features where two measurements were taken, exclude any where difference between measurements is larger than 3 standard deviations
def remove_outliers_difference(data, col1, col2):
    stdev = sqrt((data[col1] - data[col2]).var())
# how many measurements were excluded
    print('number of discarded measurements')
    display(len(data[[col1, col2]].where(abs(data[col1] - data[col2])>(3*stdev)).dropna()))
# keep original values if difference of two measurements is less than 3 standard deviations. NA otherwise
    return data[[col1, col2]].where(abs(data[col1] - data[col2])<(3*stdev), other = np.nan)

In [None]:
data[['bp_systolic', 'bp_systolic_2_reading']] = remove_outliers_difference(data, 'bp_systolic', 'bp_systolic_2_reading')
data[['bp_diastolic', 'bp_diastolic_2reading']] = remove_outliers_difference(data, 'bp_diastolic', 'bp_diastolic_2reading')
data[['pulse_rate', 'pulse_rate_2_reading']] = remove_outliers_difference(data, 'pulse_rate', 'pulse_rate_2_reading')

Now that outliers have been removed, aggregate remaining data by finding mean between two readings

In [None]:
# aggregate two reading by finding mean
def aggregate_readings(data, col1, col2):
    data[col1] = data.apply(lambda row: sum([row[col1], row[col2]])/2, axis = 1)
    data = data.drop(col2, axis = 1)
    return data

In [None]:
data = aggregate_readings(data, 'bp_systolic', 'bp_systolic_2_reading')
data = aggregate_readings(data, 'bp_diastolic', 'bp_diastolic_2reading')
data = aggregate_readings(data, 'pulse_rate', 'pulse_rate_2_reading')

Systolic - beating, diastolic - resting blood pressure. Likely input/measurement error where systolic < diastolic

In [None]:
# retain original values where resting blood pressure lower than beating. NA otherwise 
data[['bp_diastolic', 'bp_systolic']] = data[['bp_diastolic', 'bp_systolic']].where(data.bp_diastolic < data.bp_systolic, other = np.nan)

Data cleaning steps for heart disease related data: 
- Discarded any where difference between two measurements was further from mean than 3 standard deviations. Looking at distribution of measurement differences as normally distributed.
- Aggregated two measurements by finding mean
- Discarded any where diastolic pressure was higher than systolic

Lost less than 5% of values for each feature

## Haemoglobin(anemia)
From data dictionary:
- 'haemoglobin_test' - Consent for Haemoglobin test (Yes-1; No-2)
- 'haemoglobin'- Status of Haemoglobin Test (Measured-1; Member not present-2; Refused-3, Other-4)
- 'haemoglobin_level' - Outcome of Haemoglobin Level (Hb) Test (in percentage gms)  

In [None]:
data = data.drop(['haemoglobin_test', 'haemoglobin'], axis = 1)

In [None]:
plt.hist(data.haemoglobin_level[~np.isnan(data.haemoglobin_level)], bins=50)
plt.title('Blood haemoglobin')
plt.show

## Blood sugar(diabetes)
From data dictionary:
- 'diabetes_test' - consent for testing
- 'fasting_blood_glucose' - Measured-1; Member not present-2; Refused-3; Other-4
- 'fasting_blood_glucose_mg_dl' - outcome of test

In [None]:
data = data.drop(['diabetes_test', 'fasting_blood_glucose'], axis = 1)

In [None]:
data = data.rename(index = str, columns = {'fasting_blood_glucose_mg_dl' : 'glucose'})
data['diabetes'] = data['glucose'].apply(lambda x: 1 if x >= 100 else 0)

In [None]:
plt.hist(data.glucose[~np.isnan(data.glucose)], bins=50)
plt.title('Blood sugar')
plt.show

In [None]:
plt.boxplot(data.glucose[~np.isnan(data.glucose)])
plt.title('Blood sugar')
plt.show

In [None]:
data['glucose'] = remove_outliers(data,'glucose')

## Features only applicable to women
From data dictionary:
- 'marital_status' - Never married=1,Married but Gauna not performed=2, Married and Gauna perfomed=3, Remarried=4,Widow=5, Divorced=6, Separated=7, Not stated=8
- 'gauna_perfor_not_perfor' - Pregnant-1; Lactating-2; Non-pregnant or Non-lactating-3
- 'duration_pregnanacy' - Duration of pregnancy/lactation (in months)

In [None]:
cols_women = ['marital_status', 'gauna_perfor_not_perfor', 'duration_pregnanacy']

placing NA where marital status 'not stated' 

In [None]:
data['marital_status'] = data['marital_status'].where(~(data['marital_status']==8.0), other = np.nan)

In [None]:
# input errors have to be dealt with
plt.boxplot(data['duration_pregnanacy'].dropna())
plt.show

In [None]:
corr=data.corr()[['haemoglobin_level', 'pulse_rate', 'bp_diastolic', 'bp_systolic', 'glucose']]
corr.where(abs(corr)>0.1)

Removing features where there's no correlation

In [None]:
data_correlated = data.drop(['district_code', 'stratum', 'test_salt_iodine', 'record_code_iodine', 'date_of_birth', 'month_of_birth', 'duration_pregnanacy'], axis = 1)
corr = data_correlated.corr()[['haemoglobin_level', 'pulse_rate', 'bp_diastolic', 'bp_systolic', 'glucose']]
corr.where(abs(corr)>0.1)


## Summary of first Data Exploration
- From 53 initial features to 21
- data_correlated contains only data that has relationships to other data
- obviously the data does not have a very good quality, there are many missing values and it seems the measurements are taken in different accuracy

TODO:
- A lot of individuals with 130, 140, 150cm height value??

# Prepare Data for training models

## Goals of preparing data
1. Clarify if the data with heights 130cm, 140cm, 150cm is usable (and maybe find a reason for this)
2. apply OneHotEncoding to categorical features
3. encode date_survey as ordinal feature
4. create a scaled and normalized data set
5. Check which features have a high NaN property (for later use)
6. create data sets for anemia, heart, and glucose testing
7. separate men and women data for each questionnaire


## 1. Clarify if the data with heights 130cm, 140cm, 150cm is usable (and maybe find a reason for this)

In [None]:
# get an impression of how many values are there again
print("About how many instances are we talking?\n")
print(data.height.value_counts().head())
weird_heights = data.height.value_counts().index[:3].tolist()
data_filter_helper = data.isin(weird_heights)
weird_heights_data = data.loc[data_filter_helper.height]
print("This affects ", weird_heights_data.shape[0], "instances. ")
print("That is ", weird_heights_data.shape[0]*100/data.shape[0], "% of our data in total. ")

So this affects a crucial amount of our data, we should investigate before we consider this data as measurement errors. 

In [None]:
weird_heights_data.describe()

This shows the data has a large variety in other features. Lets visualize: 

In [None]:
fig = plt.figure(figsize = (10, 30))

for counter, column in enumerate(weird_heights_data.columns): 
    axes= fig.add_subplot(7, 4, 1+ counter)
    axes.bar(weird_heights_data[column].value_counts().index, weird_heights_data[column].value_counts().values)
    axes.set_title(column)  
plt.subplots_adjust(wspace = 0.5)
plt.show()

As we can observe, most of the other measurements are fairly distributed. As the district code varies, I can only assume that sometimes the height was very loosely taken. Also it is interesting, that month and date of birth are all the same among the regarded group of people. Hence I can only conclude that these people do not own a birth confirmation and their body size was maybe simply estimated when registrating them. We should keep this data. 

##  2. Apply OneHotEncoding to categorical features



Dummies should be drawn by all features that are encoded numerically and which are actually categorical. From the 21 remaining, these are

- district_code
- stratum
- record_code_iodine. Here, 1, 2 and 3 are ordered, while 4 should be 0 (no salt in household) and 5 should be replaced by NaN (no information).
- sex should be replaced by binary encoding instead of 0, 1, 2
- marital status
- gauna_perfor_not_perfor: 1- pregnant, 2-lactating, 3-nothing of both. Better rename to "pregnant" and "lactating" after OneHoteEncoding



In [None]:
dummieable =['district_code', 'stratum', 'record_code_iodine', 'sex', 'marital_status', 'gauna_perfor_not_perfor']
dummiedata = [data]
for dum in dummieable: 
    dummiedata.append(pd.get_dummies(data[dum], prefix = dum))
dummied_data = pd.concat(dummiedata, axis = 1)
print("Number of features now: ", len(dummied_data.columns))
dummied_data.columns

Finally remove the old columns, rename the new ones and set all not given data NaN.

In [None]:
dummied_data = dummied_data.drop(dummieable, axis =1)
print("Number of features after making categorical numeric: ", len(dummied_data.columns))
rename_dict = {'marital_status_1.0': 'never_married', 'marital_status_2.0': 'married_no_gauna',
               'marital_status_3.0': 'married_and_gauna',
       'marital_status_4.0': 'remarried', 'marital_status_5.0': 'widow', 'marital_status_6.0': 'divorced',
       'marital_status_7.0': 'separated', 'gauna_perfor_not_perfor_1.0': 'pregnant',
       'gauna_perfor_not_perfor_2.0': 'lactating', 'gauna_perfor_not_perfor_3.0': 'non_pregnant_non_lactating',
        'sex_1': 'male', 'sex_2': 'female'}
dummied_data = dummied_data.rename(rename_dict, axis = 'columns')

## 3. Encode date_survey as ordinal feature

In [None]:
def parse(string):
    return int(string[6:])*10000 + int(string[3:5])*100 + int(string[:2])
dummied_data['year_month_day_survey'] = dummied_data.date_survey.apply(parse)
display(dummied_data[['date_survey', 'year_month_day_survey']].head(10)) #show how encoding looks like
dummied_data.drop('date_survey', axis = 1, inplace = True); #remove the original encoding

## 4. Create a scaled and centered data set

In [None]:
ft_numeric = ['year_month_day_survey','test_salt_iodine', 'age', 'date_of_birth', 'month_of_birth', 'year_of_birth', 'weight', 
              'height', 'haemoglobin_level', 'bp_systolic', 'bp_diastolic', 'glucose', 'duration_pregnanacy',
              'bmi', 'pulse_rate']
print("before: ")
display(dummied_data[ft_numeric].head())
#scale data to unit variance
cols = ["std_"+ x for x in  dummied_data[ft_numeric].columns]
dummied_data_numeric_std = pd.DataFrame(StandardScaler(with_mean = True).fit_transform(dummied_data[ft_numeric]), 
                                    columns = cols, index = dummied_data.index)
print("after scaling and centralizing: ")
dummied_data_numeric = dummied_data[ft_numeric]
dummied_data_numeric.head()

In [None]:
dummied_data_numeric_std.hist(figsize = (20, 20));

In [None]:
dummied_data_std = pd.concat([dummied_data.drop(ft_numeric, axis = 1), dummied_data_numeric_std], axis = 1)

In [None]:
dummied_data_std.head()

## 5. Check which features have a high NaN property (for later use)



In [None]:
print("Missing values in both dummied data sets: ")
for c in dummied_data.columns: 
    nan_count = sum(dummied_data[c].isna())
    if(nan_count > 0):
        print(c, nan_count)

## 6. create data sets for anemia, heart, and glucose testing

In [None]:
def drop_null_targets(data, target): 
     return data[data[target].notnull()]

data_anemia = drop_null_targets(dummied_data, 'haemoglobin_level')
data_anemia_std = drop_null_targets(dummied_data_std, 'std_haemoglobin_level')
data_glucose = drop_null_targets(dummied_data, 'glucose')
data_glucose_std = drop_null_targets(dummied_data_std, 'std_glucose')
data_heart = drop_null_targets(dummied_data, 'pulse_rate')
data_heart_std = drop_null_targets(dummied_data_std, 'std_pulse_rate')

In [None]:
#for control
print(dummied_data.shape)
print(dummied_data_std.shape)


## Checking correlations

Finding and plotting meaningful pairwise correlations.

In [None]:
corr_dummied_data = dummied_data.corr()
corr_dummied_data_std = dummied_data_std.corr()

In [None]:
m = (corr_dummied_data.mask(np.eye(len(corr_dummied_data), dtype=bool)).abs() > 0.4).any()
# keeping the feature if it has an above 0.4 correlation with at least one other feature

corr_dd = corr_dummied_data.loc[m, m]

In [None]:
corr_matrix_plot = corr_dd
title = 'Pairwise Correlations'

f, ax = plt.subplots(figsize=(10, 8))
        
# Diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with a color bar
sns.heatmap(corr_matrix_plot, cmap=cmap, center=0, linewidths=.25, cbar_kws={"shrink": 0.6})

# Set the ylabels 
ax.set_yticks([x + 0.5 for x in list(range(corr_matrix_plot.shape[0]))])
ax.set_yticklabels(list(corr_matrix_plot.index), size = int(160 / corr_matrix_plot.shape[0]));

# Set the xlabels 
ax.set_xticks([x + 0.5 for x in list(range(corr_matrix_plot.shape[1]))])
ax.set_xticklabels(list(corr_matrix_plot.columns), size = int(160 / corr_matrix_plot.shape[1]));
plt.title(title, size = 14)
plt.savefig('correlations.png')
plt.savefig('correlations.pdf')

# Investigations on Anemia

## Steps

1.  Drop irrelevant data
2. Add column for anemia
3. Find correlations
4. Train a model
5. Evaluate model


## 1. Drop irrelevant data

Dropping data irrelevant to anemia from medicinal site. 

In [None]:
anemia_relevant = ['stratum_0', 'stratum_1', 'stratum_2',"test_salt_iodine",'record_code_iodine_1', 'record_code_iodine_2', 
                   'record_code_iodine_3',"age","weight","height","haemoglobin_level","bp_systolic",
                   "bp_diastolic","pulse_rate","glucose","duration_pregnanacy","bmi", 'male', 'female']
temp = [x for x in anemia_relevant if not np.isin(x, ft_numeric)]
temp2 = [x for x in anemia_relevant if np.isin(x, ft_numeric)]
anemia_relevant_std = temp + ["std_" + x for x in temp2]
data_anemia_red = data_anemia[anemia_relevant]
data_anemia_std_red = data_anemia_std[anemia_relevant_std]

display(data_anemia_red.head())
display(data_anemia_red.describe(include = 'all'))
display(data_anemia_std_red.head())
display(data_anemia_std_red.describe(include = 'all'))

### Create an anemia feature

Adding a column for anemia. For men, anemia is diagnosed when the haemoglobin level is less than 13 g/dL, and for women less than 12 g/dL.

In [None]:
data_anemia_red['anemia'] = np.where(((data_anemia_red['male'] == 1) & (data_anemia_red['haemoglobin_level'] < 13.0)) |
                                    ((data_anemia_red['female'] == 1) & (data_anemia_red['haemoglobin_level'] < 12.0)), 1, 0)
data_anemia_std_red['anemia'] = data_anemia_red.anemia
display(data_anemia_std_red.head()) #no need to find the borders here.
data_anemia_red.head()

In [None]:
data_anemia_red_corr = data_anemia_red.corr()
data_anemia_std_red_corr = data_anemia_std_red.corr()
display(data_anemia_std_red_corr)
display(data_anemia_red_corr)

In [None]:
data_anemia_women = data_anemia_red.where(data_anemia_red.male == 0)
data_anemia_men = data_anemia_red.where(data_anemia_red.male == 1)

In [None]:
fig = plt.figure(figsize = (15, 5))
#plot anemia status of all women
axes = fig.add_subplot(1, 2, 1)
axes.fill_between(x = [0, 12], y1= [0.28, 0.28], color = 'lightcoral' )
axes.fill_between(x = [12, 18], y1= [0.28, 0.28], color = 'lightgreen' )
axes.hist(data_anemia_women.haemoglobin_level.dropna(), density = True, bins = 20)
axes.axvline(12, color = 'red', label = 'critical haemoglobin level')
axes.set_xlabel('haemoglobin [g/dl]')
axes.set_ylabel('freq')
axes.set_title('haemoglobin of women, mean ' + str(np.nanmean(data_anemia_women.haemoglobin_level)))
#anemia of all men
axes = fig.add_subplot(1, 2, 2)
axes.fill_between(x = [0, 13], y1= [0.28, 0.28], color = 'lightcoral' )
axes.fill_between(x = [13, 18], y1= [0.28, 0.28], color = 'lightgreen' )
axes.hist(data_anemia_men.haemoglobin_level.dropna(), density = True, bins = 20)
axes.axvline(13, color = 'red', label = 'critical haemoglobin level')
axes.set_xlabel('haemoglobin [g/dl]')
axes.set_ylabel('freq')
axes.set_title('haemoglobin of men, mean ' + str(np.nanmean(data_anemia_men.haemoglobin_level)))
axes.legend()
plt.savefig("anemia_men_women.pdf")
plt.show()

Now to get data ready for training a model. There's many missing values so we'll have to fill them in. I'll use imputation.

In [None]:
data_anemia_red = data_anemia_red.drop('haemoglobin_level', axis = 1)
data_anemia_std_red = data_anemia_std_red.drop('std_haemoglobin_level', axis = 1)

In [None]:
def impute_data(data):
    imputer = SimpleImputer()#fill up with mean
    data_i= pd.DataFrame(imputer.fit_transform(data), columns = data.columns, index = data.index)
    return data_i
    
data_anemia_red = impute_data(data_anemia_red)
data_anemia_std_red = impute_data(data_anemia_std_red)

In [None]:
display(data_anemia_red.head())
data_anemia_std_red.head()

In [None]:
def create_sampled_train_test_split(data, label, test_size, under = True):
    X_train,X_test,y_train,y_test = train_test_split(data, data[label],test_size=test_size)
    r = False
    if(under): 
        class_count = np.amin(X_train.groupby(label)[label].count().values)
    else: 
        class_count = np.amax(X_train.groupby(label)[label].count().values)
        r = True
    
    print("Classes before under- or oversampling")
    display(X_train.groupby(label)[label].count())
    
    negative_cases = X_train[X_train[label] == 0].sample(n = class_count, replace = r)
    positive_sample = X_train[X_train[label] == 1].sample(n=class_count, replace = r)
    X_train_balanced = pd.concat([negative_cases, positive_sample])
    X_train_balanced.sort_index
    print("Classes after under- or oversampling")
    display(X_train_balanced.groupby(label)[label].count())
    
    y_train = X_train_balanced.pop(label)
    y_test = X_test.pop(label)
    
    return X_train_balanced, X_test, y_train, y_test

X_train, X_test, y_train, y_test = create_sampled_train_test_split(data_anemia_red, label = 'anemia', test_size = 0.2)

In [None]:
#Now train a model
def evaluate_model(model, X_train, y_train, X_test, y_test):
    print("Evaluation of", model)
    rf = model.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    confusion_matrix_result = confusion_matrix(y_test.values, y_pred)
    print("Confusion matrix:\n%s" % confusion_matrix_result)
    print(classification_report(y_test, y_pred))
    print("Accuracy: %.2f" % accuracy_score(y_test, y_pred))
    
evaluate_model(RandomForestClassifier(n_estimators=100, max_depth=4, random_state=0), X_train, y_train, X_test, y_test)

In [None]:
#repeat the same on standardized data set
X_train, X_test, y_train, y_test = create_sampled_train_test_split(data_anemia_std_red, label = 'anemia', test_size = 0.2)
evaluate_model(RandomForestClassifier(n_estimators=100, max_depth=4, random_state=0), X_train, y_train, X_test, y_test)

In [None]:
#try oversampling
X_train, X_test, y_train, y_test = create_sampled_train_test_split(data_anemia_std_red, label = 'anemia', test_size = 0.2, under = False)
evaluate_model(RandomForestClassifier(n_estimators=100, max_depth=4, random_state=0), X_train, y_train, X_test, y_test)

## Feature selection
Currently, the anemia dataset has 19 features. We should find and keep k best features to use. Let's try with k=13. Since both data_anemia_red and data_anemia_std_red currently include the anemia feature, doing that will give us 12 best features for classifying anemia.

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif
select_k_best_classifier = SelectKBest(score_func=f_classif, k=13)

select_k_best_classifier.fit_transform(data_anemia_red, data_anemia_red.anemia)
mask = select_k_best_classifier.get_support()
relevant_columns = data_anemia_red.columns[mask]
display(relevant_columns)

select_k_best_classifier.fit_transform(data_anemia_std_red, data_anemia_std_red.anemia)
mask = select_k_best_classifier.get_support()
relevant_columns_std = data_anemia_std_red.columns[mask]
display(relevant_columns_std)

In [None]:
data_anemia_new = data_anemia_red[relevant_columns]
data_anemia_std_new = data_anemia_std_red[relevant_columns_std]
display(data_anemia_new.head())
display(data_anemia_std_new.head())

Now let's try training some models with the new datasets.

In [None]:
X_train, X_test, y_train, y_test = create_sampled_train_test_split(data_anemia_new, label = 'anemia', test_size = 0.2)
evaluate_model(RandomForestClassifier(n_estimators=100, max_depth=4, random_state=0), X_train, y_train, X_test, y_test)

In [None]:
#repeat the same on standardized data set
X_train, X_test, y_train, y_test = create_sampled_train_test_split(data_anemia_std_new, label = 'anemia', test_size = 0.2)
evaluate_model(RandomForestClassifier(n_estimators=100, max_depth=4, random_state=0), X_train, y_train, X_test, y_test)

In [None]:
#try oversampling
X_train, X_test, y_train, y_test = create_sampled_train_test_split(data_anemia_new, label = 'anemia', test_size = 0.2, under = False)
evaluate_model(RandomForestClassifier(n_estimators=100, max_depth=4, random_state=0), X_train, y_train, X_test, y_test)

In [None]:
#try oversampling
X_train, X_test, y_train, y_test = create_sampled_train_test_split(data_anemia_std_new, label = 'anemia', test_size = 0.2, under = False)
evaluate_model(RandomForestClassifier(n_estimators=100, max_depth=4, random_state=0), X_train, y_train, X_test, y_test)

Let's try learning another model. How will a KNN classifier with n=3 perform relative to the random forest on an oversampled dataset?

In [None]:
from sklearn.neighbors import KNeighborsClassifier

X_train, X_test, y_train, y_test = create_sampled_train_test_split(data_anemia_new, label = 'anemia', test_size = 0.2, under = False)
evaluate_model(KNeighborsClassifier(n_neighbors=3), X_train, y_train, X_test, y_test)

That's way better! Let's try performing 10-fold cross-validation on our oversampled anemia dataset to see which n gives us the lowest validation error for KNN.

In [None]:
from sklearn.model_selection import cross_val_score

# filtering just the odd numbers from 1 to 50
neighbors = list(filter(lambda x: x % 2 != 0, list(range(1,50))))
cv_scores = [] # cross-validation scores

for n in neighbors:
    knn = KNeighborsClassifier(n_neighbors=n)
    scores = cross_val_score(knn, X_train, y_train, cv=10, scoring='accuracy')
    cv_scores.append(scores.mean())

In [None]:
errors = [1 - x for x in cv_scores]
optimal_n = neighbors[errors.index(min(errors))]
print("The optimal number of neighbors is %d" % optimal_n)

# plot misclassification errors for each n
plt.plot(neighbors, errors)
plt.xlabel('Number of Neighbors')
plt.ylabel('Misclassification Error')
plt.show()

In [None]:
X_train, X_test, y_train, y_test = create_sampled_train_test_split(data_anemia_new, label = 'anemia', test_size = 0.2, under = False)
evaluate_model(KNeighborsClassifier(n_neighbors=optimal_n), X_train, y_train, X_test, y_test)

In [None]:
# let's try on the standardized dataset as well
X_train, X_test, y_train, y_test = create_sampled_train_test_split(data_anemia_std_new, label = 'anemia', test_size = 0.2, under = False)
evaluate_model(KNeighborsClassifier(n_neighbors=optimal_n), X_train, y_train, X_test, y_test)

In [None]:
# and on the undersampled datasets
X_train, X_test, y_train, y_test = create_sampled_train_test_split(data_anemia_new, label = 'anemia', test_size = 0.2)
evaluate_model(KNeighborsClassifier(n_neighbors=optimal_n), X_train, y_train, X_test, y_test)

In [None]:
# standardized dataset with undersampling
X_train, X_test, y_train, y_test = create_sampled_train_test_split(data_anemia_std_new, label = 'anemia', test_size = 0.2)
evaluate_model(KNeighborsClassifier(n_neighbors=optimal_n), X_train, y_train, X_test, y_test)

The model that gave us the highest accuracy (84%) was KNN classifier with n = 1 on the oversampled anemia dataset with 12 best features.