In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from matplotlib import pyplot as plt # plotting
from math import * # sqrt() etc
# with %matplotlib inline you turn on the immediate display.
# %matplotlib inline

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

# Gather Data

In [None]:
data_dictionary_loc = '../input/CAB_data_dictionary.xlsx'
data_dic = pd.read_excel(data_dictionary_loc, dtype = object)
data_dic['File Content Description'] #well, how to import the correct column width? can be viewed using other programs
data_dic

In [None]:
data_u_pradesh = pd.read_csv('../input/CAB_09_UP.csv', low_memory = False) 
#needed to specify low_memory because columns (14, 43 had mixed types)
data_u_pradesh.head()

# Cleaning data

Subset of adults has 299 570 individuals

In [None]:
data = data_u_pradesh[(data_u_pradesh['age_code']=='Y')&(data_u_pradesh['age']>=18)]
len(data)

Original data had -1 for missing values

In [None]:
data = data.replace([-1, '-1'], np.nan)

Dropping columns only applicable to under 5 year olds

In [None]:
cols_under5 = ['illness_type', 'illness_duration', 'treatment_type']
cols_under3 = ['first_breast_feeding', 'is_cur_breast_feeding', 'day_or_month_for_breast_feeding_', 'day_or_month_for_breast_feeding', 'water_month', 'ani_milk_month', 'semisolid_month_or_day', 'solid_month', 'vegetables_month_or_day']

In [None]:
data = data.drop(cols_under5, axis = 1)
data = data.drop(cols_under3, axis = 1)

Dropping unnecessary features
 - 'state_code'
 - 'PSU_ID' - This is a seven digit number to uniquely identify each record.
 - 'ahs_house_unit' - House Number
 - 'house_hold_no' - Household Number
 - 'record_code_iodine_reason' - Why was iodine testing refused
 - 'sl_no' - Each record of the Household has a serial no. 
 - 'usual_residence' - Whether the member usually lives here
 - 'usual_residence_reason' - Reason for member not being usual resident
 - 'identification_code' - Each member of a PSU is assigned a unique number
 - 'v54' ?

In [None]:
data = data.drop(['state_code', 'psu_id', 'ahs_house_unit', 'house_hold_no', 'record_code_iodine_reason', 'sl_no', 'usual_residance', 'usual_residance_reason', 'identification_code', 'v54'], axis = 1)

From data dictionary:
- 'rural_urban' - Rural-1; Urban-2
- 'stratum' - 1 or 2 when 'rural_urban'=1, 0 when 'rural_urban'=2

dropping feature 'rural_urban', since 'stratum' contains the same information

I guess 'stratum' feature values:
- 0 - urban
- 1 - rural  
- 2 - very rural?

not specified in dictionary

In [None]:
data = data.drop('rural_urban', axis = 1)

## Age related
From data dictionary:
- 'age_code' - unit of recording age
- 'age'
- 'date_of_birth' - DD
- 'month_of_birth' - MM
- 'year_of_birth' - YYYY

Dropping feature age_code(values: Y, M, D for years, months, days), since age always recorded in years for adults

In [None]:
display(np.unique(data['age_code']))
data = data.drop('age_code', axis = 1)

In [None]:
plt.hist(data.age.dropna(), bins = 50)
plt.title('Age')
plt.show

## Iodine
From data dictionary:
- 'test_salt_iodine' - Salt used by the Household has been tested for Iodine content[Recorded as Parts Per Million(PPM)]
- 'record_code_iodine' - No iodine – 1; Less than 15 PPM – 2; More than or equal to 15 PPM – 3; No salt in Household – 4; Salt not tested  – 5

In [None]:
pd.value_counts(data['record_code_iodine'])

## Height/weight
From data dictionary:
- 'weight_measured' - Measured-1;  Member - not present-2, Refused-3, Other-4
- 'weight_in_kg' - outcome
- 'length_height_measured' - Measured-1;  Member not present-2, Refused-3, Other-4
- 'length_height_code' - L- Length, H-Height
- 'length_height_cm' - outcome

Dropping, unnecessary columns, NA in weight/length column if measurement was not conducted

In [None]:
data = data.drop(['weight_measured', 'length_height_measured', 'length_height_code'], axis = 1)

In [None]:
data = data.rename(index=str, columns={"weight_in_kg": "weight", "length_height_cm": "height"})

In [None]:
plt.boxplot(data['weight'].dropna())
plt.title('Weight with outliers')
plt.show

In [None]:
plt.boxplot(data['height'].dropna())
plt.title('Height with outliers')
plt.show

In [None]:
# exclude any measurements where difference from median is larger than 3 standard deviations
def remove_outliers(data, feature):
    stdev = sqrt(np.var(data[feature].dropna()))
    median = np.median(data[feature].dropna())
    print("number of discarded measurements")
    display(len(data[[feature]].where(abs(data[feature] - median)>(3*stdev)).dropna()))
# keep original values if difference from mean is less than 3 standard deviations. NA otherwise
    return data[[feature]].where(abs(data[feature] - median)<(3*stdev), other = np.nan)

In [None]:
data['height'] = remove_outliers(data, 'height')

Removing weight outliers. NA for anything under 20kg

In [None]:
print('number of discarded measurements')
display(len(data[data['weight']<20]))
data['weight'] = data['weight'].where(data['weight']>20, other=np.nan)

In [None]:
plt.boxplot(data['weight'].dropna())
plt.title('Weight without outliers')
plt.show

In [None]:
plt.boxplot(data['height'].dropna())
plt.title('Height without outliers')
plt.show

Body mass index: weight(kg)/(height(m) * height(m))

In [None]:
data['bmi'] = data['weight']/(data['height']/100)**2

In [None]:
plt.hist(data['weight'].dropna(), bins = 50)
plt.title('Weight without outliers')
plt.show()

In [None]:
plt.hist(data['height'].dropna(), bins = 50)
plt.title('Height without outliers')
plt.show()

A lot of individuals with 130, 140, 150cm height

In [None]:
plt.hist(data['bmi'].dropna(), bins = 50)
plt.title('BMI')
plt.show()

Data cleaning steps for height/weight related data: 
- Discarded any height measurements where difference from median was further than 3 standard deviations. Looking at distribution of height/weight as normally distributed.
- Discarded any weight measurements under 20kg
- Calculated BMI

Discarded ~800 values for height, ~460 values for weight. Out of ~200 000

## Pulse, blood pressure(heart disease)
From data dictionary:
- 'bp_systolic'
- 'bp_systolic_2_reading'
- 'bp_diastolic'
- 'bp_diastolic_2reading'
- 'pulse_rate',
- 'pulse_rate_2_reading'

In [None]:
# distribution of measurement differences
#plt.hist((data['bp_systolic'] - data['bp_systolic_2_reading']).dropna(), bins = 50)
#plt.hist((data['pulse_rate'] - data['pulse_rate_2_reading']).dropna(), bins = 50)
#plt.hist((data['bp_diastolic'] - data['bp_diastolic_2reading']).dropna(), bins = 50)

In [None]:
# for features where two measurements were taken, exclude any where difference between measurements is larger than 3 standard deviations
def remove_outliers_difference(data, col1, col2):
    stdev = sqrt((data[col1] - data[col2]).var())
# how many measurements were excluded
    print('number of discarded measurements')
    display(len(data[[col1, col2]].where(abs(data[col1] - data[col2])>(3*stdev)).dropna()))
# keep original values if difference of two measurements is less than 3 standard deviations. NA otherwise
    return data[[col1, col2]].where(abs(data[col1] - data[col2])<(3*stdev), other = np.nan)

In [None]:
data[['bp_systolic', 'bp_systolic_2_reading']] = remove_outliers_difference(data, 'bp_systolic', 'bp_systolic_2_reading')
data[['bp_diastolic', 'bp_diastolic_2reading']] = remove_outliers_difference(data, 'bp_diastolic', 'bp_diastolic_2reading')
data[['pulse_rate', 'pulse_rate_2_reading']] = remove_outliers_difference(data, 'pulse_rate', 'pulse_rate_2_reading')

Now that outliers have been removed, aggregate remaining data by finding mean between two readings

In [None]:
# aggregate two reading by finding mean
def aggregate_readings(data, col1, col2):
    data[col1] = data.apply(lambda row: sum([row[col1], row[col2]])/2, axis = 1)
    data = data.drop(col2, axis = 1)
    return data

In [None]:
data = aggregate_readings(data, 'bp_systolic', 'bp_systolic_2_reading')
data = aggregate_readings(data, 'bp_diastolic', 'bp_diastolic_2reading')
data = aggregate_readings(data, 'pulse_rate', 'pulse_rate_2_reading')

Systolic - beating, diastolic - resting blood pressure. Likely input/measurement error where systolic < diastolic

In [None]:
# retain original values where resting blood pressure lower than beating. NA otherwise 
data[['bp_diastolic', 'bp_systolic']] = data[['bp_diastolic', 'bp_systolic']].where(data.bp_diastolic < data.bp_systolic, other = np.nan)

Data cleaning steps for heart disease related data: 
- Discarded any where difference between two measurements was further from mean than 3 standard deviations. Looking at distribution of measurement differences as normally distributed.
- Aggregated two measurements by finding mean
- Discarded any where diastolic pressure was higher than systolic

Lost less than 5% of values for each feature

## Haemoglobin(anemia)
From data dictionary:
- 'haemoglobin_test' - Consent for Haemoglobin test (Yes-1; No-2)
- 'haemoglobin'- Status of Haemoglobin Test (Measured-1; Member not present-2; Refused-3, Other-4)
- 'haemoglobin_level' - Outcome of Haemoglobin Level (Hb) Test (in percentage gms)  

In [None]:
data = data.drop(['haemoglobin_test', 'haemoglobin'], axis = 1)

In [None]:
plt.hist(data.haemoglobin_level[~np.isnan(data.haemoglobin_level)], bins=50)
plt.title('Blood haemoglobin')
plt.show

## Blood sugar(diabetes)
From data dictionary:
- 'diabetes_test' - consent for testing
- 'fasting_blood_glucose' - Measured-1; Member not present-2; Refused-3; Other-4
- 'fasting_blood_glucose_mg_dl' - outcome of test

In [None]:
data = data.drop(['diabetes_test', 'fasting_blood_glucose'], axis = 1)

In [None]:
plt.hist(data.fasting_blood_glucose_mg_dl[~np.isnan(data.fasting_blood_glucose_mg_dl)], bins=50)
plt.title('Blood sugar')
plt.show

In [None]:
plt.boxplot(data.fasting_blood_glucose_mg_dl[~np.isnan(data.fasting_blood_glucose_mg_dl)])
plt.title('Blood sugar')
plt.show

In [None]:
data['fasting_blood_glucose_mg_dl'] = remove_outliers(data,'fasting_blood_glucose_mg_dl')

## Features only applicable to women
From data dictionary:
- 'marital_status' - Never married=1,Married but Gauna not performed=2, Married and Gauna perfomed=3, Remarried=4,Widow=5, Divorced=6, Separated=7, Not stated=8
- 'gauna_perfor_not_perfor' - Pregnant-1; Lactating-2; Non-pregnant or Non-lactating-3
- 'duration_pregnanacy' - Duration of pregnancy/lactation (in months)

In [None]:
cols_women = ['marital_status', 'gauna_perfor_not_perfor', 'duration_pregnanacy']

placing NA where marital status 'not stated' 

In [None]:
data['marital_status'] = data['marital_status'].where(~(data['marital_status']==8.0), other = np.nan)

In [None]:
# input errors have to be dealt with
plt.boxplot(data['duration_pregnanacy'].dropna())
plt.show

In [None]:
corr=data.corr()[['haemoglobin_level', 'pulse_rate', 'bp_diastolic', 'bp_systolic', 'fasting_blood_glucose_mg_dl']]
corr.where(abs(corr)>0.1)

Removing features where there's no correlation

In [None]:
data_correlated = data.drop(['district_code', 'stratum', 'test_salt_iodine', 'record_code_iodine', 'date_of_birth', 'month_of_birth', 'duration_pregnanacy'], axis = 1)
corr = data_correlated.corr()[['haemoglobin_level', 'pulse_rate', 'bp_diastolic', 'bp_systolic', 'fasting_blood_glucose_mg_dl']]
corr.where(abs(corr)>0.1)

## Summary
From 53 initial features to 21

TODO:
- A lot of individuals with 130, 140, 150cm height value??