## Implementing a baseline machine learning model

In [40]:
import pandas as pd

In [41]:
demographics = pd.read_csv('Demographics.csv')

In [42]:
ml = demographics.loc[:,'GENDER': 'HEART_DEATH_FLAG']
del ml['DOB']
del ml['DOD']
del ml['DOA']
ml.head()

Unnamed: 0,GENDER,ADMIT_AGE,ETHNICITY,MARITAL_STATUS,LANGUAGE,RELIGION,INSURANCE,ADMISSION_LOCATION,#ADMISSIONS,OUTSIDE_DEATH_FLAG,DEATH_FLAG,OLD_FLAG,HEART_ATTACK_FLAG,ATHERO_DIAGNOSIS_FLAG,HEART_DEATH_FLAG
0,M,72.312329,WHITE,MARRIED,UKNOWN,CATHOLIC,Medicare,TRANSFER FROM HOSP/EXTRAM,1,0,1,0,0,0,0
1,F,,WHITE,UKNOWN,UKNOWN,NOT SPECIFIED,Medicare,EMERGENCY ROOM ADMIT,1,0,1,1,0,0,0
2,M,55.241096,WHITE,MARRIED,UKNOWN,CATHOLIC,Private,CLINIC REFERRAL/PREMATURE,2,0,1,0,0,0,0
3,M,73.539726,WHITE,SINGLE,UKNOWN,JEWISH,Medicare,EMERGENCY ROOM ADMIT,2,0,1,0,0,0,0
4,F,44.512329,WHITE,MARRIED,UKNOWN,OTHER,Private,EMERGENCY ROOM ADMIT,2,0,1,0,0,0,0


In [43]:
# Get numerical data
ml_data = pd.get_dummies(ml, columns=['GENDER','ETHNICITY','MARITAL_STATUS', 'LANGUAGE', 'RELIGION', 'INSURANCE', 'ADMISSION_LOCATION'])
ml_data = ml_data[ml_data['OLD_FLAG']==0]

In [44]:
# Reduce population to only those with ages
ml_data = ml_data[ml_data['OLD_FLAG']==0]

In [45]:
# Produce output data sets to create models
heart_attacks = ml_data['HEART_ATTACK_FLAG']
athero_diagnosis = ml_data['ATHERO_DIAGNOSIS_FLAG']
deaths = ml_data['DEATH_FLAG']
heart_deaths = ml_data['HEART_DEATH_FLAG']

In [46]:
# Predict just deaths on non-diagnostic data
del ml_data['HEART_ATTACK_FLAG']
del ml_data['ATHERO_DIAGNOSIS_FLAG']
del ml_data['DEATH_FLAG']
del ml_data['HEART_DEATH_FLAG']
del ml_data['OLD_FLAG']
del ml_data['OUTSIDE_DEATH_FLAG']

ml_data.head()

Unnamed: 0,ADMIT_AGE,#ADMISSIONS,GENDER_F,GENDER_M,ETHNICITY_AMERICAN INDIAN/ALASKA NATIVE,ETHNICITY_AMERICAN INDIAN/ALASKA NATIVE FEDERALLY RECOGNIZED TRIBE,ETHNICITY_ASIAN,ETHNICITY_ASIAN - ASIAN INDIAN,ETHNICITY_ASIAN - CAMBODIAN,ETHNICITY_ASIAN - CHINESE,...,INSURANCE_Self Pay,ADMISSION_LOCATION_** INFO NOT AVAILABLE **,ADMISSION_LOCATION_CLINIC REFERRAL/PREMATURE,ADMISSION_LOCATION_EMERGENCY ROOM ADMIT,ADMISSION_LOCATION_HMO REFERRAL/SICK,ADMISSION_LOCATION_PHYS REFERRAL/NORMAL DELI,ADMISSION_LOCATION_TRANSFER FROM HOSP/EXTRAM,ADMISSION_LOCATION_TRANSFER FROM OTHER HEALT,ADMISSION_LOCATION_TRANSFER FROM SKILLED NUR,ADMISSION_LOCATION_TRSF WITHIN THIS FACILITY
0,72.312329,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,55.241096,2,0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,73.539726,2,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,44.512329,2,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
5,81.627397,1,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [47]:
# Create randomly undersampled data set
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(return_indices=True)
X_resampled, y_resampled, idx_resampled = rus.fit_sample(ml_data, deaths)

In [48]:
# Check sampling numbers
y_resampled = pd.Series(y_resampled)
print(y_resampled.value_counts())
print(deaths.value_counts())

1    14320
0    14320
dtype: int64
0    30250
1    14320
Name: DEATH_FLAG, dtype: int64


In [49]:
# Create and test an XGBoost with 5 fold cross validation for predicting death on this model with no hyperparameter 
# optimization
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score

base_model_XG = GradientBoostingClassifier()
scores = cross_val_score(base_model_XG, X_resampled, y_resampled, cv=5)
scores.mean()

0.7068435754189945

In [34]:
# Same process for predicting heart attacks
rus = RandomUnderSampler(return_indices=True)
X_resampled, y_resampled, idx_resampled = rus.fit_sample(ml_data, heart_attacks)

In [35]:
# Check sampling numbers
y_resampled = pd.Series(y_resampled)
print(y_resampled.value_counts())
print(deaths.value_counts())

1    121
0    121
dtype: int64
0    30250
1    14320
Name: DEATH_FLAG, dtype: int64


So probably not large enough #s for heart attacks alone, instead of undersampling will use SMOTE to oversample

In [36]:
from imblearn.over_sampling import SMOTE 
sm = SMOTE(random_state=42)
X_resampled, y_resampled = sm.fit_sample(ml_data, heart_attacks)

In [37]:
y_resampled = pd.Series(y_resampled)
print(y_resampled.value_counts())

1    44449
0    44449
dtype: int64


In [77]:
base_model_XG = GradientBoostingClassifier()
scores = cross_val_score(base_model_XG, X_resampled, y_resampled, cv=5)
scores.mean()

0.7068435754189945

High accuracy with SMOTE, likely overfitting...

#### Next lets look at a model for atherosclerosis death predictions

In [82]:
# First, define atherosclerosis diagnoses from non-atherosclerosis diagnoses
athero_pre = demographics[demographics['OLD_FLAG']==0]
athero_pos = athero_pre[athero_pre['ATHERO_DIAGNOSIS_FLAG']== 1]
athero_neg = athero_pre[athero_pre['ATHERO_DIAGNOSIS_FLAG']==0]

# Clean data sets
del athero_neg['CAUSE']
del athero_pos['CAUSE']

del athero_neg['ATHERO_DIAGNOSIS_FLAG']
del athero_pos['ATHERO_DIAGNOSIS_FLAG']

del athero_neg['OLD_FLAG']
del athero_pos['OLD_FLAG']

del athero_neg['OUTSIDE_DEATH_FLAG']
del athero_pos['OUTSIDE_DEATH_FLAG']

del athero_neg['SUBJECT_ID']
del athero_pos['SUBJECT_ID']

del athero_neg['DOB']
del athero_pos['DOB']

del athero_neg['DOD']
del athero_pos['DOD']

In [83]:
athero_pos['DOA']
del athero_pos['DOA']
del athero_neg['DOA']

athero_neg['HEART_ATTACK_FLAG']
del athero_neg['HEART_ATTACK_FLAG']
del athero_pos['HEART_ATTACK_FLAG']

del athero_pos['Unnamed: 0']

In [84]:
len(athero_pos)

10203

In [85]:
athero_pos.head()

Unnamed: 0,GENDER,ADMIT_AGE,ETHNICITY,MARITAL_STATUS,LANGUAGE,RELIGION,INSURANCE,ADMISSION_LOCATION,#ADMISSIONS,DEATH_FLAG,HEART_DEATH_FLAG
9,M,69.641096,WHITE,MARRIED,UKNOWN,CATHOLIC,Private,TRANSFER FROM HOSP/EXTRAM,4,1,0
12,F,69.005479,WHITE,MARRIED,ENGL,PROTESTANT QUAKER,Medicare,EMERGENCY ROOM ADMIT,2,1,0
17,M,87.882192,WHITE,MARRIED,UKNOWN,JEWISH,Medicare,EMERGENCY ROOM ADMIT,2,1,0
19,F,76.871233,WHITE,MARRIED,PORT,CATHOLIC,Medicare,TRANSFER FROM HOSP/EXTRAM,4,1,1
22,F,85.726027,BLACK/AFRICAN AMERICAN,WIDOWED,UKNOWN,CATHOLIC,Medicare,EMERGENCY ROOM ADMIT,2,1,0


In [65]:
# Create Outcome data sets
athero_heartdeath = pd.Series(athero_pos['HEART_DEATH_FLAG'])
athero_death = pd.Series(athero_pos['DEATH_FLAG'])

In [86]:
del athero_pos['HEART_DEATH_FLAG']
del athero_pos['DEATH_FLAG']

In [87]:
# Get dummies
athero_pos = pd.get_dummies(athero_pos, columns=['GENDER','ETHNICITY','MARITAL_STATUS', 'LANGUAGE', 'RELIGION', 'INSURANCE', 'ADMISSION_LOCATION'])

In [88]:
# Check outcome numbers
print(athero_heartdeath.value_counts())
print(athero_death.value_counts())

0    10001
1      202
Name: HEART_DEATH_FLAG, dtype: int64
0    6576
1    3627
Name: DEATH_FLAG, dtype: int64


In [89]:
base_model_XG = GradientBoostingClassifier()
scores = cross_val_score(base_model_XG, athero_pos, athero_death, cv=5)
scores.mean()

0.6961760077878063

#### So for athero diagnosed patients, we are able to predict with 0.6961 5 fold CV accuracy

In [90]:
athero_pos.head()

Unnamed: 0,ADMIT_AGE,#ADMISSIONS,GENDER_F,GENDER_M,ETHNICITY_AMERICAN INDIAN/ALASKA NATIVE,ETHNICITY_ASIAN,ETHNICITY_ASIAN - ASIAN INDIAN,ETHNICITY_ASIAN - CAMBODIAN,ETHNICITY_ASIAN - CHINESE,ETHNICITY_ASIAN - FILIPINO,...,INSURANCE_Medicaid,INSURANCE_Medicare,INSURANCE_Private,INSURANCE_Self Pay,ADMISSION_LOCATION_CLINIC REFERRAL/PREMATURE,ADMISSION_LOCATION_EMERGENCY ROOM ADMIT,ADMISSION_LOCATION_PHYS REFERRAL/NORMAL DELI,ADMISSION_LOCATION_TRANSFER FROM HOSP/EXTRAM,ADMISSION_LOCATION_TRANSFER FROM OTHER HEALT,ADMISSION_LOCATION_TRANSFER FROM SKILLED NUR
9,69.641096,4,0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
12,69.005479,2,1,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
17,87.882192,2,0,1,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
19,76.871233,4,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
22,85.726027,2,1,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
