In [1]:
import pandas as pd
import numpy as np
#pd.set_option("display.max_rows", None, "display.max_columns", None)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

df = pd.read_csv('/Users/chiufengyap/OneDrive - The University of Texas Health Science Center at Houston/Humana_Mays_2020/2020_Competition_Training.csv', chunksize=10000)
df = df.read()
df = df.reindex()
df.head()


Unnamed: 0,person_id_syn,transportation_issues,src_platform_cd,sex_cd,est_age,smoker_current_ind,smoker_former_ind,lang_spoken_cd,mabh_seg,cci_score,...,submcc_rar_scl_ind,rx_gpi2_74_ind,rx_gpi2_89_ind,rx_gpi2_96_ind,submcc_rsk_obe_ind,rx_gpi2_22_ind,submcc_rsk_synx_ind,submcc_rsk_coag_ind,submcc_rsk_othr_ind,submcc_rsk_chol_ind
0,0002MOb79ST17bLYAe46eIc2,0,EM,F,62,1,0,ENG,UNK,3.0,...,0,0,0,0,0,0,0,0,0,0
1,0004cMOS6bTLf34Y7AIca8f3,0,EM,F,59,1,0,ENG,C2,1.0,...,0,0,0,0,1,0,0,0,0,1
2,000536M9O3ST98LaYaeA29Ia,1,EM,F,63,0,0,ENG,UNK,3.0,...,0,0,0,0,0,0,0,0,0,1
3,0009bMO9SfTLYe77A51I4ac3,0,EM,M,75,0,0,ENG,H6,3.0,...,0,0,0,0,1,0,0,0,0,1
4,000M7OeS66bTL8bY89Aa16Ie,0,EM,M,51,1,0,ENG,UNK,3.0,...,0,0,0,0,0,0,0,0,0,1


In [None]:
y = df['transportation_issues']
y.shape

In [None]:
df = df.drop(['zip_cd','person_id_syn'], axis=1)

In [None]:
df.shape

In [None]:
#impute missing values
from sklearn.impute import SimpleImputer

#get categorical feautes and convert all values to string
categorical_features = df.select_dtypes(include=['object']).columns
print(categorical_features)
df.loc[:,categorical_features].astype(str)

cat = SimpleImputer(strategy='most_frequent',copy=False)
cat1 = cat.fit(df[categorical_features].astype(str))
df[categorical_features] = cat1.transform(df[categorical_features])

In [None]:
#impute numeric
numeric_features = df.select_dtypes(include=['int64', 'float64']).columns
num = SimpleImputer(strategy='median',copy=False)
num1 = num.fit(df[numeric_features])
df[numeric_features] = num1.transform(df[numeric_features])

In [None]:
#get dummies on all categorical variables to create new numerical columns, then drop original columns
categorical_features = df.select_dtypes(include=['object']).columns
X = pd.get_dummies(df[categorical_features], prefix_sep='_')
X = pd.merge(df,X,how='outer',left_index=True,right_index=True)
X = X.drop(categorical_features, axis=1)
X.head()

## feature engineering

In [None]:
#binary indicator for values > 75 percentile, and sqrt to amplify values
X['med_er_visit_high'] = np.where(X['med_er_visit_ct_pmpm'] > 0.1, 1, 0)

#binary indicator for values > 75 percentile, and sqrt to amplify values
X['total_er_visit_high'] = np.where(X['total_er_visit_ct_pmpm'] > 0.1, 1, 0)

X['med_plus_total_er_visit'] = X['med_er_visit_ct_pmpm'] + X['total_er_visit_ct_pmpm']

#combine codes for superficial injuries (likely falls)
#X['ccsp_236_239'] = X['ccsp_239_ind'] + X['ccsp_236_ind']

#combine low income indicators and square/cube to amplify
#X['low_inc'] = X['cms_dual_eligible_ind'] + X['cms_low_income_ind']

#combine behavioral health indicators and square/cube
#X['bh_tot'] = X['bh_cdto_ind'] + X['bh_bipr_ind'] + X['bh_dema_ind'] + X['bh_cdsb_ind'] + X['bh_aoth_ind']

#create binary column for high risk cms part d (anything above 75 percentile considered high risk)
X['cms_partd_high_risk'] = np.where(X['cms_partd_ra_factor_amt'] > 1.32, 1, 0)

#create binary column for high risk cms ma (anything above 75 percentile considered high risk)
X['cms_ma_high_risk'] = np.where(X['cms_ma_risk_score_nbr'] > 1.34, 1, 0)

#amplify correlated cms risk/payment amounts by squaring and cubing values
X['cms_tot_ma_payment_amt_sqrt'] = np.sqrt(X['cms_tot_ma_payment_amt'])

In [None]:
#additional features

# low_inc                                 154.072680
# est_age                                 119.421419
# ccsp_236_239                             80.907964
# cms_tot_partd_payment_amt                78.778600
# med_ambulance_visit_ct_pmpm              62.059773
# total_ambulance_visit_ct_pmpm            52.921809
# cms_rx_risk_score_nbr                    50.098201
# cms_disabled_ind                         48.187203
# bh_tot                                   47.511346

In [None]:
# create a list of our conditions
conditions = [
    (X['cms_rx_risk_score_nbr'] >= 1.84),
    (X['cms_rx_risk_score_nbr'] >= 1.04) & (X['cms_rx_risk_score_nbr'] <= 1.84),
    (X['cms_rx_risk_score_nbr'] >= 0.24) & (X['cms_rx_risk_score_nbr'] <= 1.04),
    (X['cms_rx_risk_score_nbr'] >= 0) & (X['cms_rx_risk_score_nbr'] <= 0.24),
    ]
# create a list of the values we want to assign for each condition
values = [4,3,2,1]
# create a new column and use np.select to assign values to it using our lists as arguments
X['cms_rx_risk_score_grp'] = np.select(conditions, values)
# display updated DataFrame
X['cms_rx_risk_score_grp'].value_counts()

In [None]:
X['cms_rx_risk_score_hi'] = np.where(X['cms_rx_risk_score_nbr'] > 1.83, 1, 0)
X['cms_rx_risk_score_hi'].value_counts()

In [None]:

X['age_90plus'] = np.where(X['est_age'] > 89, 1, 0)
X['age_80to90'] = np.where((X['est_age'] > 79) & (X['est_age'] < 90) , 1, 0)
X['age_70to80'] = np.where((X['est_age'] > 69) & (X['est_age'] < 80) , 1, 0)
X['age_60to70'] = np.where((X['est_age'] > 59) & (X['est_age'] < 70) , 1, 0)

In [None]:

X['cms_tot_partd_payment_hi'] = np.where(X['cms_tot_partd_payment_amt'] > 243, 1, 0)

In [None]:
X['med_ambulance_visit_ct_hi'] = np.where(X['med_ambulance_visit_ct_pmpm'] > .08, 1, 0)

In [None]:
X['total_ambulance_visit_ct_hi'] = np.where(X['total_ambulance_visit_ct_pmpm'] > .08, 1, 0)

In [None]:
X['tot_ambulance_visit_pmpm'] = X['total_ambulance_visit_ct_pmpm'] + X['med_ambulance_visit_ct_pmpm']

In [None]:
X.shape

In [None]:
X = X[lim_cols]
X.shape

In [None]:
X.head()

In [None]:
X['y'] = y
X.head()

## outlier detection

In [None]:
from sklearn.ensemble import IsolationForest
clf = IsolationForest(random_state=0).fit(X)

outliers = clf.fit_predict(X)
outliers

In [None]:
print(np.sum(outliers), len(outliers))

In [None]:
X['outliers'] = outliers
X.head()

In [None]:
#dataframe for outliers
X_out =  X[X['outliers'] == -1]
X_out.head()

In [None]:
#remove outliers from main dataframe
X = X[X['outliers'] == 1].drop('outliers',axis=1)

In [None]:
X.head()

# feature engineering with featuretools

In [None]:
import featuretools as ft

# Create Entity
entity = X.reindex(sorted(X.columns), axis=1)
entity.head()

In [None]:
betos = entity.iloc[:,0:13]
betos.head()

In [None]:
es = ft.EntitySet(id = 'clients')
es = es.entity_from_dataframe(entity_id = 'X', dataframe = entity, 
                              make_index = True, index='index')

In [None]:
es

In [None]:
es = es.entity_from_dataframe(entity_id = 'betos', dataframe = betos, 
                              index='index')

In [None]:
# Run deep feature synthesis with transformation primitives
feature_matrix, feature_defs = ft.dfs(entityset = es, target_entity = 'betos',
                                      max_depth = 2, 
                                      verbose = 1, 
                                      n_jobs = -1)

In [None]:
es = ft.EntitySet(id = 'Turnover')
es.entity_from_dataframe(entity_id = 'betos', dataframe = betos, index = 'index')

# Run deep feature synthesis with transformation primitives
feature_matrix, feature_defs = ft.dfs(entityset = es, target_entity = 'betos',
                                      trans_primitives = ['add_numeric', 'percentile'], 
                                      verbose=True, max_depth=3)

In [None]:
feature_matrix

In [None]:
f = ft.list_primitives()
f[30:50]

In [None]:
X = pd.merge(X,feature_matrix,how='outer',left_index=True,right_index=True)
X.head()

## test train split

In [None]:
X['y'] = y
X.head()

In [None]:
#split into test and train sets
from sklearn.model_selection import train_test_split

X_Train, X_Test, Y_Train, Y_Test = train_test_split(X, X.iloc[:,-1], test_size = 0.20,random_state = 12)
print(len(X_Train), len(Y_Train), len(X_Test), len(Y_Test))

In [None]:
X_Train.head()

In [None]:
#use only after limiting columns based on featuree importance for next step
X_Train = pd.merge(Y_Train, X_Train, how='outer', left_index=True, right_index=True)
X_Train.head()

In [None]:
X_Test = X_Test.drop('transportation_issues',axis=1)

## bagging xgboost process

In [None]:
#random undersampling
#tomek undersampling
#fit xgb models and make predictions
#bagging classifiers

In [None]:
#random sampling with replacement only on majority class
import random

X_Train_0 = X_Train[X_Train['y'] == 0]
X_Train_1 = X_Train[X_Train['y'] == 1]

print(X_Train_0.shape, X_Train_1.shape)

In [None]:
#create random samples of majority class
X_Train_0_1 = X_Train_0.sample(12285)
X_Train_0_2 = X_Train_0.sample(12285)
X_Train_0_3 = X_Train_0.sample(12285)
X_Train_0_4 = X_Train_0.sample(12285)
X_Train_0_5 = X_Train_0.sample(12285)
X_Train_0_6 = X_Train_0.sample(12285)
X_Train_0_7 = X_Train_0.sample(12285)
X_Train_0_8 = X_Train_0.sample(12285)
X_Train_0_9 = X_Train_0.sample(12285)
X_Train_0_10 = X_Train_0.sample(12285)
X_Train_0_10.shape

In [None]:
#combine above with minority class
X_Train_samp1 = pd.concat([X_Train_0_1, X_Train_1])
X_Train_samp2 = pd.concat([X_Train_0_2, X_Train_1])
X_Train_samp3 = pd.concat([X_Train_0_3, X_Train_1])
X_Train_samp4 = pd.concat([X_Train_0_4, X_Train_1])
X_Train_samp5 = pd.concat([X_Train_0_5, X_Train_1])
X_Train_samp6 = pd.concat([X_Train_0_6, X_Train_1])
X_Train_samp7 = pd.concat([X_Train_0_7, X_Train_1])
X_Train_samp8 = pd.concat([X_Train_0_8, X_Train_1])
X_Train_samp9 = pd.concat([X_Train_0_9, X_Train_1])
X_Train_samp10 = pd.concat([X_Train_0_10, X_Train_1])
X_Train_samp10.shape

In [None]:
#tomek undersampling
from imblearn.under_sampling import TomekLinks
tl = TomekLinks(sampling_strategy='majority')
X_Train_res1, Y_Train_res1 = tl.fit_resample(X_Train_samp1.iloc[:,:-1], X_Train_samp1.iloc[:,-1])
X_Train_res2, Y_Train_res2 = tl.fit_resample(X_Train_samp2.iloc[:,:-1], X_Train_samp2.iloc[:,-1])
X_Train_res3, Y_Train_res3 = tl.fit_resample(X_Train_samp3.iloc[:,:-1], X_Train_samp3.iloc[:,-1])
X_Train_res4, Y_Train_res4 = tl.fit_resample(X_Train_samp4.iloc[:,:-1], X_Train_samp4.iloc[:,-1])
X_Train_res5, Y_Train_res5 = tl.fit_resample(X_Train_samp5.iloc[:,:-1], X_Train_samp5.iloc[:,-1])
X_Train_res6, Y_Train_res6 = tl.fit_resample(X_Train_samp6.iloc[:,:-1], X_Train_samp6.iloc[:,-1])
X_Train_res7, Y_Train_res7 = tl.fit_resample(X_Train_samp7.iloc[:,:-1], X_Train_samp7.iloc[:,-1])
X_Train_res8, Y_Train_res8 = tl.fit_resample(X_Train_samp8.iloc[:,:-1], X_Train_samp8.iloc[:,-1])
X_Train_res9, Y_Train_res9 = tl.fit_resample(X_Train_samp9.iloc[:,:-1], X_Train_samp9.iloc[:,-1])
X_Train_res10, Y_Train_res10 = tl.fit_resample(X_Train_samp10.iloc[:,:-1], X_Train_samp10.iloc[:,-1])

In [None]:
X_Test = X_Test.drop('y',axis=1)

In [None]:
print(np.sum(Y_Train_res1)/len(Y_Train_res1))

In [None]:
yt = Y_Test.to_numpy()
bg = pd.DataFrame(data=yt, index=None, columns=['y'])

## run series of 10 models

In [None]:
#run model on group 1
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report,confusion_matrix

est1 = XGBClassifier(eta=0.1, min_child_weight=1, max_depth=4)
eval_set = [(X_Test,Y_Test)]
est1.fit(X_Train_res1, Y_Train_res1, early_stopping_rounds=10, eval_metric="auc", eval_set=eval_set)

print('Test Results')
predict_val = est1.predict(X_Test)
print(confusion_matrix(Y_Test, predict_val))
print(classification_report(Y_Test, predict_val, digits=3))
test_preds1 = est1.predict_proba(X_Test)
auc_prob = roc_auc_score(Y_Test, test_preds1[:,1])
print('AUC prob : ',auc_prob)

bg['preds1'] = test_preds1[:,1]

In [None]:
#group 2
est2 = XGBClassifier(eta=0.1, min_child_weight=1, max_depth=4)
eval_set = [(X_Test,Y_Test)]
est2.fit(X_Train_res2, Y_Train_res2, early_stopping_rounds=10, eval_metric="auc", eval_set=eval_set)

print('Train Results')
print(est2.score(X_Train_res2, Y_Train_res2))
print(recall_score(Y_Train_res2, est2.predict(X_Train_res2)))
predict_val = est2.predict(X_Train_res2)
auc_val = roc_auc_score(Y_Train_res2, predict_val)
print('AUC score : ',auc_val)
print(confusion_matrix(Y_Train_res2, predict_val))
print(classification_report(Y_Train_res2, predict_val, digits=3))
predict_proba_train = est2.predict_proba(X_Train_res2)
auc_prob = roc_auc_score(Y_Train_res2, predict_proba_train[:,1])
print('AUC prob : ',auc_prob)

print('/nTest Results')
print(est2.score(X_Test, Y_Test))
print(recall_score(Y_Test, est2.predict(X_Test)))
predict_val = est2.predict(X_Test)
auc_val = roc_auc_score(Y_Test, predict_val)
print('AUC score : ',auc_val)
print(confusion_matrix(Y_Test, predict_val))
print(classification_report(Y_Test, predict_val, digits=3))
test_preds2 = est2.predict_proba(X_Test)
auc_prob = roc_auc_score(Y_Test, test_preds2[:,1])
print('AUC prob : ',auc_prob)

bg['preds2'] = test_preds2[:,1]

In [None]:
#run model on group 3
est3 = XGBClassifier(eta=0.1, min_child_weight=1, max_depth=4)
eval_set = [(X_Test,Y_Test)]
est3.fit(X_Train_res3, Y_Train_res3, early_stopping_rounds=10, eval_metric="auc", eval_set=eval_set)
print('Train Results')

predict_val = est3.predict(X_Train_res3)
print(confusion_matrix(Y_Train_res3, predict_val))
print(classification_report(Y_Train_res3, predict_val, digits=3))
predict_proba_train = est3.predict_proba(X_Train_res3)
auc_prob = roc_auc_score(Y_Train_res3, predict_proba_train[:,1])
print('AUC prob : ',auc_prob)

print('/nTest Results')
predict_val = est3.predict(X_Test)
print(confusion_matrix(Y_Test, predict_val))
print(classification_report(Y_Test, predict_val, digits=3))
test_preds3 = est3.predict_proba(X_Test)
auc_prob = roc_auc_score(Y_Test, test_preds3[:,1])
print('AUC prob : ',auc_prob)

bg['preds3'] = test_preds3[:,1]

In [None]:
#run model on group 4
est4 = XGBClassifier(eta=0.1, min_child_weight=1, max_depth=4)
eval_set = [(X_Test,Y_Test)]
est4.fit(X_Train_res4, Y_Train_res4, early_stopping_rounds=10, eval_metric="auc", eval_set=eval_set)

print('Train Results')

predict_val = est4.predict(X_Train_res4)
print(confusion_matrix(Y_Train_res4, predict_val))
print(classification_report(Y_Train_res4, predict_val, digits=3))
predict_proba_train = est4.predict_proba(X_Train_res4)
auc_prob = roc_auc_score(Y_Train_res4, predict_proba_train[:,1])
print('AUC prob : ',auc_prob)

print('/nTest Results')
predict_val = est4.predict(X_Test)
print(confusion_matrix(Y_Test, predict_val))
print(classification_report(Y_Test, predict_val, digits=3))
test_preds4 = est4.predict_proba(X_Test)
auc_prob = roc_auc_score(Y_Test, test_preds4[:,1])
print('AUC prob : ',auc_prob)
bg['preds4'] = test_preds4[:,1]

In [None]:
#run model on group 5
est5 = XGBClassifier(eta=0.1, min_child_weight=1, max_depth=4)
eval_set = [(X_Test,Y_Test)]
est5.fit(X_Train_res5, Y_Train_res5, early_stopping_rounds=10, eval_metric="auc", eval_set=eval_set)

print('Test Results')
predict_val = est5.predict(X_Test)
print(confusion_matrix(Y_Test, predict_val))
print(classification_report(Y_Test, predict_val, digits=3))
test_preds5 = est5.predict_proba(X_Test)
auc_prob = roc_auc_score(Y_Test, test_preds5[:,1])
print('AUC prob : ',auc_prob)

bg['preds5'] = test_preds5[:,1]

In [None]:
#run model on group 6
est6 = XGBClassifier(eta=0.1, min_child_weight=1, max_depth=4)
eval_set = [(X_Test,Y_Test)]
est6.fit(X_Train_res6, Y_Train_res6, early_stopping_rounds=10, eval_metric="auc", eval_set=eval_set)

print('Test Results')
predict_val = est6.predict(X_Test)
print(confusion_matrix(Y_Test, predict_val))
print(classification_report(Y_Test, predict_val, digits=3))
test_preds6 = est6.predict_proba(X_Test)
auc_prob = roc_auc_score(Y_Test, test_preds6[:,1])
print('AUC prob : ',auc_prob)

bg['preds6'] = test_preds6[:,1]

In [None]:
#group 7
est7 = XGBClassifier(eta=0.1, min_child_weight=1, max_depth=4)
eval_set = [(X_Test,Y_Test)]
est7.fit(X_Train_res7, Y_Train_res7, early_stopping_rounds=10, eval_metric="auc", eval_set=eval_set)

print('Test Results')
predict_val = est7.predict(X_Test)
print(confusion_matrix(Y_Test, predict_val))
print(classification_report(Y_Test, predict_val, digits=3))
test_preds7 = est7.predict_proba(X_Test)
auc_prob = roc_auc_score(Y_Test, test_preds7[:,1])
print('AUC prob : ',auc_prob)

bg['preds7'] = test_preds7[:,1]

In [None]:
#group 8
est8 = XGBClassifier(eta=0.1, min_child_weight=1, max_depth=4)
eval_set = [(X_Test,Y_Test)]
est8.fit(X_Train_res8, Y_Train_res8, early_stopping_rounds=10, eval_metric="auc", eval_set=eval_set)

print('Test Results')
predict_val = est8.predict(X_Test)
print(confusion_matrix(Y_Test, predict_val))
print(classification_report(Y_Test, predict_val, digits=3))
test_preds8 = est8.predict_proba(X_Test)
auc_prob = roc_auc_score(Y_Test, test_preds8[:,1])
print('AUC prob : ',auc_prob)

bg['preds8'] = test_preds8[:,1]

In [None]:
#group 9
est9 = XGBClassifier(eta=0.1, min_child_weight=1, max_depth=4)
eval_set = [(X_Test,Y_Test)]
est9.fit(X_Train_res9, Y_Train_res9, early_stopping_rounds=10, eval_metric="auc", eval_set=eval_set)

print('Test Results')
predict_val = est9.predict(X_Test)
print(confusion_matrix(Y_Test, predict_val))
print(classification_report(Y_Test, predict_val, digits=3))
test_preds9 = est9.predict_proba(X_Test)
auc_prob = roc_auc_score(Y_Test, test_preds9[:,1])
print('AUC prob : ',auc_prob)

bg['preds9'] = test_preds9[:,1]

In [None]:
#group 10
est10 = XGBClassifier(eta=0.1, min_child_weight=1, max_depth=4)
eval_set = [(X_Test,Y_Test)]
est10.fit(X_Train_res10, Y_Train_res10, early_stopping_rounds=10, eval_metric="auc", eval_set=eval_set)

print('Test Results')
predict_val = est10.predict(X_Test)
print(confusion_matrix(Y_Test, predict_val))
print(classification_report(Y_Test, predict_val, digits=3))
test_preds10 = est10.predict_proba(X_Test)
auc_prob = roc_auc_score(Y_Test, test_preds10[:,1])
print('AUC prob : ',auc_prob)

bg['preds10'] = test_preds10[:,1]

In [None]:
f = 'gain'
i = est2.get_booster().get_score(importance_type= f)
j = pd.Series(i).sort_values(ascending=False)
j[0:20]

In [None]:
bg.head()

In [None]:
col = bg.loc[: , "preds1":"preds10"]
bg['cum_prob'] = col.mean(axis=1)

bg['cum_pred'] = np.where(bg['cum_prob'] > 0.5, 1, 0)
bg['stdev'] = col.std(axis=1)

bg[61:80]

In [None]:
print(classification_report(bg['y'], bg['cum_pred'], digits=3))
auc_prob = roc_auc_score(bg['y'], bg['cum_prob'])
print('AUC prob : ',auc_prob)