In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import Counter

from sklearn.model_selection import StratifiedKFold
from sklearn import metrics, preprocessing
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from catboost import Pool, CatBoostClassifier
import category_encoders as ce


In [2]:
#Importing
train_data = pd.read_csv('/Users/s0c02nj/Desktop/WiDS/training_v2.csv')
test_data =  pd.read_csv('/Users/s0c02nj/Desktop/WiDS/unlabeled.csv')

sub_data = pd.read_csv('/Users/s0c02nj/Desktop/WiDS/samplesubmission.csv')
data_dict = pd.read_csv('/Users/s0c02nj/Desktop/WiDS/WiDS Datathon 2020 Dictionary.csv')

In [3]:
def weighted_classt(x): 
    if pd.isna(x):
        return np.nan
    elif x < 15: 
        return 'very severely underweight' 
    elif x >= 15 and x < 16: 
        return 'severely weight' 
    elif x >=16 and x < 18.5: 
        return 'underweight' 
    elif x >= 18.5 and x < 25: 
        return 'healthy weight' 
    elif x >= 25 and x < 30: 
        return 'overweight'
    elif x >= 30 and x < 35: 
        return 'class 1' 
    elif x >= 35 and x < 40: 
        return 'class 2' 
    else: 
        return 'class 3' 

In [4]:
train_data['weightclass'] = train_data['bmi'].map(weighted_classt)
test_data['weightclass'] = test_data['bmi'].map(weighted_classt)

In [None]:
ls  = list(set(test_data['hospital_id']).intersection(set(train_data['hospital_id'])))

In [None]:
cat_var = list(data_dict[data_dict['Data Type'].isin(['binary','string'])]['Variable Name'])
cat_var.remove('icu_admit_type')
cat_var.remove('bmi')
cat_var = cat_var[1:]

In [None]:
#cat_var = cat_var[1:]
cat_var = cat_var + ['weightclass']


In [None]:
cont_var = list(data_dict[data_dict['Data Type'].isin(['integer','numeric'])]['Variable Name'])
cont_var.remove('pred')
cont_var = cont_var + ['bmi']

#### Baseline

In [None]:
x_train = train_data.drop('hospital_death',axis=1)
y_train = train_data['hospital_death']

In [None]:
x_test = test_data.drop('hospital_death',axis=1)

In [None]:
x_comb = pd.concat([x_train,x_test],sort=False)

In [None]:
### impute bmi values based on height and weight
count = 0
bmi_list = []
for i in tqdm(range(x_comb.shape[0])):
    
    if pd.isnull(x_comb["weight"].iloc[i]) or pd.isnull(x_comb["height"].iloc[i]):
        bmi_list.append(x_comb["bmi"].iloc[i])
        continue
    else:
        
        count += 1
        bmi_list.append(x_comb["weight"].iloc[i] / ((x_comb["height"].iloc[i]/100) ** 2))

In [None]:
x_comb['bmi'] = bmi_list

In [None]:
# # Remove Features with more than 70 percent missing values
# data_missing = (x_comb.isnull().sum() / len(x_comb)).sort_values(ascending = False)
# data_missing = data_missing.index[data_missing > 0.60]

# x_comb = x_comb.drop(columns = data_missing)
#x_comb.isna().sum()

In [None]:
#Imputing Missing values

In [None]:
x_comb["apache_4a_hospital_death_prob"] = x_comb["apache_4a_hospital_death_prob"].replace(-1, np.nan)
x_comb["apache_4a_icu_death_prob"] = x_comb["apache_4a_icu_death_prob"].replace(-1, np.nan)

In [None]:
new_cat_var = list(set(cat_var).intersection(set(x_comb.columns)))

In [None]:
for col in tqdm(new_cat_var):
    le = LabelEncoder()
    x_comb[col] = le.fit_transform(x_comb[col].astype(str))


In [None]:
new_cont_var = list(set(cont_var).intersection(set(x_comb.columns)))

In [None]:
for col in tqdm(new_cont_var):
    x_comb[col] = x_comb[col].fillna(x_comb[col].mode().values[0])

In [None]:
count_var = []
for i,col in tqdm(enumerate(new_cont_var + new_cat_var)):
    counter = Counter(x_comb[col])
    x_comb[str(col)+'count'] = x_comb[col].apply(lambda x:counter[x])
    count_var.append(str(col)+'count')

In [None]:
age_cat = []

for i in tqdm(range(0,len(x_comb))):
    
    val = x_comb['age'].iloc[i]
    
    if val >= 15 and val <= 24: 
        age_cat.append('igen')
    
    elif val >= 25 and val <= 54: 
       age_cat.append('Prime_working_Age')
    
    elif val >= 55 and val <= 64: 
        age_cat.append('Mature_working_Age')
        
    else: 
        age_cat.append('Elderly_working_Age')


In [None]:
x_comb['age_category'] = age_cat
le = LabelEncoder()
x_comb['age_category'] = le.fit_transform(x_comb['age_category'].astype(str))

In [None]:
list_illness = ['aids',
                'cirrhosis',
                'diabetes_mellitus',
                'hepatic_failure',
                'immunosuppression',
                'leukemia',
                'lymphoma',
                'solid_tumor_with_metastasis']

In [None]:
inter_cols1 = []

for col in tqdm(list_illness) :
    x_comb['hospital_id'+str(col)] = x_comb['hospital_id'].astype(str) + '_' + x_comb[col].astype(str)
    x_comb['gender'+str(col)] = x_comb['gender'].astype(str) + '_' + x_comb[col].astype(str)
    x_comb['age'+str(col)] = x_comb['age'].astype(str) + '_' + x_comb[col].astype(str)
    x_comb['bmi'+str(col)] = x_comb['bmi'].astype(str) + '_' + x_comb[col].astype(str)
    
    inter_cols1.append('hospital_id'+str(col))
    inter_cols1.append('gender'+str(col))
    inter_cols1.append('age'+str(col))
    inter_cols1.append('bmi'+str(col))

In [None]:
for cols in tqdm(inter_cols1):
    le = LabelEncoder()
    x_comb[cols] = le.fit_transform(x_comb[cols])
    

In [None]:
inter_cols2 = []

for col in tqdm(list_illness) :
    x_comb['age_category'+str(col)] = x_comb['age_category'].astype(str) + '_' + x_comb[col].astype(str)
    inter_cols2.append('age_category'+str(col))

In [None]:
for cols in tqdm(inter_cols2):
    le = LabelEncoder()
    x_comb[cols] = le.fit_transform(x_comb[cols])
    

In [None]:
inter_cols3 = []

for col1 in tqdm(list_illness) :
    for col2 in list_illness:
        if col1 !=col2 :
            x_comb[col1+col2] = x_comb[col1].astype(str) + '_' + x_comb[col2].astype(str)
            inter_cols3.append(col1+col2)

In [None]:
for cols in tqdm(inter_cols3):
    le = LabelEncoder()
    x_comb[cols] = le.fit_transform(x_comb[cols])
    

In [None]:
#train_data.columns[0:100]

list_imp_cols = ['icu_type','bmi','age','gender','d1_heartrate_max']

inter_cols4 = []

for col1 in tqdm(list_imp_cols) :
    for col2 in list_imp_cols:
        if col1 !=col2 :
            x_comb[col1+col2] = x_comb[col1].astype(str) + '_' + x_comb[col2].astype(str)
            inter_cols4.append(col1+col2)
            

In [None]:
for cols in tqdm(inter_cols4):
    le = LabelEncoder()
    x_comb[cols] = le.fit_transform(x_comb[cols])

In [18]:
list_groups = list(data_dict['Category'].unique())
list_groups = list_groups[1:]

grp_ls = []

for grp in tqdm(list_groups):
    
    grp_ls.append(list(data_dict[data_dict['Category'] == grp]['Variable Name']))

100%|██████████| 9/9 [00:00<00:00, 894.35it/s]


In [41]:
inter_cols5 = []

for i in tqdm(range(0,len(grp_ls))):
    p = grp_ls[i]
    p_net = list(set(p).intersection(set(x_comb.columns)))
    
    temp = ''
    x_comb[temp] = []

    for j in range(0,len(p_net)):
        #print (i)

        temp = p[j]+ '_'+ temp
        x_comb[temp] = x_comb[p[j]] + x_comb[temp]
    
    inter_cols5.append(temp)



100%|██████████| 9/9 [00:00<00:00, 15847.50it/s]


In [40]:
x

'weight_readmission_status_pre_icu_los_days_icu_type_icu_stay_type_icu_id_icu_admit_type_icu_admit_source_hospital_admit_source_height_gender_ethnicity_elective_surgery_bmi_age_hospital_death_'

In [47]:
a = pd.DataFrame()
b = [1,2,3,4]
a['a1'] = b

In [48]:
a['temp'] = ''

In [51]:
a['temp']+'_'+a['a1'].astype(str)

0    _1
1    _2
2    _3
3    _4
dtype: object

In [None]:
final_cat_var = new_cat_var + count_var + ['age_category'] + inter_cols1 + inter_cols2 + inter_cols3 + inter_cols4

##### Target Encoding

In [None]:
temp_train = x_train.copy()
temp_train['hospital_death'] = train_data['hospital_death']

In [None]:
cat_count_var = []

for i,col in tqdm(enumerate(cat_var)):
    mean = temp_train['hospital_death'].mean()
    
    #Compute the number of values and the mean of each group
    agg = temp_train.groupby(col)['hospital_death'].agg(['count', 'mean'])
    counts = agg['count']
    means =  agg['mean']
    
    #Compute the "smoothed" means
    m=3
    smooth = (counts * means + m * mean) / (counts + m)
    
    #Final_val
    x_comb[str(col)+'count_new'] = x_comb[col].map(smooth)
    cat_count_var.append(str(col)+'count_new')
    

In [None]:
# # Remove Features with more than 70 percent missing values
data_missing = (x_comb.isnull().sum() / len(x_comb)).sort_values(ascending = False)
data_missing = data_missing.index[data_missing > 0.90]

x_comb = x_comb.drop(columns = data_missing)

In [None]:
cat_count_var_new = list(set(cat_count_var).intersection(set(x_comb.columns)))

In [None]:
for col in tqdm(cat_count_var_new):
    x_comb[col] = x_comb[col].fillna(x_comb[col].mode().values[0])

In [None]:
x_comb = x_comb.drop(['encounter_id','patient_id'],axis=1)

In [None]:
x_comb.shape

In [None]:
train_x = x_comb[0:91713]
test_x = x_comb[91713:]

In [None]:
x1_train, x1_val, y1_train, y1_val = train_test_split(train_x, y_train, 
                                                      test_size=0.2,
                                                      random_state=42,
                                                      stratify = y_train)

In [None]:
#let us make the catboost model, use_best_model params will make the model prevent overfitting
model = CatBoostClassifier(iterations=2000, 
                           learning_rate=0.04, 
                           l2_leaf_reg= 3.5,  
                           depth= 8, 
                           rsm=0.98, 
                           loss_function= 'Logloss', 
                           eval_metric='AUC',
                           use_best_model=True,
                           random_seed=42)

In [None]:
model.fit(x1_train,y1_train, cat_features= new_cat_var, eval_set=(x1_val,y1_val))


In [None]:
#last let us make the submission,note that you have to make the pred to be int!
pred = model.predict_proba(test_x)
preds= pred[:,1]

In [None]:
#sub_data

In [None]:
data_sub = x_test[['encounter_id']]
data_sub['hospital_death'] = preds

In [None]:
data_sub.to_csv('/Users/s0c02nj/Desktop/WiDS/sub4_catb_01022020.csv',index = False)

In [None]:
data_sub.shape

In [None]:
kf = StratifiedKFold(n_splits=12, random_state=42, shuffle=True)

In [None]:
predictions = np.zeros(len(x_test))
num_models = 0

for train_index, valid_index in tqdm(kf.split(train_x, y_train)):
    
    x1_train = train_x.iloc[train_index]
    y1_train =y_train[train_index]
    x1_val = train_x.iloc[valid_index]
    y1_val = y_train[valid_index]

    catb_model.fit(x1_train,y1_train, cat_features= new_cat_var, eval_set=(x1_val,y1_val))

    num_models += 1
    
    predictions += catb_model.predict(test_x)

In [None]:
# a1 = pd.read_csv('/Users/s0c02nj/Desktop/WiDS/Submission/sub2.csv')
# a2 = pd.read_csv('/Users/s0c02nj/Desktop/WiDS/Submission/sub3.csv')
# a3 = pd.read_csv('/Users/s0c02nj/Desktop/WiDS/Submission/sub3_lgb.csv')
# a4 = pd.read_csv('/Users/s0c02nj/Desktop/WiDS/Submission/sub4_lgb.csv')
# a5 = pd.read_csv('/Users/s0c02nj/Desktop/WiDS/Submission/sub5_lgb.csv')
# a6 = pd.read_csv('/Users/s0c02nj/Desktop/WiDS/Submission/sub6_lgb.csv')
# a7 = pd.read_csv('/Users/s0c02nj/Desktop/WiDS/Submission/sub7_catb_agg.csv')
# a8 = pd.read_csv('/Users/s0c02nj/Desktop/WiDS/sub6_catb.csv')
# a9 = pd.read_csv('/Users/s0c02nj/Desktop/WiDS/sub5_catb.csv')
                      
a1 = pd.read_csv('/Users/s0c02nj/Desktop/WiDS/Submission/sub_ensemble.csv')
a2 = pd.read_csv('/Users/s0c02nj/Desktop/WiDS/Submission/sub_ensemble_gm.csv')
a3 = pd.read_csv('/Users/s0c02nj/Desktop/WiDS/Submission/sub_ensemble_gm1.csv')
a4 = pd.read_csv('/Users/s0c02nj/Desktop/WiDS/sub_ensemble_gm2.csv')
a5 = pd.read_csv('/Users/s0c02nj/Desktop/WiDS/sub8_lgb_10fold1.csv')
a6 = pd.read_csv('/Users/s0c02nj/Desktop/WiDS/sub8_lgb_10fold.csv')
a7 = pd.read_csv('/Users/s0c02nj/Desktop/WiDS/sub10_lgb_kfold2.csv')
a8 = pd.read_csv('/Users/s0c02nj/Desktop/WiDS/xgboost.csv')
a9 = pd.read_csv('/Users/s0c02nj/Desktop/WiDS/sub11_ensemble_imp1.csv')
a10 = pd.read_csv('/Users/s0c02nj/Desktop/WiDS/sub12_catb.csv')
$a11 = pd.read_csv('/Users/s0c02nj/Desktop/WiDS/sub14_catb_sudolabel.csv')

In [None]:
sub = pd.DataFrame()
sub['encounter_id'] = a1['encounter_id']

In [None]:
pred_ens = (a1['hospital_death'] + a2['hospital_death'] + a3['hospital_death'] + 
            a4['hospital_death'] + a5['hospital_death'] + a6['hospital_death'] +
            a7['hospital_death'] + a8['hospital_death'] + a9['hospital_death'] +
            a10['hospital_death'] + a11['hospital_death']
           )/11.0
            

In [None]:
sub['hospital_death'] = pred_ens

In [None]:
sub.to_csv('/Users/s0c02nj/Desktop/WiDS/sub15_ensemble_sudolabel.csv',index = False)

In [None]:
predictions = np.zeros(len(x_test))
num_models = 0

for train_index, valid_index in tqdm(kf.split(train_x, y_train)):
    
    d_train = lgb.Dataset(train_x.iloc[train_index], label=y_train[train_index])
    d_val = lgb.Dataset(train_x.iloc[valid_index], label=y_train[valid_index])

    clf = lgb.train(lgb_params, d_train, 130000, verbose_eval=1000, 
                    valid_sets = [d_train, d_val], early_stopping_rounds = 3000)
    

    
    num_models += 1
    
    predictions += clf.predict(test_x)

In [None]:
x_test1 = x_test.copy()

In [None]:
x_test1

In [None]:
test_x['hospital_death'] = pred_ens

In [None]:
df_zero = test_x[test_x['hospital_death']< 0.0008]
df_zero['hospital_death'] = 0

In [None]:
df_ones = test_x[test_x['hospital_death']>0.9]
df_ones['hospital_death'] = 1

In [None]:
df_sudo = pd.concat([df_zero,df_ones])

In [None]:
y_sudo = df_sudo['hospital_death']
train_sudo = df_sudo.drop(['hospital_death'],axis=1)

In [None]:
# a1 = pd.read_csv('/Users/s0c02nj/Desktop/WiDS/Submission/sub_ensemble.csv')
# a2 = pd.read_csv('/Users/s0c02nj/Desktop/WiDS/Submission/sub_ensemble_gm.csv')
# a3 = pd.read_csv('/Users/s0c02nj/Desktop/WiDS/Submission/sub_ensemble_gm1.csv')
# a4 = pd.read_csv('/Users/s0c02nj/Desktop/WiDS/sub_ensemble_gm2.csv')
# a5 = pd.read_csv('/Users/s0c02nj/Desktop/WiDS/sub8_lgb_10fold1.csv')
# a6 = pd.read_csv('/Users/s0c02nj/Desktop/WiDS/sub8_lgb_10fold.csv')
# a7 = pd.read_csv('/Users/s0c02nj/Desktop/WiDS/sub10_lgb_kfold2.csv')
# a8 = pd.read_csv('/Users/s0c02nj/Desktop/WiDS/xgboost.csv')
# a9 = pd.read_csv('/Users/s0c02nj/Desktop/WiDS/sub11_ensemble_imp1.csv')
# a10 = pd.read_csv('/Users/s0c02nj/Desktop/WiDS/sub12_catb.csv')
# $a11 = pd.read_csv('/Users/s0c02nj/Desktop/WiDS/sub14_catb_sudolabel.csv')