In [19]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold
from catboost import CatBoostClassifier

from sklearn.metrics import roc_auc_score

pd.set_option('display.max_columns', None)

In [6]:
train_df = pd.read_csv('data/TrainingWiDS2021.csv')

In [7]:
train_df.corr()['diabetes_mellitus'].sort_values().tail(10)

bun_apache            0.145241
d1_bun_max            0.146990
weight                0.155517
bmi                   0.169043
h1_glucose_min        0.304520
h1_glucose_max        0.316847
glucose_apache        0.354359
d1_glucose_max        0.400742
diabetes_mellitus     1.000000
readmission_status         NaN
Name: diabetes_mellitus, dtype: float64

In [8]:
train_df.corr()['diabetes_mellitus'].sort_values().head(10)

h1_diasbp_invasive_min   -0.103672
h1_diasbp_invasive_max   -0.088664
h1_hco3_max              -0.086883
h1_hco3_min              -0.086548
d1_diasbp_invasive_min   -0.084994
d1_hemaglobin_max        -0.078801
d1_hematocrit_max        -0.067271
h1_hemaglobin_min        -0.065774
h1_mbp_invasive_min      -0.065732
h1_hemaglobin_max        -0.063734
Name: diabetes_mellitus, dtype: float64

In [9]:
unlabeled_df = pd.read_csv('data/UnlabeledWiDS2021.csv')

## Let's put features here

In [11]:
# as an example I will just take some columns that already correllate with the target

In [10]:
features = [
    'd1_glucose_max',
    'glucose_apache', 
    'weight',
    'age',
    'h1_diasbp_invasive_min'
]

In [15]:
from sklearn.model_selection import train_test_split
X, X_test, y, y_test = train_test_split(
     train_df[features], train_df['diabetes_mellitus'], test_size=0.33, random_state=42)

In [30]:
y_valid_pred = 0*y
y_test_pred = 0

# Set up folds
K = 5
kf = KFold(n_splits = K, random_state = 1, shuffle = True)

model = CatBoostClassifier(
    learning_rate=0.04, 
    depth=6, 
    l2_leaf_reg = 14, 
    iterations = 650,
    verbose = False,
    loss_function='Logloss'
)

In [31]:
for i, (train_index, test_index) in enumerate(kf.split(X)):
    
    # Create data for this fold
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    X_train, X_valid = X.iloc[train_index,:], X.iloc[test_index,:]
    print( "Fold ", i)
    
    # Run model for this fold

    fit_model = model.fit( X_train, y_train )
        
    # Generate validation predictions for this fold
    pred = fit_model.predict_proba(X_valid)[:,1]
    y_valid_pred.iloc[test_index] = pred
    
    # Accumulate test set predictions
    y_test_pred += fit_model.predict_proba(X_test)[:,1]
    
y_test_pred /= K  # Average test set predictions

Fold  0
Fold  1
Fold  2
Fold  3
Fold  4


In [33]:
roc_auc_score(y_test, y_test_pred)

0.815993383025388

## Prepare submission

In [34]:
submission = pd.DataFrame()
submission['encounter_id'] = unlabeled_df['encounter_id']
submission['diabetes_mellitus'] = fit_model.predict_proba(unlabeled_df[features])[:,1]

In [36]:
submission.to_csv('submission.csv', index=False)