In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold
from catboost import CatBoostRegressor

from sklearn.metrics import roc_auc_score

pd.set_option('display.max_columns', None)

In [2]:
train_df = pd.read_csv('data/TrainingWiDS2021.csv')

In [3]:
train_df.corr()['diabetes_mellitus'].sort_values().tail(10)

bun_apache            0.145241
d1_bun_max            0.146990
weight                0.155517
bmi                   0.169043
h1_glucose_min        0.304520
h1_glucose_max        0.316847
glucose_apache        0.354359
d1_glucose_max        0.400742
diabetes_mellitus     1.000000
readmission_status         NaN
Name: diabetes_mellitus, dtype: float64

In [4]:
train_df.corr()['diabetes_mellitus'].sort_values().head(10)

h1_diasbp_invasive_min   -0.103672
h1_diasbp_invasive_max   -0.088664
h1_hco3_max              -0.086883
h1_hco3_min              -0.086548
d1_diasbp_invasive_min   -0.084994
d1_hemaglobin_max        -0.078801
d1_hematocrit_max        -0.067271
h1_hemaglobin_min        -0.065774
h1_mbp_invasive_min      -0.065732
h1_hemaglobin_max        -0.063734
Name: diabetes_mellitus, dtype: float64

In [5]:
unlabeled_df = pd.read_csv('data/UnlabeledWiDS2021.csv')

## Let's put features here

In [6]:
# as an example I will just take some columns that already correllate with the target

In [7]:
features = [
    'd1_glucose_max',
    'glucose_apache', 
    'weight',
    'age',
    'h1_diasbp_invasive_max',
    'h1_hco3_max',
    'd1_bun_max'
]

In [8]:
train_df[features].corr() # probably not the best idea to use features that correllate with each other

Unnamed: 0,d1_glucose_max,glucose_apache,weight,age,h1_diasbp_invasive_max,h1_hco3_max,d1_bun_max
d1_glucose_max,1.0,0.923322,0.0886,0.014772,-0.045575,-0.293122,0.184887
glucose_apache,0.923322,1.0,0.092489,0.009594,-0.009383,-0.265253,0.148887
weight,0.0886,0.092489,1.0,-0.125448,0.062234,0.091822,0.050998
age,0.014772,0.009594,-0.125448,1.0,-0.250053,0.163897,0.239831
h1_diasbp_invasive_max,-0.045575,-0.009383,0.062234,-0.250053,1.0,0.021565,-0.164543
h1_hco3_max,-0.293122,-0.265253,0.091822,0.163897,0.021565,1.0,-0.218076
d1_bun_max,0.184887,0.148887,0.050998,0.239831,-0.164543,-0.218076,1.0


## Train model

In [9]:
from sklearn.model_selection import train_test_split
X, X_test, y, y_test = train_test_split(
     train_df[features], train_df['diabetes_mellitus'], test_size=0.33, random_state=42)

In [10]:
y_valid_pred = 0*y
y_test_pred = 0

# Set up folds
K = 5
kf = KFold(n_splits = K, random_state = 1, shuffle = True)

model = CatBoostRegressor(
    learning_rate=0.06, 
    depth=5, 
    l2_leaf_reg = 14, 
    iterations = 850,
    verbose = False
)

In [11]:
for i, (train_index, test_index) in enumerate(kf.split(X)):
    
    # Create data for this fold
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    X_train, X_valid = X.iloc[train_index,:], X.iloc[test_index,:]
    print( "Fold ", i)
    
    # Run model for this fold

    fit_model = model.fit( X_train, y_train )
        
    # Generate validation predictions for this fold
    pred = fit_model.predict(X_valid)
    y_valid_pred.iloc[test_index] = pred
    
    # Accumulate test set predictions
    y_test_pred += fit_model.predict(X_test)
    
y_test_pred /= K  # Average test set predictions

Fold  0
Fold  1
Fold  2
Fold  3
Fold  4


In [12]:
roc_auc_score(y_test, y_test_pred) # 0.8155648390360045

0.8195511060201544

## Prepare submission

In [13]:
submission = pd.DataFrame()
submission['encounter_id'] = unlabeled_df['encounter_id']
submission['diabetes_mellitus'] = fit_model.predict_proba(unlabeled_df[features])[:,1]

AttributeError: 'CatBoostRegressor' object has no attribute 'predict_proba'

In [None]:
submission.to_csv('submission.csv', index=False)