In [165]:
import pandas as pd
from lightgbm.sklearn import LGBMRegressor
import numpy as np
import re
import shap

In [166]:
def preprocess(df, feat_required):
    df = df.copy()
    X, y = df.Input.Visual, df.Target.pcr
    X = X.copy()
    y = y.copy()
#     for col in X.select_dtypes(include=['category']):
#         X[col] = X[col].cat.codes
#         X.loc[X[col] < 0, col] = np.nan
#         X[col] = X[col].astype('category')
    
    X = X[~y.isna()]
    df = df[~y.isna()]
    y = y[~y.isna()]
    
    if isinstance(X.columns, pd.MultiIndex):
        X.columns = [re.sub(r'[^0-9a-zA-Z_-]+ ', '', '_'.join(col).strip()) for col in X.columns.values]
    else:
        X.columns = [re.sub(r'[^0-9a-zA-Z_-]+ ', '', col) for col in X.columns.values]
        
    has_sufficient_values = (1 - X.isna().mean() >= feat_required)
    X = X.loc[:, has_sufficient_values]
    return X, y, df

def get_feature_importances(X, y, return_estimator=False, do_preprocess=True, clf=None):
    if do_preprocess:
        X, y = preprocess(X, y)
    
    if clf is None:
        clf = LGBMClassifier(n_jobs=-1)
        clf.fit(X, y)
    
    explainer = shap.TreeExplainer(clf)
    shap_values = explainer(X)[:, :, 1]
    if return_estimator:
        return np.abs(shap_values.values * (~X.isna())).mean(axis=0), clf
    else:
        return np.abs(shap_values.values * (~X.isna())).mean(axis=0)

In [226]:
df = pd.read_pickle('../data/processed/rumc.pkl')

In [227]:
X, y, df = preprocess(df, 0.1)
# X_train, y_train = preprocess(df[~is_test].Input.Clinical, df[~is_test].Target.corads, 0.16)
# X_test, y_test = preprocess(df[is_test].Input.Clinical, df[is_test].Target.corads, 0.16)

In [228]:
is_test = df.Meta.part == 'test'
X_train, y_train = X[~is_test], y[~is_test]
X_test, y_test = X[is_test], y[is_test]

In [229]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y)

In [189]:
df.columns[df.columns.get_level_values(2).str.contains('POLYCHROMASIE')]

MultiIndex([('Input', 'Clinical',   'POLYCHROMASIE'),
            ('Input', 'Clinical', 'POLYCHROMASIE, ')],
           )

In [190]:
from covidcf.evaluation.metrics import corads_roc_auc

In [230]:
reg = LGBMClassifier(n_jobs=-1).fit(X_train, y_train)

In [231]:
reg.score(X_train, y_train)

1.0

In [232]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, reg.predict_proba(X_test)[:, 1])

0.5757575757575757

In [233]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, X_test.mean(axis=1).fillna(2))

0.5656565656565656

In [85]:
reg.score(X_test, y_test)

-0.7806456653358114

In [137]:
corads_roc_auc(y_test, reg.predict(X_test))

0.49237288135593227

In [138]:
corads_roc_auc(y_train, reg.predict(X_train))

0.8788826349801959

In [67]:
corads_roc_auc(y_test, reg.predict(X_test))

0.9370056497175141

In [140]:
from covidcf.evaluation.metrics import custom_metrics

In [177]:
y

patientprimarymrn  study
10008              st000    Negatief
10017              st000    Negatief
10020              st000    Negatief
10025              st000    Negatief
10033              st000    Negatief
                              ...   
10605              st000    Negatief
10608              st000    Positief
10610              st000    Negatief
10611              st000    Positief
10614              st000    Negatief
Name: pcr, Length: 155, dtype: category
Categories (2, object): ['Negatief', 'Positief']

In [176]:
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
cross_val_score(LGBMClassifier(n_jobs=-1, boosting_type='gbdt'), X, y, cv=RepeatedStratifiedKFold(n_splits=10), scoring='roc_auc').mean()

0.5217234848484849