In [1]:
import pickle
import pandas as pd

with open("data/X_meta_use-target_sub.pickle", "rb") as f:
    X_meta = pickle.load(f)

X_train_meta, X_valid_meta, X_test_meta = X_meta
    
with open("data/X_token_use-target_sub.pickle", "rb") as f:
    X_token = pickle.load(f)

X_train_token, X_valid_token, X_test_token = X_token
    
with open("data/y_use-target_sub.pickle", "rb") as f:
    y = pickle.load(f)
    
y_train, y_valid, y_test = y

In [2]:
def concat_tokens(tokens):
    return [" ".join(x) for x in tokens]

X_train_token = concat_tokens(X_train_token)
X_valid_token = concat_tokens(X_valid_token)
X_test_token = concat_tokens(X_test_token)

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

count_vectorizer = CountVectorizer(min_df=.015, max_df=.8, max_features=1000, ngram_range=[1, 3])
count_vectorizer.fit(X_train_token)
X_train_token_vec = pd.DataFrame.sparse.from_spmatrix(count_vectorizer.transform(X_train_token))
X_valid_token_vec = pd.DataFrame.sparse.from_spmatrix(count_vectorizer.transform(X_valid_token))
X_test_token_vec = pd.DataFrame.sparse.from_spmatrix(count_vectorizer.transform(X_test_token))

In [4]:
X_train = pd.concat([X_train_meta, X_train_token_vec], axis=1)
X_valid = pd.concat([X_valid_meta, X_valid_token_vec], axis=1)
X_test = pd.concat([X_test_meta, X_test_token_vec], axis=1)

y_train = pd.DataFrame(y_train, columns=['target'])
y_valid = pd.DataFrame(y_valid, columns=['target'])
y_test = pd.DataFrame(y_test, columns=['target'])

train_df = pd.concat([X_train, y_train], axis=1)
valid_df = pd.concat([X_valid, y_valid], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

In [5]:
from pycaret.classification import *

clf = setup(data=train_df, target="target", 
            session_id=42,
            fold=5, fold_shuffle=True, 
            fix_imbalance=True, 
            normalize=True,)

Unnamed: 0,Description,Value
0,session_id,42
1,Target,target
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(16549, 387)"
5,Missing Values,False
6,Numeric Features,9
7,Categorical Features,377
8,Ordinal Features,False
9,High Cardinality Features,False


In [6]:
lr = create_model('lr')
tuned_lr = tune_model(lr, optimize='f1')
final_lr = finalize_model(tuned_lr)
pred_lr = predict_model(final_lr, data=test_df)
pred_lr

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6897,0.6932,0.5525,0.3925,0.459,0.2501,0.2573
1,0.6836,0.7265,0.6431,0.3984,0.492,0.2803,0.2972
2,0.7061,0.7376,0.6004,0.4192,0.4937,0.2957,0.3052
3,0.694,0.7083,0.5588,0.3992,0.4657,0.2596,0.2668
4,0.6826,0.7073,0.5797,0.3888,0.4655,0.2521,0.2622
Mean,0.6912,0.7146,0.5869,0.3996,0.4752,0.2675,0.2777
Std,0.0085,0.0156,0.0328,0.0105,0.0146,0.0177,0.0196


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.6996,0.7614,0.6454,0.4146,0.5048,0.3036,0.3189


Unnamed: 0,center,age,gender,height,weight,BMI,BMI_group,is_operation,is_pain,pain_NRS,...,358,359,360,361,362,363,364,target,Label,Score
0,13,20,1,187.0,71.0,20.303698,1.0,0.0,1.0,3.0,...,0,0,1,0,0,0,0,0,0,0.6370
1,2,24,1,175.0,73.0,23.836735,1.0,0.0,1.0,2.0,...,0,0,0,1,0,1,0,0,0,0.5351
2,12,22,1,178.0,64.0,20.199470,1.0,1.0,1.0,3.0,...,0,0,0,0,0,0,0,0,1,0.5113
3,12,21,1,172.0,80.0,27.041644,2.0,0.0,0.0,3.0,...,0,0,0,0,0,0,0,0,1,0.6481
4,14,20,1,178.0,58.0,18.305769,1.0,0.0,0.0,3.0,...,0,0,0,0,0,0,0,0,1,0.6227
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2039,11,20,1,170.0,70.0,24.221453,1.0,1.0,1.0,4.0,...,0,0,0,1,0,0,0,0,1,0.7440
2040,15,23,1,171.0,82.0,28.042817,2.0,0.0,1.0,3.0,...,0,0,0,0,0,0,0,0,1,0.5492
2041,12,21,1,173.0,60.0,20.047446,1.0,0.0,0.0,2.0,...,0,0,0,0,0,0,0,0,0,0.5905
2042,12,21,1,168.0,66.0,23.384354,1.0,0.0,1.0,4.0,...,0,0,0,0,0,0,0,0,0,0.6527


In [7]:
dt = create_model('dt')
tuned_dt = tune_model(dt, optimize='f1')
final_dt = finalize_model(tuned_dt)
pred_dt = predict_model(final_dt, data=test_df)
pred_dt

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7514,0.6598,0.3587,0.4714,0.4074,0.2538,0.2576
1,0.7462,0.6508,0.3605,0.4585,0.4037,0.2454,0.2483
2,0.7579,0.6669,0.3544,0.49,0.4113,0.2638,0.2693
3,0.7393,0.6547,0.3526,0.4422,0.3924,0.2291,0.2315
4,0.7332,0.6412,0.3333,0.424,0.3732,0.2068,0.2092
Mean,0.7456,0.6547,0.3519,0.4572,0.3976,0.2398,0.2432
Std,0.0087,0.0086,0.0097,0.0228,0.0137,0.02,0.021


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Decision Tree Classifier,0.7779,0.694,0.3402,0.5518,0.4209,0.293,0.3061


Unnamed: 0,center,age,gender,height,weight,BMI,BMI_group,is_operation,is_pain,pain_NRS,...,358,359,360,361,362,363,364,target,Label,Score
0,13,20,1,187.0,71.0,20.303698,1.0,0.0,1.0,3.0,...,0,0,1,0,0,0,0,0,0,0.7066
1,2,24,1,175.0,73.0,23.836735,1.0,0.0,1.0,2.0,...,0,0,0,1,0,1,0,0,0,0.9155
2,12,22,1,178.0,64.0,20.199470,1.0,1.0,1.0,3.0,...,0,0,0,0,0,0,0,0,0,0.7066
3,12,21,1,172.0,80.0,27.041644,2.0,0.0,0.0,3.0,...,0,0,0,0,0,0,0,0,0,0.6718
4,14,20,1,178.0,58.0,18.305769,1.0,0.0,0.0,3.0,...,0,0,0,0,0,0,0,0,0,0.8923
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2039,11,20,1,170.0,70.0,24.221453,1.0,1.0,1.0,4.0,...,0,0,0,1,0,0,0,0,0,0.8851
2040,15,23,1,171.0,82.0,28.042817,2.0,0.0,1.0,3.0,...,0,0,0,0,0,0,0,0,0,0.8886
2041,12,21,1,173.0,60.0,20.047446,1.0,0.0,0.0,2.0,...,0,0,0,0,0,0,0,0,0,0.9236
2042,12,21,1,168.0,66.0,23.384354,1.0,0.0,1.0,4.0,...,0,0,0,0,0,0,0,0,0,0.8851


In [10]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

xgb_param_grid = {
    'n_estimators': [100, 300, 500],
    'learning_rate': [0.1, 0.05, 0.01],
    'max_depth': [3, 5, 7, 10, 20]
}

xgb = XGBClassifier()
xgb_grid = GridSearchCV(xgb, param_grid=xgb_param_grid, scoring="f1", 
                        verbose=1, n_jobs=-1, cv=5)

xgb_grid.fit(X_train, y_train)

Fitting 5 folds for each of 45 candidates, totalling 225 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   43.2s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed: 12.3min
[Parallel(n_jobs=-1)]: Done 225 out of 225 | elapsed: 19.3min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     callbacks=None, colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     feature_types=None, gamma=None,
                                     gpu_id=None, grow_policy=None,
                                     importance_type=None,
                                     interaction_constraints=None,
                                     lea...
                                     max_leaves=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                        

In [11]:
params = xgb_grid.best_params_
xgb = XGBClassifier(learning_rate=params['learning_rate'],
                    max_depth=params['max_depth'], 
                    n_estimators=params['n_estimators'])

xgb.fit(X_train, y_train)
xgb_out = xgb.predict_proba(X_test)[:, 1]

In [9]:
import pickle

with open("weights/stomach_tree.pkl", "wb") as f:
    pickle.dump(xgb, f)



In [12]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y_test, xgb_out)

0.7832234514591034

In [17]:
def compute_meta_risk(x):
    risk = 0
    risk += x['pain_NRS'] * 0.1
    
    if x["temperature"] > 37 and x["temperature"] <= 38:
        risk += 0.3
    elif x["temperature"] > 38 and x["temperature"] <= 39:
        risk += 0.6
    else:
        risk += 1
    
    if x["pulse"] > 80 and x["pulse"] <= 90:
        risk += 0.3
    elif x["pulse"] > 90 and x["pulse"] <= 100:
        risk += 0.6
    elif x["pulse"] > 100:
        risk += 1
        
    if x["respiration"] > 16 and x["respiration"] <= 18:
        risk += 0.3
    elif x["respiration"] > 18 and x["respiration"] <= 20:
        risk += 0.6
    elif x["respiration"] > 20:
        risk += 1
        
    if x["is_operation"] == 1:
        risk += 0.3
        
    if x["is_medical_history"] == 1:
        risk += 0.3
        
    if x["is_alertness"] == 0:
        risk += 1
        
    if x["is_digestive"] == 1:
        risk += 0.3
        
    if x["is_hemoptysis"] == 1:
        risk += 0.5
        
    if x["is_bloody_excrement"] == 1:
        risk += 0.8
        
    if risk > 1:
        risk = 1
        
    return risk

meta_risk = X_test.apply(compute_meta_risk, axis=1)
ensemble_prob = (xgb_out + meta_risk) / 2

roc_auc_score(y_test, ensemble_prob)

0.7500710870700885

In [18]:
mlp = create_model('mlp')
tuned_mlp = tune_model(mlp, optimize='f1')
final_mlp = finalize_model(tuned_mlp)
pred_mlp = predict_model(final_mlp, data=test_df)
pred_mlp

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.659,0.7019,0.5779,0.3642,0.4468,0.2183,0.2305
1,0.6625,0.7294,0.663,0.3805,0.4835,0.2592,0.2813
2,0.7216,0.7458,0.5913,0.4383,0.5035,0.3159,0.3228
3,0.7195,0.7166,0.5479,0.431,0.4825,0.2938,0.2978
4,0.5924,0.7139,0.7011,0.3319,0.4505,0.1877,0.2211
Mean,0.671,0.7215,0.6163,0.3892,0.4733,0.255,0.2707
Std,0.0475,0.0149,0.0568,0.0404,0.0215,0.0471,0.0391


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,MLP Classifier,0.7324,0.7661,0.6144,0.4529,0.5214,0.3416,0.3492


Unnamed: 0,center,age,gender,height,weight,BMI,BMI_group,is_operation,is_pain,pain_NRS,...,358,359,360,361,362,363,364,target,Label,Score
0,13,20,1,187.0,71.0,20.303698,1.0,0.0,1.0,3.0,...,0,0,1,0,0,0,0,0,0,0.6955
1,2,24,1,175.0,73.0,23.836735,1.0,0.0,1.0,2.0,...,0,0,0,1,0,1,0,0,0,0.5060
2,12,22,1,178.0,64.0,20.199470,1.0,1.0,1.0,3.0,...,0,0,0,0,0,0,0,0,1,0.5319
3,12,21,1,172.0,80.0,27.041644,2.0,0.0,0.0,3.0,...,0,0,0,0,0,0,0,0,1,0.6076
4,14,20,1,178.0,58.0,18.305769,1.0,0.0,0.0,3.0,...,0,0,0,0,0,0,0,0,1,0.6275
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2039,11,20,1,170.0,70.0,24.221453,1.0,1.0,1.0,4.0,...,0,0,0,1,0,0,0,0,1,0.7019
2040,15,23,1,171.0,82.0,28.042817,2.0,0.0,1.0,3.0,...,0,0,0,0,0,0,0,0,0,0.5390
2041,12,21,1,173.0,60.0,20.047446,1.0,0.0,0.0,2.0,...,0,0,0,0,0,0,0,0,0,0.6339
2042,12,21,1,168.0,66.0,23.384354,1.0,0.0,1.0,4.0,...,0,0,0,0,0,0,0,0,0,0.6682


In [21]:
res = pd.read_excel("model_performances_use-target_sub.xlsx", engine="openpyxl")
res["Score_lr"] = pred_lr["Score"].values
res['Score_dt'] = pred_dt['Score'].values
res['Score_xgboost'] = xgb_out
res['Score_ensemble'] = ensemble_prob
res['Score_mlp'] = pred_mlp['Score'].values

res.head()

Unnamed: 0,label,Score_dnn,Score_lr,Score_dt,Score_xgboost,Score_mlp,Score_ensemble
0,0,0.210132,0.637,0.7066,0.342983,0.6955,0.471492
1,0,0.404816,0.5351,0.9155,0.306893,0.506,0.653447
2,0,0.198936,0.5113,0.7066,0.189198,0.5319,0.544599
3,0,0.126523,0.6481,0.6718,0.260582,0.6076,0.630291
4,0,0.322968,0.6227,0.8923,0.110405,0.6275,0.555203


In [22]:
res.to_excel("model_performances_use-target_sub.xlsx", index=False)