In [None]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, balanced_accuracy_score
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [None]:
#Data Import, train-testsplit and Outcome.
data = pd.read_csv("anonymized_data_final_sep.csv")
data["recc"] = np.where(data.timediff <= 188, 1, 0)
traindata = data[data.train=="train"]
testdata = data[data.train=="test"]

In [None]:
#Train the model using the tuned hyperparameters

column_transformer = ColumnTransformer(([('tfidf1', TfidfVectorizer(max_df=0.8, 
                                                                    min_df=150, 
                                                                    use_idf=False), 
                                                                    'body_chat'),
                                         ('tfidf2', TfidfVectorizer(max_df=0.3, 
                                                                    min_df=75, 
                                                                    use_idf=False), 
                                                                    'body_couns')]), 
                                         remainder="passthrough")

pipeline = Pipeline(steps = [['transf', column_transformer],
                             ['classifier', XGBClassifier(colsample_bytree=0.9, 
                                                          eta=0.05, 
                                                          gamma=1.5, 
                                                          max_depth=8, 
                                                          min_child_weight=20,
                                                          subsample=0.6)]])

X = traindata[["body_couns", "body_chat"]]
y = traindata["recc"]

pipeline.fit(X, y)

In [None]:
#Calculate AUROC score, balanced accuracy, and accuracy score on the test dataset

preds = pipeline.predict_proba(testdata[["body_couns", "body_chat"]])
roc_auc_score(testdata["recc"], preds[:, 1])
balanced_accuracy_score(testdata["recc"], np.where(preds[:,1]>0.5, 1, 0))
accuracy_score(testdata["recc"], np.where(preds[:,1]>0.5, 1, 0))

In [None]:
#Plot the confusion matrix

cm = confusion_matrix(testdata["recc"],  np.where(preds[:,1]>0.5, 1, 0))
class_labels = ["No Recurrence", "Recurrence"]

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_labels)
disp.plot(cmap=plt.cm.Blues,  colorbar=False)
plt.show()

In [None]:
#Permutation test for AUROC metric (see utils.py for the function definition)

from utils import permutation_metric


auroc_, pval = permutation_metric(testdata["recc"], 
                                  preds[:, 1], 
                                  roc_auc_score,
                                  side='right', 
                                  n=5000)