In [None]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from skopt import BayesSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RepeatedKFold
from sklearn.compose import ColumnTransformer
from sklearn.metrics import RocCurveDisplay
from xgboost import XGBClassifier
import matplotlib.pyplot as plt

#Import Data, filter for Trainset, and create Target Variable.

data = pd.read_csv("anonymized_data_final_sep.csv")
data["recc"] = np.where(data.timediff <= 188, 1, 0)
traindata = data[data.train=="train"]

In [None]:
#Definition of Pipeline, Transformations of the independent variables and CrossVal principle.

column_transformer = ColumnTransformer([('tfidf1', TfidfVectorizer(), 'body_chat'),
                                        ('tfidf2', TfidfVectorizer(), 'body_couns')], 
                                        remainder="passthrough")

pipeline = Pipeline(steps = [['transf', column_transformer],
                             ['classifier', XGBClassifier()]])

cv = RepeatedKFold(n_splits=5, n_repeats=5)

In [None]:
#Define hyperparameter grid around the default values for the TF-IDF vectorizers and the XGBoost model

param_grid = {'transf__tfidf1__min_df': [1, 2, 5, 10, 25, 50, 75, 100, 150, 200],
              'transf__tfidf1__max_df': [x/100 for x in range(20, 101, 10)],
              'transf__tfidf1__use_idf': [True, False],
              # Same parameter grid for the second tf-idf vectorizer
              'transf__tfidf2__min_df': [1, 2, 5, 10, 25, 50, 75, 100, 150, 200],
              'transf__tfidf2__max_df': [x/100 for x in range(20, 101, 10)],
              'transf__tfidf2__use_idf': [True, False],
              # model parameters
              'classifier__min_child_weight': [1, 5, 10, 20],
              'classifier__gamma': [0, 0.25, 0.5, 1, 1.5, 2, 5, 10],
              'classifier__subsample': [0.2, 0.4, 0.6, 0.7, 0.8, 0.9, 1.0],
              'classifier__colsample_bytree': [0.2, 0.4, 0.6, 0.7, 0.8, 0.9, 1.0],
              'classifier__max_depth': [2, 4, 6, 8, 10, 12, 14, 16],
              'classifier__eta': [0.005, 0.01, 0.05, 0.1, 0.2]
              }

In [None]:
#Run hyperparameter search

grid_search = BayesSearchCV(estimator=pipeline,
                            search_spaces=param_grid,
                            scoring='roc_auc',
                            cv=cv,
                            n_jobs=-1,
                            n_iter=250,
                            verbose=2)

grid_search.fit(traindata[["body_couns", "body_chat"]], 
                traindata["recc"])

In [None]:
#Building RocAuc Curve for the best Performing hyperparameters.

pipeline.set_params(**grid_search.best_params_)

tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)

X = traindata[["body_couns", "body_chat"]]
y = traindata["recc"]
fig, ax = plt.subplots(figsize=(6, 6))
for fold, (train, test) in enumerate(cv.split(X, y)):
    pipeline.fit(X.iloc[train], y.iloc[train], )
    viz = RocCurveDisplay.from_estimator(
        pipeline,
        X.iloc[test],
        y.iloc[test],
        alpha=0.3,
        lw=1,
        ax=ax,
        label='_nolegend_'
    )
    interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
    interp_tpr[0] = 0.0
    tprs.append(interp_tpr)
    aucs.append(viz.roc_auc)

ax.plot([0, 1], [0, 1], "k--", label="chance level (AUC = 0.5)")

mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
ax.plot(
    mean_fpr,
    mean_tpr,
    color="b",
    label=r"Mean ROC (AUC = %0.2f $\pm$ %0.2f)" % (mean_auc, std_auc),
    lw=2,
    alpha=0.8,
)

std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
ax.fill_between(
    mean_fpr,
    tprs_lower,
    tprs_upper,
    color="grey",
    alpha=0.2,
    label=r"$\pm$ 1 std. dev.",
)

ax.set(
    xlim=[-0.05, 1.05],
    ylim=[-0.05, 1.05],
    xlabel="False Positive Rate",
    ylabel="True Positive Rate",
    title="ROC Curve",
)
ax.axis("square")
ax.legend(loc="lower right")

plt.gcf()
plt.show()