In [None]:
import shap
import pandas as pd

import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

from xgboost import XGBClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE

from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

vectorizerchatter = TfidfVectorizer(max_df=0.9, min_df=10, use_idf=True)
vectorizercouns = TfidfVectorizer(max_df=0.3, min_df=5, use_idf=True)

X_trainchat = vectorizerchatter.fit_transform(train["body_chat"])
X_trainchatdata = pd.DataFrame(X_trainchat.toarray())
X_trainchatdata.columns = vectorizerchatter.get_feature_names_out()
X_trainchatdata = X_trainchatdata.add_suffix("_chat")

X_traincouns = vectorizercouns.fit_transform(train["body_couns"])
X_traincounsdata = pd.DataFrame(X_traincouns.toarray())
X_traincounsdata.columns = vectorizercouns.get_feature_names_out()
X_traincounsdata = X_traincounsdata.add_suffix("_couns")

X_train = pd.concat([X_trainchatdata,
                     X_traincounsdata],
                     axis=1)

In [None]:
#Train the classifier on the training data

classi = XGBClassifier(colsample_bytree=0.2, eta=0.05, gamma=5.0, max_depth=12,
                                 min_child_weight=10, scale_pos_weight=0.5, subsample=0.8)

X_train_resampled, y_train_resampled = RandomOverSampler().fit_resample(X_train, train["outcome"])


classi.fit(X_train_resampled, y_train_resampled)


In [None]:
#Prepare the test dataset

X_testchat = vectorizerchatter.transform(test["body_chat"])
X_testcouns = vectorizercouns.transform(test["body_couns"])

X_testchatdata = pd.DataFrame(X_testchat.toarray())
X_testchatdata.columns = vectorizerchatter.get_feature_names_out()

X_testcounsdata = pd.DataFrame(X_testcouns.toarray())
X_testcounsdata.columns = vectorizercouns.get_feature_names_out()

X_testchatdata = X_testchatdata.add_suffix("_chat")
X_testcounsdata = X_testcounsdata.add_suffix("_couns")

X_test = pd.concat([X_testchatdata,X_testcounsdata],axis=1)

In [None]:
# Dictionary to rename columns
new_column_names = {
    'nein_chat': 'No (Chatter)',
    'mal_chat' : 'Once (Chatter)',
    'gerne_chat': 'Gladly (Chatter)',
    'anfangen_chat': 'Start (Chatter)',
    'dank_chat': 'Thanks (Chatter)',
    'toll_chat': 'Great (Chatter)',
    'schlimm_couns': 'Bad (Counselor)',
    'freundin_chat': 'Friend (Chatter)',
    'fragen_chat': 'Ask (Chatter)',
    'eher_couns': 'More (Counselor)',
    'daher_couns': 'Therefore (Counselor)',
    'danken_chat': 'Thank (Chatter)',
    'bissch_chat': 'A bit (Chatter)',
    'trotzdem_chat': 'Nevertheless (Chatter)',
    'anliegen_couns': 'Issue (Counselor)',
    'fragen_chat': 'Question (Chatter)',
    'versuchen_chat': 'Try (Chatter)',
    'ermutigen_couns': 'Encourage (Counselor)',
    'weiterhin_couns': 'Further One (Counselor)',
    'leider_chat': 'Unfortunetaly (Chatter)',
    '17_chat' : "17 (Chatter)"}

# Renaming columns
X_test.rename(columns=new_column_names, inplace=True)


In [None]:
#Run shap explainer on the test dataset

explainer = shap.TreeExplainer(classi)
shap_values = explainer.shap_values(X_test)
shap_data = pd.DataFrame(shap_values,
                         columns=X_test.columns)

In [None]:
shap.summary_plot(shap_values,
                  X_test, max_display = 20)