In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
import numpy as np
import shap
import gensim.models.word2vec
import gensim
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans

In [None]:
#Data Import, train-testsplit and Outcome.

data = pd.read_csv("anonymized_data_final_sep.csv")
data["recc"] = np.where(data.timediff <= 188, 1, 0)
traindata = data[data.train=="train"]
testdata = data[data.train=="test"]

In [None]:
#Vectorize the text bodies of the training data and convert it to a DataFrame. 
#Suffix of each column indicates whether the word was used by counselor or chatter

vectorizerchatter = TfidfVectorizer(max_df=0.8, 
                                    min_df=150, 
                                    use_idf=False)
vectorizercouns = TfidfVectorizer(max_df=0.3, 
                                  min_df=75, 
                                  use_idf=False)

X_trainchat = vectorizerchatter.fit_transform(traindata["body_chat"])
X_trainchatdata = pd.DataFrame(X_trainchat.toarray())
X_trainchatdata.columns = vectorizerchatter.get_feature_names_out() 
X_trainchatdata = X_trainchatdata.add_suffix("_chat")

X_traincouns = vectorizercouns.fit_transform(traindata["body_couns"])
X_traincounsdata = pd.DataFrame(X_traincouns.toarray())
X_traincounsdata.columns = vectorizercouns.get_feature_names_out() 
X_traincounsdata = X_traincounsdata.add_suffix("_couns")

X_train = pd.concat([X_trainchatdata,
                     X_traincounsdata],
                     axis=1)

In [None]:
#Train the classifier on the training data 

classi = XGBClassifier(colsample_bytree=0.9, 
                       eta=0.05, 
                       gamma=1.5, 
                       max_depth=8, 
                       min_child_weight=20, 
                       subsample=0.6)

classi.fit(X_train, traindata["recc"])

In [None]:
#Prepare the test dataset

X_testchat = vectorizerchatter.transform(testdata["body_chat"])
X_testcouns = vectorizercouns.transform(testdata["body_couns"])

X_testchatdata = pd.DataFrame(X_testchat.toarray())
X_testchatdata.columns = vectorizerchatter.get_feature_names_out() 

X_testcounsdata = pd.DataFrame(X_testcouns.toarray())
X_testcounsdata.columns = vectorizercouns.get_feature_names_out() 

X_testchatdata = X_testchatdata.add_suffix("_chat")
X_testcounsdata = X_testcounsdata.add_suffix("_couns")

X_test = pd.concat([X_testchatdata,X_testcounsdata],axis=1)

In [None]:
#Run shap explainer on the test dataset

explainer = shap.TreeExplainer(classi)
shap_values = explainer.shap_values(X_test)
shap_data = pd.DataFrame(shap_values, 
                         columns=X_test.columns)

In [None]:
#Rename columns (German to English)

X_test = X_test.rename(columns={"tagsub_couns" : "Daytime (CO)", "morgen_chat" : "Tommorow (CH)",
                   "nacht_chat" : "Night (CH)", "männlich_chat" : "Male (CH)",
                   "12_chat" : "12 (CH)", "13_chat" : "13 (CH)",
                   "verletz_couns" : "Harm (CO)", "verletzt_couns" : "Harmed (CO)",
                   "rat_chat" : "Advice (CH)", "anspann_couns" : "Tension (CH)"})
shap_data = shap_data.rename(columns= {"tagsub_couns" : "Daytime (CO)", "morgen_chat" : "Tommorow (CH)",
                   "nacht_chat" : "Night (CH)", "männlich_chat" : "Male (CH)",
                   "12_chat" : "12 (CH)", "13_chat" : "13 (CH)",
                   "verletz_couns" : "Harm (CO)", "verletzt_couns" : "Harmed (CO)",
                   "rat_chat" : "Advice (CH)", "anspann_couns" : "Tension (CH)"})
shap.summary_plot(shap_data.to_numpy(),
                  X_test)

In [None]:
#Generate Shap plot for 20 Selected Variables

columns = ["Daytime (CO)", "Tommorow (CH)", "Night (CO)", "Male (CH)",
           "12 (CH)", "13 (CH)", "Harm (CO)", "Harmed (CO)",
           "Advice (CH)", "Tension (CH)", "Friend/Girlfriend (CH)", "Girl (CH)",
           "Child (CO)", "Internet Care (CO)", "Professional (CO)", "Job (CH)",
           "Suicide(CO)", "Dying (CH)", "Work (CH)", "Everyday Life (CH)",
           "14 (CH)", "Spot for Therapy (CO)", "Suicide (CO)", "Cutting (CH)"]

X_test = X_test[columns]
shap_data = shap_data[columns]

shap.summary_plot(shap_data.to_numpy(),
                  X_test)

In [None]:
#Clustering: Loading Word2Vec Model (Source: https://github.com/devmount/GermanWordEmbeddings)

model = gensim.models.KeyedVectors.load_word2vec_format('\german.model', binary=True)
chatdata_words = vectorizerchatter.get_feature_names_out()
counsdata_words = vectorizercouns.get_feature_names_out()

In [None]:
# Combining chatters and counselors word for clustering.

words_combined = np.union1d(chatdata_words, counsdata_words)
out = []
wor = []

for word in words_combined:
    try:
        out.append(model[word])
        wor.append(word)
    except:
        pass

In [None]:
#Sillhouete Scores for the Clusters.

sse = []
silhouette_avg = []

for k in range(2, 30):
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(out)
    sse.append(kmeans.inertia_)
    cluster_labels = kmeans.labels_
    silhouette_avg.append(silhouette_score(out, cluster_labels))

plt.plot(range(2,30), sse)
plt.title("Elbow Curve")
plt.show()

plt.plot(range(2,30),silhouette_avg)
plt.title("Silhouette Scores for k Clusters")
plt.show()

In [None]:
#Building the clusters using KMeans and the identified number of clusters.

kmeans = KMeans(n_clusters=20)
kmeans.fit(out)

In [None]:
#Predicting the clusters for the Word Stems.

worddata = pd.DataFrame({"word": wor})
worddata["pred"] = kmeans.predict(out)
worddata.groupby("pred").count().reset_index().mean()

In [None]:
# Building final dataset

cluster = []
values = []
words = []
clustersize = []
worddata = pd.DataFrame({"word": wor})
worddata["pred"] = kmeans.predict(out)

chatter_shap = shap_data.filter(regex='_chat')
chatter_shap.columns = chatter_shap.columns.str.replace('_chat', '')

couns_shap = shap_data.filter(regex='_couns')
couns_shap.columns = couns_shap.columns.str.replace('_couns', '')

for i in range(0,20):
    cluster.append(i)
    words.append(worddata[worddata.pred == i].word.values)
    clustersize.append(len(worddata[worddata.pred == i].word.values))
    values.append(chatter_shap.filter(worddata[worddata.pred == i].word.values).abs().sum().sum() + couns_shap.filter(worddata[worddata.pred == 0].word.values).abs().sum().sum())

data = pd.DataFrame({"cluster":cluster, 
                     "value":values, 
                     "words":words, 
                     "clustersize":clustersize})