In [None]:
# Import the pandas library for working with data tables
import pandas as pd

# Load the dataset from the CSV file named "data_tutorial.csv"
data = pd.read_csv("data_tutorial.csv")

# Note: This assumes the dataset is in the same directory as this script. 
#This should be the case, when cloning the whole github repository.

In [None]:
# Import the re module for regular expressions
import re

# Define a regular expression pattern to select everything besides characters and whitespaces
pattern = r'[^\w\s]'

# Apply the pattern to clean up the 'body' column and create a new 'body_cleaned' column
data['body_cleaned'] = data['body'].apply(lambda x: re.sub(pattern, '', x))

# Remove '\n' characters, convert to lowercase, and update the 'body_cleaned' column
data['body_cleaned'] = data['body_cleaned'].str.replace("\n", " ").str.lower()

# Note: The '\n' characters were present in the text and are removed as they don't seem to contain relevant information.

In [None]:
# Import the Cistem stemmer from the NLTK library, designed for the German language
from nltk.stem.cistem import Cistem

# Create an instance of the Cistem stemmer
stemmer = Cistem()

# Apply the stemmer to each word in the 'body_cleaned' column and create a new 'stemmed' column
# We split the comments into words using spaces as the default separator.
data["stemmed"] = data['body_cleaned'].apply(lambda x: [stemmer.stem(word) for word in x.split()])


In [None]:
# Import stopwords for the German language from the NLTK library
from nltk.corpus import stopwords

# Access the collection of German stopwords from NLTK
german_stop_words = stopwords.words('german')

# Remove stopwords from the 'stemmed' column and create a new 'body_stem_nostop' column
data['body_stem_nostop'] = data["stemmed"].apply(lambda x: [word for word in x if word not in german_stop_words])

# Note: Words that are stopwords in the German language are removed from the list of word stems.

In [None]:
# Import the train_test_split function from scikit-learn
from sklearn.model_selection import train_test_split

# Split the dataset into training (70%) and testing (30%) sets
train, test = train_test_split(data, test_size=0.3, random_state=123)

# Further split the testing set into validation (20%) and final testing (10%) sets
validation, test = train_test_split(test, test_size=0.33, random_state=123)

# Note: Out of the remaining 30%, 20% is used as validation data for making model decisions,
# such as choosing hyperparameters or feature extraction methods.
# The remaining 10% is saved for a final one-time prediction.

In [None]:
# Import the necessary modules for text vectorization
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Train CountVectorizer and TfidfVectorizer on our text data without using any specific settings
count_vectorizer = CountVectorizer(analyzer=lambda x: x).fit(train['body_stem_nostop'])
tfidf_vectorizer = TfidfVectorizer(analyzer=lambda x: x).fit(train['body_stem_nostop'])

# Transform the text data into vectors using the trained vectorizers
cv_data = count_vectorizer.transform(train['body_stem_nostop'])
tf_data = tfidf_vectorizer.transform(train['body_stem_nostop'])

In [None]:
# Import the logistic regression model from scikit-learn
from sklearn.linear_model import LogisticRegression

# Train two logistic regression models on our vectorized data
# One using CountVectorizer (cv_data) and the other using TF-IDF (tf_data)
cv_logistic_model = LogisticRegression().fit(cv_data, train["category"])
tf_logistic_model = LogisticRegression().fit(tf_data, train["category"])

# Test the trained models with two random examples.
# Note: The input data must match the format used during training with the vectorizer.
test_data_cv = count_vectorizer.transform([["example", "word", "for", "prediction"]])
test_data_tf = tfidf_vectorizer.transform([["active", "concentrate"]])

# Predict the category for the test data using each model
prediction_cv = cv_logistic_model.predict(test_data_cv)
prediction_tf = tf_logistic_model.predict(test_data_tf)

# Display the predictions
print("Prediction using CountVectorizer:", prediction_cv)
print("Prediction using TF-IDF:", prediction_tf)

In [None]:
# Import necessary modules from the Gensim library
import gensim.models.keyedvectors as word2vec
import gensim
import urllib.request

# Download the pre-trained German Word2Vec model from a URL and save it locally
url = 'https://cloud.devmount.de/d2bc5672c523b086/german.model'
local_path = 'word2vecgerman.model'
urllib.request.urlretrieve(url, local_path)

# Load the German Word2Vec model from the local file
model = gensim.models.KeyedVectors.load_word2vec_format(local_path, binary=True)

In [None]:
# Import the numpy library for numerical operations
import numpy as np

words = set(model.index_to_key )

# Extract unique words from the Word2Vec model
words = set(model.index_to_key)

# Assuming 'train' is your DataFrame with a 'body' column
train["embedd"] = train["body"].apply(lambda ls: np.array([model[i] for i in ls.split(" ") if i in words]))

In [None]:
# Initialize an empty list to store average word embeddings for each comment
train_vect_avg = []

# Calculate the average word embedding for each comment in the 'embedd' column
for values in train["embedd"]:
    if values.size:
        train_vect_avg.append(np.mean(values, axis=0))
    else:
        # If there are no word embeddings for a comment, append a zero-filled array
        train_vect_avg.append(np.zeros(300, dtype=float))

# Train a logistic regression model using the average word embeddings and the subreddits.
w2v_model = LogisticRegression().fit(pd.DataFrame(train_vect_avg), train["category"])

In [None]:
# Import the fasttext module
import fasttext.util

# Download the pre-trained German language model if it doesn't exist locally
fasttext.util.download_model('de', if_exists='ignore')

# Load the model
ft = fasttext.load_model('cc.de.300.bin')

In [None]:
# Create a new column 'embedd_fasttext' by applying the fastText model to each cleaned comment
train["embedd_fasttext"] = train['body_cleaned'].apply(lambda x: ft.get_sentence_vector(x))

# Flatten arrays in the 'embedd_fasttext' containing the values per vector into separate columns
fasttext_train = train["embedd_fasttext"].apply(lambda x: pd.Series(x))

# Train a logistic regression model using the fastText embeddings and corresponding subreddits.
fasttext_model = LogisticRegression().fit(fasttext_train, train["category"])

In [None]:
# Import the Word2Vec model from the Gensim library
from gensim.models import Word2Vec

# Train a Word2Vec model on the preprocessed wordstems in the 'body_stem_nostop' column
w2v_model = Word2Vec(train["body_stem_nostop"],
                     vector_size=500,
                     window=5,
                     min_count=10)

# Extract unique words from the Word2Vec model
words = set(w2v_model.wv.index_to_key)

# Create an 'embedd_own' column by applying our selftrained Word2Vec model to each wordstem of each comment
train["embedd_own"] = train["body_stem_nostop"].apply(
    lambda ls: np.array([w2v_model.wv[i] for i in ls if i in words])
)

# Calculate the average word embedding for each comment in the 'embedd_own' column
train_vect_avg = []
for value in train["embedd_own"]:
    if value.size:
        train_vect_avg.append(value.mean(axis=0))
    else:
        train_vect_avg.append(np.zeros(500, dtype=float))

# Train a logistic regression model using the self-trained Word2Vec embeddings and corresponding subreddit.
self_trained_w2v_model = LogisticRegression().fit(pd.DataFrame(train_vect_avg), train["category"])

In [None]:
# Create a new column 'category_fasttext' by adding '__label__' to the existing 'category' column
train['category_fasttext'] = '__label__' + train['category']

# Combine 'category_fasttext' and preprocessed comments into a single string for each row
fasttext_train = train['category_fasttext'] + " " + train['body_stem_nostop'].apply(lambda word_list: ' '.join(word_list))

# Write the formatted data to a file named "train.txt"
with open("train.txt", "a", encoding="utf-8") as f_train:
    for i in range(len(np.array(fasttext_train))):
        f_train.write(np.array(fasttext_train)[i] + "\n")

# Train a supervised fastText model using the formatted data in "train.txt"
model_fasttext_selftrained = fasttext.train_supervised(input="train.txt")

#Example prediction for a string of wordstems.
model_fasttext_selftrained.predict(["beispiel wort"])

In [None]:
# Import the PyTorch library
import torch

# Check if a GPU (CUDA) is available for acceleration
is_cuda_available = torch.cuda.is_available()

# Print the result
print("CUDA (GPU) is available:", is_cuda_available)

In [None]:
# Import necessary modules from the datasets and transformers libraries
from datasets import Dataset
from transformers import AutoTokenizer

# Rename columns and dummycode the outcome.
train["text"] = train["body"]
train["label"] = train["category"].replace(['ADHS','depression_de'], [0, 1])

# Create a new DataFrame with selected columns
train_transformer = train[["label", "text"]]

# Convert the DataFrame to a Dataset, the required format of the transformers library.
data = Dataset.from_pandas(train_transformer)

# Load the Distilbert Model for tokenization.
#Documentation of the model: https://huggingface.co/distilbert-base-german-cased
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-german-cased")

# Define a function to tokenize the text in the dataset
def tokenize(dataset):
    return tokenizer(dataset["text"], truncation=True, padding="max_length", max_length=512)

# Tokenize the text in the dataset using the defined function
data_tokenized = data.map(tokenize)

In [None]:
# Import necessary modules from the transformers library
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

# Load the pre-trained model for sequence classification with 2 labels
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-german-cased", num_labels=2)

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    report_to=[],
    num_train_epochs=5,
)

# Train the model using the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data_tokenized,
)

trainer.train()

In [None]:
#Example Predictions and saving the model locally for later use.
trainer.predict(data_tokenized.select(range(10)))
trainer.save_model("distillbert_german_classification_reddit")

In [None]:
# Import necessary modules from skllm library
from skllm import ZeroShotGPTClassifier
from skllm.config import SKLLMConfig

# Set OpenAI organization information (not used locally, but random string input is required)
SKLLMConfig.set_openai_key("any string")
SKLLMConfig.set_openai_org("any string")

# Initialize a ZeroShotGPTClassifier with a specific openai_model
clf = ZeroShotGPTClassifier(openai_model="gpt4all::mistral-7b-instruct-v0.1.Q4_0.gguf")

# Fit the classifier with None as input (not used in this case)
clf.fit(None, ["aufmerksamkeitsdefizit subreddit", "depression subreddit"])

# Make predictions on a list of example comments
predictions = clf.predict(["Example Comment"])

In [None]:
# Train a TfidfVectorizer on the training data
tf = TfidfVectorizer(analyzer=lambda x: x).fit(train['body_stem_nostop'])

# Transform the training and validation data into TF-IDF vectors
tf_train = tf.transform(train['body_stem_nostop'])
tf_val = tf.transform(validation['body_stem_nostop'])

# Train a logistic regression model on the TF-IDF vectors
tflog = LogisticRegression().fit(tf_train, train["category"])

# Calculate the ROC AUC score on the validation set
roc_auc = roc_auc_score(validation["category"], tflog.predict_proba(tf_val)[:, 1])

# Print the ROC AUC score
print("ROCAUC Score:", roc_auc)

In [None]:
# Bring the data into the needed format, matching the model input format.
validation["dataforfasttext"] = validation["body_stem_nostop"].apply(lambda word_list: ' '.join(word_list))

# Define a function to predict using the fastText model on each row
def predict(row):
    return model_fasttext_selftrained.predict(row['dataforfasttext'])

# Define a function that generates the probability per Prediction. 
#This manual fix is needed as always the probability for the more likely class is given out.

def process_row(row):
    label, value = row
    if '__label__ADHS' in label:
        return 1 - value[0]
    elif '__label__depression_de' in label:
        return value[0]
    else:
        return None

# Apply the predict function to each row and store the predictions in a new column 'predictions'
validation['predictions'] = validation.apply(predict, axis=1)

# Apply the process_row function to calculate probabilities and store in a new column 'proba'
validation["proba"] = validation.predictions.apply(process_row)

# Calculate the ROC AUC score on the validation set
roc_auc = roc_auc_score(validation["category"], validation['proba'])

# Print the ROC AUC score
print("ROCAUC Score:", roc_auc)

In [None]:
# Load the pre-trained model for sequence classification with 2 labels
model_path = "/distillbert_german_classification_reddit"
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=2)

# Define a function to compute metrics, including ROC AUC score
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions
    roc = roc_auc_score(labels, preds[:, 1])
    return {'rocauc': roc}

# Set up training arguments for the Trainer (not used for evaluation)
args = TrainingArguments(report_to=[], output_dir="/")

# Initialize the Trainer with the model and training arguments
trainer = Trainer(model=model, args=args, compute_metrics=compute_metrics)

#Preparing Validationdata
validation["text"] = validation["body"]
validation["label"] = validation["category"].replace(['ADHS', 'depression_de'], [0, 1])

validation_transformer = validation[["label", "text"]]

validation_data = Dataset.from_pandas(validation_transformer)

# Tokenize the text in the validation dataset using the defined function
validation_tokenized = validation_data.map(tokenize)

# Evaluate the model on the validation dataset using the Trainer
evaluation_result = trainer.evaluate(eval_dataset=validation_tokenized)

# Print the ROC AUC score
print("ROCAUC Score:", evaluation_result['rocauc'])

In [None]:
#Preparing Testdata
test["text"] = test["body"]
test["label"] = test["category"].replace(['ADHS', 'depression_de'], [0, 1])

test_transformer = test[["label", "text"]]

test_data = Dataset.from_pandas(test_transformer)

# Tokenize the text in the test dataset using the defined function
test_data_tokenized = test_data.map(tokenize)

# Evaluate the model on the test dataset using the Trainer
evaluation_result_test = trainer.evaluate(eval_dataset=test_data_tokenized)

# Print the ROC AUC score on the test dataset
print("ROCAUC Score on Test Data:", evaluation_result_test['rocauc'])