In [None]:
import pandas as pd

data = pd.read_csv("data_tutorial.csv") #Import the scraped dataset

In [None]:
import re

pattern = r'[^\w\s]' #Regular Expression select everything besides characters and whitespaces.
data['body_cleaned'] = data['body'].apply(lambda x: re.sub(pattern, '', x))
data['body_cleaned']= data['body_cleaned'].str.replace("\n"," ").str.lower() #Our Textcorpora included "\n" indicating linebreaks.
# As this does not appear to contain relevant information, we are deleting it. 

In [None]:
from nltk.stem.cistem import Cistem

st = Cistem() #We use this Stemmer, specifically built for the german language.
data["stemmed"] = data['body_cleaned'].apply(lambda x: [st.stem(y) for y in x.split()])

In [None]:
from nltk.corpus import stopwords

german_stop_words = stopwords.words('german')
data['body_stem_nostop'] = data["stemmed"].apply(lambda x: [item for item in x if item not in german_stop_words])

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size = 0.3, random_state = 123)
validation, test = train_test_split(test, test_size = 0.33, random_state = 123) 

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

cv = CountVectorizer(analyzer=lambda x: x).fit(train['body_stem_nostop'])
tf = TfidfVectorizer(analyzer=lambda x: x).fit(train['body_stem_nostop'])

cv_data = cv.transform(train['body_stem_nostop'])
tf_data = tf.transform(train['body_stem_nostop'])

In [None]:
from sklearn.linear_model import LogisticRegression

cvlog = LogisticRegression().fit(cv_data, train["category"])
tflog = LogisticRegression().fit(tf_data, train["category"])

test_data_cv = cv.transform([["beispiel", "wort", "fur", "vorhersag"]])
test_data_tf = tf.transform([["aktiv", "konzentration"]])

cvlog.predict(test_data_cv)
tflog.predict(test_data_tf)

In [None]:
import gensim.models.keyedvectors as word2vec
import gensim
import urllib.request 

urllib.request.urlretrieve('https://cloud.devmount.de/d2bc5672c523b086/german.model', 'word2vecgerman.model')
model = gensim.models.KeyedVectors.load_word2vec_format('./word2vecgerman.model', binary=True)

In [None]:
import numpy as np

words = set(model.index_to_key )
train["embedd"] = np.array([np.array([model[i] for i in ls.split(" ") if i in words]) for ls in train["body"]])

In [None]:
train_vect_avg = []
for values in train["embedd"]:
    if  values.size:
        train_vect_avg.append(np.mean(values, axis=0))
    else:
        train_vect_avg.append(np.zeros(300, dtype=float))

w2vmodel = LogisticRegression().fit(pd.DataFrame(train_vect_avg), train["category"])

In [None]:
import fasttext.util

fasttext.util.download_model('de', if_exists='ignore') 
ft = fasttext.load_model('cc.de.300.bin')

In [None]:
train["embedd_fasttext"] = train['body_cleaned'].apply(lambda x: ft.get_sentence_vector(x))
fasttext_train = train["embedd_fasttext"].apply(lambda x: pd.Series(x)) #Flatten Arrays into single columns
fastmodel = LogisticRegression().fit(fasttext_train, train["category"])

In [None]:
from gensim.models import Word2Vec

w2v_model = gensim.models.Word2Vec(train["body_stem_nostop"],
                                   vector_size=500,
                                   window=5,
                                   min_count=10)

words = set(w2v_model.wv.index_to_key )

train["embedd_own"] = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in train["body_stem_nostop"]])


train_vect_avg = []
for value in train["embedd_own"]:
    if value.size:
        train_vect_avg.append(value.mean(axis=0))
    else:
        train_vect_avg.append(np.zeros(500, dtype=float))

selftrainedw2v = LogisticRegression().fit(pd.DataFrame(train_vect_avg), train["category"])

In [None]:
train['category_fasttext'] = '__label__' + train['category']
fasttexttrain = train['category_fasttext'] + " " + train['body_stem_nostop'].apply(lambda word_list: ' '.join(word_list))

f_train = open("train.txt", "a", encoding="utf-8")
for i in range(len(np.array(fasttexttrain))):
    f_train.write(np.array(fasttexttrain)[i] + "\n")
f_train.close()

model_fasttext_selftrained = fasttext.train_supervised(input="train.txt")

In [None]:
model_fasttext_selftrained.predict("sauf")

In [None]:
import torch 
torch.cuda.is_available()


In [None]:
from datasets import Dataset
from transformers import AutoTokenizer

train["text"] = train["body"]
train["label"] = train["category"].replace(['ADHS','depression_de'],[0,1])

train_transformer = train[["label", "text"]]
data = Dataset.from_pandas(train_transformer)

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-german-cased")

def tokenize(dataset):
    return tokenizer(dataset["text"], truncation=True, padding="max_length", max_length =512)

data_tokenized = data.map(tokenize)


In [None]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-german-cased", num_labels=2)

training_args = TrainingArguments(
    output_dir="./results",
    report_to= "none",
    num_train_epochs=5,
)

# Train the model
trainer = Trainer(
    model=model,
    args = training_args,
    train_dataset=data_tokenized,
)

trainer.train()

In [None]:
trainer.predict(data_tokenized.select(range(10)))
trainer.save_model("distillbert_german_classification_reddit")

In [None]:
from skllm import ZeroShotGPTClassifier
from skllm.config import SKLLMConfig

SKLLMConfig.set_openai_key("not used as locally, but needed anyway.") 
SKLLMConfig.set_openai_org("any string can be used")

clf = ZeroShotGPTClassifier(openai_model="gpt4all::mistral-7b-instruct-v0.1.Q4_0.gguf")

clf.fit(None, ["aufmerksamkeitsdefizit subreddit", "depression subreddit"])
clf.predict(["Example Comment"])

In [None]:
#Using the tfidf Classifier we trained earlier for predictions on the validation set.
from sklearn.metrics import roc_auc_score

tf = TfidfVectorizer(analyzer=lambda x: x).fit(train['body_stem_nostop'])

tf_train = tf.transform(train['body_stem_nostop'])
tf_val = tf.transform(validation['body_stem_nostop'])

tflog = LogisticRegression().fit(tf_train, train["category"])
roc_auc_score(validation["category"], tflog.predict_proba(tf_val)[:, 1])
#ROCAUC SCore of 0.91

In [None]:
validation["dataforfasttext"] = validation["body_stem_nostop"].apply(lambda word_list: ' '.join(word_list))

def predict(row):
    return model_fasttext_selftrained.predict(row['dataforfasttext'])

def process_row(row):
    label, value = row
    if '__label__ADHS' in label:
        return 1 - value[0]
    elif '__label__depression_de' in label:
        return value[0]
    else:
        return None
    
validation['predictions'] = validation.apply(predict,axis=1)
validation["proba"] = validation.predictions.apply(process_row)

roc_auc_score(validation["category"], validation['proba']) 


In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions

    roc = roc_auc_score(labels, preds[:, 1])
    return {
          'rocauc': roc,
            }

In [None]:
from transformers import pipeline

model = AutoModelForSequenceClassification.from_pretrained(r"C:\Users\49157\nlptut\distillbert_german_classification_reddit", num_labels=2)
trainer = Trainer(model=model, args = TrainingArguments(report_to=None, output_dir="/"), compute_metrics=compute_metrics)

validation["text"] = validation["body"]
validation["label"] = validation["category"].replace(['ADHS','depression_de'],[0,1])

validation_transformer = validation[["label", "text"]]
validation = Dataset.from_pandas(validation_transformer)

validation_tokenized = validation.map(tokenize)

trainer.evaluate(eval_dataset=validation_tokenized)

In [None]:
test["text"] = test["body"]
test["label"] = test["category"].replace(['ADHS','depression_de'],[0,1])

test_transformer = test[["label", "text"]]
testdata = Dataset.from_pandas(test_transformer)

testdata_tokenized = testdata.map(tokenize)

trainer.evaluate(eval_dataset=testdata_tokenized)