In [1]:
import pandas as pd

df = pd.read_csv("../data/processed/sentiment_annotated_data.csv")

print(df["sentiment_label"].value_counts(normalize=True))


sentiment_label
LABEL_1    0.840846
ERROR      0.112818
LABEL_0    0.024679
LABEL_2    0.021657
Name: proportion, dtype: float64


In [3]:
import pandas as pd
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Initialisiere NLTK-Tools
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Daten laden
df = pd.read_csv("../data/processed/cleaned_social_media_data.csv")

# Text vorbereiten (title + description)
df["text"] = df["title"].fillna("") + " " + df["description/text"].fillna("")

# Preprocessing- & Tokenisierung-Funktion
def tokenize_and_lemmatize(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r'\W+', ' ', text)
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(t) for t in tokens if t not in stop_words and len(t) > 2]
    return tokens

# Token-Spalte erstellen
df["tokens"] = df["text"].apply(tokenize_and_lemmatize)

# Optional: Token als Joined-String (für späteres Vectorizing)
df["tokenized_text"] = df["tokens"].apply(lambda tokens: " ".join(tokens))

# Speichern
df.to_csv("../data/processed/tokenized_social_media_data.csv", index=False)
print("Tokenisierte Datei gespeichert unter: data/processed/tokenized_social_media_data.csv")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\SofiePischl\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SofiePischl\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\SofiePischl\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Tokenisierte Datei gespeichert unter: data/processed/tokenized_social_media_data.csv


In [4]:
from collections import Counter
from itertools import chain

all_tokens = list(chain.from_iterable(df["tokens"]))
print(Counter(all_tokens).most_common(30))

[('subreddits', 2231), ('community', 1960), ('year', 1529), ('subscriber', 1436), ('trending', 1172), ('subreddit', 646), ('interesting', 603), ('2021', 579), ('und', 449), ('die', 439), ('also', 411), ('discussion', 394), ('work', 389), ('feel', 376), ('video', 357), ('please', 355), ('discus', 344), ('based', 336), ('free', 331), ('started', 331), ('der', 328), ('like', 328), ('small', 324), ('keep', 320), ('try', 314), ('comment', 304), ('front', 303), ('page', 302), ('check', 300), ('hope', 300)]


In [5]:
import pyLDAvis.sklearn
import pyLDAvis

pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda, dtm, vectorizer)
pyLDAvis.save_html(panel, "lda_topics.html")  # erzeugt klickbare Übersicht


ModuleNotFoundError: No module named 'pyLDAvis'

In [6]:
topic_distributions = lda.transform(dtm)
df["dominant_topic"] = topic_distributions.argmax(axis=1)


NameError: name 'lda' is not defined

In [None]:
import os
import pandas as pd
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt

# === Konfigurierbarer Pfad ===
BASE_DIR = os.getenv("DATA_DIR", "../data")
INPUT_FILE = os.path.join(BASE_DIR, "processed/cleaned_social_media_data.csv")
OUTPUT_TOPIC_FILE = os.path.join(BASE_DIR, "processed/bertopic_topics.csv")
TOPIC_VIS_FILE = os.path.join(BASE_DIR, "processed/bertopic_visualization.html")

# === Daten laden ===
df = pd.read_csv(INPUT_FILE)
texts = df["description/text"].astype(str).tolist()

# === Embeddings vorbereiten ===
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")  # klein & schnell
embeddings = embedding_model.encode(texts, show_progress_bar=True)

# === BERTopic Modell trainieren ===
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")
topic_model = BERTopic(vectorizer_model=vectorizer_model)
topics, probs = topic_model.fit_transform(texts, embeddings)

# === Ergebnisse speichern ===
df["topic"] = topics
df["topic_probability"] = probs
df.to_csv(OUTPUT_TOPIC_FILE, index=False)

# === Top Themen anzeigen ===
print("\nTop 10 Themen mit Stichwörtern:")
for topic_id, words in topic_model.get_topics().items():
    if topic_id == -1:
        continue
    print(f"\nTopic {topic_id}: {[w[0] for w in words[:5]]}")

# === Beispieltexte pro Topic ===
for topic_id in topic_model.get_topic_freq().head(5)["Topic"]:
    if topic_id == -1:
        continue
    print(f"\n=== Beispiele für Topic {topic_id} ===")
    examples = df[df["topic"] == topic_id].head(3)
    for _, row in examples.iterrows():
        print("-", row["description/text"][:200])

# === Interaktive Visualisierung speichern ===
topic_model.visualize_topics().write_html(TOPIC_VIS_FILE)
print(f"\n✅ Interaktive Visualisierung gespeichert unter: {TOPIC_VIS_FILE}")
