In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from gensim.utils import simple_preprocess
from transformers import AutoTokenizer, pipeline

# Load the datasets
articles_df = pd.read_csv('/Users/reppmazc/Documents/IRONHACK/quests/final_project/df_toKenized_topic.csv')
topics_df = pd.read_csv('/Users/reppmazc/Documents/IRONHACK/quests/final_project/df_topics.csv')

# Convert column names to lowercase for consistency
topics_df.columns = map(str.lower, topics_df.columns)
articles_df.columns = map(str.lower, articles_df.columns)

# Merge the dataframes on the 'topic' column using a left join
merged_df = articles_df.merge(topics_df, on='topic', how='left')

# Save the merged file (optional)
merged_df.to_csv('/Users/reppmazc/Documents/IRONHACK/quests/final_project/topics.csv')

# Ensure stopwords are downloaded
nltk.download('stopwords')
stop_words = set(stopwords.words('german'))

# Filter for the specific topic (174)
filtered_df = merged_df[merged_df['topic'] == 174]

# Define text preprocessing function
def preprocess_text(text):
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\W', ' ', text)  # Remove punctuation
    text = text.lower()  # Convert to lowercase
    tokens = simple_preprocess(text)  # Tokenize
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    return ' '.join(tokens)

# Apply preprocessing to the 'content' column
filtered_df['processed_content'] = filtered_df['content'].apply(preprocess_text)

# Load tokenizer and pipeline for sentiment analysis
model_name = "ssary/XLM-RoBERTa-German-sentiment"  # Specify the model for German sentiment analysis
max_token_length = 512
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Truncate text for tokenization
def truncate_text(text, tokenizer):
    tokens = tokenizer.encode(text, truncation=True, max_length=max_token_length)
    return tokenizer.decode(tokens, skip_special_tokens=True)

filtered_df['processed_content'] = filtered_df['processed_content'].apply(lambda x: truncate_text(x, tokenizer))

# Load sentiment analysis pipeline
sentiment_pipeline = pipeline("sentiment-analysis", model=model_name, tokenizer=tokenizer, max_length=max_token_length, truncation=True)

# Apply the model to get sentiment
filtered_df['sentiment'] = filtered_df['processed_content'].apply(lambda x: sentiment_pipeline(x)[0]['label'])

# Replace topic numbers with topic names in the final results
filtered_df['topic'] = '174_gendern_gender_sprache_rechtschreibung'  # Manually setting the topic name for this specific analysis

# Save filtered and processed results to CSV (optional)
filtered_df.to_csv(f"filtered_topic_sentiment_test_{model_name.split('/')[0]}.csv", index=False)
print(f"Sentiment analysis completed for topic 174. Results saved to 'filtered_topic_sentiment_test_{model_name.split('/')[0]}.csv'")

# Analyze sentiment by source
sentiment_by_source = filtered_df.groupby('source')['sentiment'].value_counts(normalize=True).unstack(fill_value=0)
print("Sentiment distribution by source for topic 174:")
print(sentiment_by_source)

# Optional: Save distribution table to CSV
sentiment_by_source.to_csv(f"sentiment_distribution_topic_174_by_source_{model_name.split('/')[0]}.csv")


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/reppmazc/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['processed_content'] = filtered_df['content'].apply(preprocess_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['processed_content'] = filtered_df['processed_content'].apply(lambda x: truncate_text(x, tokenizer))
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model w

Sentiment analysis completed for topic 174. Results saved to 'filtered_topic_sentiment_test_ssary.csv'
Sentiment distribution by source for topic 174:
sentiment     negative   neutral
source                          
bild          0.333333  0.666667
faz           0.250000  0.750000
focus         0.000000  1.000000
sueddeutsche  0.250000  0.750000
tagesschau    0.000000  1.000000
tagesspiegel  0.333333  0.666667
welt          0.000000  1.000000
zeit          0.000000  1.000000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['sentiment'] = filtered_df['processed_content'].apply(lambda x: sentiment_pipeline(x)[0]['label'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['topic'] = '174_gendern_gender_sprache_rechtschreibung'  # Manually setting the topic name for this specific analysis


# Sentiment model functionality test

In [10]:
from transformers import AutoTokenizer, pipeline

# Load the sentiment analysis model
model_name = "ssary/XLM-RoBERTa-German-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
sentiment_pipeline = pipeline("sentiment-analysis", model=model_name, tokenizer=tokenizer)

# Function to analyze sentiment of individual sentences or phrases
def analyze_sentence(sentence):
    result = sentiment_pipeline(sentence)
    sentiment = result[0]['label']
    confidence = result[0]['score']
    return sentiment, confidence

# Examples of single sentences or word combinations
test_sentences = [
    "Die Debatte um das Gendern ist umstritten.",
    "Das Gendersternchen ist eine Schande für die Gesellschaft",
    "Inklusive Sprache beinhaltet die Nutzung des Gendersternchens",
    "Am Samstag wurden zwei Männer tot aufgefunden",
    "Am Samstag wurden zwei Männer brutal abgeschlachtet"]

# Analyze each sentence
for sentence in test_sentences:
    sentiment, confidence = analyze_sentence(sentence) 
    print(f"Sentence: {sentence}\nSentiment: {sentiment} (Confidence: {confidence:.2f})\n")


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Sentence: Die Debatte um das Gendern ist umstritten.
Sentiment: neutral (Confidence: 1.00)

Sentence: Das Gendersternchen ist eine Schande für die Gesellschaft
Sentiment: negative (Confidence: 0.99)

Sentence: Inklusive Sprache beinhaltet die Nutzung des Gendersternchens
Sentiment: neutral (Confidence: 1.00)

Sentence: Am Samstag wurden zwei Männer tot aufgefunden
Sentiment: neutral (Confidence: 0.76)

Sentence: Am Samstag wurden zwei Männer brutal abgeschlachtet
Sentiment: negative (Confidence: 0.96)

