In [3]:
import re
import nltk
from nltk.corpus import stopwords
from gensim.utils import simple_preprocess
from transformers import pipeline, AutoTokenizer
import pandas as pd

In [None]:
# loading article data
df = pd.read_csv("/Users/reppmazc/Documents/IRONHACK/quests/final_project/cleaned_articles_wo_date.csv")

In [8]:
# test three different models on 50 randomly selected articles
# Ensure stopwords are downloaded
nltk.download('stopwords')
stop_words = set(stopwords.words('german'))

# Step 1: Define text preprocessing function
def preprocess_text(text):
    # Remove numbers and punctuation, lowercase, and remove stopwords
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\W', ' ', text)  # Remove punctuation
    text = text.lower()  # Convert to lowercase
    tokens = simple_preprocess(text)  # Tokenize
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    return ' '.join(tokens)

# Apply preprocessing to the 'content' column
df['processed_content'] = df['content'].apply(preprocess_text)

# Step 2: Select a random sample of 100 articles
sample_df = df.sample(n=100, random_state=1)

# Step 3: Load tokenizers and define truncation for each model
model_names = {"XLM-RoBERTa-German-sentiment": "ssary/XLM-RoBERTa-German-sentiment",
               "GermanFinBert_SC_Sentiment": "scherrmann/GermanFinBert_SC_Sentiment",
               "twitter-xlm-roberta-base-sentiment-finetunned": "citizenlab/twitter-xlm-roberta-base-sentiment-finetunned"}

# Set up tokenizers with truncation for each model
max_token_length = 512
tokenizers = {name: AutoTokenizer.from_pretrained(model) for name, model in model_names.items()}

# Truncate each text if it exceeds the maximum length for each model
def truncate_text(text, tokenizer):
    tokens = tokenizer.encode(text, truncation=True, max_length=max_token_length)
    return tokenizer.decode(tokens, skip_special_tokens=True)

# Apply truncation for each model's processed content
for model_name, tokenizer in tokenizers.items():
    sample_df[f'processed_content_{model_name}'] = sample_df['processed_content'].apply(lambda x: truncate_text(x, tokenizer))

# Step 4: Load pipelines for each model with max_length and truncation set
pipelines = {
    name: pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, max_length=max_token_length, truncation=True)
    for name, (model, tokenizer) in zip(model_names.keys(), zip(model_names.values(), tokenizers.values()))}

# Apply each model's pipeline to the truncated text and save results in separate columns
for model_name, sentiment_pipeline in pipelines.items():
    sample_df[f'sentiment_{model_name}'] = sample_df[f'processed_content_{model_name}'].apply(
        lambda x: sentiment_pipeline(x)[0]['label'])

# Step 5: Save the sample DataFrame with sentiment results to a new CSV file
sample_df.to_csv("sample_processed_articles_with_multiple_sentiments.csv", index=False)

print("Sentiment analysis with multiple models, including the Twitter model, completed and results saved to 'sample_processed_articles_with_multiple_sentiments.csv'")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/reppmazc/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


tokenizer_config.json:   0%|          | 0.00/618 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


config.json:   0%|          | 0.00/992 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Sentiment analysis with multiple models, including the Twitter model, completed and results saved to 'sample_processed_articles_with_multiple_sentiments.csv'
