<a href="https://colab.research.google.com/github/stepanjaburek/workingpaper_czech_psp_speeches/blob/main/Streamline_Translation_Sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Machine translation using the Opus-MT model from Uni Helsinky**

# Setup

In [None]:
!pip install transformers sentencepiece sacremoses torch tqdm

import pandas as pd
from transformers import MarianMTModel, MarianTokenizer
from tqdm.notebook import tqdm
import torch

In [None]:
def translate_csv(file_path, source_lang='cs', target_lang='en', batch_size=8):
    df=pd.read_csv(file_path)
    model_name = f'Helsinki-NLP/opus-mt-{source_lang}-{target_lang}'
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name).to('cuda' if torch.cuda.is_available() else 'cpu')

    def translate_batch(texts):
        inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
        inputs = {k: v.to(model.device) for k, v in inputs.items()}
        with torch.no_grad():
            return tokenizer.batch_decode(model.generate(**inputs), skip_special_tokens=True)

    translations = []
    for i in tqdm(range(0, len(df), batch_size)):
        translations.extend(translate_batch(df['context_full'][i:i + batch_size].tolist()))

    df['translated_context_full'] = translations
    return df

# Translation left


In [None]:
translated_df = translate_csv('/content/left.csv')
translated_df.to_csv('left_translated.csv', index=False)

# Translation right

In [None]:
translated_df = translate_csv('/content/right.csv')
translated_df.to_csv('right_translated.csv', index=False)

# **Sentiment classification using the Poltical DEBATE model by Burnham et al. (2024)**

# Setup

In [None]:
import torch
from transformers import pipeline
import pandas as pd
from tqdm.notebook import tqdm

In [None]:
def analyze_sentiments(df, classifier, classes, hypothesis, batch_size=16):
    results = []
    for i in tqdm(range(0, len(df), batch_size)):
        batch_output = classifier(
            df['translated_context_full'][i:i + batch_size].tolist(),
            classes,
            hypothesis_template=hypothesis,
            multi_label=False,
            batch_size=batch_size
        )

        for item in batch_output:
            results.append({
                'label': item['labels'][0],
                'score': item['scores'][0],
                **{f'{label}_score': score for label, score in zip(item['labels'], item['scores'])}
            })

    return pd.DataFrame(results)


# Model specification and labeling - Left

In [None]:
model_name = "mlburnham/Political_DEBATE_large_v1.0"
hypothesis_template =  "The emotional valence of this text towards the political left is {}"
classes = [ "negative", "neutral", "positive"]

device = 0 if torch.cuda.is_available() else -1
classifier = pipeline("zero-shot-classification",
                     model=model_name,
                     device=device)

In [None]:
df = pd.read_csv("/content/left_translated.csv")
results = analyze_sentiments(df, classifier, classes, hypothesis_template)

pd.concat([df, results], axis=1).to_csv('debate_sentiment_left.csv', index=False)


print("\nSentiment Distribution:")
print(results['label'].value_counts())

# Model specification and labeling - Right

In [None]:
model_name = "mlburnham/Political_DEBATE_large_v1.0"
hypothesis_template =  "The emotional valence of this text towards the political right is {}"
classes = [ "negative", "neutral", "positive"]

device = 0 if torch.cuda.is_available() else -1
classifier = pipeline("zero-shot-classification",
                     model=model_name,
                     device=device)

In [None]:
df = pd.read_csv("/content/right_translated.csv")
results = analyze_sentiments(df, classifier, classes, hypothesis_template)


pd.concat([df, results], axis=1).to_csv('debate_sentiment_right.csv', index=False)


print("\nSentiment Distribution:")
print(results['label'].value_counts())