In [1]:
%pip install transformers
%pip install torch



In [2]:
import numpy as np
import pandas as pd
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline

data = pd.read_csv('PSE_ISR-filtered_data.csv') 

In [4]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline

model_name = "bhadresh-savani/distilbert-base-uncased-emotion"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name, truncation=True)
max_length = 256

# Change here: Set device to 'cpu'
sentiment_pipeline = pipeline(task="sentiment-analysis",
                              model=model,
                              tokenizer=tokenizer,
                              device=-1,  # Changed from 0 to -1 to use CPU
                              max_length=max_length)

In [5]:
threshold_strongly_pro_palestine = 0.7
threshold_pro_palestine = 0.6
threshold_neutral = 0.5
threshold_pro_israel = 0.4
threshold_strongly_pro_israel = 0.3

In [6]:
predicted_labels = []
batch_size = 100
total_records = len(data)
start_index = 0

while start_index < total_records:
    end_index = start_index + batch_size
    batch_data = data['self_text'][start_index:end_index]
    results = sentiment_pipeline([str(i) for i in batch_data])

    for result in results:
        score = result['score']
        if score >= threshold_strongly_pro_palestine:
            predicted_labels.append("strongly pro-Palestine")
        elif score >= threshold_pro_palestine:
            predicted_labels.append("pro-Palestine")
        elif score >= threshold_neutral:
            predicted_labels.append("neutral")
        elif score >= threshold_pro_israel:
            predicted_labels.append("pro-Israel")
        else:
            predicted_labels.append("strongly pro-Israel")

    start_index = end_index

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [7]:
data['predicted_label'] = predicted_labels

for txt, pred_label, rslt in zip(data['self_text'], predicted_labels, results):
    print(f'Text: "{txt}"\nPredicted Label: {pred_label}\nScore: {rslt["score"]}\n{"-" * 40}')

data.to_csv('predicted-dateset.csv', index=False)

Text: "care belgium crack violent settler israel honest question hope answer"
Predicted Label: strongly pro-Palestine
Score: 0.5105515122413635
----------------------------------------
Text: "talking palestinian killed year"
Predicted Label: strongly pro-Palestine
Score: 0.5708707571029663
----------------------------------------
Text: "totally fascist regime nazi shit germany"
Predicted Label: strongly pro-Palestine
Score: 0.9894216656684875
----------------------------------------
Text: "attack mention responsible caught killed howard happened guess settler blood lust israeli government support refuse hold accountable free innocent people"
Predicted Label: neutral
Score: 0.7417247295379639
----------------------------------------
Text: "palestine water country outsized power stage rely national gas supply economy"
Predicted Label: pro-Palestine
Score: 0.5722042322158813
----------------------------------------
Text: "hegemony"
Predicted Label: strongly pro-Palestine
Score: 0.93044960