In [1]:
import pandas as pd
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
from scipy.special import softmax
import numpy as np

In [2]:
def get_sentiment(text, model, tokenizer, config):
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    # ranking = np.argsort(scores)
    # ranking = ranking[::-1]
    # for i in range(scores.shape[0]):
    #     l = config.id2label[ranking[i]]
    #     s = scores[ranking[i]]
    #     print(f"{i+1}) {l} {np.round(float(s), 4)}")
    return scores[0] # value for positive sentiment
        
        


HYPOTHESIS TO TEST: Instead of coming up with "pro-israel" or "pro-palestine" words, use the article source as an original hypothesis of words that are more likely to appear in the article.

Could do this by finding the set of words in source-a, the set of words in source-b, and then finding words that only appear in source-a or source-b - the difference.

In [5]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
sentences_df = pd.read_csv('data/sentences_df.csv')

In [4]:
sentences_df

Unnamed: 0,original_article,source,sentence
0,0,alj,The UN agency for Palestinian refugees (UNRWA)...
1,0,alj,"In a statement on Saturday, the agency noted “..."
2,0,alj,UNRWA teams work tirelessly to reach families ...
3,0,alj,UNICEF spokesperson James Elder also described...
4,0,alj,“More aid workers have been killed in this war...
...,...,...,...
174353,6195,bbc,"This is a democracy,” he told reporters.The Is..."
174354,6195,bbc,There have been further signs of strain in the...
174355,6195,bbc,Supplies have been held back at the crossing p...
174356,6195,bbc,"It added that they had located weapons, struck..."


In [24]:
import pandas as pd
from collections import defaultdict

# Assuming get_sentiment is a pre-defined function
def get_sentiment(text):
    # Placeholder for actual sentiment analysis function
    return 0.7

def filter_sentences_by_keywords(sentence, keywords):
    """Check if any keyword is present in the sentence."""
    words = sentence.lower().split()
    return any(keyword in words for keyword in keywords)

def collect_article_sentiments(df, pro_x_words, pro_y_words):
    """Collect sentiment scores for each article based on the presence of pro_x or pro_y keywords."""
    article_sentiments = defaultdict(lambda: {'pro_x': [], 'pro_y': []})
    
    for _, row in df.iterrows():
        sentence = row['sentence']
        article_id = row['original_article']
        
        if filter_sentences_by_keywords(sentence, pro_x_words):
            sentiment = get_sentiment(sentence)
            article_sentiments[article_id]['pro_x'].append(sentiment)
        
        if filter_sentences_by_keywords(sentence, pro_y_words):
            sentiment = get_sentiment(sentence)
            article_sentiments[article_id]['pro_y'].append(sentiment)
    
    return article_sentiments

def calculate_article_averages(article_sentiments):
    """Calculate the average sentiment scores for each article."""
    article_scores = {}
    
    for article_id, sentiments in article_sentiments.items():
        pro_x_avg = sum(sentiments['pro_x']) / len(sentiments['pro_x']) if sentiments['pro_x'] else None
        pro_y_avg = sum(sentiments['pro_y']) / len(sentiments['pro_y']) if sentiments['pro_y'] else None
        article_scores[article_id] = {'pro_x_avg': pro_x_avg, 'pro_y_avg': pro_y_avg}
    
    return article_scores

def aggregate_source_scores(df, article_scores):
    """Aggregate article-level sentiment averages into source-level sentiment scores."""
    source_scores = defaultdict(lambda: {'pro_x': [], 'pro_y': []})
    
    for article_id, scores in article_scores.items():
        source = df[df['original_article'] == article_id].iloc[0]['source']
        if scores['pro_x_avg'] is not None:
            source_scores[source]['pro_x'].append(scores['pro_x_avg'])
        if scores['pro_y_avg'] is not None:
            source_scores[source]['pro_y'].append(scores['pro_y_avg'])
    
    return source_scores

def calculate_final_source_averages(source_scores):
    """Calculate the final average sentiment scores for each source."""
    final_source_scores = {}
    
    for source, scores in source_scores.items():
        pro_x_final = sum(scores['pro_x']) / len(scores['pro_x']) if scores['pro_x'] else None
        pro_y_final = sum(scores['pro_y']) / len(scores['pro_y']) if scores['pro_y'] else None
        final_source_scores[source] = {'pro_x_avg': pro_x_final, 'pro_y_avg': pro_y_final}
    
    return final_source_scores

def process_dataframe(df, pro_x_words, pro_y_words):
    """Process the dataframe to calculate sentiment scores at the article and source levels."""
    article_sentiments = collect_article_sentiments(df, pro_x_words, pro_y_words)
    article_scores = calculate_article_averages(article_sentiments)
    source_scores = aggregate_source_scores(df, article_scores)
    final_source_scores = calculate_final_source_averages(source_scores)
    
    return article_scores, final_source_scores


In [25]:
pro_x_words = ["israel", "terrorist", "terror"]
pro_y_words = ["palestine", "occupied", "militant"]

article_scores, final_source_scores = process_dataframe(sentences_df, pro_x_words, pro_y_words)




In [26]:
article_scores

{0: {'pro_x_avg': 0.7, 'pro_y_avg': None},
 1: {'pro_x_avg': 0.7, 'pro_y_avg': None},
 2: {'pro_x_avg': 0.7000000000000001, 'pro_y_avg': 0.6999999999999998},
 4: {'pro_x_avg': 0.6999999999999998, 'pro_y_avg': 0.7},
 5: {'pro_x_avg': 0.7, 'pro_y_avg': 0.7},
 6: {'pro_x_avg': 0.7, 'pro_y_avg': 0.7},
 7: {'pro_x_avg': 0.7000000000000001, 'pro_y_avg': None},
 8: {'pro_x_avg': 0.7, 'pro_y_avg': None},
 9: {'pro_x_avg': 0.7, 'pro_y_avg': 0.7},
 10: {'pro_x_avg': None, 'pro_y_avg': 0.7},
 11: {'pro_x_avg': 0.7, 'pro_y_avg': None},
 12: {'pro_x_avg': 0.7, 'pro_y_avg': 0.7},
 13: {'pro_x_avg': 0.7000000000000001, 'pro_y_avg': None},
 14: {'pro_x_avg': 0.7, 'pro_y_avg': 0.6999999999999998},
 15: {'pro_x_avg': 0.7, 'pro_y_avg': None},
 17: {'pro_x_avg': 0.7, 'pro_y_avg': None},
 18: {'pro_x_avg': 0.7, 'pro_y_avg': None},
 19: {'pro_x_avg': 0.7, 'pro_y_avg': None},
 20: {'pro_x_avg': 0.7, 'pro_y_avg': None},
 21: {'pro_x_avg': 0.7, 'pro_y_avg': None},
 23: {'pro_x_avg': 0.7000000000000001, 'pro_y_