In [5]:
import pandas as pd
import spacy

In [6]:
all_articles = pd.read_csv('./scraped_folders/Cleaned_Articles_manually_labeled.csv')
all_articles = all_articles.drop(all_articles.columns[[6,7, 8, 9, 10]], axis=1)
all_articles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10525 entries, 0 to 10524
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Country          10525 non-null  object
 1   News Publisher   10525 non-null  object
 2   Country Stance   10525 non-null  object
 3   Headline         10525 non-null  object
 4   Date             10525 non-null  object
 5   Headline Stance  700 non-null    object
dtypes: object(6)
memory usage: 493.5+ KB


In [7]:
from dateutil import parser
import pandas as pd
import re

def parse_and_format_date(date_str):
    if pd.isna(date_str):
        return None
    if isinstance(date_str, pd.Timestamp):
        return date_str.date().strftime('%Y-%m-%d')
    if isinstance(date_str, str):
        try:
            # Attempt to parse with dateutil
            parsed_date = parser.parse(date_str, fuzzy_with_tokens=True)
            # parsed_date is a tuple where the first element is the datetime object
            parsed_datetime = parsed_date[0]
            return parsed_datetime.date().strftime('%Y-%m-%d')
        except ValueError:
            return None
    return date_str

all_articles['parsed_dates'] = all_articles['Date'].apply(parse_and_format_date)

print(all_articles)


            Country News Publisher Country Stance  \
0      South Africa           eNCA  Pro Palestine   
1      South Africa           eNCA  Pro Palestine   
2      South Africa           eNCA  Pro Palestine   
3      South Africa           eNCA  Pro Palestine   
4      South Africa           eNCA  Pro Palestine   
...             ...            ...            ...   
10520            US       Fox News     Pro Israel   
10521            US       Fox News     Pro Israel   
10522            US       Fox News     Pro Israel   
10523            US       Fox News     Pro Israel   
10524            US       Fox News     Pro Israel   

                                                Headline  \
0            Biden announces emergency port for Gaza aid   
1      Israel strikes Gaza's Rafah as truce talks und...   
2      Gaza hospitals out of fuel, caught in Israel-H...   
3      Gazans bury their dead in orchards and footbal...   
4      'Exhausted' Gazans desperate for war to end as...   
...

In [8]:
def clean_headline(headline):
    parts = headline.split("Published", 1)
    return parts[0].strip()

all_articles['Headline'] = all_articles['Headline'].apply(clean_headline)
all_articles.head()

Unnamed: 0,Country,News Publisher,Country Stance,Headline,Date,Headline Stance,parsed_dates
0,South Africa,eNCA,Pro Palestine,Biden announces emergency port for Gaza aid,Thursday 07 March 2024 - 21:00pm,Neutral,2024-03-07
1,South Africa,eNCA,Pro Palestine,Israel strikes Gaza's Rafah as truce talks und...,Thursday 22 February 2024 - 13:00pm,Pro-Israel,2024-02-22
2,South Africa,eNCA,Pro Palestine,"Gaza hospitals out of fuel, caught in Israel-H...",Monday 13 November 2023 - 05:51am,Pro-Palestine,2023-11-13
3,South Africa,eNCA,Pro Palestine,Gazans bury their dead in orchards and footbal...,Thursday 09 November 2023 - 15:55pm,Pro-Palestine,2023-11-09
4,South Africa,eNCA,Pro Palestine,'Exhausted' Gazans desperate for war to end as...,Saturday 30 December 2023 - 05:50am,Pro-Palestine,2023-12-30


In [9]:
all_articles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10525 entries, 0 to 10524
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Country          10525 non-null  object
 1   News Publisher   10525 non-null  object
 2   Country Stance   10525 non-null  object
 3   Headline         10525 non-null  object
 4   Date             10525 non-null  object
 5   Headline Stance  700 non-null    object
 6   parsed_dates     8261 non-null   object
dtypes: object(7)
memory usage: 575.7+ KB


In [11]:
all_articles.to_csv('./scraped_folders/all_articles_cleaned_with_python.csv')

In [12]:
labeled_data = all_articles.loc[all_articles['Headline Stance'].isin(['Pro-Israel', 'Neutral','Pro-Palestine'])]
labeled_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 633 entries, 0 to 10524
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Country          633 non-null    object
 1   News Publisher   633 non-null    object
 2   Country Stance   633 non-null    object
 3   Headline         633 non-null    object
 4   Date             633 non-null    object
 5   Headline Stance  633 non-null    object
 6   parsed_dates     539 non-null    object
dtypes: object(7)
memory usage: 39.6+ KB


In [13]:
labeled_data.groupby('Headline Stance').count()

Unnamed: 0_level_0,Country,News Publisher,Country Stance,Headline,Date,parsed_dates
Headline Stance,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Neutral,221,221,221,221,221,200
Pro-Israel,128,128,128,128,128,113
Pro-Palestine,284,284,284,284,284,226


In [17]:
#SVM
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score


# Split the data into training and test sets
train_data, test_data = train_test_split(labeled_data, test_size=0.30, random_state=42, stratify=labeled_data['Headline Stance'])

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words='english')

# Transform headlines to TF-IDF features
X_train = vectorizer.fit_transform(train_data['Headline'])
X_test = vectorizer.transform(test_data['Headline'])

# Target labels
y_train = train_data['Headline Stance']
y_test = test_data['Headline Stance']

# Initialize and train the SVM classifier
svm = SVC(kernel='linear',class_weight='balanced')
svm.fit(X_train, y_train)

# Predict on test data
y_pred = svm.predict(X_test)

# Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")


Classification Report:
               precision    recall  f1-score   support

      Neutral       0.66      0.58      0.61        66
   Pro-Israel       0.57      0.64      0.60        39
Pro-Palestine       0.74      0.76      0.75        85

     accuracy                           0.67       190
    macro avg       0.65      0.66      0.66       190
 weighted avg       0.67      0.67      0.67       190

Accuracy: 0.67


In [None]:
label_mapping = {'Neutral': 0, 'Pro-Israel': 1, 'Pro-Palestine': 2}
labeled_data['label'] = labeled_data['Headline Stance'].map(label_mapping)

# Split the data into training and test sets
train_data, test_data = train_test_split(labeled_data, test_size=0.2, random_state=42, stratify=labeled_data['label'])


In [None]:
test_data['label'].unique()

In [10]:
#XLNet
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from transformers import XLNetTokenizer, XLNetForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from datasets import Dataset


# Ensure the labels are consistent
label_mapping = {'Neutral': 0, 'Pro-Israel': 1, 'Pro-Palestine': 2}
labeled_data['label'] = labeled_data['Headline Stance'].map(label_mapping)

# Split the data into training and test sets
train_data, test_data = train_test_split(labeled_data, test_size=0.2, random_state=42, stratify=labeled_data['label'])

# Initialize the tokenizer
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')

# Tokenize the data
def tokenize_function(examples):
    return tokenizer(examples['Headline'], padding="max_length", truncation=True, max_length=128)

train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

print(train_data['label'].dtype, " ",test_data['label'].dtype)
# Load the model
model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=3)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train the model
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

# Predict on test data
predictions = trainer.predict(test_dataset)
pred_labels = predictions.predictions.argmax(axis=1)

predictions = trainer.predict(test_dataset)
pred_labels = predictions.predictions.argmax(axis=1)

# Evaluate and print the classification report
print("Classification Report:")
print(classification_report(test_data['label'], pred_labels))
print(f"Accuracy: {accuracy_score(test_data['label'], pred_labels):.2f}")



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_data['label'] = labeled_data['Headline Stance'].map(label_mapping)


Map:   0%|          | 0/506 [00:00<?, ? examples/s]

Map:   0%|          | 0/127 [00:00<?, ? examples/s]

int64   int64


Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


  0%|          | 0/320 [00:00<?, ?it/s]

{'loss': 1.1102, 'learning_rate': 1.9375e-05, 'epoch': 0.16}
{'loss': 1.1711, 'learning_rate': 1.8750000000000002e-05, 'epoch': 0.31}
{'loss': 1.0813, 'learning_rate': 1.8125e-05, 'epoch': 0.47}
{'loss': 1.08, 'learning_rate': 1.7500000000000002e-05, 'epoch': 0.62}
{'loss': 1.0974, 'learning_rate': 1.6875e-05, 'epoch': 0.78}
{'loss': 1.0171, 'learning_rate': 1.6250000000000002e-05, 'epoch': 0.94}


  0%|          | 0/16 [00:00<?, ?it/s]

{'eval_loss': 1.036670207977295, 'eval_runtime': 3.6868, 'eval_samples_per_second': 34.447, 'eval_steps_per_second': 4.34, 'epoch': 1.0}
{'loss': 1.1111, 'learning_rate': 1.5625e-05, 'epoch': 1.09}
{'loss': 1.0572, 'learning_rate': 1.5000000000000002e-05, 'epoch': 1.25}
{'loss': 1.106, 'learning_rate': 1.4375e-05, 'epoch': 1.41}
{'loss': 1.0831, 'learning_rate': 1.375e-05, 'epoch': 1.56}
{'loss': 1.0463, 'learning_rate': 1.3125e-05, 'epoch': 1.72}
{'loss': 0.9777, 'learning_rate': 1.25e-05, 'epoch': 1.88}


  0%|          | 0/16 [00:00<?, ?it/s]

{'eval_loss': 1.0552150011062622, 'eval_runtime': 3.6099, 'eval_samples_per_second': 35.181, 'eval_steps_per_second': 4.432, 'epoch': 2.0}
{'loss': 0.9976, 'learning_rate': 1.1875e-05, 'epoch': 2.03}
{'loss': 1.0235, 'learning_rate': 1.125e-05, 'epoch': 2.19}
{'loss': 0.9647, 'learning_rate': 1.0625e-05, 'epoch': 2.34}
{'loss': 1.0337, 'learning_rate': 1e-05, 'epoch': 2.5}
{'loss': 0.9756, 'learning_rate': 9.375000000000001e-06, 'epoch': 2.66}
{'loss': 0.9724, 'learning_rate': 8.750000000000001e-06, 'epoch': 2.81}
{'loss': 0.9356, 'learning_rate': 8.125000000000001e-06, 'epoch': 2.97}


  0%|          | 0/16 [00:00<?, ?it/s]

{'eval_loss': 0.9715608954429626, 'eval_runtime': 11.7322, 'eval_samples_per_second': 10.825, 'eval_steps_per_second': 1.364, 'epoch': 3.0}
{'loss': 0.905, 'learning_rate': 7.500000000000001e-06, 'epoch': 3.12}
{'loss': 0.8745, 'learning_rate': 6.875e-06, 'epoch': 3.28}
{'loss': 0.8394, 'learning_rate': 6.25e-06, 'epoch': 3.44}
{'loss': 0.7859, 'learning_rate': 5.625e-06, 'epoch': 3.59}
{'loss': 0.8276, 'learning_rate': 5e-06, 'epoch': 3.75}
{'loss': 0.7996, 'learning_rate': 4.3750000000000005e-06, 'epoch': 3.91}


  0%|          | 0/16 [00:00<?, ?it/s]

{'eval_loss': 0.9467660784721375, 'eval_runtime': 6.2842, 'eval_samples_per_second': 20.209, 'eval_steps_per_second': 2.546, 'epoch': 4.0}
{'loss': 0.8421, 'learning_rate': 3.7500000000000005e-06, 'epoch': 4.06}
{'loss': 0.6652, 'learning_rate': 3.125e-06, 'epoch': 4.22}
{'loss': 0.695, 'learning_rate': 2.5e-06, 'epoch': 4.38}
{'loss': 0.7793, 'learning_rate': 1.8750000000000003e-06, 'epoch': 4.53}
{'loss': 0.622, 'learning_rate': 1.25e-06, 'epoch': 4.69}
{'loss': 0.6139, 'learning_rate': 6.25e-07, 'epoch': 4.84}
{'loss': 0.7269, 'learning_rate': 0.0, 'epoch': 5.0}


  0%|          | 0/16 [00:00<?, ?it/s]

{'eval_loss': 0.9287413358688354, 'eval_runtime': 5.7821, 'eval_samples_per_second': 21.964, 'eval_steps_per_second': 2.767, 'epoch': 5.0}
{'train_runtime': 437.2754, 'train_samples_per_second': 5.786, 'train_steps_per_second': 0.732, 'train_loss': 0.931811572611332, 'epoch': 5.0}


  0%|          | 0/16 [00:00<?, ?it/s]

Evaluation results: {'eval_loss': 0.9287413358688354, 'eval_runtime': 4.3847, 'eval_samples_per_second': 28.964, 'eval_steps_per_second': 3.649, 'epoch': 5.0}


  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.50      0.57        44
           1       0.47      0.31      0.37        26
           2       0.56      0.75      0.64        57

    accuracy                           0.57       127
   macro avg       0.57      0.52      0.53       127
weighted avg       0.58      0.57      0.56       127

Accuracy: 0.57


In [15]:
#VADER with logistic regression
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')

# Initialize VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Function to apply VADER sentiment analysis and return compound score
def analyze_sentiment(text):
    return analyzer.polarity_scores(text)['compound']

# Apply sentiment analysis to each headline
label_mapping = {'Neutral': 0, 'Pro-Israel': 1, 'Pro-Palestine': 2}
labeled_data['label'] = labeled_data['Headline Stance'].map(label_mapping)

labeled_data['compound_score'] = labeled_data['Headline'].apply(analyze_sentiment)

# Split the data into training and test sets
train_data, test_data = train_test_split(labeled_data, test_size=0.20, random_state=42, stratify=labeled_data['label'])

# Define features and labels
X_train = train_data[['compound_score']]
y_train = train_data['label']
X_test = test_data[['compound_score']]
y_test = test_data['label']

# Initialize and train logistic regression model
model = LogisticRegression(max_iter=1000, random_state=42,class_weight='balanced')
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")


Classification Report:
              precision    recall  f1-score   support

           0       0.35      0.45      0.40        44
           1       0.19      0.50      0.28        26
           2       0.33      0.02      0.03        57

    accuracy                           0.27       127
   macro avg       0.29      0.32      0.24       127
weighted avg       0.31      0.27      0.21       127

Accuracy: 0.27


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/shivam/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_data['label'] = labeled_data['Headline Stance'].map(label_mapping)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_data['compound_score'] = labeled_data['Headline'].apply(analyze_sentiment)


In [20]:
#Naive Bayes
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

# Example dataset with labeled headlines


# Split the data into training and test sets
train_data, test_data = train_test_split(labeled_data, test_size=0.25, random_state=42, stratify=labeled_data['Headline Stance'])

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words='english')

# Transform headlines to TF-IDF features
X_train = vectorizer.fit_transform(train_data['Headline'])
X_test = vectorizer.transform(test_data['Headline'])

# Target labels
y_train = train_data['Headline Stance']
y_test = test_data['Headline Stance']
class_counts = y_train.value_counts()
class_weights = {cls: np.sum(class_counts) / (len(class_counts) * count) for cls, count in class_counts.items()}
# Initialize and train the Naive Bayes classifier (MultinomialNB)
nb = MultinomialNB(class_prior=list(class_weights.values()))
nb.fit(X_train, y_train)

# Predict on test data
y_pred = nb.predict(X_test)

# Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")


Classification Report:
               precision    recall  f1-score   support

      Neutral       1.00      0.04      0.07        56
   Pro-Israel       0.45      0.16      0.23        32
Pro-Palestine       0.46      0.94      0.62        71

     accuracy                           0.47       159
    macro avg       0.64      0.38      0.31       159
 weighted avg       0.65      0.47      0.35       159

Accuracy: 0.47


In [18]:
svm_label = all_articles[['Country','News Publisher','Country Stance','Headline','parsed_dates']]

# Transform new headlines to TF-IDF features
X_new = vectorizer.transform(svm_label['Headline'])

# Predict using the SVM classifier
y_pred_new = svm.predict(X_new)
svm_label['Headline Stance'] = y_pred_new
svm_label.to_csv('./scraped_folders/all_articles_svm_labeled.csv')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  svm_label['Headline Stance'] = y_pred_new


In [19]:
stance_counts = svm_label.groupby(['Country','News Publisher', 'Headline Stance']).size().reset_index(name='Count')
print(stance_counts)

         Country News Publisher Headline Stance  Count
0        Ireland  Breaking News         Neutral    311
1        Ireland  Breaking News      Pro-Israel    225
2        Ireland  Breaking News   Pro-Palestine    471
3        Ireland    The Journal         Neutral    129
4        Ireland    The Journal      Pro-Israel     87
5        Ireland    The Journal   Pro-Palestine    230
6   South Africa           SABC         Neutral    208
7   South Africa           SABC      Pro-Israel     63
8   South Africa           SABC   Pro-Palestine    265
9   South Africa           eNCA         Neutral    118
10  South Africa           eNCA      Pro-Israel     67
11  South Africa           eNCA   Pro-Palestine    202
12            UK            BBC         Neutral    200
13            UK            BBC      Pro-Israel    153
14            UK            BBC   Pro-Palestine    296
15            UK   The Guardian         Neutral    766
16            UK   The Guardian      Pro-Israel    398
17        