**NEWS BIAS DETECTOR**

In [62]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

**Load Data**

In [63]:
filename = "allsides_balanced_news_headlines-texts.csv"

import requests
url = 'https://raw.githubusercontent.com/irgroup/Qbias/refs/heads/main/allsides_balanced_news_headlines-texts.csv'
res = requests.get(url, allow_redirects=True)
with open(filename,'wb') as file:
    file.write(res.content)

df = pd.read_csv(filename)
print("Shape:", df.shape)
print(df.head(10))
print(f"Columns: {df.columns}")

Shape: (21754, 7)
   Unnamed: 0                                              title  \
0           0           Gun Violence Over Fourth of July Weekend   
1           1           Gun Violence Over Fourth of July Weekend   
2           2           Gun Violence Over Fourth of July Weekend   
3           3  Yellen Warns Congress of 'Economic Recession' ...   
4           4  Yellen Warns Congress of 'Economic Recession' ...   
5           5  Yellen Warns Congress of 'Economic Recession' ...   
6           6                       Night 2: Christie on Hillary   
7           7                       Night 2: Christie on Hillary   
8           8                       Night 2: Christie on Hillary   
9           9  Denying Abortion Medication Could Violate Civi...   

                                                tags  \
0  ['Protests', 'Fourth Of July', 'Gun Control An...   
1  ['Protests', 'Fourth Of July', 'Gun Control An...   
2  ['Protests', 'Fourth Of July', 'Gun Control An...   
3  ['Jane

**Drop 'Unnamed: 0' column and reset index**

In [64]:
df.drop(columns= ["Unnamed: 0"], inplace = True)
df.reset_index(drop = True, inplace = True)
print(df.columns)

Index(['title', 'tags', 'heading', 'source', 'text', 'bias_rating'], dtype='object')


In [65]:
print(df.head())

                                               title  \
0           Gun Violence Over Fourth of July Weekend   
1           Gun Violence Over Fourth of July Weekend   
2           Gun Violence Over Fourth of July Weekend   
3  Yellen Warns Congress of 'Economic Recession' ...   
4  Yellen Warns Congress of 'Economic Recession' ...   

                                                tags  \
0  ['Protests', 'Fourth Of July', 'Gun Control An...   
1  ['Protests', 'Fourth Of July', 'Gun Control An...   
2  ['Protests', 'Fourth Of July', 'Gun Control An...   
3  ['Janet Yellen', 'Debt Ceiling', 'Economic Pol...   
4  ['Janet Yellen', 'Debt Ceiling', 'Economic Pol...   

                                             heading                 source  \
0  Chicago Gun Violence Spikes and Increasingly F...  New York Times (News)   
1  ‘Bullets just came from nowhere’: Fourth of Ju...        Chicago Tribune   
2  Dozens of shootings across US mark bloody July...   New York Post (News)   
3  Federal

**Data Exploration**

In [66]:
# Explore number of labeled articles in each bucket
left_df = df[df["bias_rating"] == "left"]
right_df = df[df["bias_rating"] == "right"]
center_df = df[df["bias_rating"] == "center"]

print(f"Left: {left_df.shape}")
print(f"Right: {right_df.shape}")
print(f"Center: {center_df.shape}")

Left: (10275, 6)
Right: (7226, 6)
Center: (4253, 6)


In [67]:
# Explore word counts across articles
left_word_count = sum(left_df["text"].fillna("").str.split().apply(len))
right_word_count = sum(right_df["text"].fillna("").str.split().apply(len))
center_word_count = sum(center_df["text"].fillna("").str.split().apply(len))

print("Left Leaning Articles Word Count:", left_word_count)
print("Right Leaning Articles Word Count:", right_word_count)
print("Center Leaning Articles Word Count:", center_word_count)

Left Leaning Articles Word Count: 658761
Right Leaning Articles Word Count: 481681
Center Leaning Articles Word Count: 301266


There is a discrepancy in the total word count of the articles labeled left, right, and center

In [68]:
# Explore tag lengths across articles
left_tags_count = sum(left_df["tags"].fillna("").str.split(",").apply(len))
right_tags_count = sum(right_df["tags"].fillna("").str.split(",").apply(len))
center_tags_count = sum(center_df["tags"].fillna("").str.split(",").apply(len))

print("Left Leaning Articles Tags Count:", left_tags_count)
print("Right Leaning Articles Tags Count:", right_tags_count)
print("Center Leaning Articles Tags Count:", center_tags_count)

Left Leaning Articles Tags Count: 35898
Right Leaning Articles Tags Count: 27676
Center Leaning Articles Tags Count: 19801


Tags counts are closer

**Data Preparation**

In [69]:
# Convert words to lowercase in all columns
df = df.map(lambda x: x.lower() if isinstance(x, str) else x)
print(df.head())

                                               title  \
0           gun violence over fourth of july weekend   
1           gun violence over fourth of july weekend   
2           gun violence over fourth of july weekend   
3  yellen warns congress of 'economic recession' ...   
4  yellen warns congress of 'economic recession' ...   

                                                tags  \
0  ['protests', 'fourth of july', 'gun control an...   
1  ['protests', 'fourth of july', 'gun control an...   
2  ['protests', 'fourth of july', 'gun control an...   
3  ['janet yellen', 'debt ceiling', 'economic pol...   
4  ['janet yellen', 'debt ceiling', 'economic pol...   

                                             heading                 source  \
0  chicago gun violence spikes and increasingly f...  new york times (news)   
1  ‘bullets just came from nowhere’: fourth of ju...        chicago tribune   
2  dozens of shootings across us mark bloody july...   new york post (news)   
3  federal

In [None]:
# Shuffle df and split into X and Y
# all_indices = np.arange(df.shape[0])
# np.random.shuffle(all_indices)

# test_size = int(0.2 * df.shape[0])
# validation_size = int(0.1 * df.shape[0])

# test_indices = all_indices[: test_size]
# validation_indices = all_indices[test_size: test_size + validation_size]
# train_indices = all_indices[test_size + validation_size: ]

# X_train = df.iloc[train_indices].drop(columns = ["bias_rating"]).reset_index(drop = True)
# X_validation = df.iloc[validation_indices].drop(columns = ["bias_rating"]).reset_index(drop = True)
# X_test = df.iloc[test_indices].drop(columns = ["bias_rating"]).reset_index(drop = True)

# y_train = df.iloc[train_indices]["bias_rating"].reset_index(drop = True)
# y_validation = df.iloc[validation_indices]["bias_rating"].reset_index(drop = True)
# y_test = df.iloc[test_indices]["bias_rating"].reset_index(drop = True)

# print(f"X_train shape: {X_train.shape}")
# print(f"X_validation shape: {X_validation.shape}")
# print(f"X_test shape: {X_test.shape}")
# print(f"y_train shape: {y_train.shape}")
# print(f"y_validation shape: {y_validation.shape}")
# print(f"y_test shape: {y_test.shape}")

# print(X_train.head())
# print(y_train.head())

In [70]:
# Shuffle df and split into X and Y
df = df.sample(frac = 1)
X = df[['title', 'heading', 'text']]
y = df["bias_rating"]
print(X.head())
print(y.head())

                                                   title  \
14492   senate acquits trump in second impeachment trial   
16612  perspectives: the supreme court and bipartisan...   
14058                 changes to the democratic platform   
8282                 media coverage of jobs report tweet   
17498                               improving the budget   

                                                 heading  \
14492  senate acquits trump in impeachment trial — again   
16612                the supreme court’s surprising term   
14058  clinton reaffirms support for public option in...   
8282   kudlow on trump jobs tweet: 'i don't think he ...   
17498  bipartisan house majority defeats senate's unb...   

                                                    text  
14492  the u.s. senate on saturday acquitted former p...  
16612  the supreme court term that began last fall ha...  
14058  hillary clinton reaffirmed her support on satu...  
8282   president donald trump was briefed 

In [75]:
# Split into training, testing with even ratio between articles from each side
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
X_test, X_valid, y_test, y_valid = train_test_split(X_test, y_test, test_size=0.5, stratify=y_test, random_state=42)
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"X_valid shape: {X_valid.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")
print(f"y_valid shape: {y_valid.shape}")

print(X_train.head(2))
print(y_train.head(2))

X_train shape: (15227, 3)
X_test shape: (3263, 3)
X_valid shape: (3264, 3)
y_train shape: (15227,)
y_test shape: (3263,)
y_valid shape: (3264,)
                         title  \
15083        texas voting laws   
14395  vp debate tuesday night   

                                                 heading  \
15083  holder now going after texas voting law in rac...   
14395  bidenã¢â‚¬â„¢s debate aim: reclaim edge after ...   

                                                    text  
15083  there is one thing to be said about attorney g...  
14395  with president obama looking to an unpredictab...  
15083     left
14395    right
Name: bias_rating, dtype: object


Resetting indices

In [76]:
X_train = X_train.reset_index(drop = True)
X_test = X_test.reset_index(drop = True)
X_valid = X_valid.reset_index(drop = True)
y_train = y_train.reset_index(drop = True)
y_test = y_test.reset_index(drop = True)
y_valid = y_valid.reset_index(drop = True)

print(X_train.head(2))
print(y_train.head(2))

                     title                                            heading  \
0        texas voting laws  holder now going after texas voting law in rac...   
1  vp debate tuesday night  bidenã¢â‚¬â„¢s debate aim: reclaim edge after ...   

                                                text  
0  there is one thing to be said about attorney g...  
1  with president obama looking to an unpredictab...  
0     left
1    right
Name: bias_rating, dtype: object


**Setting up for MS2**

In [None]:
import tensorflow as tf
import huggingface_hub
from transformers import AutoTokenizer

In [None]:
model = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
test_tokens = tokenizer.tokenize(X_train["text"][0])
test_token_ids = tokenizer.convert_tokens_to_ids(test_tokens)
print(test_tokens)
print(test_token_ids)

['as', 'ya', '##sm', '##in', 'miller', 'drove', 'home', 'from', 'a', 'lau', '##nd', '##rom', '##at', 'in', 'chicago', '’', 's', 'eng', '##lewood', 'neighborhood', 'last', 'weekend', ',', 'a', 'gun', '##man', 'in', 'another', 'car', 'pepper', '##ed', 'her', 'red', 'hyundai', 'sedan', 'with', 'bullets', ',', 'grazing', 'her', 'head', 'and', 'striking', 'her', 'son', ',', 'sincere', 'gaston', ',', 'in', 'the', 'chest', '.', 'sincere', 'died', 'in', 'his', 'car', 'seat', '.', 'he', 'was', '20', 'months', 'old', '.', 'on', 'june', '20', ',', 'a', 'man', 'fired', 'gunshot', '##s', 'through', 'the', 'back', 'of', 'a', 'dark', 'blue', 'suv', ',', 'wounding', 'the', '27', '-', 'year', '-', 'old', 'man', 'driving', 'and', 'hitting', 'his', 'steps', '##on', ',', 'me', '##kh', '##i', 'james', ',', 'in', 'the', 'back', ',', 'killing', 'him', '.', 'me', '##kh', '##i', 'was', 'three', '.', 'two', 'girls', ',', 'both', 'aged', 'three', ',', 'were', 'hospitalized', '.', '.', '.']
[2004, 8038, 6491, 237

Now, we will tokenize all the columns of all the sets. Also, should we drop source column??