**Imports**

In [1]:
import re

import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt

import spacy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay
)

**Load and preprocess data**

In [2]:
# Load Data
df = pd.read_parquet('/content/drive/MyDrive/NLP News Bias Data/data_newsbias_cleaned.parquet', columns=['text', 'bias_label'])
df = df.dropna(subset=['text', 'bias_label']).copy()

# Lowercase, remove punctuation, remove stopwords
def clean_for_model(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = text.split()
    tokens = [t for t in tokens if t not in ENGLISH_STOP_WORDS]
    return ' '.join(tokens)

# Apply cleaning
df['text_clean'] = df['text'].astype(str).apply(clean_for_model)

# Keep only cleaned text + label
df_cleaned = df[['text_clean', 'bias_label']]

# Save to compact Parquet
df_cleaned.to_parquet('/content/drive/MyDrive/NLP News Bias Data/df_cleaned_model_ready.parquet', compression='snappy')

print(f"Cleaned data saved: {df_cleaned.shape[0]} rows")


KeyboardInterrupt: 

**Baselines: Apply Naive Bayes and Logisitc Regression with BoW and TF-IDF**

In [None]:
# Load cleaned dataset
df = pd.read_parquet('/content/drive/MyDrive/NLP News Bias Data/df_cleaned_model_ready.parquet')

# Randomly sample 50k rows per class (balanced total = 150k)
df = (
    df.groupby('bias_label', group_keys=False)
      .apply(lambda g: g.sample(50000, random_state=42))
      .reset_index(drop=True)
)

# Features and labels
X = df['text_clean']
y = df['bias_label']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

# Vectorizers
vectorizers = {
    'BoW': CountVectorizer(stop_words='english', max_features=20000),
    'TF-IDF': TfidfVectorizer(stop_words='english', max_features=20000)
}

# Models
models = {
    'LogisticRegression': LogisticRegression(max_iter=500, solver='saga'),
    'NaiveBayes': MultinomialNB()
}

# Run all combinations
for vec_name, vectorizer in vectorizers.items():
    print(f"\n🔹 Vectorizing with {vec_name}...")
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    for model_name, model in models.items():
        print(f"\n{model_name} + {vec_name}")
        model.fit(X_train_vec, y_train)
        preds = model.predict(X_test_vec)
        acc = accuracy_score(y_test, preds)

        print(f"Accuracy: {acc:.4f}")
        print(classification_report(y_test, preds))

        # Confusion matrix
        cm = confusion_matrix(y_test, preds, labels=model.classes_)
        disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
        disp.plot(cmap='Blues', values_format='d')
        plt.title(f"{model_name} + {vec_name} Confusion Matrix")
        plt.show()

**Distribution of Sources of misclassified articles (LR with Bag of Words)**

In [None]:
# Load cleaned dataset
df = pd.read_parquet('/content/drive/MyDrive/NLP News Bias Data/df_sampled_cleaned.parquet')

# Features and labels
X = df['text_clean']
y = df['bias_label']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

# Vectorize with Bag of Words
vectorizer = CountVectorizer(stop_words='english', max_features=20000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train Logistic Regression
lr_model = LogisticRegression(max_iter=500, solver='saga')
lr_model.fit(X_train_vec, y_train)

# Predict and evaluate
y_pred = lr_model.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Attach predictions to test data
df_test = df.iloc[y_test.index].copy()
df_test['predicted'] = y_pred

# Identify misclassified samples
df_misclassified = df_test[df_test['bias_label'] != df_test['predicted']].copy()

# Load spaCy for POS tagging
nlp = spacy.load("en_core_web_sm")
tqdm.pandas()

# Count NOUN and ADJ
def count_pos(text):
    doc = nlp(str(text))
    noun_count = sum(1 for token in doc if token.pos_ == "NOUN")
    adj_count = sum(1 for token in doc if token.pos_ == "ADJ")
    return pd.Series([noun_count, adj_count])

df_misclassified[['noun_count', 'adj_count']] = df_misclassified['text_clean'].progress_apply(count_pos)

# Add text length
df_misclassified['text_length'] = df_misclassified['text_clean'].apply(lambda x: len(str(x).split()))

# Analysis
print("\nTop 10 misclassified sources:")
print(df_misclassified['source'].value_counts().head(10))

print("\nAverage noun count:", df_misclassified['noun_count'].mean())
print("Average adjective count:", df_misclassified['adj_count'].mean())
print("Average article length (in words):", df_misclassified['text_length'].mean())


**Distributions of nouns, adjectives and text lengths across misclassified articles grouped by bias label**

In [None]:
# Group misclassified samples by true bias label
grouped_by_label = df_misclassified.groupby('bias_label')[['noun_count', 'adj_count', 'text_length']].mean()

# Display results
print(grouped_by_label)


📊 Average POS statistics for misclassified articles, grouped by true label:
            noun_count  adj_count  text_length
bias_label                                    
center      112.418612  31.541073   270.414065
left        126.301767  34.707903   306.753918
right       119.375149  33.351367   289.372176


**Distributions of nouns, adjectives and text lengths across correctly classified articles grouped by bias label**

In [None]:
# Load the data and predictions
df = pd.read_parquet('/content/drive/MyDrive/NLP News Bias Data/df_sampled_cleaned.parquet')

# Ensure the test set from previous run (same train/test split!)
X = df['text_clean']
y = df['bias_label']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

# Vectorize again to match original predictions
vectorizer = CountVectorizer(stop_words='english', max_features=20000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Re-train logistic regression (same as before)
lr_model = LogisticRegression(max_iter=500, solver='saga')
lr_model.fit(X_train_vec, y_train)
y_pred = lr_model.predict(X_test_vec)

# Attach predictions to df
df_test = df.iloc[y_test.index].copy()
df_test['predicted'] = y_pred

# Filter correctly classified rows
df_correct = df_test[df_test['bias_label'] == df_test['predicted']].copy()

# Load spaCy
nlp = spacy.load("en_core_web_sm")
tqdm.pandas()

# POS tagging function
def count_pos(text):
    doc = nlp(str(text))
    noun_count = sum(1 for token in doc if token.pos_ == "NOUN")
    adj_count = sum(1 for token in doc if token.pos_ == "ADJ")
    return pd.Series([noun_count, adj_count])

# Apply POS tagging and compute length
df_correct[['noun_count', 'adj_count']] = df_correct['text_clean'].progress_apply(count_pos)
df_correct['text_length'] = df_correct['text_clean'].apply(lambda x: len(str(x).split()))

# Group by label and calculate averages
grouped_correct = df_correct.groupby('bias_label')[['noun_count', 'adj_count', 'text_length']].mean()
print(grouped_correct)


100%|██████████| 20342/20342 [18:39<00:00, 18.18it/s]


            noun_count  adj_count  text_length
bias_label                                    
center      127.685467  33.146822   298.602208
left        166.153055  50.621074   396.582667
right       133.164256  37.373418   339.279687


**Average length of all articles grouped by bias label**

In [None]:
# Compute text length (in words)
df['text_length'] = df['text_clean'].apply(lambda x: len(str(x).split()))

# Group by label and calculate average length
avg_lengths = df.groupby('bias_label')['text_length'].mean()

# Display the result
print("Average article length (in words) by label:")
print(avg_lengths)

📏 Average article length (in words) by label:
bias_label
center    289.98156
left      373.33196
right     321.78388
Name: text_length, dtype: float64


**To Delete**

In [None]:
# Load cleaned dataset
df = pd.read_parquet('/content/drive/MyDrive/NLP News Bias Data/df_cleaned_model_ready.parquet')

X = df['text_clean']
y = df['bias_label']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

# Vectorizers
vectorizers = {
    'BoW': CountVectorizer(stop_words='english', max_features=20000),
    'TF-IDF': TfidfVectorizer(stop_words='english', max_features=20000)
}

# Models
models = {
    'LogisticRegression': LogisticRegression(max_iter=500, solver='saga'),
    'NaiveBayes': MultinomialNB()
}

# Run all combinations
for vec_name, vectorizer in vectorizers.items():
    print(f"\n Vectorizing with {vec_name}...")
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    for model_name, model in models.items():
        print(f"\n {model_name} + {vec_name}")
        model.fit(X_train_vec, y_train)
        preds = model.predict(X_test_vec)
        acc = accuracy_score(y_test, preds)

        print(f" Accuracy: {acc:.4f}")
        print(classification_report(y_test, preds))

        # Confusion matrix
        cm = confusion_matrix(y_test, preds, labels=model.classes_)
        disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
        disp.plot(cmap='Blues', values_format='d')
        plt.title(f"{model_name} + {vec_name} Confusion Matrix")
        plt.show()

In [None]:
import pandas as pd
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# 📂 Load full dataset with only necessary columns
df = pd.read_parquet('/content/drive/MyDrive/NLP News Bias Data/data_newsbias_cleaned.parquet', columns=['text', 'bias_label', 'source'])
df = df.dropna(subset=['text', 'bias_label'])

# ✅ Randomly sample 50k rows per class (balanced total = 150k)
df_sampled = (
    df.groupby('bias_label', group_keys=False)
      .apply(lambda g: g.sample(50000, random_state=42))
      .reset_index(drop=True)
)

# 🧼 Define text cleaning function
def clean_for_model(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    tokens = text.split()
    tokens = [t for t in tokens if t not in ENGLISH_STOP_WORDS]
    return ' '.join(tokens)

# 🔁 Apply cleaning
df_sampled['text_clean'] = df_sampled['text'].astype(str).apply(clean_for_model)

# 💾 Save cleaned, sampled dataset with source for later error analysis
df_sampled.to_parquet('/content/drive/MyDrive/NLP News Bias Data/df_sampled_cleaned.parquet', compression='snappy')

print(f"Preprocessing done. Shape: {df_sampled.shape}")


  .apply(lambda g: g.sample(50000, random_state=42))


✅ Preprocessing done. Shape: (150000, 4)
