This file constructs the baselines using the outlet metadata and runs a logistic regression and random forest classifier to classify newspapers as Republican or Democrat.

In [None]:
# Imports
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc

import pickle
from google.colab import drive
import os
import numpy as np
from tqdm import tqdm
!pip install datasets > /dev/null 2>&1
from datasets import load_dataset
import pandas as pd
from datetime import datetime

drive.mount('/content/drive', force_remount=True)
os.chdir('/content/drive/MyDrive/cs224w_project')

In [None]:
results = []

def evaluate_baseline(TOPIC, DECADE):
  """
  Given a topic and decade to analyze, this function evaluates the baselines and runs a logistic regression and random forest classifier to classify newspapers as Republican or Democrat.
  It prints the resulting metrics.
  """
  # Clean outlet metadata
  outlet_metadata_df = pd.read_pickle(f"karsen_redo/DATA/outlet_metadata_{DECADE}{DECADE+10}_{TOPIC}.pkl")
  outlet_metadata_df[['latitude', 'longitude']] = pd.DataFrame(outlet_metadata_df['newspaper_coordinates'].tolist())
  outlet_metadata_df = outlet_metadata_df.drop(columns='newspaper_coordinates')

  # Convert list of embeddings into a bunch of columns
  embed_len = len(outlet_metadata_df['avg_embedding'][0])
  embeds = pd.DataFrame(outlet_metadata_df['avg_embedding'].tolist(), index=outlet_metadata_df.index).add_prefix('embed')
  outlet_metadata_df = pd.concat([outlet_metadata_df, embeds], axis=1)
  outlet_metadata_df = outlet_metadata_df.drop(columns=['avg_embedding', 'newspaper_city', 'newspaper_state'])

  # Find ground truths
  ground_truths = pd.read_pickle(f"karsen_redo/DATA/clean_ground_truths.pkl")

  merged = pd.merge(outlet_metadata_df, ground_truths, on='outlet_name').dropna() # Drop if no ground truth available
  X = merged.drop(columns=['label', 'outlet_name'])
  y = merged['label']

  scaler = StandardScaler()
  X = scaler.fit_transform(X)
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=126)
  X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.222, random_state=126)
  param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  # regularization strength
  }

  # REGRESSION
  result = {}
  model = LogisticRegression(max_iter=1000, class_weight='balanced') # tol=1e-4, max_iter=1000
  grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5) # , n_jobs=-1
  grid_search.fit(X_train, y_train)

  best_params = grid_search.best_params_
  print("REGRESSION\n")
  print("------------------------")
  print("Best hyperparameters:", best_params)

  best_model = grid_search.best_estimator_

  best_model = model.fit(X_train, y_train)

  y_val_pred = best_model.predict(X_val)
  val_accuracy = accuracy_score(y_val, y_val_pred)
  print("Validation Accuracy:", val_accuracy)
  print("Validation Classification Report:")
  report_val = classification_report(y_val, y_val_pred)
  print(report_val)
  roc_auc_val = roc_auc_score(y_val, best_model.predict_proba(X_val)[:, 1])
  print("AUC-ROC:", roc_auc_val)

  fpr, tpr, thresholds = roc_curve(y_val, y_val_pred)
  roc_auc = auc(fpr, tpr)
  result['fpr_val'], result['tpr_val'] = fpr, tpr

  y_test_pred = best_model.predict(X_test)
  test_accuracy = accuracy_score(y_test, y_test_pred)
  print("\nTest Accuracy:", test_accuracy)
  print("Test Classification Report:")
  report_test = classification_report(y_test, y_test_pred)
  print(report_test)
  roc_auc_test = roc_auc_score(y_test, best_model.predict_proba(X_test)[:, 1])
  print("AUC-ROC:", roc_auc_test)

  fpr, tpr, thresholds = roc_curve(y_test, y_test_pred)
  roc_auc = auc(fpr, tpr)
  result['fpr_test'], result['tpr_test'] = fpr, tpr

  result['hyperparameters'] = {}
  result['model'] = 'Raw Input Features (Baseline)'
  result['reg_model'] = 'Logistic Regression'
  result['val_accuracy'] = val_accuracy
  result['test_accuracy'] = test_accuracy
  result['auc_val'] = roc_auc_val
  result['auc_test'] = roc_auc_test

  result['report_val'] = report_val
  result['report_test'] = report_test

  result['reg_hyperparameters'] = best_params

  results.append(result)

  # RANDOM FOREST
  print("------------------------")
  print("\nRANDOM FOREST\n")
  print("------------------------")
  param_grid = {
    'n_estimators': [100, 200], # 50,
    'max_depth': [None, 10], # 20
    'min_samples_split': [2, 5] # 10
  }

  result = {}
  model = RandomForestClassifier(random_state=126, class_weight='balanced') # tol=1e-4, max_iter=1000
  grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5) # , n_jobs=-1
  grid_search.fit(X_train, y_train)

  best_params = grid_search.best_params_
  print("Best hyperparameters:", best_params)

  best_model = grid_search.best_estimator_

  best_model = model.fit(X_train, y_train)

  y_val_pred = best_model.predict(X_val)
  val_accuracy = accuracy_score(y_val, y_val_pred)
  print("Validation Accuracy:", val_accuracy)
  print("Validation Classification Report:")
  report_val = classification_report(y_val, y_val_pred)
  print(report_val)
  roc_auc_val = roc_auc_score(y_val, best_model.predict_proba(X_val)[:, 1])
  print("AUC-ROC:", roc_auc_val)

  fpr, tpr, thresholds = roc_curve(y_val, y_val_pred)
  roc_auc = auc(fpr, tpr)
  result['fpr_val'], result['tpr_val'] = fpr, tpr

  y_test_pred = best_model.predict(X_test)
  test_accuracy = accuracy_score(y_test, y_test_pred)
  print("\nTest Accuracy:", test_accuracy)
  print("Test Classification Report:")
  report_test = classification_report(y_test, y_test_pred)
  print(report_test)
  roc_auc_test = roc_auc_score(y_test, best_model.predict_proba(X_test)[:, 1])
  print("AUC-ROC:", roc_auc_test)

  fpr, tpr, thresholds = roc_curve(y_test, y_test_pred)
  roc_auc = auc(fpr, tpr)
  result['fpr_test'], result['tpr_test'] = fpr, tpr

  result['hyperparameters'] = {}
  result['model'] = 'Raw Input Features (Baseline)'
  result['reg_model'] = 'Random Forest'
  result['val_accuracy'] = val_accuracy
  result['test_accuracy'] = test_accuracy
  result['auc_val'] = roc_auc_val
  result['auc_test'] = roc_auc_test

  result['report_val'] = report_val
  result['report_test'] = report_test

  result['reg_hyperparameters'] = best_params

  results.append(result)

  return results

In [None]:
"""
Edit below to dump the results into a pickle file, depending on which topic-decade.
"""
TOPIC = 'macro'
DECADE = 60
results = []
results = evaluate_baseline(TOPIC, DECADE)

folder_name = f"karsen_redo/EVAL/{TOPIC}_{DECADE}{DECADE+10}"
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

with open(f'karsen_redo/EVAL/{TOPIC}_{DECADE}{DECADE+10}/results_baseline.pkl', "wb") as file:
    pickle.dump(results, file)

In [None]:
def generate_markdown_table(results):
    """
    Converts results into a Markdown-compatible table.
    """
    headers = ["Model", "Hyperparameters", "Regression Model", "Validation Accuracy", "Validation AUC-ROC", "Test Accuracy", "Test AUC-ROC"]
    table = f"## {TOPIC.title()} 19{DECADE} - 19{DECADE+10} Results\n| {' | '.join(headers)} |\n|{'|'.join(['---'] * len(headers))}|\n"

    for result in results:
        reg_model = result['reg_model']
        model = result['model']
        val_acc = result['val_accuracy']
        test_acc = result['test_accuracy']
        auc_val = result['auc_val']
        auc_test = result['auc_test']
        hyperparameters = result["hyperparameters"]

        table += f"| {model} | ```{hyperparameters}``` | {reg_model} | {val_acc:.3f} | {auc_val:.3f} | {test_acc:.3f} | {auc_test:.3f} |\n"

    return table

In [None]:
"""
Writes the markdown table as needed.
"""
table = generate_markdown_table(results)

with open(f'karsen_redo/EVAL/{TOPIC}_{DECADE}{DECADE+10}/table_baseline.txt', "w") as file:
    file.write(table)