This file takes in data from hugging face and produces the metadata for each news outlet, exporting it to a set pathname.

In [None]:
# Imports
import os
from google.colab import drive
from tqdm import tqdm
from collections import defaultdict
import ast
!pip install datasets > /dev/null 2>&1
from datasets import load_dataset
import pandas as pd
from datetime import datetime
import requests
import time
import numpy as np
import pickle

drive.mount('/content/drive', force_remount=True)
os.chdir('/content/drive/MyDrive/STANFORD/SENIOR (2024-2025)/CS224W/cs224w_project')

In [None]:
# EDIT ACCORDINGLY
DECADE = 60
TOPIC = "labor"

full_decade_data = load_dataset(f"amyguan/newswire-{DECADE}-{DECADE+10}")['train']
topic_decade_data = load_dataset(f"pnsahoo/{DECADE}-{DECADE+10}-{TOPIC}-embedding")['train']
outlet_metadata_filename = f"karsen_redo/DATA/outlet_metadata_{DECADE}{DECADE+10}_{TOPIC}.pkl"

api_key = "INSERT API KEY"


def get_coordinates_google(city, state, api_key):
    """
    Get the coordinates of a city and state using the Google Maps Geocoding API.
    """
    url = f"https://maps.googleapis.com/maps/api/geocode/json?address={city},{state}&key={api_key}"
    response = requests.get(url)
    if response.status_code == 200:
        results = response.json().get('results')
        if results:
            location = results[0]['geometry']['location']
            return location['lat'], location['lng']
    return None

def find_outlet_metadata_decade(full_decade_data):
  """
  This finds the main outlet metadata for all topics in the decade.
  """
  # Initialize variables
  outlets = set()
  articles_per_outlet = defaultdict(int)
  articles_per_ca_topic = defaultdict(lambda: defaultdict(int))
  wire_coordinates_per_outlet = defaultdict(set)
  city_per_outlet = {}
  state_per_outlet = {}
  newspaper_coordinates_per_outlet = {}

  batch_size = 1000

  # Iterate through all articles
  for i in tqdm(range(0, len(full_decade_data), batch_size), desc="Processing batches"):
      batch = full_decade_data[i: i + batch_size]
      l = len(batch['article']) # Number of articles in this batch

      for j in range(l):
          ca_topic = batch['ca_topic'][j]
          wire_coord = tuple(batch['wire_coordinates'][j])
          row = batch['newspaper_metadata'][j]

          for outlet in row: # Per newspaper
              names = ast.literal_eval(outlet['newspaper_title'])
              cities = ast.literal_eval(outlet['newspaper_city'])
              states = ast.literal_eval(outlet['newspaper_state'])

              if names:
                  outlet_name = names[0] # Only look at first newspapername in the metadata if multiple names
                  if city_per_outlet.get(outlet_name) is None:
                      city_per_outlet[outlet_name] = cities[0]
                      state_per_outlet[outlet_name] = states[0]
                      coordinates = get_coordinates_google(cities[0], states[0], api_key)
                      time.sleep(0.1) # avoid overwhelming the geocoding service
                      if coordinates:
                          newspaper_coordinates_per_outlet[outlet_name] = coordinates
                      else:
                          newspaper_coordinates_per_outlet[outlet_name] = (0, 0)

                  outlets.add(outlet_name)
                  articles_per_outlet[outlet_name] += 1
                  articles_per_ca_topic[outlet_name][ca_topic] += 1
                  wire_coordinates_per_outlet[outlet_name].add(wire_coord)
  return outlets, articles_per_outlet, articles_per_ca_topic, wire_coordinates_per_outlet, city_per_outlet, state_per_outlet, newspaper_coordinates_per_outlet

def compute_average_embeddings_topic(topic_decade_data):
  """
  This computes the average article embedding for each outlet in the topic-decade, which is later used in outlet metadata.
  """
  embeddings_per_outlet = defaultdict(list)
  batch_size = 100

  # Iterate through articles
  for i in tqdm(range(0, len(topic_decade_data), batch_size), desc="Processing embeddings"):
      batch = topic_decade_data[i: i + batch_size]
      l = len(batch['article']) # Number of articles in this batch
      for j in range(l):
          embeddings = batch['embedding'][j]
          row = batch['newspaper_metadata'][j]
          for outlet in row:
              names = ast.literal_eval(outlet['newspaper_title'])
              if names:
                  embeddings_per_outlet[names[0]].append(embeddings)

  average_embeddings_per_outlet = {k: np.mean(v, axis=0) for k, v in embeddings_per_outlet.items()}
  final_outlets = list(average_embeddings_per_outlet.keys())
  return final_outlets, average_embeddings_per_outlet

def compute_outlet_metadata(topic_decade_data):
  """
  This computes the outlet metadata for the topic-decade, calling on other functions as needed.
  """
  # Compute only for outlets in the topic-decade
  final_outlets, average_embeddings_per_outlet = compute_average_embeddings_topic(topic_decade_data)

  # Construct outlet metadata in dataframe
  outlet_metadata = []
  for outlet in final_outlets:
      num_articles = articles_per_outlet[outlet]
      ca_topic_distribution = articles_per_ca_topic[outlet]

      ca_topic_shares = {f"share_{topic}": count / num_articles for topic, count in ca_topic_distribution.items()}

      num_unique_wire_sources = len(wire_coordinates_per_outlet[outlet])
      wire_coord_diversity = num_unique_wire_sources / num_articles if num_articles > 0 else 0

      average_embedding = average_embeddings_per_outlet[outlet]
      city = city_per_outlet[outlet]
      state = state_per_outlet[outlet]
      newspaper_coordinates = newspaper_coordinates_per_outlet[outlet]

      outlet_metadata.append({
          'outlet_name': outlet,
          'num_articles': num_articles,
          'wire_coord_diversity': wire_coord_diversity,
          'newspaper_city': city,
          'newspaper_state': state,
          'newspaper_coordinates': newspaper_coordinates,
          **ca_topic_shares,
          'avg_embedding': average_embedding
      })

  # Export metadata
  outlet_metadata_df = pd.DataFrame(outlet_metadata)

  # Fill NaNs with 0 (in case some outlets don't have articles in specific topics)
  outlet_metadata_df.fillna(0, inplace=True)
  outlet_metadata_df.to_pickle(outlet_metadata_filename) # CHANGE NAME if different
