This code generates the homogeneous graph for a newswire data subset as chosen, and exports it.

In [None]:
# Imports

import os
import ast
from google.colab import drive
from tqdm import tqdm
from collections import defaultdict
import ast
!pip install datasets > /dev/null 2>&1
from datasets import load_dataset
import pandas as pd
from datetime import datetime
import numpy as np
import pickle
import torch
import torch_geometric as pyg
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

drive.mount('/content/drive', force_remount=True)
os.chdir('/content/drive/MyDrive/STANFORD/SENIOR (2024-2025)/CS224W/cs224w_project')

In [None]:
def prepare_features(newspaper_df, topic_data):
    """
    Given a newswire dataset and newspaper metadata, this function creates the newspaper and article features and concatenates them into a single dataframe.
    """
    # Newspaper features
    numeric_columns = [col for col in newspaper_df.columns if col not in ['outlet_name', 'avg_embedding', 'city', 'state', 'newspaper_city', 'newspaper_state', 'newspaper_coordinates']]
    newspaper_features_df = newspaper_df[numeric_columns]

    # Article features
    embeddings = []
    wire_coordinates = []
    date_features = []

    for article in topic_data:
        embeddings.append(article['embedding'])
        wire_coordinates.append(article['wire_coordinates'])

        # Find the earliest date
        date_list = article['dates']
        if date_list:
            earliest_date = min(datetime.strptime(date, '%b-%d-%Y') for date in date_list)
            earliest_timestamp = earliest_date.timestamp()
        else:
            earliest_timestamp = 0.0

        date_features.append(earliest_timestamp)

    # Add node type column to differentiate nodes
    newspaper_features_df['node_type'] = 0  # 0 for newspaper

    embeddings_df = pd.DataFrame(embeddings)
    wire_coordinates_df = pd.DataFrame(wire_coordinates, columns=['latitude', 'longitude'])
    date_features_df = pd.DataFrame(date_features, columns=['earliest_timestamp'])

    article_features_df = pd.concat([
        embeddings_df,
        wire_coordinates_df,
        date_features_df
    ], axis=1)
    article_features_df['node_type'] = 1  # 1 for article

    # Combine features
    combined_features_df = pd.concat([newspaper_features_df, article_features_df], ignore_index=True)
    combined_features_df = combined_features_df.fillna(0)

    return combined_features_df

def process_features(combined_features_df, n_components=50):
    """
    This function scales and applies PCA to the combined features.
    """
    # Separate features for scaling and PCA
    combined_features_df.columns = combined_features_df.columns.astype(str)
    features_to_scale = combined_features_df.select_dtypes(include=['float64', 'float32'])
    node_type = combined_features_df['node_type']

    # Scale features
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(features_to_scale)

    # Apply PCA
    pca = PCA(n_components=n_components)
    pca_features = pca.fit_transform(scaled_features)

    # Combine PCA features with node type
    processed_features = torch.tensor(pca_features, dtype=torch.float)
    node_type_tensor = torch.tensor(node_type.values, dtype=torch.long)

    return processed_features, node_type_tensor

def create_homogeneous_graph(topic_data, newspaper_df):
    """
    This function takes in the newswire data and outlet metadata and creates a homogeneous graph from the combined features.
    """
    # Prepare combined features
    combined_features_df = prepare_features(newspaper_df, topic_data)

    # Process features with PCA
    node_features, node_type = process_features(combined_features_df)

    # Create edge index
    num_rows = len(newspaper_df)
    # DICTIONARY with one node index dict, newspapers first, add length of newspapers to the article indices

    newspaper_indices = {}
    for i, row in newspaper_df.iterrows():
      newspaper_indices[row['outlet_name']] = i

    num_newspapers = newspaper_df.shape[0]

    edges = []

    for idx, article in enumerate(topic_data):
        article_idx = idx + num_newspapers
        for newspaper in article["newspaper_metadata"]:
            newspaper_tuple = ast.literal_eval(newspaper['newspaper_title'])
            newspaper_title = newspaper_tuple[0].strip().lower()

            if newspaper_title in newspaper_indices:
              edges.append((article_idx, newspaper_indices[newspaper_title]))
              edges.append((newspaper_indices[newspaper_title], article_idx))

    print(edges)
    # Create edge index tensor
    edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
    print(edge_index.shape)
    print(edge_index)

    # Create PyG data object
    data = pyg.data.Data(
        x=node_features,  # Node features after PCA
        edge_index=edge_index,
        node_type=node_type
    )
    print(data)

    return data, num_newspapers, newspaper_indices

In [None]:
"""
This code generates the homogeneous graph for the decades and topics described. It saves the graph to a specified path.
"""

for decade in range(20, 70, 10):
  for topic in ["labor", "civil-rights", "macro"]:
    main_dataset = load_dataset(f"amyguan/newswire-{decade}-{decade+10}")
    embeddings_data = load_dataset(f"pnsahoo/{decade}-{decade+10}-{topic}-embedding")

    main_dataset = main_dataset['train']
    embeddings_data = embeddings_data['train']

    outlet_metadata_df = pd.read_pickle(f"karsen_redo/DATA/outlet_metadata_{decade}{decade+10}_{topic}.pkl")

    newspaper_df = outlet_metadata_df.copy()
    newspaper_df[['latitude', 'longitude']] = newspaper_df['newspaper_coordinates'].apply(
      lambda x: pd.Series([float(coord) for coord in x])
    )

    # Create homogeneous graph
    homogeneous_graph, num_newspapers, newspaper_dict = create_homogeneous_graph(embeddings_data, newspaper_df)

    # Save the graph
    torch.save(homogeneous_graph, f"karsen_redo/HOMOGNN/homogeneous_graph-{decade}-{decade+10}-{topic}-{num_newspapers}.pt")
    with open(f"karsen_redo/HOMOGNN/newspaper_node_index-{decade}-{decade+10}-{topic}.pkl", "wb") as f:
      pickle.dump(newspaper_dict, f)
