This file evaluates the heterogeneous GNN models using a regression and random forest classifier. It uses the embeddings from the heterogeneous GNN models to classify newspapers as Republican or Democrat, and prints and exports the results.

# Setup

In [None]:
# Imports
import pickle
from google.colab import drive
import os
import numpy as np
from tqdm import tqdm
import pandas as pd
import torch
import glob
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc
from matplotlib import pyplot as plt

drive.mount('/content/drive', force_remount=True)
os.chdir('/content/drive/MyDrive/cs224w_project')

In [None]:
"""
Merge graph embeddings with ground truths.
"""
def create_merged_dataset(path, topic, decade):
  hetero_embeds = torch.load(path, weights_only=True)
  hetero_embeds = hetero_embeds['newspaper']
  ground_truths = pd.read_pickle(f"karsen_redo/DATA/clean_ground_truths.pkl")
  newspaper_indices = pd.read_pickle(f"karsen_redo/HETEROGNN/newspaper_node_index-{decade}-{decade+10}-{topic}.pkl")
  # Convert tensor to DataFrame
  embeds_array = hetero_embeds.numpy() if isinstance(hetero_embeds, torch.Tensor) else hetero_embeds

  # Create a DataFrame for embeddings
  embeds_df = pd.DataFrame(embeds_array)
  embeds_df = embeds_df.rename_axis('index').reset_index()

  # Convert newspaper_indices to a DataFrame
  indices_df = pd.DataFrame(list(newspaper_indices.items()), columns=['outlet_name', 'index'])
  merged_indices_embeds = pd.merge(indices_df, embeds_df, on='index')
  merged_indices_embeds['outlet_name'] = merged_indices_embeds['outlet_name'].astype(str)

  # Merge with ground_truths on outlet_name to get the label
  final_dataset = pd.merge(merged_indices_embeds, ground_truths, on='outlet_name')
  return final_dataset

"""
Fit a regression model on the graph embeddings at path, for a given topic and decade.
Return a results dictionary with metrics saved.
"""
def regression_hetgnn(path, topic, decade, result={'hyperparameters': {}, 'model': ''}):
  df = create_merged_dataset(path, topic, decade)
  print(df)
  X = df.drop(columns=['outlet_name', 'label', 'index'])
  y = df['label']

  scaler = StandardScaler()
  X = scaler.fit_transform(X)
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=126)
  X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.222, random_state=126)
  param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  # regularization strength
}
  # REGRESSION
  model = LogisticRegression(max_iter=1000, class_weight='balanced') # tol=1e-4, max_iter=1000
  grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5) # , n_jobs=-1
  grid_search.fit(X_train, y_train)

  best_params = grid_search.best_params_
  print("REGRESSION\n")
  print("------------------------")
  print("Best hyperparameters:", best_params)

  best_model = grid_search.best_estimator_

  best_model = model.fit(X_train, y_train)

  y_val_pred = best_model.predict(X_val)
  val_accuracy = accuracy_score(y_val, y_val_pred)
  print("Validation Accuracy:", val_accuracy)
  print("Validation Classification Report:")
  report_val = classification_report(y_val, y_val_pred)
  print(report_val)
  roc_auc_val = roc_auc_score(y_val, best_model.predict_proba(X_val)[:, 1])
  print("AUC-ROC:", roc_auc_val)

  fpr, tpr, thresholds = roc_curve(y_val, y_val_pred)
  roc_auc = auc(fpr, tpr)
  print("roc_auc", roc_auc)
  result['fpr_val'], result['tpr_val'] = fpr, tpr

  y_test_pred = best_model.predict(X_test)
  test_accuracy = accuracy_score(y_test, y_test_pred)
  print("\nTest Accuracy:", test_accuracy)
  print("Test Classification Report:")
  report_test = classification_report(y_test, y_test_pred)
  print(report_test)
  roc_auc_test = roc_auc_score(y_test, best_model.predict_proba(X_test)[:, 1])
  print("AUC-ROC:", roc_auc_test)

  fpr, tpr, thresholds = roc_curve(y_test, y_test_pred)
  roc_auc = auc(fpr, tpr)
  result['fpr_test'], result['tpr_test'] = fpr, tpr

  result['reg_model'] = 'Logistic Regression'
  result['val_accuracy'] = val_accuracy
  result['test_accuracy'] = test_accuracy
  result['auc_val'] = roc_auc_val
  result['auc_test'] = roc_auc_test

  result['report_val'] = report_val
  result['report_test'] = report_test

  result['reg_hyperparameters'] = best_params

  return result

"""
Fit a random forest model on the graph embeddings at path, for a given topic and decade.
Return a results dictionary with metrics saved.
"""
def random_forest(path, topic, decade, result={'hyperparameters': {}, 'model': ''}):
    df = create_merged_dataset(path, topic, decade)
    X = df.drop(columns=['outlet_name', 'label', 'index'])
    y = df['label']

    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=126)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.222, random_state=126)


    print("------------------------")
    print("\nRANDOM FOREST\n")
    print("------------------------")
    param_grid = {
      'n_estimators': [100, 200], # 50,
      'max_depth': [None, 10], # 20
      'min_samples_split': [2, 5] # 10
    }

    model = RandomForestClassifier(random_state=126, class_weight='balanced') # tol=1e-4, max_iter=1000
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5) # , n_jobs=-1
    grid_search.fit(X_train, y_train)

    best_params = grid_search.best_params_
    print("Best hyperparameters:", best_params)

    best_model = grid_search.best_estimator_

    best_model = model.fit(X_train, y_train)

    y_val_pred = best_model.predict(X_val)
    val_accuracy = accuracy_score(y_val, y_val_pred)
    print("Validation Accuracy:", val_accuracy)
    print("Validation Classification Report:")
    report_val = classification_report(y_val, y_val_pred)
    print(report_val)
    roc_auc_val = roc_auc_score(y_val, best_model.predict_proba(X_val)[:, 1])
    print("AUC-ROC:", roc_auc_val)

    fpr, tpr, thresholds = roc_curve(y_val, y_val_pred)
    roc_auc = auc(fpr, tpr)
    print("AUC", roc_auc)
    result['fpr_val'], result['tpr_val'] = fpr, tpr

    y_test_pred = best_model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    print("\nTest Accuracy:", test_accuracy)
    print("Test Classification Report:")
    report_test = classification_report(y_test, y_test_pred)
    print(report_test)
    roc_auc_test = roc_auc_score(y_test, best_model.predict_proba(X_test)[:, 1])
    print("AUC-ROC:", roc_auc_test)

    fpr, tpr, thresholds = roc_curve(y_test, y_test_pred)
    roc_auc = auc(fpr, tpr)
    result['fpr_test'], result['tpr_test'] = fpr, tpr

    result['reg_model'] = 'Random Forest'
    result['val_accuracy'] = val_accuracy
    result['test_accuracy'] = test_accuracy
    result['auc_val'] = roc_auc_val
    result['auc_test'] = roc_auc_test

    result['report_val'] = report_val
    result['report_test'] = report_test

    result['reg_hyperparameters'] = best_params

    return result

In [None]:
for hidden_size, out_size in sizes:
    for temperature in temperatures:
        for alpha in alphas:
          print(f'\n===================\n')
          print(f'MODEL {conv}, HIDDEN SIZE {hidden_size}, OUTPUT SIZE {out_size}, TEMPERATURE {temperature}, ALPHA {alpha}')
          path = f'karsen_redo/HETEROGNN/{topic}_{decade}{decade+10}/het_{conv}_t{temperature}_a{alpha}_h{hidden_size}_o{out_size}_l{layers}_lr{lr*1000}.pt'
          result = {}
          result['model'] = conv_map[conv]
          result['hyperparameters'] = {'alpha': alpha, 'lr': lr, 'hidden size': hidden_size, 'output size': out_size, 'layers': layers, 'temperature': temperature}
          results.append(regression_hetgnn(path, topic, decade, result))

          result = {}
          result['model'] = conv_map[conv]
          result['hyperparameters'] = {'alpha': alpha, 'lr': lr, 'hidden size': hidden_size, 'output size': out_size, 'layers': layers, 'temperature': temperature}
          results.append(random_forest(path, topic, decade, result))

# make table

In [None]:
"""
Generate a markdown table from the results dictionary outputted from evaluation step.
"""
def generate_markdown_table(results):
    """
    Converts results into a Markdown-compatible table.
    """
    headers = ["Model", "Hyperparameters", "Regression Model", "Validation Accuracy", "Validation AUC-ROC", "Test Accuracy", "Test AUC-ROC"]
    table = f"## {topic.title()} 19{decade}-19{decade+10} Results\n| {' | '.join(headers)} |\n|{'|'.join(['---'] * len(headers))}|\n"

    for result in results:
        reg_model = result['reg_model']
        model = result['model']
        val_acc = result['val_accuracy']
        test_acc = result['test_accuracy']
        auc_val = result['auc_val']
        auc_test = result['auc_test']
        hyperparameters = result["hyperparameters"]

        table += f"| {model} | ```{hyperparameters}``` | {reg_model} | {val_acc:.3f} | {auc_val:.3f} | {test_acc:.3f} | {auc_test:.3f} |\n"

    return table

In [None]:
table = generate_markdown_table(results)

with open(f'karsen_redo/EVAL/{topic}_{decade}{decade+10}/table_hetero.txt', "w") as file:
    file.write(table)

In [None]:
with open(f'karsen_redo/EVAL/{topic}_{decade}{decade+10}/results_hetero.pkl', "wb") as file:
    pickle.dump(results, file)

# Select each best model amongst hyperparameter sets --> final table

In [None]:
"""
Generates separate Markdown tables for each regression model.
Each table includes one row per model type, selecting the entry with the highest Validation AUC-ROC for that model type.
"""
def final_tables(results):
    # Organize results by regression model
    reg_model_tables = {}
    for result in results:
        reg_model = result['reg_model']
        if reg_model not in reg_model_tables:
            reg_model_tables[reg_model] = []
        reg_model_tables[reg_model].append(result)

    # Prepare Markdown tables
    markdown = f"## {topic.title()} {decade}-{decade+10} Results\n\n"
    headers = ["Model", "Hyperparameters", "Validation Accuracy", "Validation AUC-ROC", "Test Accuracy", "Test AUC-ROC"]
    header_row = f"| {' | '.join(headers)} |\n|{'|'.join(['---'] * len(headers))}|\n"

    for reg_model, models in reg_model_tables.items():
        # Group models by their model type
        grouped_by_model = {}
        for model in models:
            model_name = model['model']
            if model_name not in grouped_by_model:
                grouped_by_model[model_name] = []
            grouped_by_model[model_name].append(model)

        # Select the best result per model type according to ROC-AUC on val set
        best_models = [
            max(group, key=lambda x: x['auc_val']) for group in grouped_by_model.values()
        ]

        # Create a table for this regression model
        table = f"### {reg_model} Results\n"
        table += header_row
        for best_model in best_models:
            table += (
                f"| {best_model['model']} | ```{best_model['hyperparameters']}``` "
                f"| {best_model['val_accuracy']:.3f} | {best_model['auc_val']:.3f} "
                f"| {best_model['test_accuracy']:.3f} | {best_model['auc_test']:.3f} |\n"
            )
        table += "\n"
        markdown += table

    return markdown

In [None]:
reduced_table = final_tables(results)

with open(f'karsen_redo/EVAL/{topic}_{decade}{decade+10}/table_hetero_reduced.txt', "w") as file:
    file.write(reduced_table)