In [None]:
# Import libraries
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, multilabel_confusion_matrix, classification_report
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import random
import matplotlib.pyplot as plt
import seaborn as sns

# Set random seed for reproducibility
random.seed(42)

In [None]:
# Load the saved model and tokenizer
model_path = "/content/drive/MyDrive/issue classifier model "
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)
model.eval()

print("Model and tokenizer loaded successfully!")

In [None]:
# Load the dataset
data_path = "/content/speeches_111_gpt_topic_labels.json"
with open(data_path, "r") as f:
    data = json.load(f)

In [None]:
# Convert JSON to DataFrame
records = [
    {"speech_id": speech_id, "speech_content": details["speech"], "true_issues": details["topics"]}
    for speech_id, details in data.items()
]
df = pd.DataFrame(records)

print(f"Dataset loaded with {len(df)} speeches.")

In [None]:
# Preprocessing function
def preprocess_data(df):
    # Remove speeches with only "Governance and Democracy" as their label
    df = df[df["true_issues"].apply(lambda x: x != ["Governance and Democracy"])].copy()

    # Remove "Governance and Democracy" from labels for remaining speeches
    df.loc[:, "true_issues"] = df["true_issues"].apply(lambda x: [topic for topic in x if topic != "Governance and Democracy"])

    # Remove speeches with empty content or no labels
    df = df[df["speech_content"].notnull()].copy()  # Remove empty speeches
    df = df[df["speech_content"].str.strip() != ""].copy()  # Remove speeches with only whitespace
    df = df[df["true_issues"].apply(len) > 0].copy()  # Remove speeches with no remaining labels
    return df

In [None]:
# Apply preprocessing
df = preprocess_data(df)
print(df.head())
# Print summary
print(f"Preprocessed dataset contains {len(df)} speeches.")

In [None]:
# Load unique topics from training
unique_topics_path = "/content/unique_topics.json"
with open(unique_topics_path, "r") as f:
    unique_topics = json.load(f)

print("Loaded unique topics:", unique_topics)

In [None]:
# Encode topics for evaluation
def encode_topics(topics):
    labels = [0] * len(unique_topics)
    for topic in topics:
        labels[unique_topics.index(topic)] = 1
    return labels

df["true_issues_binary"] = df["true_issues"].apply(encode_topics)

In [None]:
# Function to sample speeches
def sample_speeches(df, n):
    return df.sample(n=n, random_state=42)

# Input parameter: Number of speeches to sample
n = 500  # Modify this to tweak the sample size
sampled_df = sample_speeches(df, n)

print(f"Sampled {n} speeches.")

In [None]:
# Predict issues for sampled speeches
def predict_issues(model, tokenizer, speech, unique_topics, threshold=0.5):
    encoding = tokenizer(
        speech,
        max_length=512,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
    )
    with torch.no_grad():
        outputs = model(**{k: v.to(model.device) for k, v in encoding.items()})
        probabilities = torch.sigmoid(outputs.logits).cpu().numpy().flatten()

    # Convert probabilities to binary predictions
    binary_predictions = (probabilities >= threshold).astype(int)
    predicted_topics = [unique_topics[i] for i, label in enumerate(binary_predictions) if label == 1]
    return predicted_topics

In [None]:
# Predict issues for the sampled speeches
sampled_df["predicted_issues"] = sampled_df["speech_content"].apply(
    lambda x: predict_issues(model, tokenizer, x, unique_topics)
)

In [None]:
# Encode predicted issues as binary vectors
sampled_df["predicted_issues_binary"] = sampled_df["predicted_issues"].apply(encode_topics)

print("Predictions completed.")

In [None]:
print(sampled_df.head())

In [None]:
from sklearn.metrics import hamming_loss, classification_report, accuracy_score

# Ensure true and predicted binary labels exist
true_labels = np.array(sampled_df["true_issues_binary"].tolist())
predicted_labels = np.array(sampled_df["predicted_issues_binary"].tolist())

# Evaluate Hamming Loss
hamming_loss_score = hamming_loss(true_labels, predicted_labels)
print(f"Hamming Loss: {hamming_loss_score:.4f}")

# Calculate Subset Accuracy
subset_accuracy = np.mean(np.all(true_labels == predicted_labels, axis=1))
print(f"Subset Accuracy: {subset_accuracy:.4f}")

# Calculate Overall Accuracy
overall_accuracy = accuracy_score(true_labels.flatten(), predicted_labels.flatten())
print(f"Overall Accuracy (flattened): {overall_accuracy:.4f}")

# Classification Report
print("\nClassification Report:")
print(classification_report(true_labels, predicted_labels, target_names=unique_topics))


In [None]:
from sklearn.metrics import multilabel_confusion_matrix, ConfusionMatrixDisplay

# Generate confusion matrices for each label
confusion_matrices = multilabel_confusion_matrix(true_labels, predicted_labels)

# Create a combined confusion matrix
overall_confusion_matrix = np.sum(confusion_matrices, axis=0)

# Plot the overall confusion matrix
plt.figure(figsize=(8, 6))
ConfusionMatrixDisplay(overall_confusion_matrix, display_labels=["No", "Yes"]).plot(cmap="Blues", values_format="d")
plt.title("Overall Confusion Matrix")
plt.show()

# Print the confusion matrix as raw values
print("Overall Confusion Matrix (raw values):")
print(overall_confusion_matrix)
