# File for issue classification

In [1]:
import wandb
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset
import pandas as pd

In [4]:
!wandb login --relogin

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [5]:
# Initialize Weights & Biases
wandb.init(project="issue_classifier")

[34m[1mwandb[0m: Currently logged in as: [33mnicmeda[0m ([33mnicmeda-massachusetts-institute-of-technology[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [26]:

# Load and preprocess data
data_file = "/content/formatted_speeches_with_topics.csv"
df = pd.read_csv(data_file)
df["topics"] = df["topics"].str.split(", ")

# Prepare multi-label encoding
unique_topics = sorted(set(topic for topics in df["topics"] for topic in topics))
topic_to_id = {topic: i for i, topic in enumerate(unique_topics)}

def encode_topics(topics):
    labels = [0] * len(unique_topics)
    for topic in topics:
        labels[topic_to_id[topic]] = 1
    return labels

df["labels"] = df["topics"].apply(encode_topics)

# Train-test split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["speech_content"], df["labels"], test_size=0.2, random_state=42
)
print(topic_to_id)

{'alcohol': 0, 'budget': 1, 'business': 2, 'crime': 3, 'defense': 4, 'economy': 5, 'education': 6, 'elections': 7, 'environment': 8, 'federalism': 9, 'foreign': 10, 'government': 11, 'health': 12, 'immigration': 13, 'justice': 14, 'labor': 15, 'mail': 16, 'minorities': 17, 'money': 18, 'religion': 19, 'tax': 20, 'topic': 21, 'trade': 22}


In [8]:
# Tokenizer and Dataset
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [9]:
class SpeechDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = torch.tensor(self.labels.iloc[idx], dtype=torch.float)
        encoding = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": label,
        }

In [10]:
train_dataset = SpeechDataset(train_texts, train_labels, tokenizer)
test_dataset = SpeechDataset(test_texts, test_labels, tokenizer)

In [11]:
# Model
model = BertForSequenceClassification.from_pretrained(
    model_name, num_labels=len(unique_topics), problem_type="multi_label_classification"
)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# Training arguments with W&B integration
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to="wandb",  # Enable W&B logging
    load_best_model_at_end=True,
)



In [13]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [14]:
# Train the model
trainer.train()



Epoch,Training Loss,Validation Loss
1,0.4402,0.406996
2,0.3596,0.359802
3,0.364,0.349516


TrainOutput(global_step=135, training_loss=0.4157728707348859, metrics={'train_runtime': 278.9241, 'train_samples_per_second': 7.744, 'train_steps_per_second': 0.484, 'total_flos': 568427036590080.0, 'train_loss': 0.4157728707348859, 'epoch': 3.0})

In [15]:
# Save model and tokenizer
model.save_pretrained("./issue_classifier_model")
tokenizer.save_pretrained("./issue_classifier_model")

('./issue_classifier_model/tokenizer_config.json',
 './issue_classifier_model/special_tokens_map.json',
 './issue_classifier_model/vocab.txt',
 './issue_classifier_model/added_tokens.json')

In [16]:
# Finish W&B logging
wandb.finish()

[34m[1mwandb[0m: [32m[41mERROR[0m Control-C detected -- Run data was not synced


KeyboardInterrupt: 

In [17]:
# Load the saved model and tokenizer
model_path = "/content/issue_classifier_model"
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)

In [18]:
# Ensure the model is in evaluation mode
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [19]:
# Define unique topics for decoding (should match the training order)
unique_topics = sorted(set(topic for topics in df["topics"] for topic in topics))

In [20]:
# Define a helper function to decode predictions
def decode_predictions(probabilities, threshold=0.5):
    """Convert probabilities to topic labels based on the threshold."""
    binary_predictions = (probabilities >= threshold).astype(int)
    predicted_topics = [unique_topics[i] for i, label in enumerate(binary_predictions) if label == 1]
    return predicted_topics

In [32]:
# Test on a sample input
sample_text = "The government needs to address issues related to healthcare and justice."
encoding = tokenizer(
    sample_text,
    padding="max_length",
    truncation=True,
    max_length=512,
    return_tensors="pt",
)

In [33]:
# Perform inference
with torch.no_grad():
    outputs = model(**encoding)
    logits = outputs.logits
    probabilities = torch.sigmoid(logits).numpy().flatten()

In [34]:
print(probabilities)

[0.17028458 0.22592826 0.19354716 0.15021409 0.1635427  0.10473033
 0.12771916 0.33811757 0.12776667 0.33693352 0.2456419  0.17784792
 0.22708787 0.16126098 0.38376215 0.25002068 0.18091772 0.28002822
 0.15948132 0.14844513 0.13115278 0.1456296  0.15174747]


In [35]:
# Decode the predictions
predicted_topics = decode_predictions(probabilities, threshold=0.5)

In [36]:
# Print the results
print("Sample Input:")
print(sample_text)
print("\nPredicted Topics:")
print(predicted_topics)

Sample Input:
The government needs to address issues related to justice.

Predicted Topics:
[]
