In [1]:
import numpy as np
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer
from nltk.stem.snowball import SnowballStemmer
from string import punctuation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [4]:
# Dataset from: https://github.com/JerryWeiAI/NewB/blob/master/README.md
f = open('political_affiliation_dataset.txt', 'r')
X = []
y = []
for line in f.readlines():
    X.append(line.split('\t')[1])
    y.append(int(line.split('\t')[0]))
X = np.array(X)
y = np.array(y)

In [5]:
# 0 is liberal, 1 is neutral, 2 is conservative
def classify(x):
    if x <= 4:
        return 0
    elif x >= 6:
        return 2
    else:
        return 1
y = np.array(list(map(classify, y)))

In [6]:
label_0_indices = np.where(y == 0)[0]
label_1_indices = np.where(y == 1)[0]
label_2_indices = np.where(y == 2)[0]

num_samples_per_label = 2000

selected_label_0_indices = np.random.choice(label_0_indices, num_samples_per_label, replace=False)
selected_label_1_indices = np.random.choice(label_1_indices, num_samples_per_label, replace=False)
selected_label_2_indices = np.random.choice(label_2_indices, num_samples_per_label, replace=False)

selected_indices = np.concatenate((selected_label_0_indices, selected_label_1_indices, selected_label_2_indices))

X_sample = X[selected_indices]
y_sample = y[selected_indices]

In [7]:
max_sequence_length = 512
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
encoded_sentences = [tokenizer.encode(text, add_special_tokens=True, max_length=max_sequence_length, truncation=True) for text in X_sample]
labels = y_sample

In [8]:
encoded_set = [torch.LongTensor(text) for text in encoded_sentences]
train_labels = list(y_sample)

In [9]:
# Train-validation-test split
train_texts, test_texts, train_labels, test_labels = train_test_split(encoded_set, train_labels, test_size=0.3, random_state=42)
val_texts, test_texts, val_labels, test_labels = train_test_split(test_texts, test_labels, test_size=0.5, random_state=42)

In [10]:
train_texts = pad_sequence(train_texts, batch_first=True, padding_value=tokenizer.pad_token_id)
train_dataset = TensorDataset(torch.LongTensor(train_texts), torch.LongTensor(train_labels))
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [11]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [12]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()

In [13]:
for epoch in range(5):
    model.train()
    for batch in train_loader:
        input_ids, labels = batch
        input_ids = input_ids.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        output = model(input_ids, labels=labels)
        loss = loss_fn(output.logits, labels)
        loss.backward()
        optimizer.step()

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


In [14]:
val_texts = pad_sequence(val_texts, batch_first=True, padding_value=tokenizer.pad_token_id)
val_dataset = TensorDataset(torch.LongTensor(val_texts), torch.LongTensor(val_labels))
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=True)

In [15]:
model.eval()
with torch.no_grad():
    val_loss = 0
    correct = 0
    for batch in val_loader:
        input_ids, labels = batch
        input_ids = input_ids.to(device)
        output = model(input_ids)
        labels = labels.to(device)
        val_loss += loss_fn(output.logits, labels).item()
        pred = output.logits.argmax(dim=1, keepdim=True)
        correct += pred.eq(labels.view_as(pred)).sum().item()

accuracy = correct / len(val_texts)
print(f"Valid accuracy: {accuracy}")

Valid accuracy: 0.5155555555555555


In [11]:
# Define your hyperparameter search space
learning_rates = [1e-5, 1e-4, 1e-3]  # Example learning rates
batch_sizes = [8, 16, 32]  # Example batch sizes

for learning_rate in learning_rates:
    for batch_size in batch_sizes:
        # Reinitialize the model with the current hyperparameters
        model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)
        model.to(device)

        optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
        loss_fn = torch.nn.CrossEntropyLoss()

        for epoch in range(5):
            model.train()
            for batch in train_loader:
                input_ids, labels = batch
                input_ids = input_ids.to(device)
                labels = labels.to(device)

                optimizer.zero_grad()
                output = model(input_ids, labels=labels)
                loss = loss_fn(output.logits, labels)
                loss.backward()
                optimizer.step()

        # Evaluate on the validation set
        val_texts = pad_sequence(val_texts, batch_first=True, padding_value=tokenizer.pad_token_id)
        val_texts = val_texts.to(device)
        val_dataset = TensorDataset(torch.LongTensor(val_texts), torch.LongTensor(val_labels))
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

        model.eval()
        with torch.no_grad():
            val_loss = 0
            correct = 0
            for batch in val_loader:
                input_ids, labels = batch
                input_ids = input_ids.to(device)
                output = model(input_ids)
                labels = labels.to(device)
                val_loss += loss_fn(output.logits, labels).item()
                pred = output.logits.argmax(dim=1, keepdim=True)
                correct += pred.eq(labels.view_as(pred)).sum().item()

        val_accuracy = correct / len(val_texts)
        print(f"Validation accuracy (LR={learning_rate}, Batch Size={batch_size}): {val_accuracy}")

        if val_accuracy > best_accuracy:
            best_accuracy = val_accuracy
            best_epoch = epoch
            best_batch_size = batch_size

            # Save the best model checkpoint
            torch.save(model.state_dict(), "best_model_checkpoint.pth")

print(f"Best Validation Accuracy: {best_accuracy}")
print(f"Best Epoch: {best_epoch}")
print(f"Best Batch Size: {best_batch_size}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


RuntimeError: The expanded size of the tensor (601) must match the existing size (512) at non-singleton dimension 1.  Target sizes: [32, 601].  Tensor sizes: [1, 512]

In [17]:
#Classifier = Nearest Neighbors, Score (test, accuracy) = 42.78, Training time = 1729.20 seconds
#Classifier = Linear SVM, Score (test, accuracy) = 43.77, Training time = 57367.66 seconds

In [18]:
# Test accuracy: 0.6268256803614585 with only 2 epochs
# Sample test accuracy: 0.548 with 50 epochs, not equal size
# Test accuracy: 0.5417777777777778 with 5000 samples of each