In [1]:
from transformers import LongformerTokenizer, LongformerForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score, f1_score
import torch
import pandas as pd

# Load model and tokenizer
tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")
model = LongformerForSequenceClassification.from_pretrained("allenai/longformer-base-4096", num_labels=2)
model = model.to("cuda")

# Custom Dataset Class
class ScriptDataset(Dataset):
    def __init__(self, scripts, labels, tokenizer, max_length=4096):
        self.scripts = scripts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.scripts)

    def __getitem__(self, idx):
        text = self.scripts[idx]
        label = self.labels[idx]
        inputs = self.tokenizer(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        return {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long)
        }

# Prepare DataLoaders
def prepare_dataloader(data, tokenizer, max_length, batch_size):
    dataset = ScriptDataset(
        scripts=data["script"].tolist(),
        labels=data["passed_bechdel"].tolist(),
        tokenizer=tokenizer,
        max_length=max_length
    )
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Load datasets
train_data = pd.read_csv("../2_preprocessing/train_case_sensitive.csv")
val_data = pd.read_csv("../2_preprocessing/validation_case_sensitive.csv")
test_data = pd.read_csv("../2_preprocessing/test_case_sensitive.csv")

train_loader = prepare_dataloader(train_data, tokenizer, max_length=4096, batch_size=4)
val_loader = prepare_dataloader(val_data, tokenizer, max_length=4096, batch_size=4)
test_loader = prepare_dataloader(test_data, tokenizer, max_length=4096, batch_size=4)

# Training Loop
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)
criterion = torch.nn.CrossEntropyLoss()

def train(model, dataloader):
    model.train()
    total_loss, total_correct = 0, 0

    for batch in dataloader:
        input_ids = batch["input_ids"].to("cuda")
        attention_mask = batch["attention_mask"].to("cuda")
        labels = batch["label"].to("cuda")

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = criterion(outputs.logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        total_correct += (outputs.logits.argmax(1) == labels).sum().item()

    return total_loss / len(dataloader), total_correct / len(dataloader.dataset)

# Validation Loop
def evaluate(model, dataloader):
    model.eval()
    total_loss, total_correct = 0, 0
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to("cuda")
            attention_mask = batch["attention_mask"].to("cuda")
            labels = batch["label"].to("cuda")

            outputs = model(input_ids, attention_mask=attention_mask)
            loss = criterion(outputs.logits, labels)

            total_loss += loss.item()
            total_correct += (outputs.logits.argmax(1) == labels).sum().item()
            all_preds.extend(outputs.logits.argmax(1).cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = total_correct / len(dataloader.dataset)
    f1 = f1_score(all_labels, all_preds)
    return total_loss / len(dataloader), accuracy, f1

# Main Training Loop
epochs = 3
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    train_loss, train_accuracy = train(model, train_loader)
    val_loss, val_accuracy, val_f1 = evaluate(model, val_loader)

    print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}")
    print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}, Validation F1: {val_f1:.4f}")

# Evaluate on Test Data
test_loss, test_accuracy, test_f1 = evaluate(model, test_loader)
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}, Test F1: {test_f1:.4f}")


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/597M [00:00<?, ?B/s]

Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


Initializing global attention on CLS token...


OutOfMemoryError: CUDA out of memory. Tried to allocate 722.00 MiB. GPU 0 has a total capacity of 21.95 GiB of which 280.12 MiB is free. Process 152451 has 21.67 GiB memory in use. Of the allocated memory 20.44 GiB is allocated by PyTorch, and 1.01 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)