Created by: 

Date: 2024-12-07 

Latest change when and what:

Notes:

# 2. Preprocessing

Cleaning the data, tokenizing it, splitting it into test, train and validation, and finally embedding the data.

In [1]:
# importing packages
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import kagglehub
import shutil
import seaborn as sns
import re
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
import torch



In [2]:
# load data
data = pd.read_csv("../1_data_acquisition/data/labels_and_scripts.csv")
data.head()

Unnamed: 0,rating,id,title,imdbid,year,passed_bechdel,script_filename,script,decade,5_year_bin
0,2,1307,"Nosferatu, eine Symphonie des Grauens",13442,1922,0,Nosferatu_0013442.txt,\n\n 1922\n\n\n\n...,1920,1920
1,2,1305,"Phantom of the Opera, The",16220,1925,0,The Phantom of the Opera_0016220.txt,The Phantom of the Opera\n\nTHE PHANTOM OF THE...,1920,1925
2,0,1308,Battleship Potemkin,15648,1925,0,Battleship Potemkin_0015648.txt,Battleship Potemkin\n\nScenario and script by ...,1920,1925
3,2,5514,"Lost World, The",16039,1925,0,The Lost World_0016039.txt,THE LOST WORLD\nJURASSIC PARK\n\nscreenplay by...,1920,1925
4,1,1267,Metropolis,17136,1927,0,Metropolis_0017136.txt,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n ...,1920,1925


## 2.1 Cleaning the data

Removing '/n', lowercasing, removing special characters, etc

In [6]:
data["script"] = (
    data["script"]
    .str.replace(r'[^\w\s]', '', regex=True)  # Remove special characters
    .str.replace('\n', ' ')                   # Remove newlines
    .str.lower()                             # Convert to lowercase
    .str.replace(r'\s+', ' ', regex=True)    # Replace multiple spaces with a single space
    .str.strip()                             # Remove leading/trailing spaces
)

data.head()

Unnamed: 0,rating,id,title,imdbid,year,passed_bechdel,script_filename,script,decade,5_year_bin,script_tokens
0,2,1307,"Nosferatu, eine Symphonie des Grauens",13442,1922,0,Nosferatu_0013442.txt,1922 nosferatu cast count dracula the vampirem...,1920,1920,"[101, 4798, 16839, 27709, 8525, 3459, 4175, 18..."
1,2,1305,"Phantom of the Opera, The",16220,1925,0,The Phantom of the Opera_0016220.txt,the phantom of the opera the phantom of the op...,1920,1925,"[101, 1996, 11588, 1997, 1996, 3850, 1996, 115..."
2,0,1308,Battleship Potemkin,15648,1925,0,Battleship Potemkin_0015648.txt,battleship potemkin scenario and script by ser...,1920,1925,"[101, 17224, 8962, 6633, 4939, 11967, 1998, 58..."
3,2,5514,"Lost World, The",16039,1925,0,The Lost World_0016039.txt,the lost world jurassic park screenplay by dav...,1920,1925,"[101, 1996, 2439, 2088, 19996, 2380, 9000, 201..."
4,1,1267,Metropolis,17136,1927,0,Metropolis_0017136.txt,metropolis by corey mandell fade in ext manhat...,1920,1925,"[101, 18236, 2011, 18132, 2158, 12662, 12985, ..."


## 2.2 Tokenizing (500 tokens)

Tokenizing using a pre-trained BERT tokenizer from transformers

max_length=500: Specifies the maximum number of tokens to include.

truncation=True: Ensures that if the text exceeds 500 tokens, it will be truncated to fit the specified length.

add_special_tokens=True: Includes any special tokens required by the model, such as [CLS] and [SEP] for BERT.

Transformed-based models like BERT need inputs of same length -> pad & attention mask

In [7]:
def tokenize_and_encode(text, tokenizer, max_length=500):
    encoded = tokenizer(
        text,
        add_special_tokens=True,
        max_length=max_length,
        padding="max_length",
        truncation=True,
        return_tensors="pt"  # Return PyTorch tensors
    )
    return encoded['input_ids'][0], encoded['attention_mask'][0]

# Apply to all scripts
data[['input_ids', 'attention_mask']] = data['script'].apply(
    lambda x: pd.Series(tokenize_and_encode(x, tokenizer, max_length=500))
)


# Display the DataFrame with the new column
data.head()


Unnamed: 0,rating,id,title,imdbid,year,passed_bechdel,script_filename,script,decade,5_year_bin,script_tokens,input_ids,attention_mask
0,2,1307,"Nosferatu, eine Symphonie des Grauens",13442,1922,0,Nosferatu_0013442.txt,1922 nosferatu cast count dracula the vampirem...,1920,1920,"[101, 4798, 16839, 27709, 8525, 3459, 4175, 18...","[tensor(101), tensor(4798), tensor(16839), ten...","[tensor(1), tensor(1), tensor(1), tensor(1), t..."
1,2,1305,"Phantom of the Opera, The",16220,1925,0,The Phantom of the Opera_0016220.txt,the phantom of the opera the phantom of the op...,1920,1925,"[101, 1996, 11588, 1997, 1996, 3850, 1996, 115...","[tensor(101), tensor(1996), tensor(11588), ten...","[tensor(1), tensor(1), tensor(1), tensor(1), t..."
2,0,1308,Battleship Potemkin,15648,1925,0,Battleship Potemkin_0015648.txt,battleship potemkin scenario and script by ser...,1920,1925,"[101, 17224, 8962, 6633, 4939, 11967, 1998, 58...","[tensor(101), tensor(17224), tensor(8962), ten...","[tensor(1), tensor(1), tensor(1), tensor(1), t..."
3,2,5514,"Lost World, The",16039,1925,0,The Lost World_0016039.txt,the lost world jurassic park screenplay by dav...,1920,1925,"[101, 1996, 2439, 2088, 19996, 2380, 9000, 201...","[tensor(101), tensor(1996), tensor(2439), tens...","[tensor(1), tensor(1), tensor(1), tensor(1), t..."
4,1,1267,Metropolis,17136,1927,0,Metropolis_0017136.txt,metropolis by corey mandell fade in ext manhat...,1920,1925,"[101, 18236, 2011, 18132, 2158, 12662, 12985, ...","[tensor(101), tensor(18236), tensor(2011), ten...","[tensor(1), tensor(1), tensor(1), tensor(1), t..."


In [8]:
#checking
len(data['script_tokens'][0])

500

## 2.3 Splitting the data into test, train and validation

In [11]:

# Step 1: Split the data into train and temp (validation+test) sets
train_data, temp_data = train_test_split(data, test_size=0.2, random_state=42)  # 20% for validation+test

# Step 2: Split temp_data into validation and test sets (10% each)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)  # 50% of temp (10% of original)

# Display the sizes of each set
print(f"Train size: {len(train_data)}")
print(f"Validation size: {len(val_data)}")
print(f"Test size: {len(test_data)}")

Train size: 1424
Validation size: 178
Test size: 179


In [12]:
# saving the test, train and val datasets
train_data.to_csv("train.csv", index= False)
test_data.to_csv("test.csv", index= False)
val_data.to_csv("validation.csv", index= False)

## Model selection

In [9]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
from torch.utils.data import Dataset, DataLoader

class ScriptDataset(Dataset):
    def __init__(self, data):
        self.input_ids = torch.stack(data['input_ids'].tolist())
        self.attention_mask = torch.stack(data['attention_mask'].tolist())
        self.labels = torch.tensor(data['passed_bechdel'].values)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx]
        }


train_dataset = ScriptDataset(train_data)
val_dataset = ScriptDataset(val_data)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

## Training loop

In [26]:
from transformers import AdamW
from torch.nn import CrossEntropyLoss
from torch.optim import lr_scheduler

optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = CrossEntropyLoss()

# Move model to GPU if available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# Training loop
for epoch in range(6):  # 3 epochs as a start
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    
    print(f"Epoch {epoch + 1}: Loss = {loss.item()}")


Epoch 1: Loss = 0.5384442210197449
Epoch 2: Loss = 0.26806384325027466
Epoch 3: Loss = 0.07606004923582077
Epoch 4: Loss = 0.1101921796798706
Epoch 5: Loss = 0.04116030037403107
Epoch 6: Loss = 0.02917688526213169


## Evaluation

In [28]:
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=-1)
        correct += (predictions == labels).sum().item()
        total += labels.size(0)

accuracy = correct / total
print(f"Validation Accuracy: {accuracy:.2f}")


Validation Accuracy: 0.65


## --- Trying bigger models ---

In [29]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load Longformer tokenizer and model (base version, 4096 token support)
tokenizer = AutoTokenizer.from_pretrained("allenai/longformer-base-4096")
model = AutoModelForSequenceClassification.from_pretrained("allenai/longformer-base-4096", num_labels=2)  # 2 labels for binary classification

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/597M [00:00<?, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
# Tokenize the entire script (max_length increased to 4096)
tokenized = tokenizer(
    data['script'].tolist(),
    max_length=4096,  # Longformer supports up to 4096 tokens
    truncation=True,
    padding=True,
    return_tensors="pt"
)

data['input_ids'] = tokenized['input_ids']
data['attention_mask'] = tokenized['attention_mask']

In [34]:
class ScriptDataset(Dataset):
    def __init__(self, data):
        self.input_ids = torch.stack(data['input_ids'].tolist())
        self.attention_mask = torch.stack(data['attention_mask'].tolist())
        self.labels = torch.tensor(data['passed_bechdel'].values)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx]
        }

train_dataset = ScriptDataset(train_data)
val_dataset = ScriptDataset(val_data)
test_dataset = ScriptDataset(test_data)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)

In [35]:
from torch.optim import AdamW
from tqdm import tqdm

optimizer = AdamW(model.parameters(), lr=3e-5) # changed 1e -> 3e

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training loop
for epoch in range(6):  # 6 epochs
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader):
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        # Backward pass
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1}: Training Loss = {total_loss / len(train_loader)}")

    # Validation
    model.eval()
    correct = 0
    total = 0
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            predictions = torch.argmax(outputs.logits, dim=-1)
            correct += (predictions == labels).sum().item()
            total += labels.size(0)

            val_loss += outputs.loss.item()

    val_accuracy = correct / total
    print(f"Epoch {epoch + 1}: Validation Loss = {val_loss / len(val_loader)}, Validation Accuracy = {val_accuracy:.2f}")


  0%|          | 0/89 [00:00<?, ?it/s]

100%|██████████| 89/89 [03:59<00:00,  2.69s/it]


Epoch 1: Training Loss = 0.6232423501068287
Epoch 1: Validation Loss = 0.6969125866889954, Validation Accuracy = 0.63


100%|██████████| 89/89 [03:59<00:00,  2.69s/it]


Epoch 2: Training Loss = 0.6146960559855686
Epoch 2: Validation Loss = 0.6896079381306967, Validation Accuracy = 0.61


100%|██████████| 89/89 [03:59<00:00,  2.69s/it]


Epoch 3: Training Loss = 0.5928333649474583
Epoch 3: Validation Loss = 0.6704256335894266, Validation Accuracy = 0.67


100%|██████████| 89/89 [03:58<00:00,  2.68s/it]


Epoch 4: Training Loss = 0.5772817124811451
Epoch 4: Validation Loss = 0.7360468953847885, Validation Accuracy = 0.63


100%|██████████| 89/89 [03:59<00:00,  2.69s/it]


Epoch 5: Training Loss = 0.5363863555902846
Epoch 5: Validation Loss = 0.8584823037187258, Validation Accuracy = 0.63


100%|██████████| 89/89 [03:59<00:00,  2.69s/it]


Epoch 6: Training Loss = 0.4790857340512651
Epoch 6: Validation Loss = 0.8743902146816254, Validation Accuracy = 0.62


## --- stella_en_v5 ---

In [36]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch.utils.data import DataLoader, Dataset


In [38]:
# Load the tokenizer and model
model_name = "blevlabs/stella_en_v5"  # or the specific model you want to use

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

tokenizer_config.json:   0%|          | 0.00/1.31k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/80.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/370 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/844 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/6.17G [00:00<?, ?B/s]

Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at blevlabs/stella_en_v5 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Qwen2ForSequenceClassification(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151646, 1536)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear(in_features=1536, out_features=1536, bias=True)
          (k_proj): Linear(in_features=1536, out_features=256, bias=True)
          (v_proj): Linear(in_features=1536, out_features=256, bias=True)
          (o_proj): Linear(in_features=1536, out_features=1536, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=1536, out_features=8960, bias=False)
          (up_proj): Linear(in_features=1536, out_features=8960, bias=False)
          (down_proj): Linear(in_features=8960, out_features=1536, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
      )
    )
