Created by: 

Date: 2024-12-07 

Latest change when and what:

Notes:

# 2. Preprocessing

Cleaning the data, tokenizing it, splitting it into test, train and validation, and finally embedding the data.

In [1]:
# importing packages
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import kagglehub
import shutil
import seaborn as sns
import re
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
import torch



In [2]:
# load data
data = pd.read_csv("../1_data_acquisition/data/labels_and_scripts.csv")
data.head()

Unnamed: 0,rating,id,title,imdbid,year,passed_bechdel,script_filename,script,decade,5_year_bin
0,2,1307,"Nosferatu, eine Symphonie des Grauens",13442,1922,0,Nosferatu_0013442.txt,\n\n 1922\n\n\n\n...,1920,1920
1,2,1305,"Phantom of the Opera, The",16220,1925,0,The Phantom of the Opera_0016220.txt,The Phantom of the Opera\n\nTHE PHANTOM OF THE...,1920,1925
2,0,1308,Battleship Potemkin,15648,1925,0,Battleship Potemkin_0015648.txt,Battleship Potemkin\n\nScenario and script by ...,1920,1925
3,2,5514,"Lost World, The",16039,1925,0,The Lost World_0016039.txt,THE LOST WORLD\nJURASSIC PARK\n\nscreenplay by...,1920,1925
4,1,1267,Metropolis,17136,1927,0,Metropolis_0017136.txt,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n ...,1920,1925


## 2.1 Cleaning the data

Removing '/n', lowercasing, removing special characters, etc

In [6]:
data["script"] = (
    data["script"]
    .str.replace(r'[^\w\s]', '', regex=True)  # Remove special characters
    .str.replace('\n', ' ')                   # Remove newlines
    .str.lower()                             # Convert to lowercase
    .str.replace(r'\s+', ' ', regex=True)    # Replace multiple spaces with a single space
    .str.strip()                             # Remove leading/trailing spaces
)

data.head()

Unnamed: 0,rating,id,title,imdbid,year,passed_bechdel,script_filename,script,decade,5_year_bin,script_tokens
0,2,1307,"Nosferatu, eine Symphonie des Grauens",13442,1922,0,Nosferatu_0013442.txt,1922 nosferatu cast count dracula the vampirem...,1920,1920,"[101, 4798, 16839, 27709, 8525, 3459, 4175, 18..."
1,2,1305,"Phantom of the Opera, The",16220,1925,0,The Phantom of the Opera_0016220.txt,the phantom of the opera the phantom of the op...,1920,1925,"[101, 1996, 11588, 1997, 1996, 3850, 1996, 115..."
2,0,1308,Battleship Potemkin,15648,1925,0,Battleship Potemkin_0015648.txt,battleship potemkin scenario and script by ser...,1920,1925,"[101, 17224, 8962, 6633, 4939, 11967, 1998, 58..."
3,2,5514,"Lost World, The",16039,1925,0,The Lost World_0016039.txt,the lost world jurassic park screenplay by dav...,1920,1925,"[101, 1996, 2439, 2088, 19996, 2380, 9000, 201..."
4,1,1267,Metropolis,17136,1927,0,Metropolis_0017136.txt,metropolis by corey mandell fade in ext manhat...,1920,1925,"[101, 18236, 2011, 18132, 2158, 12662, 12985, ..."


## 2.2 Tokenizing (500 tokens)

Tokenizing using a pre-trained BERT tokenizer from transformers

max_length=500: Specifies the maximum number of tokens to include.

truncation=True: Ensures that if the text exceeds 500 tokens, it will be truncated to fit the specified length.

add_special_tokens=True: Includes any special tokens required by the model, such as [CLS] and [SEP] for BERT.

Transformed-based models like BERT need inputs of same length -> pad & attention mask

In [7]:
def tokenize_and_encode(text, tokenizer, max_length=500):
    encoded = tokenizer(
        text,
        add_special_tokens=True,
        max_length=max_length,
        padding="max_length",
        truncation=True,
        return_tensors="pt"  # Return PyTorch tensors
    )
    return encoded['input_ids'][0], encoded['attention_mask'][0]

# Apply to all scripts
data[['input_ids', 'attention_mask']] = data['script'].apply(
    lambda x: pd.Series(tokenize_and_encode(x, tokenizer, max_length=500))
)


# Display the DataFrame with the new column
data.head()


Unnamed: 0,rating,id,title,imdbid,year,passed_bechdel,script_filename,script,decade,5_year_bin,script_tokens,input_ids,attention_mask
0,2,1307,"Nosferatu, eine Symphonie des Grauens",13442,1922,0,Nosferatu_0013442.txt,1922 nosferatu cast count dracula the vampirem...,1920,1920,"[101, 4798, 16839, 27709, 8525, 3459, 4175, 18...","[tensor(101), tensor(4798), tensor(16839), ten...","[tensor(1), tensor(1), tensor(1), tensor(1), t..."
1,2,1305,"Phantom of the Opera, The",16220,1925,0,The Phantom of the Opera_0016220.txt,the phantom of the opera the phantom of the op...,1920,1925,"[101, 1996, 11588, 1997, 1996, 3850, 1996, 115...","[tensor(101), tensor(1996), tensor(11588), ten...","[tensor(1), tensor(1), tensor(1), tensor(1), t..."
2,0,1308,Battleship Potemkin,15648,1925,0,Battleship Potemkin_0015648.txt,battleship potemkin scenario and script by ser...,1920,1925,"[101, 17224, 8962, 6633, 4939, 11967, 1998, 58...","[tensor(101), tensor(17224), tensor(8962), ten...","[tensor(1), tensor(1), tensor(1), tensor(1), t..."
3,2,5514,"Lost World, The",16039,1925,0,The Lost World_0016039.txt,the lost world jurassic park screenplay by dav...,1920,1925,"[101, 1996, 2439, 2088, 19996, 2380, 9000, 201...","[tensor(101), tensor(1996), tensor(2439), tens...","[tensor(1), tensor(1), tensor(1), tensor(1), t..."
4,1,1267,Metropolis,17136,1927,0,Metropolis_0017136.txt,metropolis by corey mandell fade in ext manhat...,1920,1925,"[101, 18236, 2011, 18132, 2158, 12662, 12985, ...","[tensor(101), tensor(18236), tensor(2011), ten...","[tensor(1), tensor(1), tensor(1), tensor(1), t..."


In [8]:
#checking
len(data['script_tokens'][0])

500

## 2.3 Splitting the data into test, train and validation

In [11]:

# Step 1: Split the data into train and temp (validation+test) sets
train_data, temp_data = train_test_split(data, test_size=0.2, random_state=42)  # 20% for validation+test

# Step 2: Split temp_data into validation and test sets (10% each)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)  # 50% of temp (10% of original)

# Display the sizes of each set
print(f"Train size: {len(train_data)}")
print(f"Validation size: {len(val_data)}")
print(f"Test size: {len(test_data)}")

Train size: 1424
Validation size: 178
Test size: 179


In [12]:
# saving the test, train and val datasets
train_data.to_csv("train.csv", index= False)
test_data.to_csv("test.csv", index= False)
val_data.to_csv("validation.csv", index= False)

## Model selection

In [9]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
from torch.utils.data import Dataset, DataLoader

class ScriptDataset(Dataset):
    def __init__(self, data):
        self.input_ids = torch.stack(data['input_ids'].tolist())
        self.attention_mask = torch.stack(data['attention_mask'].tolist())
        self.labels = torch.tensor(data['passed_bechdel'].values)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx]
        }


train_dataset = ScriptDataset(train_data)
val_dataset = ScriptDataset(val_data)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

## Training loop

In [26]:
from transformers import AdamW
from torch.nn import CrossEntropyLoss
from torch.optim import lr_scheduler

optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = CrossEntropyLoss()

# Move model to GPU if available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# Training loop
for epoch in range(6):  # 3 epochs as a start
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    
    print(f"Epoch {epoch + 1}: Loss = {loss.item()}")


Epoch 1: Loss = 0.5384442210197449
Epoch 2: Loss = 0.26806384325027466
Epoch 3: Loss = 0.07606004923582077
Epoch 4: Loss = 0.1101921796798706
Epoch 5: Loss = 0.04116030037403107
Epoch 6: Loss = 0.02917688526213169


## Evaluation

In [28]:
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=-1)
        correct += (predictions == labels).sum().item()
        total += labels.size(0)

accuracy = correct / total
print(f"Validation Accuracy: {accuracy:.2f}")


Validation Accuracy: 0.65


## -------------------------- New stuff above, old stuff below -------------------

#### trying to feed the tokens of one script to distilbert 

In [None]:
one_script = train_data.iloc[0]
one_script['script_tokens']

In [None]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding

# Load pre-trained DistilBERT model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

# Prepare data collator for padding sequences
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    logging_strategy="epoch"
)

# Define Trainer object for training the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train the model
trainer.train()

# Save the trained model
trainer.save_model('model')

## 2.4 Embedding the data

Because we are using a pre-trained model and tokenizer, we need to first get the token IDs

In [None]:
from transformers import AutoTokenizer
import torch

# Load the pre-trained tokenizer (e.g., BERT tokenizer)
model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define a function to convert words to token IDs using the tokenizer
def words_to_token_ids(words):
    # Tokenize the words into token IDs
    return tokenizer.convert_tokens_to_ids(words)

# Apply the function to your tokenized column (assuming `script_tokens` contains words)
train_data["script_token_ids"] = train_data["script_tokens"].apply(words_to_token_ids)
val_data["script_token_ids"] = val_data["script_tokens"].apply(words_to_token_ids)
test_data["script_token_ids"] = test_data["script_tokens"].apply(words_to_token_ids)

# Check the result
print(train_data["script_token_ids"].head())


In [None]:
train_data.head()

In [None]:
# embedding the script token ids

# Load the pre-trained BERT model
#model_name = 'bert-base-uncased'
#model = AutoModel.from_pretrained(model_name)

# Define a function to obtain embeddings for token IDs
def get_embeddings(token_ids):
    # Convert list of token IDs to a tensor
    tokens_tensor = torch.tensor([token_ids])  # Add a batch dimension
    
    # Create an attention mask (1 for real tokens, 0 for padding)
    attention_mask = (tokens_tensor != tokenizer.pad_token_id).int()
    
    # Pass through the model (no gradient computation to save memory)
    with torch.no_grad():
        outputs = model(input_ids=tokens_tensor, attention_mask=attention_mask)
    
    # Extract the embeddings (usually from the last hidden layer)
    # The shape of outputs.last_hidden_state is (batch_size, sequence_length, hidden_size)
    # We usually take the embeddings of the [CLS] token (index 0)
    cls_embedding = outputs.last_hidden_state[0, 0, :]  # [CLS] token's embedding

    return cls_embedding

# Apply the function to the tokenized column
train_data["embeddings"] = train_data["script_token_ids"].apply(get_embeddings)
val_data["embeddings"] = val_data["script_token_ids"].apply(get_embeddings)
test_data["embeddings"] = test_data["script_token_ids"].apply(get_embeddings)

# Check the result
print(train_data["embeddings"].head())


In [None]:
# save embeddings
train_data.to_csv("train_embedded.csv", index= False)
test_data.to_csv("test_embedded.csv", index= False)
val_data.to_csv("validation_embedded.csv", index= False)

In [None]:
train_data.head()

In [None]:
train_data['embeddings']

In [None]:
train_data = pd.read_csv("train_embedded.csv")
train_data.head()

In [None]:
train_data.iloc[0]

In [None]:
print(train_data['embeddings'][0])

In [None]:
test['embeddings'] # format is slightly different 

In [None]:
### Testing a small model

In [None]:
from transformers import DistilBertForSequenceClassification, AdamW

# Load the pre-trained DistilBERT model with a classification head
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


DON't know about the code below......