Created by: 

Date: 2024-12-07 

Latest change when and what:

Notes:

# 2. Preprocessing

Cleaning the data, tokenizing it, splitting it into test, train and validation, and finally embedding the data.

In [25]:
# importing packages
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import kagglehub
import shutil
import seaborn as sns
import re
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
import torch



In [17]:
# load data
data = pd.read_csv("../1_data_acquisition/data/labels_and_scripts.csv")
data.head()

Unnamed: 0,title,rating,imdbid,year,id,passed_bechdel,script_filename,script
0,"Nosferatu, eine Symphonie des Grauens",2,13442,1922,1307,0,Nosferatu_0013442.txt,\n\n 1922\n\n\n\n...
1,"Phantom of the Opera, The",2,16220,1925,1305,0,The Phantom of the Opera_0016220.txt,The Phantom of the Opera\n\nTHE PHANTOM OF THE...
2,Battleship Potemkin,0,15648,1925,1308,0,Battleship Potemkin_0015648.txt,Battleship Potemkin\n\nScenario and script by ...
3,"Lost World, The",2,16039,1925,5514,0,The Lost World_0016039.txt,THE LOST WORLD\nJURASSIC PARK\n\nscreenplay by...
4,Metropolis,1,17136,1927,1267,0,Metropolis_0017136.txt,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n ...


## 2.1 Cleaning the data

Removing '/n', lowercasing, removing special characters, etc

In [18]:
data["script"] = (
    data["script"]
    .str.replace(r'[^\w\s]', '', regex=True)  # Remove special characters
    .str.replace('\n', ' ')                   # Remove newlines
    .str.lower()                             # Convert to lowercase
    .str.replace(r'\s+', ' ', regex=True)    # Replace multiple spaces with a single space
    .str.strip()                             # Remove leading/trailing spaces
)

data.head()

Unnamed: 0,title,rating,imdbid,year,id,passed_bechdel,script_filename,script
0,"Nosferatu, eine Symphonie des Grauens",2,13442,1922,1307,0,Nosferatu_0013442.txt,1922 nosferatu cast count dracula the vampirem...
1,"Phantom of the Opera, The",2,16220,1925,1305,0,The Phantom of the Opera_0016220.txt,the phantom of the opera the phantom of the op...
2,Battleship Potemkin,0,15648,1925,1308,0,Battleship Potemkin_0015648.txt,battleship potemkin scenario and script by ser...
3,"Lost World, The",2,16039,1925,5514,0,The Lost World_0016039.txt,the lost world jurassic park screenplay by dav...
4,Metropolis,1,17136,1927,1267,0,Metropolis_0017136.txt,metropolis by corey mandell fade in ext manhat...


## 2.2 Tokenizing (500 tokens)

Tokenizing using a pre-trained BERT tokenizer from transformers

max_length=500: Specifies the maximum number of tokens to include.


truncation=True: Ensures that if the text exceeds 500 tokens, it will be truncated to fit the specified length.


add_special_tokens=True: Includes any special tokens required by the model, such as [CLS] and [SEP] for BERT.

In [19]:

# Load the tokenizer (use any pre-trained tokenizer, e.g., BERT)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenize each row in the 'script' column
data["script_tokens"] = data["script"].apply( 
    lambda x: tokenizer.tokenize(x, add_special_tokens=True, max_length=500, truncation = True)) # only takes the first 500 tokens

# Display the DataFrame with the new column
data.head()


Unnamed: 0,title,rating,imdbid,year,id,passed_bechdel,script_filename,script,script_tokens
0,"Nosferatu, eine Symphonie des Grauens",2,13442,1922,1307,0,Nosferatu_0013442.txt,1922 nosferatu cast count dracula the vampirem...,"[[CLS], 1922, nos, ##fera, ##tu, cast, count, ..."
1,"Phantom of the Opera, The",2,16220,1925,1305,0,The Phantom of the Opera_0016220.txt,the phantom of the opera the phantom of the op...,"[[CLS], the, phantom, of, the, opera, the, pha..."
2,Battleship Potemkin,0,15648,1925,1308,0,Battleship Potemkin_0015648.txt,battleship potemkin scenario and script by ser...,"[[CLS], battleship, pot, ##em, ##kin, scenario..."
3,"Lost World, The",2,16039,1925,5514,0,The Lost World_0016039.txt,the lost world jurassic park screenplay by dav...,"[[CLS], the, lost, world, jurassic, park, scre..."
4,Metropolis,1,17136,1927,1267,0,Metropolis_0017136.txt,metropolis by corey mandell fade in ext manhat...,"[[CLS], metropolis, by, corey, man, ##dell, fa..."


In [None]:
#checking
len(data['script_tokens'][0])

## 2.3 Splitting the data into test, train and validation

In [21]:

# Step 1: Split the data into train and temp (validation+test) sets
train_data, temp_data = train_test_split(data, test_size=0.2, random_state=42)  # 20% for validation+test

# Step 2: Split temp_data into validation and test sets (10% each)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)  # 50% of temp (10% of original)

# Display the sizes of each set
print(f"Train size: {len(train_data)}")
print(f"Validation size: {len(val_data)}")
print(f"Test size: {len(test_data)}")


Train size: 1424
Validation size: 178
Test size: 179


## 2.4 Embedding the data

Because we are using a pre-trained model and tokenizer, we need to first get the token IDs

In [53]:
from transformers import AutoTokenizer
import torch

# Load the pre-trained tokenizer (e.g., BERT tokenizer)
model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define a function to convert words to token IDs using the tokenizer
def words_to_token_ids(words):
    # Tokenize the words into token IDs
    return tokenizer.convert_tokens_to_ids(words)

# Apply the function to your tokenized column (assuming `script_tokens` contains words)
train_data["script_token_ids"] = train_data["script_tokens"].apply(words_to_token_ids)
val_data["script_token_ids"] = val_data["script_tokens"].apply(words_to_token_ids)
test_data["script_token_ids"] = test_data["script_tokens"].apply(words_to_token_ids)

# Check the result
print(train_data["script_token_ids"].head())


175     [101, 3899, 2154, 5027, 2011, 3581, 16067, 223...
629     [101, 8942, 5863, 1015, 20014, 7151, 27454, 89...
1105    [101, 2689, 2989, 1037, 2995, 2466, 2517, 2011...
1740    [101, 2005, 2115, 9584, 21198, 5151, 2434, 900...
1013    [101, 1996, 3035, 2517, 2011, 2848, 5253, 1015...
Name: script_token_ids, dtype: object


In [54]:
train_data.head()

Unnamed: 0,title,rating,imdbid,year,id,passed_bechdel,script_filename,script,script_tokens,script_token_ids
175,Dog Day Afternoon,3,72890,1975,3799,1,Dog Day Afternoon_0072890.txt,dog day afternoon by frank pierson final draft...,"[[CLS], dog, day, afternoon, by, frank, piers,...","[101, 3899, 2154, 5027, 2011, 3581, 16067, 223..."
629,Batman &amp; Robin,2,118688,1997,1026,0,Batman Robin_0118688.txt,batman robin 1 int batcave batmans costume vau...,"[[CLS], batman, robin, 1, int, bat, ##cave, ba...","[101, 8942, 5863, 1015, 20014, 7151, 27454, 89..."
1105,Changeling,3,824747,2008,286,1,Changeling_0824747.txt,changeling a true story written by j michael s...,"[[CLS], change, ##ling, a, true, story, writte...","[101, 2689, 2989, 1037, 2995, 2466, 2517, 2011..."
1740,Gisaengchung,3,6751668,2019,8768,1,Parasite_6751668.txt,for your consideration parasite outstanding or...,"[[CLS], for, your, consideration, parasite, ou...","[101, 2005, 2115, 9584, 21198, 5151, 2434, 900..."
1013,"Queen, The",3,436697,2006,1412,1,The Queen_0436697.txt,the queen written by peter morgan 1 archive te...,"[[CLS], the, queen, written, by, peter, morgan...","[101, 1996, 3035, 2517, 2011, 2848, 5253, 1015..."


In [55]:
# embedding the script token ids

# Load the pre-trained BERT model
#model_name = 'bert-base-uncased'
#model = AutoModel.from_pretrained(model_name)

# Define a function to obtain embeddings for token IDs
def get_embeddings(token_ids):
    # Convert list of token IDs to a tensor
    tokens_tensor = torch.tensor([token_ids])  # Add a batch dimension
    
    # Create an attention mask (1 for real tokens, 0 for padding)
    attention_mask = (tokens_tensor != tokenizer.pad_token_id).int()
    
    # Pass through the model (no gradient computation to save memory)
    with torch.no_grad():
        outputs = model(input_ids=tokens_tensor, attention_mask=attention_mask)
    
    # Extract the embeddings (usually from the last hidden layer)
    # The shape of outputs.last_hidden_state is (batch_size, sequence_length, hidden_size)
    # We usually take the embeddings of the [CLS] token (index 0)
    cls_embedding = outputs.last_hidden_state[0, 0, :]  # [CLS] token's embedding

    return cls_embedding

# Apply the function to the tokenized column
train_data["embeddings"] = train_data["script_token_ids"].apply(get_embeddings)
val_data["embeddings"] = val_data["script_token_ids"].apply(get_embeddings)
test_data["embeddings"] = test_data["script_token_ids"].apply(get_embeddings)

# Check the result
print(train_data["embeddings"].head())


175     [tensor(-0.4918), tensor(0.2709), tensor(0.378...
629     [tensor(-0.9178), tensor(0.4156), tensor(-0.00...
1105    [tensor(-0.2787), tensor(-0.0085), tensor(0.75...
1740    [tensor(-0.3097), tensor(0.3638), tensor(0.188...
1013    [tensor(-0.4474), tensor(0.0109), tensor(0.446...
Name: embeddings, dtype: object


In [56]:
# save embeddings
train_data.to_csv("train_embedded.csv", index= False)
test_data.to_csv("test_embedded.csv", index= False)
val_data.to_csv("validation_embedded.csv", index= False)

In [57]:
train_data.head()

Unnamed: 0,title,rating,imdbid,year,id,passed_bechdel,script_filename,script,script_tokens,script_token_ids,embeddings
175,Dog Day Afternoon,3,72890,1975,3799,1,Dog Day Afternoon_0072890.txt,dog day afternoon by frank pierson final draft...,"[[CLS], dog, day, afternoon, by, frank, piers,...","[101, 3899, 2154, 5027, 2011, 3581, 16067, 223...","[tensor(-0.4918), tensor(0.2709), tensor(0.378..."
629,Batman &amp; Robin,2,118688,1997,1026,0,Batman Robin_0118688.txt,batman robin 1 int batcave batmans costume vau...,"[[CLS], batman, robin, 1, int, bat, ##cave, ba...","[101, 8942, 5863, 1015, 20014, 7151, 27454, 89...","[tensor(-0.9178), tensor(0.4156), tensor(-0.00..."
1105,Changeling,3,824747,2008,286,1,Changeling_0824747.txt,changeling a true story written by j michael s...,"[[CLS], change, ##ling, a, true, story, writte...","[101, 2689, 2989, 1037, 2995, 2466, 2517, 2011...","[tensor(-0.2787), tensor(-0.0085), tensor(0.75..."
1740,Gisaengchung,3,6751668,2019,8768,1,Parasite_6751668.txt,for your consideration parasite outstanding or...,"[[CLS], for, your, consideration, parasite, ou...","[101, 2005, 2115, 9584, 21198, 5151, 2434, 900...","[tensor(-0.3097), tensor(0.3638), tensor(0.188..."
1013,"Queen, The",3,436697,2006,1412,1,The Queen_0436697.txt,the queen written by peter morgan 1 archive te...,"[[CLS], the, queen, written, by, peter, morgan...","[101, 1996, 3035, 2517, 2011, 2848, 5253, 1015...","[tensor(-0.4474), tensor(0.0109), tensor(0.446..."


In [64]:
train_data['embeddings']

175     [tensor(-0.4918), tensor(0.2709), tensor(0.378...
629     [tensor(-0.9178), tensor(0.4156), tensor(-0.00...
1105    [tensor(-0.2787), tensor(-0.0085), tensor(0.75...
1740    [tensor(-0.3097), tensor(0.3638), tensor(0.188...
1013    [tensor(-0.4474), tensor(0.0109), tensor(0.446...
                              ...                        
1130    [tensor(0.0016), tensor(0.2415), tensor(0.6381...
1294    [tensor(-0.0257), tensor(0.1727), tensor(0.732...
860     [tensor(-0.8456), tensor(0.1440), tensor(0.049...
1459    [tensor(-0.5754), tensor(0.3761), tensor(0.309...
1126    [tensor(-0.5921), tensor(0.1790), tensor(0.261...
Name: embeddings, Length: 1424, dtype: object

In [58]:
test = pd.read_csv("train_embedded.csv")
test.head()

Unnamed: 0,title,rating,imdbid,year,id,passed_bechdel,script_filename,script,script_tokens,script_token_ids,embeddings
0,Dog Day Afternoon,3,72890,1975,3799,1,Dog Day Afternoon_0072890.txt,dog day afternoon by frank pierson final draft...,"['[CLS]', 'dog', 'day', 'afternoon', 'by', 'fr...","[101, 3899, 2154, 5027, 2011, 3581, 16067, 223...","tensor([-4.9176e-01, 2.7094e-01, 3.7832e-01,..."
1,Batman &amp; Robin,2,118688,1997,1026,0,Batman Robin_0118688.txt,batman robin 1 int batcave batmans costume vau...,"['[CLS]', 'batman', 'robin', '1', 'int', 'bat'...","[101, 8942, 5863, 1015, 20014, 7151, 27454, 89...","tensor([-9.1781e-01, 4.1560e-01, -8.7145e-03,..."
2,Changeling,3,824747,2008,286,1,Changeling_0824747.txt,changeling a true story written by j michael s...,"['[CLS]', 'change', '##ling', 'a', 'true', 'st...","[101, 2689, 2989, 1037, 2995, 2466, 2517, 2011...","tensor([-2.7869e-01, -8.5252e-03, 7.5140e-01,..."
3,Gisaengchung,3,6751668,2019,8768,1,Parasite_6751668.txt,for your consideration parasite outstanding or...,"['[CLS]', 'for', 'your', 'consideration', 'par...","[101, 2005, 2115, 9584, 21198, 5151, 2434, 900...","tensor([-3.0974e-01, 3.6384e-01, 1.8883e-01,..."
4,"Queen, The",3,436697,2006,1412,1,The Queen_0436697.txt,the queen written by peter morgan 1 archive te...,"['[CLS]', 'the', 'queen', 'written', 'by', 'pe...","[101, 1996, 3035, 2517, 2011, 2848, 5253, 1015...","tensor([-4.4745e-01, 1.0884e-02, 4.4611e-01,..."


In [None]:
test['embeddings'] # format is slightly different 

0       tensor([-4.9176e-01,  2.7094e-01,  3.7832e-01,...
1       tensor([-9.1781e-01,  4.1560e-01, -8.7145e-03,...
2       tensor([-2.7869e-01, -8.5252e-03,  7.5140e-01,...
3       tensor([-3.0974e-01,  3.6384e-01,  1.8883e-01,...
4       tensor([-4.4745e-01,  1.0884e-02,  4.4611e-01,...
                              ...                        
1419    tensor([ 1.6356e-03,  2.4148e-01,  6.3814e-01,...
1420    tensor([-2.5687e-02,  1.7268e-01,  7.3295e-01,...
1421    tensor([-8.4565e-01,  1.4402e-01,  4.9040e-02,...
1422    tensor([-5.7536e-01,  3.7615e-01,  3.0960e-01,...
1423    tensor([-5.9207e-01,  1.7896e-01,  2.6125e-01,...
Name: embeddings, Length: 1424, dtype: object

In [65]:
### Testing a small model

In [66]:
from transformers import DistilBertForSequenceClassification, AdamW

# Load the pre-trained DistilBERT model with a classification head
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [None]:
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import AdamW
from sklearn.metrics import accuracy_score

# Create DataLoader for the training set
train_data = TensorDataset(train_embeddings, train_labels)  # Use embeddings if you already have them
train_loader = DataLoader(train_data, batch_size=16, shuffle=True)

# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Define the loss function
loss_fn = torch.nn.CrossEntropyLoss()

# Training loop
epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        # Move batch to the device (GPU or CPU)
        input_ids, labels = batch
        input_ids = input_ids.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(input_ids=input_ids)
        logits = outputs.logits
        
        # Calculate loss
        loss = loss_fn(logits, labels)
        total_loss += loss.item()
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()

    # Print the loss after each epoch
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}")


NameError: name 'train_loader' is not defined