Date: 2024-12-07 

Latest change when and what: 
- 18/12 2024: removed old code

Notes:

# 2. Preprocessing

Cleaning the data, tokenizing it, splitting it into test, train and validation, and finally embedding the data.

In [None]:
# importing packages
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import kagglehub
import shutil
import seaborn as sns
import re
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
import torch

In [2]:
# load data
data = pd.read_csv("../1_data_acquisition/data/labels_and_scripts.csv")
data.head()

Unnamed: 0,title,rating,imdbid,year,id,passed_bechdel,script_filename,script
0,"Nosferatu, eine Symphonie des Grauens",2,13442,1922,1307,0,Nosferatu_0013442.txt,\n\n 1922\n\n\n\n...
1,"Phantom of the Opera, The",2,16220,1925,1305,0,The Phantom of the Opera_0016220.txt,The Phantom of the Opera\n\nTHE PHANTOM OF THE...
2,Battleship Potemkin,0,15648,1925,1308,0,Battleship Potemkin_0015648.txt,Battleship Potemkin\n\nScenario and script by ...
3,"Lost World, The",2,16039,1925,5514,0,The Lost World_0016039.txt,THE LOST WORLD\nJURASSIC PARK\n\nscreenplay by...
4,Metropolis,1,17136,1927,1267,0,Metropolis_0017136.txt,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n ...


## 2.1 Cleaning the data

Removing '/n', lowercasing, removing special characters, etc

In [8]:
data["script"] = (
    data["script"]
    .str.replace(r'[^\w\s]', '', regex=True)  # Remove special characters
    .str.replace('\n', ' ')                   # Remove newlines
    .str.replace(r'\s+', ' ', regex=True)    # Replace multiple spaces with a single space
    .str.strip()                             # Remove leading/trailing spaces
)

data.head()

Unnamed: 0,title,rating,imdbid,year,id,passed_bechdel,script_filename,script
0,"Nosferatu, eine Symphonie des Grauens",2,13442,1922,1307,0,Nosferatu_0013442.txt,1922 NOSFERATU cast Count Dracula the vampireM...
1,"Phantom of the Opera, The",2,16220,1925,1305,0,The Phantom of the Opera_0016220.txt,The Phantom of the Opera THE PHANTOM OF THE OP...
2,Battleship Potemkin,0,15648,1925,1308,0,Battleship Potemkin_0015648.txt,Battleship Potemkin Scenario and script by Ser...
3,"Lost World, The",2,16039,1925,5514,0,The Lost World_0016039.txt,THE LOST WORLD JURASSIC PARK screenplay by Dav...
4,Metropolis,1,17136,1927,1267,0,Metropolis_0017136.txt,METROPOLIS by Corey Mandell FADE IN EXT MANHAT...


In [9]:
# for lowercasing 
data_lowercase = data.copy()
data_lowercase["script"] = (data['script'].str.lower())
data_lowercase.head()

Unnamed: 0,title,rating,imdbid,year,id,passed_bechdel,script_filename,script
0,"Nosferatu, eine Symphonie des Grauens",2,13442,1922,1307,0,Nosferatu_0013442.txt,1922 nosferatu cast count dracula the vampirem...
1,"Phantom of the Opera, The",2,16220,1925,1305,0,The Phantom of the Opera_0016220.txt,the phantom of the opera the phantom of the op...
2,Battleship Potemkin,0,15648,1925,1308,0,Battleship Potemkin_0015648.txt,battleship potemkin scenario and script by ser...
3,"Lost World, The",2,16039,1925,5514,0,The Lost World_0016039.txt,the lost world jurassic park screenplay by dav...
4,Metropolis,1,17136,1927,1267,0,Metropolis_0017136.txt,metropolis by corey mandell fade in ext manhat...


# 2.1 Splitting the data into train, test and validation


Stratified train, test, validation split. Won't make a huge difference because the data is already very balanced, but it is good practice.

In [10]:
########## splitting the dataset which is not lower case
# Step 1: Split the data into train and temp (validation+test) sets
train_data, temp_data = train_test_split(data, stratify = data['passed_bechdel'] , test_size=0.2, random_state=42)  # 20% for validation+test

# Step 2: Split temp_data into validation and test sets (10% each)
val_data, test_data = train_test_split(temp_data, stratify = temp_data['passed_bechdel'], test_size=0.5, random_state=42)  # 50% of temp (10% of original)

# Display the sizes of each set
print(f"Train size: {len(train_data)}")
print(f"Validation size: {len(val_data)}")
print(f"Test size: {len(test_data)}")

Train size: 1418
Validation size: 177
Test size: 178


In [11]:
print(sum(val_data['passed_bechdel'] ==  1)) # checking
print(sum(val_data['passed_bechdel'] ==  0)) 

89
88


In [14]:
# saving the test, train and val datasets
train_data.to_csv("train_case_sensitive.csv", index= False)
test_data.to_csv("test_case_sensitive.csv", index= False)
val_data.to_csv("validation_case_sensitive.csv", index= False)

In [12]:
########## splitting the dataset which IS lowercase
# Step 1: Split the data into train and temp (validation+test) sets
train_data, temp_data = train_test_split(data_lowercase, stratify = data['passed_bechdel'] , test_size=0.2, random_state=42)  # 20% for validation+test

# Step 2: Split temp_data into validation and test sets (10% each)
val_data, test_data = train_test_split(temp_data, stratify = temp_data['passed_bechdel'], test_size=0.5, random_state=42)  # 50% of temp (10% of original)

# Display the sizes of each set
print(f"Train size: {len(train_data)}")
print(f"Validation size: {len(val_data)}")
print(f"Test size: {len(test_data)}")

Train size: 1418
Validation size: 177
Test size: 178


In [13]:
print(sum(val_data['passed_bechdel'] ==  1)) # checking
print(sum(val_data['passed_bechdel'] ==  0)) 

89
88


In [7]:
# saving the test, train and val datasets
train_data.to_csv("train_lowercase.csv", index= False)
test_data.to_csv("test_lowercase.csv", index= False)
val_data.to_csv("validation_lowercase.csv", index= False)

In [None]:
### reloading the datasets to check 
#train_data = pd.read_csv("train_lowercase.csv")
#test_data = pd.read_csv("test_lowercase.csv")
#val_data = pd.read_csv('validation_lowercase.csv')