In [1]:
import polars as pl
from polars import col
from pathlib import Path
import shutil

In [2]:
actors = pl.read_csv("actors.csv")
movies = pl.read_csv("movies.csv")
countries = pl.read_csv("countries.csv")
crew = pl.read_csv("crew.csv")
genres = pl.read_csv("genres.csv")
languages = pl.read_csv("languages.csv")
releases = pl.read_csv("releases.csv")
studios = pl.read_csv("studios.csv")
themes = pl.read_csv("themes.csv")

# Filtering

* Only movies tagged with country: USA
* Only the top 200 studios in terms of number of unique titles
* No null values for
    * Rating
    * Date
    * Tagline
    * Description
    * Minutes

Subsequently:
* Only actors represented in at least 3 in the sampled dataset
* Only directors who have directed at least 3 movies in the sampled dataset



In [3]:
filtered_studios = studios.join(
    studios.group_by("studio").agg(pl.len().alias("n_unique_movies")).sort("n_unique_movies", descending=True).limit(200),
    on="studio",
    how="semi",
)

In [4]:
filtered_countries = countries.filter(col("country") == "USA")

In [5]:
filtered_movies = movies.join(
    filtered_studios,
    on="id",
    how="semi",
).join(
    filtered_countries,
    on="id",
    how="semi",
).filter(
    col("rating").is_not_null() &
    col("date").is_not_null() &
    col("tagline").is_not_null() &
    col("description").is_not_null() &
    col("minute").is_not_null()
)

filtered_movies = filtered_movies.join(
    releases.rename(
        {"rating": "theatrical_release_age_rating"}
    ).filter(
        (col("country") == "USA") &
        (col("theatrical_release_age_rating").is_not_null()) & # n.b. we'll do a left join later so we'll still have null values for the age_rating of some movies
        (col("type") == "Theatrical")
    ).sort(
        ["date"], descending=False, nulls_last=True
    ).group_by("id", maintain_order=True).first()[["id", "theatrical_release_age_rating"]],
    on="id",
    how="left"
)

In [6]:
filtered_movies

id,name,date,tagline,description,minute,rating,theatrical_release_age_rating
i64,str,i64,str,str,i64,f64,str
1000001,"""Barbie""",2023,"""She's everything. He's just Ke…","""Barbie and Ken are having the …",114,3.86,"""PG-13"""
1000004,"""Fight Club""",1999,"""Mischief. Mayhem. Soap.""","""A ticking-time-bomb insomniac …",139,4.27,"""R"""
1000006,"""Oppenheimer""",2023,"""The world forever changes.""","""The story of J. Robert Oppenhe…",181,4.23,"""R"""
1000008,"""Joker""",2019,"""Put on a happy face.""","""During the 1980s, a failed sta…",122,3.85,"""R"""
1000010,"""Pulp Fiction""",1994,"""Just because you are a charact…","""A burger-loving hit man, his p…",154,4.26,"""R"""
…,…,…,…,…,…,…,…
1110821,"""Yes, We Have No Bonanza""",1939,"""YES, YOU'LL HAVE PLENTY OF LAU…","""Set in a western town, the sto…",16,3.39,"""NR"""
1111634,"""How High Is Up?""",1940,"""A New High In Hilarity!""","""The stooges are the 'Minute Me…",17,3.46,
1112676,"""No Census, No Feeling""",1940,"""Those madcap merchants of mirt…","""The stooges get jobs as census…",17,3.42,"""NR"""
1114819,"""Deviant Love""",2019,"""Love makes you do crazy things…","""In a tailspin after her marria…",86,2.6,


In [7]:
# Warning: actor names are potentially not unique
filtered_actors = actors.drop("role").join(
    filtered_movies,
    on="id",
    how="semi",
)

filtered_actors = filtered_actors.join(
    filtered_actors.group_by("name").agg(pl.len().alias("n_movies")).sort("n_movies", descending=True).filter(col("n_movies") > 5),
    on="name",
    how="semi",
).rename({"name": "actor_name"})

In [8]:
# Warning: director names are potentially not unique
filtered_directors = crew.filter(
    (col("role") == "Director")
).drop(
    "role"
).join(filtered_movies, on="id", how="semi")

filtered_directors = filtered_directors.join(
    filtered_directors.group_by("name").agg(pl.len().alias("n_movies")).sort("n_movies", descending=True).filter(col("n_movies") > 5),
    on="name",
    how="semi",
).rename(
    {"name": "director_name"}
)

# Map movie IDs to a "simpler" 0-based index

And reorder columns

In [9]:
id_mapping = pl.DataFrame({"original_id": filtered_movies["id"], "movie_id": [i for i in range(filtered_movies.height)]})
filtered_movies = filtered_movies.rename({"id": "original_id"}).join(id_mapping, on="original_id", how="inner")[["movie_id", "original_id", "name", "date", "tagline", "description", "minute", "theatrical_release_age_rating", "rating"]]
filtered_actors = filtered_actors.rename({"id": "original_id"}).join(id_mapping, on="original_id", how="inner")[["movie_id", "original_id", "actor_name"]]
filtered_directors = filtered_directors.rename({"id": "original_id"}).join(id_mapping, on="original_id", how="inner")[["movie_id", "original_id", "director_name"]]
filtered_studios = filtered_studios.rename({"id": "original_id"}).join(id_mapping, on="original_id", how="inner")[["movie_id", "original_id", "studio"]]


In [10]:
filtered_movies

movie_id,original_id,name,date,tagline,description,minute,theatrical_release_age_rating,rating
i64,i64,str,i64,str,str,i64,str,f64
0,1000001,"""Barbie""",2023,"""She's everything. He's just Ke…","""Barbie and Ken are having the …",114,"""PG-13""",3.86
1,1000004,"""Fight Club""",1999,"""Mischief. Mayhem. Soap.""","""A ticking-time-bomb insomniac …",139,"""R""",4.27
2,1000006,"""Oppenheimer""",2023,"""The world forever changes.""","""The story of J. Robert Oppenhe…",181,"""R""",4.23
3,1000008,"""Joker""",2019,"""Put on a happy face.""","""During the 1980s, a failed sta…",122,"""R""",3.85
4,1000010,"""Pulp Fiction""",1994,"""Just because you are a charact…","""A burger-loving hit man, his p…",154,"""R""",4.26
…,…,…,…,…,…,…,…,…
10594,1110821,"""Yes, We Have No Bonanza""",1939,"""YES, YOU'LL HAVE PLENTY OF LAU…","""Set in a western town, the sto…",16,"""NR""",3.39
10595,1111634,"""How High Is Up?""",1940,"""A New High In Hilarity!""","""The stooges are the 'Minute Me…",17,,3.46
10596,1112676,"""No Census, No Feeling""",1940,"""Those madcap merchants of mirt…","""The stooges get jobs as census…",17,"""NR""",3.42
10597,1114819,"""Deviant Love""",2019,"""Love makes you do crazy things…","""In a tailspin after her marria…",86,,2.6


# Save data

In [None]:
filtered_data_folder = Path("filtered")
filtered_posters_folder = filtered_data_folder.joinpath("posters")
image_folder = Path("posters")

filtered_data_folder.mkdir(exist_ok=True)
filtered_posters_folder.mkdir(exist_ok=True)

In [11]:
filtered_movies.write_csv(filtered_data_folder.joinpath("movies.csv"))
filtered_actors.write_csv(filtered_data_folder.joinpath("actors.csv"))
filtered_directors.write_csv(filtered_data_folder.joinpath("directors.csv"))
filtered_studios.write_csv(filtered_data_folder.joinpath("studios.csv"))

for row in filtered_movies.iter_rows(named=True):
    original_movie_id = row["original_id"]
    movie_id = row["movie_id"]
    
    image_path = image_folder.joinpath(f"{original_movie_id}.jpg")
    if image_path.exists():
        shutil.copy(image_path, filtered_posters_folder.joinpath(f"{movie_id}.jpg"))
    else:
        raise FileNotFoundError(f"ERROR:Image not found for movie {original_movie_id}")

## Produce CLIP embeddings for the movie posters

In [13]:
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch
import numpy as np

In [45]:
clip_image_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
clip_vision_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")

def get_image_embeddings(image_paths):
    images = [Image.open(image_path) for image_path in image_paths]
    inputs = clip_image_processor(images=images, return_tensors="pt")
    with torch.no_grad():
        outputs = clip_vision_model.get_image_features(**inputs)
    return outputs

In [53]:
image_embeddings = get_image_embeddings([filtered_posters_folder.joinpath(f"{movie_id}.jpg") for movie_id in filtered_movies["id"].to_list()])

In [54]:
image_embeddings.numpy()

array([[-0.10932737, -0.5135564 , -0.13299036, ...,  0.37617517,
        -0.03452259,  0.6330602 ],
       [-0.14249939, -0.08135171,  0.09522294, ...,  0.23633456,
         0.11451474,  0.3982417 ],
       [-0.2902962 ,  0.32712936, -0.19116507, ...,  0.21396765,
         0.25395045, -0.24302319],
       ...,
       [ 0.1251586 ,  0.16826448, -0.4882458 , ...,  0.3885356 ,
         0.2684787 , -0.322622  ],
       [ 0.16716644,  0.37810767, -0.34913853, ...,  0.04571151,
         0.0035307 , -0.08718923],
       [-0.36362553, -0.10683084, -0.22944315, ...,  0.57249534,
         0.36928752, -0.21038885]], dtype=float32)

## Produce text embeddings using nomic-embed-text-v1.5

In [58]:
from sentence_transformers import SentenceTransformer

In [60]:
model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5", trust_remote_code=True)

def get_text_embeddings(texts):
    return model.encode([f"classification: text" for text in texts])

<All keys matched successfully>


In [66]:
tagline_embeddings = get_text_embeddings(filtered_movies["tagline"].to_list())

In [67]:
description_embeddings = get_text_embeddings(filtered_movies["description"].to_list())

In [69]:
description_embeddings.shape

(10599, 768)

## Save embeddings

np.save(file=filtered_data_folder.join())