#### Dataset
https://datasets.imdbws.com/

Just use name.basics.tsv.gz and title.basics.tsv.gz and put into ./development/MovieData directory


#### Embedding Search
https://faiss.ai/index.html

Method developed at Meta, allows similarity search in large databases (both name.basics and title.basics contain >10Mio entries). So we might need such an approach to be sufficiently fast at inference


Note: I haven't generated the embeddings so far. Depends a bit on what you want to do with the graph data we already have.

In [7]:
import pandas as pd

# Load the datasets
df_title_basic = pd.read_csv('MovieData/title.basics.tsv', sep='\t', na_values='\\N')
df_name_basic = pd.read_csv('MovieData/name.basics.tsv', sep='\t', na_values='\\N')

df_title_basic = df_title_basic[['tconst', 'primaryTitle', 'startYear', 'genres']].dropna(subset=['primaryTitle'])
df_name_basic = df_name_basic[['nconst', 'primaryName', 'primaryProfession', 'knownForTitles']].dropna(subset=['primaryName'])

# Normalize text to lowercase, remove duplicats
df_title_basic['primaryTitle'] = df_title_basic['primaryTitle'].str.lower().str.strip()
df_name_basic['primaryName'] = df_name_basic['primaryName'].str.lower().str.strip()
df_title_basic = df_title_basic.drop_duplicates(subset=['primaryTitle'])
df_name_basic = df_name_basic.drop_duplicates(subset=['primaryName'])

df_title_basic.reset_index(drop=True, inplace=True)
df_name_basic.reset_index(drop=True, inplace=True)

print("Movies DataFrame shape:", df_title_basic.shape)
print("Actors DataFrame shape:", df_name_basic.shape)


  df_title_basic = pd.read_csv('MovieData/title.basics.tsv', sep='\t', na_values='\\N')


Movies DataFrame shape: (4961047, 4)
Actors DataFrame shape: (10631575, 4)


In [8]:
from sentence_transformers import SentenceTransformer
import faiss
import pickle

# Sentence Transformer to compute embeddings for actors and movies
model = SentenceTransformer('all-MiniLM-L6-v2') 

## Actors
print("Computing embeddings for actor names...")
actor_names = df_name_basic['primaryName'].tolist()
actor_embeddings = model.encode(actor_names, convert_to_numpy=True, show_progress_bar=True)

## Movies
print("Computing embeddings for movie titles...")
movie_titles = df_title_basic['primaryTitle'].tolist()
movie_embeddings = model.encode(movie_titles, convert_to_numpy=True, show_progress_bar=True)

with open('actor_embeddings.pkl', 'wb') as f:
    pickle.dump((actor_embeddings, actor_names), f)

with open('movie_embeddings.pkl', 'wb') as f:
    pickle.dump((movie_embeddings, movie_titles), f)


  from tqdm.autonotebook import tqdm, trange


Computing embeddings for actor names...


Batches:   2%|▏         | 5062/332237 [00:59<1:03:54, 85.32it/s]


KeyboardInterrupt: 

In [None]:
# Build FAISS index for Embeddings

## Actors
print("Building FAISS index for actors...")
actor_dimension = actor_embeddings.shape[1]
actor_index = faiss.IndexFlatL2(actor_dimension)
actor_index.add(actor_embeddings)

## Movies
print("Building FAISS index for movies...")
movie_dimension = movie_embeddings.shape[1]
movie_index = faiss.IndexFlatL2(movie_dimension)
movie_index.add(movie_embeddings)

faiss.write_index(actor_index, 'actor_index.faiss')
faiss.write_index(movie_index, 'movie_index.faiss')


In [None]:
def load_actor_index():
    actor_index = faiss.read_index('actor_index.faiss')
    with open('actor_embeddings.pkl', 'rb') as f:
        _, actor_names = pickle.load(f)
    return actor_index, actor_names

def load_movie_index():
    movie_index = faiss.read_index('movie_index.faiss')
    with open('movie_embeddings.pkl', 'rb') as f:
        _, movie_titles = pickle.load(f)
    return movie_index, movie_titles

def embedding_search(query, index, labels, top_k=5):
    query_embedding = model.encode([query], convert_to_numpy=True)
    distances, indices = index.search(query_embedding, top_k)
    results = [(labels[idx], distances[0][i]) for i, idx in enumerate(indices[0])]
    return results


In [None]:
actor_index, actor_names = load_actor_index()
movie_index, movie_titles = load_movie_index()
    
queries = [
    "Who starred in the movie Inception?",
    "Tell me about Leonardo DiCaprio's films.",
    "Can you recommend movies directed by Christopher Nolan?",
    "Who are the actors in The Dark Knight?",
    "Give me some information about Emma Stone."
]

for query in queries:
    actor_matches = embedding_search(query, actor_index, actor_names, top_k=5)
    movie_matches = embedding_search(query, movie_index, movie_titles, top_k=5)
    
    print("Actors")
    print(actor_matches)
    print("Movies")
    print(movie_matches)
    print("\n")