In [20]:
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from nomic import embed
import plotly.express as px
from ast import literal_eval

### Calculate embeddings of movie storylines

In [8]:
# Dataset: IMDB Top 250 Movies
# (https://www.kaggle.com/datasets/karkavelrajaj/imdb-top-250-movies)
df = pd.read_csv('data/movies.csv')

In [9]:
embed_res = embed.text(
    texts=df['storyline'].to_list(),
    model='nomic-embed-text-v1.5',
    inference_mode='local'
)
embeddings = embed_res['embeddings']

df['embedding'] = embeddings
df.to_csv('data/movies_embeddings.csv', index=False)

### Reduce dimensionality to 3D using t-SNE and plot embeddings

In [18]:
df = pd.read_csv('data/movies_embeddings.csv')
embeddings = np.array(df['embedding'].apply(literal_eval).tolist())

tsne = TSNE(n_components=2, random_state=42)
embeddings_2d = tsne.fit_transform(embeddings)

In [19]:
df['x'] = embeddings_2d[:, 0]
df['y'] = embeddings_2d[:, 1]

fig = px.scatter(
    df, x='x', y='y', text='title', hover_data={'title': True, 'rank': True},
    title='Movie Similarity based on Storyline Embeddings')
fig.show()