In [6]:
import openai

In [7]:
from dotenv import dotenv_values
config = dotenv_values(".env")

In [8]:
openai.api_key = config["OPENAI_API_KEY"]

## Movies plotting with Atlas

In [9]:
import pandas as pd
import numpy as np

In [10]:
dataset_path = "./datasets/movie_plots.csv"
df = pd.read_csv(dataset_path)

In [11]:
movies = df[df["Origin/Ethnicity"] == "American"].sort_values("Release Year", ascending=False).head(50)

In [12]:
movie_plots = movies["Plot"].values

## Generating the embeddings

In [14]:
from tenacity import retry, wait_random_exponential, stop_after_attempt
import pickle
import tiktoken

In [16]:
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def get_embedding(text, model="text-embedding-ada-002"):

    # replace newlines, which can negatively affect performance.
    text = text.replace("\n", " ")

    return openai.Embedding.create(input=text, model=model)["data"][0]["embedding"]

In [17]:
enc = tiktoken.encoding_for_model("text-embedding-ada-002")

In [18]:
total_tokens = sum([len(enc.encode(plot)) for plot in movie_plots])

In [19]:
total_tokens

16751

In [20]:
cost = total_tokens * (.0004 / 1000)
print(f"Estimated cost ${cost:.2f}")

Estimated cost $0.01


In [27]:
# establish a cache of embeddings to avoid recomputing
# cache is a dict of tuples (text, model) -> embedding, saved as a pickle file

# set path to embedding cache
embedding_cache_path = "movie_embeddings_cache.pkl"

# load the cache if it exists, and save a copy to disk
try:
    embedding_cache = pd.read_pickle(embedding_cache_path)
except FileNotFoundError:
    embedding_cache = {}
with open(embedding_cache_path, "wb") as embedding_cache_file:
    pickle.dump(embedding_cache, embedding_cache_file)

# define a function to retrieve embeddings from the cache if present, and otherwise request via the API
def embedding_from_string(
    string,
    model="text-embedding-ada-002",
    embedding_cache=embedding_cache
):
    """Return embedding of given string, using a cache to avoid recomputing."""
    if (string, model) not in embedding_cache.keys():
        embedding_cache[(string, model)] = get_embedding(string, model)
        print(f"GOT EMBEDDING FROM OPENAI FOR {string[:20]}")
        with open(embedding_cache_path, "wb") as embedding_cache_file:
            pickle.dump(embedding_cache, embedding_cache_file)
    return embedding_cache[(string, model)]

In [28]:
# This line actaully generates the embeddings
plot_embeddings = [embedding_from_string(plot, model="text-embedding-ada-002") for plot in movie_plots]

GOT EMBEDDING FROM OPENAI FOR Meek clerk Elmer Lam
GOT EMBEDDING FROM OPENAI FOR Nick and Nora Charle
GOT EMBEDDING FROM OPENAI FOR A card sharp steps i
GOT EMBEDDING FROM OPENAI FOR Template:Section Edi
GOT EMBEDDING FROM OPENAI FOR Taft, a policeman, h
GOT EMBEDDING FROM OPENAI FOR Geoffrey Sherwood, r
GOT EMBEDDING FROM OPENAI FOR Stenographer Marilyn
GOT EMBEDDING FROM OPENAI FOR Kay Parrish is the d
GOT EMBEDDING FROM OPENAI FOR The film centers on 
GOT EMBEDDING FROM OPENAI FOR Secretary Mirabel Mi
GOT EMBEDDING FROM OPENAI FOR One year after gradu
GOT EMBEDDING FROM OPENAI FOR Ellen Garfield refus
GOT EMBEDDING FROM OPENAI FOR California gubernato
GOT EMBEDDING FROM OPENAI FOR In San Francisco in 
GOT EMBEDDING FROM OPENAI FOR Freckles, a young ma
GOT EMBEDDING FROM OPENAI FOR A radical campus gro
GOT EMBEDDING FROM OPENAI FOR A suicidal woman, Li
GOT EMBEDDING FROM OPENAI FOR Broadway star Al How
GOT EMBEDDING FROM OPENAI FOR In 1925 London, midd
GOT EMBEDDING FROM OPENAI FOR W

## Visualizing our embeddings with Atlas

In [18]:
from nomic import atlas

In [21]:
data = movies[["Title", "Genre"]].to_dict("records")

In [None]:
atlas.map_embeddings(embeddings=np.array(plot_embeddings), data=data)

## Recommending movies by plot

In [30]:
from openai.embeddings_utils import distances_from_embeddings, indices_of_nearest_neighbors_from_distances

In [35]:
def print_recommendations_from_strings(
        strings,
        index_of_source_string,
        k_nearest_neighbors=3,
        model="text-embedding-ada-002"
):
    # get all the embeddings
    embeddings = [embedding_from_string(string) for string in strings]
    # get embedding for our specific query string
    query_embedding = embeddings[index_of_source_string]
    # get distances between our embedding and all other embeddings
    distances = distances_from_embeddings(query_embedding, embeddings)
    # get indices of the nearest neighbors
    indices_of_nearest_neighbors = indices_of_nearest_neighbors_from_distances(distances)
    query_string = strings[index_of_source_string]
    match_count = 0
    for i in indices_of_nearest_neighbors:
        # skip the searched movie
        if query_string == strings[i]:
            continue
        if match_count >= k_nearest_neighbors:
            break
        match_count += 1
        print(f"Found {match_count} closest match: ")
        print(f"Distance of: {distances[i]}")
        print(strings[i])

In [36]:
print_recommendations_from_strings(movie_plots, 0)

Found 1 closest match: 
Distance of: 0.15240804182456968
Belle McGill is unaware of husband Jimmy's gambling problem. First he loses $100 at the racetrack and vows never to place another wager. Then he persuades future son-in-law Ben to bet on a sure thing, Leadpipe, but gets a tip on another horse just before the race, bets Ben's money on that instead, then watches Leadpipe win.
In danger of losing his business, if not his family, Jimmy delays paying off Ben, who excitedly believes his horse was the winner. Unbenknowst to all, Belle has been making bets of her own. When a horse called Honey Girl comes along, Belle and Jimmy risk everything they have, and they come out winners.
Found 2 closest match: 
Distance of: 0.16723019461008426
A racketeer known as "Sunshine Joe" specializes in ticket scalping. His gang of colorfully nicknamed thugs includes Liverlips, Sam the Gonoph and Bennie South Street, as well as "Georgie the Chaser," who was dubbed that way because of his penchant for chas