In [1]:
import openai
from dotenv import dotenv_values
config = dotenv_values(".env")
openai.api_key = config["OPENAI_API_KEY"]

In [2]:
import pandas as pd
import numpy as np

## Load The Movie Data

In [3]:
dataset_path = "./movie_plots.csv"
dataframe = pd.read_csv(dataset_path)

In [23]:
# Narrow data set to 5000 recent American movies (to save money):
movies = dataframe[dataframe["Origin/Ethnicity"] == "American"].sort_values("Release Year", ascending=False).head(5000)

In [26]:
movies

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
17376,2017,Phantom Thread,American,Paul Thomas Anderson,Paul Thomas Anderson (director/screenplay); Da...,drama,https://en.wikipedia.org/wiki/Phantom_Thread,"In 1954 London, renowned fashion designer Reyn..."
17243,2017,"Everything, Everything",American,Stella Meghie,Stella Meghie (director); J. Mills Goodloe (sc...,"romance, drama","https://en.wikipedia.org/wiki/Everything,_Ever...","Eighteen-year-old Maddy suffers from SCID, an ..."
17241,2017,Alien: Covenant,American,Ridley Scott,"Ridley Scott (director); Michael Green, Jack P...","sci-fi, horror",https://en.wikipedia.org/wiki/Alien:_Covenant,"In a prologue, business magnate Peter Weyland ..."
17240,2017,Paris Can Wait,American,Eleanor Coppola,Eleanor Coppola (director/screenplay); Diane L...,"comedy, romance",https://en.wikipedia.org/wiki/Paris_Can_Wait,Anne (Diane Lane) is in Cannes with her husban...
17239,2017,The Wall,American,Doug Liman,Doug Liman (director); Dwain Worrell (screenpl...,"drama, thriller",https://en.wikipedia.org/wiki/The_Wall_(2017_f...,"During the Iraq War, U.S. Army Staff Sergeant ..."
...,...,...,...,...,...,...,...,...
12478,1995,Jury Duty,American,John Fortenberry,"Pauly Shore, Tia Carrere, Abe Vigoda",comedy,https://en.wikipedia.org/wiki/Jury_Duty_(film),Tommy Collins is an unemployed erotic dancer l...
12506,1995,Mortal Kombat,American,Paul W. S. Anderson,"Robin Shou, Linden Ashby, Bridgette Wilson, Ch...",fantasy,https://en.wikipedia.org/wiki/Mortal_Kombat_(f...,Mortal Kombat is a fighting tournament between...
12480,1995,A Kid in King Arthur's Court,American,Michael Gottlieb,"Thomas Ian Nicholas, Joss Ackland",comedy,https://en.wikipedia.org/wiki/A_Kid_in_King_Ar...,Calvin Fuller is a nerdy young boy living in t...
12508,1995,Mr. Holland's Opus,American,Stephen Herek,"Richard Dreyfuss, Glenne Headly, Jay Thomas, O...",drama,https://en.wikipedia.org/wiki/Mr._Holland%27s_...,"In Portland, Oregon in 1965, Glenn Holland is ..."


In [5]:
# Extract the movie plots into a list:
movie_plots = movies["Plot"].values  # ["plot1", "plot2", ...] 

## Generating The Embeddings

In [6]:
from tenacity import retry, wait_random_exponential, stop_after_attempt

In [7]:
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def get_embedding(text, model="text-embedding-ada-002"):

    # replace newlines, which can negatively affect performance:
    text = text.replace("\n", " ")

    return openai.Embedding.create(input=text, model=model)["data"][0]["embedding"]

## Estimate The Cost

In [8]:
import tiktoken

In [9]:
# Use the tiktoken library to encode a text using the "text-embedding-ada-002" model
enc = tiktoken.encoding_for_model("text-embedding-ada-002")

In [10]:
# calculate total tokens to estimate cost
total_tokens = sum([len(enc.encode(plot)) for plot in movie_plots]) # sum([697, 757, 361, ...])

In [11]:
total_tokens
cost = (.0004 / 1000) * total_tokens
print(f"Estimated cost is ${cost:.2f}")

Estimated cost is $1.45


## Cache To Avoid Re-computing

In [12]:
import pickle 

In [13]:
# establish a cache of embeddings to avoid recomputing
# cache is a dict of tuples (text, model) -> embedding, saved as a pickle file

# set path to embedding cache
embedding_cache_path = "movie_embeddings.pkl"

# load the cache if it exists, and save a copy to disk
try:
    embedding_cache = pd.read_pickle(embedding_cache_path)
except FileNotFoundError:
    embedding_cache = {}
with open(embedding_cache_path, "wb") as embedding_cache_file:
    pickle.dump(embedding_cache, embedding_cache_file)

# define a function to retrieve embeddings from the cache if present, or otherwise request via the API
def embedding_from_string(
    string,
    model="text-embedding-ada-002",
    embedding_cache=embedding_cache
):
    """Return embedding of given string, using a cache to avoid recomputing."""
    if (string, model) not in embedding_cache.keys():
        embedding_cache[(string, model)] = get_embedding(string, model)
        print(f"GOT EMBEDDING FROM OPENAI FOR '{string[:20]}'")
        with open(embedding_cache_path, "wb") as embedding_cache_file:
            pickle.dump(embedding_cache, embedding_cache_file)
    return embedding_cache[(string, model)]

## Visualize Embeddings With Atlas

In [14]:
from nomic import atlas

In [15]:
plot_embeddings = [embedding_from_string(plot, model="text-embedding-ada-002") for plot in movie_plots]
# len(plot_embeddings) # 5000 
# len(plot_embeddings[0]) # 1536

In [27]:
data = movies[["Title", "Genre"]].to_dict("records")  # list of dicts: [{'Title': 'Phantom Thread', 'Genre': 'drama'}, ...]
# print(data)

In [17]:
project = atlas.map_embeddings(
    embeddings=np.array(plot_embeddings),
    data=data
)

[32m2023-06-02 20:53:05.771[0m | [1mINFO    [0m | [36mnomic.project[0m:[36m_create_project[0m:[36m965[0m - [1mCreating project `yummy-patron` in organization `qmeng222`[0m
[32m2023-06-02 20:53:07.216[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[36mmap_embeddings[0m:[36m100[0m - [1mUploading embeddings to Atlas.[0m
4it [00:12,  3.02s/it]                             
[32m2023-06-02 20:53:19.452[0m | [1mINFO    [0m | [36mnomic.project[0m:[36m_add_data[0m:[36m1577[0m - [1mUpload succeeded.[0m
[32m2023-06-02 20:53:19.456[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[36mmap_embeddings[0m:[36m119[0m - [1mEmbedding upload succeeded.[0m
[32m2023-06-02 20:53:20.220[0m | [1mINFO    [0m | [36mnomic.project[0m:[36mcreate_index[0m:[36m1282[0m - [1mCreated map `yummy-patron` in project `yummy-patron`: https://atlas.nomic.ai/map/791f8e14-aaea-42c8-b7b8-d7e8b443ac62/ababbffd-4c84-499e-b7a7-87f05df03366[0m
[32m2023-06-02 20:53:20.221[0m | [1mINFO  

## Recommending Movies By Plot

1. enter a movie title
2. take the movie title, and find the corresponding plot
3. turn that plot into an embedding
4. get distances between the query embedding and all other embeddings
5. get indices of the nearest neighbors using the K Nearest Neighbor (KNN) algorithm
6. take the matching movie plots, and get their corresponding titles 

In [18]:
from openai.embeddings_utils import distances_from_embeddings, indices_of_nearest_neighbors_from_distances

In [19]:
def print_recommendations_from_strings(
    strings, # movie_plots
    index_of_source_string,
    k_nearest_neighbors=3,
    model="text-embedding-ada-002"
):
    # Get all of the embeddings
    embeddings = [embedding_from_string(string) for string in strings]
    # get embedding for the specific query string, for example, the 7th embedding (query embedding)
    query_embedding = embeddings[index_of_source_string]
    # get distances between the query embedding and all other embeddings
    distances = distances_from_embeddings(query_embedding, embeddings)
#     print(distances)  # [0, 0.2215704663073892, 0.25112365731907915, ...] with the 1st movie
    # get indices of the nearest neighbors
    indices_of_nearest_neighbors = indices_of_nearest_neighbors_from_distances(distances)
#     print(indices_of_nearest_neighbors)  # [   0 1535 3418 ... 1784 2373 1057] with the 1st movie
    
    query_string = strings[index_of_source_string]
    match_count = 0
    for i in indices_of_nearest_neighbors:
        # skip the query index:
        if query_string == strings[i]:
            continue 
        # limit the number of recommendations: 
        if match_count >= k_nearest_neighbors:
            break
        match_count += 1
        # print out the recommendations: 
        print(f"🎉 Found {match_count} closest match: ")
        print(f"Distance to the query embedding: {distances[i]} ")
        print(strings[i])

In [28]:
# call the above function:
print_recommendations_from_strings(movie_plots, 0) 

🎉 Found 1 closest match: 
Distance to the query embedding: 0.1543541303023297 
Alfie (Anthony Hopkins) and Helena (Gemma Jones) divorce. Helena begins seeing a fortune teller, Cristal (Pauline Collins), for spiritual advice. Their daughter Sally (Naomi Watts) has a troubled marriage with author Roy (Josh Brolin), who once wrote a successful book and is now anxiously waiting for response from his publisher about the manuscript of his newest one. Helena helps pay their rent.
Alfie marries a prostitute, Charmaine (Lucy Punch). Roy falls for Dia (Freida Pinto), a musicologist he sees through a window near his and Sally’s flat, who is engaged to another man. Sally considers having an affair with Greg (Antonio Banderas), her new boss at an art gallery. But Greg confesses he is having trouble at home and eventually it turns out he is having an affair with Iris (Anna Friel), Sally's protégée. Helena begins a friendship with Jonathan (Roger Ashton-Griffiths), the proprietor of an occult booksh