In [1]:
import openai
from dotenv import dotenv_values
config = dotenv_values(".env")
openai.api_key = config["OPENAI_API_KEY"]

In [2]:
import pandas as pd
import numpy as np
from tenacity import retry, wait_random_exponential, stop_after_attempt
import pickle
import tiktoken

In [3]:
dataset_path = "./movie_plots.csv"
dataframe = pd.read_csv(dataset_path)

In [4]:
# Narrow our data set to 5000 recent American movies (to save money):
movies = dataframe[dataframe["Origin/Ethnicity"] == "American"].sort_values("Release Year", ascending=False).head(5000)

In [5]:
# Extract the movie plots into a list:
movie_plots = movies["Plot"].values  # ["plot1", "plot2", ...] 

In [6]:
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def get_embedding(text, model="text-embedding-ada-002"):

    # replace newlines, which can negatively affect performance:
    text = text.replace("\n", " ")

    return openai.Embedding.create(input=text, model=model)["data"][0]["embedding"]

In [7]:
# Use the tiktoken library to encode a text using the "text-embedding-ada-002" model
enc = tiktoken.encoding_for_model("text-embedding-ada-002")

In [8]:
# calculate total tokens to estimate cost
total_tokens = sum([len(enc.encode(plot)) for plot in movie_plots]) # sum([697, 757, 361, ...])

In [9]:
total_tokens
cost = (.0004 / 1000) * total_tokens
print(f"Estimated cost is ${cost:.2f}")

Estimated cost is $1.45


In [10]:
# establish a cache of embeddings to avoid recomputing
# cache is a dict of tuples (text, model) -> embedding, saved as a pickle file

# set path to embedding cache
embedding_cache_path = "movie_embeddings.pkl"

# load the cache if it exists, and save a copy to disk
try:
    embedding_cache = pd.read_pickle(embedding_cache_path)
except FileNotFoundError:
    embedding_cache = {}
with open(embedding_cache_path, "wb") as embedding_cache_file:
    pickle.dump(embedding_cache, embedding_cache_file)

# define a function to retrieve embeddings from the cache if present, or otherwise request via the API
def embedding_from_string(
    string,
    model="text-embedding-ada-002",
    embedding_cache=embedding_cache
):
    """Return embedding of given string, using a cache to avoid recomputing."""
    if (string, model) not in embedding_cache.keys():
        embedding_cache[(string, model)] = get_embedding(string, model)
        print(f"GOT EMBEDDING FROM OPENAI FOR '{string[:20]}'")
        with open(embedding_cache_path, "wb") as embedding_cache_file:
            pickle.dump(embedding_cache, embedding_cache_file)
    return embedding_cache[(string, model)]

In [11]:
plot_embeddings = [embedding_from_string(plot, model="text-embedding-ada-002") for plot in movie_plots]

In [12]:
len(plot_embeddings) # 5000 

5000

In [13]:
len(plot_embeddings[0]) # 1536

1536

In [18]:
movies[["Title", "Genre"]].to_dict("records")

TypeError: unsupported type: <class 'str'>

In [14]:
from nomic import atlas

In [15]:
project = atlas.map_embeddings(
    embeddings=np.array(plot_embeddings)
)

[32m2023-06-01 18:08:41.341[0m | [1mINFO    [0m | [36mnomic.project[0m:[36m_create_project[0m:[36m965[0m - [1mCreating project `null-armchair` in organization `qmeng222`[0m
[32m2023-06-01 18:08:43.293[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[36mmap_embeddings[0m:[36m100[0m - [1mUploading embeddings to Atlas.[0m
4it [00:15,  3.88s/it]                             
[32m2023-06-01 18:08:58.963[0m | [1mINFO    [0m | [36mnomic.project[0m:[36m_add_data[0m:[36m1577[0m - [1mUpload succeeded.[0m
[32m2023-06-01 18:08:58.967[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[36mmap_embeddings[0m:[36m119[0m - [1mEmbedding upload succeeded.[0m
[32m2023-06-01 18:09:01.158[0m | [1mINFO    [0m | [36mnomic.project[0m:[36mcreate_index[0m:[36m1282[0m - [1mCreated map `null-armchair` in project `null-armchair`: https://atlas.nomic.ai/map/23ed8f0e-1207-47b5-af40-57a1a9dd2c85/0a8ab829-0f08-46e6-bfe6-685d6f1a89a8[0m
[32m2023-06-01 18:09:01.160[0m | [1mINF

In [16]:
movies

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
17376,2017,Phantom Thread,American,Paul Thomas Anderson,Paul Thomas Anderson (director/screenplay); Da...,drama,https://en.wikipedia.org/wiki/Phantom_Thread,"In 1954 London, renowned fashion designer Reyn..."
17243,2017,"Everything, Everything",American,Stella Meghie,Stella Meghie (director); J. Mills Goodloe (sc...,"romance, drama","https://en.wikipedia.org/wiki/Everything,_Ever...","Eighteen-year-old Maddy suffers from SCID, an ..."
17241,2017,Alien: Covenant,American,Ridley Scott,"Ridley Scott (director); Michael Green, Jack P...","sci-fi, horror",https://en.wikipedia.org/wiki/Alien:_Covenant,"In a prologue, business magnate Peter Weyland ..."
17240,2017,Paris Can Wait,American,Eleanor Coppola,Eleanor Coppola (director/screenplay); Diane L...,"comedy, romance",https://en.wikipedia.org/wiki/Paris_Can_Wait,Anne (Diane Lane) is in Cannes with her husban...
17239,2017,The Wall,American,Doug Liman,Doug Liman (director); Dwain Worrell (screenpl...,"drama, thriller",https://en.wikipedia.org/wiki/The_Wall_(2017_f...,"During the Iraq War, U.S. Army Staff Sergeant ..."
...,...,...,...,...,...,...,...,...
12478,1995,Jury Duty,American,John Fortenberry,"Pauly Shore, Tia Carrere, Abe Vigoda",comedy,https://en.wikipedia.org/wiki/Jury_Duty_(film),Tommy Collins is an unemployed erotic dancer l...
12506,1995,Mortal Kombat,American,Paul W. S. Anderson,"Robin Shou, Linden Ashby, Bridgette Wilson, Ch...",fantasy,https://en.wikipedia.org/wiki/Mortal_Kombat_(f...,Mortal Kombat is a fighting tournament between...
12480,1995,A Kid in King Arthur's Court,American,Michael Gottlieb,"Thomas Ian Nicholas, Joss Ackland",comedy,https://en.wikipedia.org/wiki/A_Kid_in_King_Ar...,Calvin Fuller is a nerdy young boy living in t...
12508,1995,Mr. Holland's Opus,American,Stephen Herek,"Richard Dreyfuss, Glenne Headly, Jay Thomas, O...",drama,https://en.wikipedia.org/wiki/Mr._Holland%27s_...,"In Portland, Oregon in 1965, Glenn Holland is ..."
