In [1]:
import openai

In [2]:
from dotenv import dotenv_values
config = dotenv_values(".env")

In [3]:
openai.api_key = config["OPENAI_API_KEY"]

## Generating a single embedding

In [None]:
response = openai.Embedding.create(
    model="text-embedding-ada-002",
    input="candy canes"
)

In [None]:
response["data"][0]["embedding"]

## Movies plotting with Atlas

In [4]:
import pandas as pd
import numpy as np

In [5]:
dataset_path = "./datasets/movie_plots.csv"
df = pd.read_csv(dataset_path)

In [6]:
movies = df[df["Origin/Ethnicity"] == "American"].sort_values("Release Year", ascending=False).head(50)

In [7]:
movie_plots = movies["Plot"].values

## Generating the embeddings

In [8]:
from tenacity import retry, wait_random_exponential, stop_after_attempt
import pickle
import tiktoken

In [9]:
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def get_embedding(text, model="text-embedding-ada-002"):

    # replace newlines, which can negatively affect performance.
    text = text.replace("\n", " ")

    return openai.Embedding.create(input=text, model=model)["data"][0]["embedding"]

In [10]:
enc = tiktoken.encoding_for_model("text-embedding-ada-002")

In [11]:
total_tokens = sum([len(enc.encode(plot)) for plot in movie_plots])

In [12]:
total_tokens

16751

In [13]:
cost = total_tokens * (.0004 / 1000)
print(f"Estimated cost ${cost:.2f}")

Estimated cost $0.01


In [16]:
# establish a cache of embeddings to avoid recomputing
# cache is a dict of tuples (text, model) -> embedding, saved as a pickle file

# set path to embedding cache
embedding_cache_path = "./embeddings/movie_embeddings_cache.pkl"

# load the cache if it exists, and save a copy to disk
try:
    embedding_cache = pd.read_pickle(embedding_cache_path)
except FileNotFoundError:
    embedding_cache = {}
with open(embedding_cache_path, "wb") as embedding_cache_file:
    pickle.dump(embedding_cache, embedding_cache_file)

# define a function to retrieve embeddings from the cache if present, and otherwise request via the API
def embedding_from_string(
    string,
    model="text-embedding-ada-002",
    embedding_cache=embedding_cache
):
    """Return embedding of given string, using a cache to avoid recomputing."""
    if (string, model) not in embedding_cache.keys():
        embedding_cache[(string, model)] = get_embedding(string, model)
        print(f"GOT EMBEDDING FROM OPENAI FOR {string[:20]}")
        with open(embedding_cache_path, "wb") as embedding_cache_file:
            pickle.dump(embedding_cache, embedding_cache_file)
    return embedding_cache[(string, model)]

In [15]:
# This line actaully generates the embeddings
plot_embeddings = [embedding_from_string(plot, model="text-embedding-ada-002") for plot in movie_plots]

GOT EMBEDDING FROM OPENAI FOR Meek clerk Elmer Lam
GOT EMBEDDING FROM OPENAI FOR Nick and Nora Charle
GOT EMBEDDING FROM OPENAI FOR A card sharp steps i
GOT EMBEDDING FROM OPENAI FOR Template:Section Edi
GOT EMBEDDING FROM OPENAI FOR Taft, a policeman, h
GOT EMBEDDING FROM OPENAI FOR Geoffrey Sherwood, r
GOT EMBEDDING FROM OPENAI FOR Stenographer Marilyn
GOT EMBEDDING FROM OPENAI FOR Kay Parrish is the d
GOT EMBEDDING FROM OPENAI FOR The film centers on 
GOT EMBEDDING FROM OPENAI FOR Secretary Mirabel Mi
GOT EMBEDDING FROM OPENAI FOR One year after gradu
GOT EMBEDDING FROM OPENAI FOR Ellen Garfield refus
GOT EMBEDDING FROM OPENAI FOR California gubernato
GOT EMBEDDING FROM OPENAI FOR In San Francisco in 
GOT EMBEDDING FROM OPENAI FOR Freckles, a young ma
GOT EMBEDDING FROM OPENAI FOR A radical campus gro
GOT EMBEDDING FROM OPENAI FOR A suicidal woman, Li
GOT EMBEDDING FROM OPENAI FOR Broadway star Al How
GOT EMBEDDING FROM OPENAI FOR In 1925 London, midd
GOT EMBEDDING FROM OPENAI FOR W