In [1]:
import openai
from dotenv import dotenv_values
config = dotenv_values("../.env")
openai.api_key = config["OPENAI_API_KEY"]

In [7]:
import pandas as pd
import numpy as np
from tenacity import retry, wait_random_exponential, stop_after_attempt
import pickle
import tiktoken

## Load The Movie Data

In [8]:
#dataset_path = "./movie_plots.csv"
dataset_path = "wiki_movie_plots_deduped.csv"
df = pd.read_csv(dataset_path)

In [9]:
# Narrow our data set to 5000 recent American movies (to save money)
movies = df[df["Origin/Ethnicity"] == "American"].sort_values("Release Year", ascending=False).head(5000)

In [10]:
# Extract the movie plots into a list
movie_plots = movies["Plot"].values

## Generating The Embeddings

In [11]:
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def get_embedding(text, model="text-embedding-ada-002"):

    # replace newlines, which can negatively affect performance.
    text = text.replace("\n", " ")

    return openai.Embedding.create(input=text, model=model)["data"][0]["embedding"]

In [12]:
enc = tiktoken.encoding_for_model("text-embedding-ada-002")

In [13]:
total_tokens = sum([len(enc.encode(plot)) for plot in movie_plots])

In [14]:
total_tokens
cost = total_tokens * (.0004 / 1000)
print(f"Estimated cost ${cost:.2f}")

Estimated cost $1.45


In [15]:
# establish a cache of embeddings to avoid recomputing
# cache is a dict of tuples (text, model) -> embedding, saved as a pickle file

# set path to embedding cache
embedding_cache_path = "movie_embeddings_cache2.pkl"

# load the cache if it exists, and save a copy to disk
try:
    embedding_cache = pd.read_pickle(embedding_cache_path)
except FileNotFoundError:
    embedding_cache = {}
with open(embedding_cache_path, "wb") as embedding_cache_file:
    pickle.dump(embedding_cache, embedding_cache_file)

# define a function to retrieve embeddings from the cache if present, and otherwise request via the API
def embedding_from_string(string,
                          model="text-embedding-ada-002",
                          embedding_cache=embedding_cache):
    
    """Return embedding of given string, using a cache to avoid recomputing."""
    if (string, model) not in embedding_cache.keys():
        embedding_cache[(string, model)] = get_embedding(string, model)
        print(f"GOT EMBEDDING FROM OPENAI FOR {string[:20]}")
        with open(embedding_cache_path, "wb") as embedding_cache_file:
            pickle.dump(embedding_cache, embedding_cache_file)
    return embedding_cache[(string, model)]

In [22]:
# This line actaully generates the embeddings
plot_embeddings = [embedding_from_string(plot, model="text-embedding-ada-002") for plot in movie_plots]

GOT EMBEDDING FROM OPENAI FOR In 1933, a boy named
GOT EMBEDDING FROM OPENAI FOR In Salem, Massachuse
GOT EMBEDDING FROM OPENAI FOR The movie begins as 
GOT EMBEDDING FROM OPENAI FOR In 1970, 20-year-old
GOT EMBEDDING FROM OPENAI FOR During the Bosnian W
GOT EMBEDDING FROM OPENAI FOR Dave Lizewski, bored
GOT EMBEDDING FROM OPENAI FOR Ellie Klug is a musi
GOT EMBEDDING FROM OPENAI FOR A suicide bomber det
GOT EMBEDDING FROM OPENAI FOR After losing her ban
GOT EMBEDDING FROM OPENAI FOR In 1986, Lorraine La
GOT EMBEDDING FROM OPENAI FOR Salesmen Billy McMah
GOT EMBEDDING FROM OPENAI FOR At a New Year's Eve 
GOT EMBEDDING FROM OPENAI FOR Four couples gather 
GOT EMBEDDING FROM OPENAI FOR After climbing a gia
GOT EMBEDDING FROM OPENAI FOR In the Kingdom of Cl
GOT EMBEDDING FROM OPENAI FOR After winning $10,00
GOT EMBEDDING FROM OPENAI FOR Kevin is throwing a 
GOT EMBEDDING FROM OPENAI FOR The film is set in 1
GOT EMBEDDING FROM OPENAI FOR The film opens in 20
GOT EMBEDDING FROM OPENAI FOR D

In [26]:
len(plot_embeddings)

5000

## Plot The Embeddings Using Atlas

In [27]:
data = movies[["Title", "Genre"]].to_dict("records")

In [30]:
from nomic import atlas

In [31]:
project = atlas.map_embeddings(
    embeddings=np.array(plot_embeddings),
    data=data
)

[32m2023-07-03 11:34:47.585[0m | [1mINFO    [0m | [36mnomic.project[0m:[36m_create_project[0m:[36m749[0m - [1mCreating project `quixotic-spleen` in organization `eduardosthory69`[0m
[32m2023-07-03 11:34:49.614[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[36mmap_embeddings[0m:[36m107[0m - [1mUploading embeddings to Atlas.[0m
4it [00:05,  1.34s/it]                       
[32m2023-07-03 11:34:56.280[0m | [1mINFO    [0m | [36mnomic.project[0m:[36m_add_data[0m:[36m1371[0m - [1mUpload succeeded.[0m
[32m2023-07-03 11:34:56.282[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[36mmap_embeddings[0m:[36m126[0m - [1mEmbedding upload succeeded.[0m
[32m2023-07-03 11:34:57.931[0m | [1mINFO    [0m | [36mnomic.project[0m:[36mcreate_index[0m:[36m1081[0m - [1mCreated map `quixotic-spleen` in project `quixotic-spleen`: https://atlas.nomic.ai/map/db79b079-c5af-4296-9195-36eafcadbb90/9f86c512-9947-4fc5-a1e6-98446ba5a893[0m
[32m2023-07-03 11:34:57.933[0m | 

## Reccommending Movies By Plot

In [32]:
from openai.embeddings_utils import distances_from_embeddings, indices_of_nearest_neighbors_from_distances

In [33]:
def print_recommendations_from_strings(
    strings,
    index_of_source_string,
    k_nearest_neighbors=3,
    model="text-embedding-ada-002"
):
    #Get all of the embeddings
    embeddings = [embedding_from_string(string) for string in strings]
    # get embedding for our specific query string
    query_embedding = embeddings[index_of_source_string]
    # get distances between our embedding and all other embeddings
    distances = distances_from_embeddings(query_embedding, embeddings)
    # get indices of the nearest neighbors
    indices_of_nearest_neighbors = indices_of_nearest_neighbors_from_distances(distances)
    
    query_string = strings[index_of_source_string]
    match_count = 0
    for i in indices_of_nearest_neighbors:
        if query_string == strings[i]:
            continue
        if match_count >= k_nearest_neighbors:
            break
        match_count += 1
        print(f"Found {match_count} closest match: ")
        print(f"Distance of: {distances[i]} ")
        print(strings[i])

In [34]:
print_recommendations_from_strings(movie_plots, 2)

Found 1 closest match: 
Distance of: 0.0844882078700182 
As a spacecraft departs a planet, a humanoid alien drinks an iridescent liquid and then dissolves. The remains of the alien cascade into a waterfall. The alien's DNA strands mix with the water.
In 2089, archaeologists Elizabeth Shaw and Charlie Holloway discover a star map in Scotland that matches others from several unconnected ancient cultures. They interpret this as an invitation from humanity's forerunners, the "Engineers". Peter Weyland, the elderly CEO of Weyland Corporation, funds an expedition, aboard the scientific vessel Prometheus, to follow the map to the distant moon LV-223. The ship's crew travels in stasis while the android David monitors their voyage. Arriving in December 2093, mission-director Meredith Vickers informs them of their mission to find the Engineers and not to make contact without her permission.
The Prometheus lands on the barren, mountainous surface near a large, artificial structure, which a team e