Cohere Wikipedia Simple: https://huggingface.co/datasets/Cohere/wikipedia-22-12-simple-embeddings

MongoDB ARXIV papers subset: https://huggingface.co/datasets/MongoDB/subset_arxiv_papers_with_embeddings

Movie Plots + summaries: https://huggingface.co/datasets/vishnupriyavr/wiki-movie-plots-with-summaries-faiss-embeddings
(PROBABLY GOING TO DO THIS ONE)

# First, we get the dataset and relevant imports. 

In [64]:
# pip install -U datasets

In [135]:
from datasets import load_dataset
dataset = load_dataset("vishnupriyavr/wiki-movie-plots-with-summaries-faiss-embeddings", split='train')
#TODO before writing the tutorial: update this section to just load from the Huggingface database directly. 

dataset

Downloading readme: 100%|██████████| 657/657 [00:00<00:00, 656kB/s]
Downloading data: 100%|██████████| 217M/217M [05:30<00:00, 656kB/s]  
Generating train split: 100%|██████████| 33155/33155 [00:00<00:00, 63101.73 examples/s]


Dataset({
    features: ['Release Year', 'Title', 'Cast', 'Wiki Page', 'Plot', 'plot_length', 'text', 'embeddings'],
    num_rows: 33155
})

In [66]:
dataset['train'][0]

{'Release Year': 1903,
 'Title': 'Alice in Wonderland',
 'Cast': 'May Clark',
 'Wiki Page': 'https://en.wikipedia.org/wiki/Alice_in_Wonderland_(1903_film)',
 'Plot': 'Alice follows a large white rabbit down a "Rabbit-hole". She finds a tiny door. When she finds a bottle labeled "Drink me", she does, and shrinks, but not enough to pass through the door. She then eats something labeled "Eat me" and grows larger. She finds a fan when enables her to shrink enough to get into the "Garden" and try to get a "Dog" to play with her. She enters the "White Rabbit\'s tiny House," but suddenly resumes her normal size. In order to get out, she has to use the "magic fan."\r\nShe enters a kitchen, in which there is a cook and a woman holding a baby. She persuades the woman to give her the child and takes the infant outside after the cook starts throwing things around. The baby then turns into a pig and squirms out of her grip. "The Duchess\'s Cheshire Cat" appears and disappears a couple of times to A

In [67]:
release_years = dataset['train']['Release Year']
titles = dataset['train']['Title']
casts = dataset['train']['Cast']
wiki_pages = dataset['train']['Wiki Page']
plots = dataset['train']['Plot']
plot_lens = dataset['train']['plot_length']
texts = dataset['train']['text']
embeddings = dataset['train']['embeddings']

# **Now, insert embeddings into pgvecto.rs instance:**

### Create instance by running: 

docker run \
  --name pgvecto-rs-demo \
  -e POSTGRES_PASSWORD=mysecretpassword \
  -p 5432:5432 \
  -d tensorchord/pgvecto-rs:pg16-v0.3.0

In [68]:
#psycopg installation to interface with postgres in Python:
# pip install -U psycopg2
# pip install "psycopg[binary,pool]"

In [69]:
size_embeddings = 0
for embed in embeddings:
    if len(embed) > size_embeddings:
        size_embeddings = len(embed)
print("size of embeddings: ", size_embeddings)
#seems like dense embeddings?

size of embeddings:  768


Ensure the extension is enabled:

In [70]:
import psycopg

URL = "postgresql://postgres:mysecretpassword@localhost:5432/postgres"
#Create the table
with psycopg.connect(URL) as conn:
    conn.execute("DROP EXTENSION IF EXISTS vectors;")
    conn.execute("CREATE EXTENSION vectors;")

Create table:

In [71]:
import psycopg

URL = "postgresql://postgres:mysecretpassword@localhost:5432/postgres"
#Create the table
with psycopg.connect(URL) as conn:
    conn.execute("DROP TABLE IF EXISTS movies;")
    conn.execute(f"""
        CREATE TABLE movies (
            id SERIAL PRIMARY KEY, 
            Release_Year INT NOT NULL,
            Title TEXT NOT NULL,
            Movie_Cast TEXT,
            Wiki_page TEXT NOT NULL,
            Plot TEXT NOT NULL,
            plot_length INT NOT NULL,
            text TEXT NOT NULL,
            embedding vector(768) NOT NULL);
    """
    )

Insert into table:

In [72]:
# %pip install -U pgvecto_rs
import psycopg
from pgvecto_rs.psycopg import register_vector

from tqdm import tqdm
URL = "postgresql://postgres:mysecretpassword@localhost:5432/postgres"

with psycopg.connect(URL) as conn:
    with conn.cursor() as cursor:    
        for release_year, title, cast, wiki_page, plot, plot_len, text, embedding in tqdm(zip(release_years, titles, casts, wiki_pages, plots, plot_lens, texts, embeddings), total=len(titles)):
            cursor.execute(
                """INSERT INTO movies (Release_Year, Title, Movie_Cast, Wiki_page, Plot, plot_length, text, embedding) 
                    VALUES (%s, %s, %s, %s, %s, %s, %s, %s::real[]);""",
                (release_year, title, cast, wiki_page, plot, plot_len, text, embedding),
            )
        conn.commit()

100%|██████████| 33155/33155 [27:45<00:00, 19.91it/s]


In [114]:
#Check if the data was inserted correctly
import psycopg
URL = "postgresql://postgres:mysecretpassword@localhost:5432/postgres"

with psycopg.connect(URL) as conn:
    cursor = conn.cursor()
    cursor.execute("SELECT Title FROM movies LIMIT 10;")
    print(cursor.fetchall())
    cursor.execute("SELECT text FROM movies WHERE Title = 'Cat Napping';")
    print(cursor.fetchone())
    

[('Alice in Wonderland',), ('Daniel Boone',), ('How Brown Saw the Baseball Game',), ('Laughing Gas',), ('The Adventures of Dollie',), ('David Copperfield',), ('The Black Viper',), ('A Calamitous Elopement',), ('The Call of the Wild',), ('A Christmas Carol',)]
2513


In [75]:
import psycopg
URL = "postgresql://postgres:mysecretpassword@localhost:5432/postgres"
with psycopg.connect(URL) as conn:
    cursor = conn.cursor()

    #Creating indexes for embedding column
    cursor.execute("""
        CREATE INDEX ON movies 
        USING vectors (embedding vector_l2_ops) 
        WITH (options = \"[indexing.hnsw]\");
    """)
    conn.commit()
    
    #Check the created index
    cursor.execute("SELECT * FROM pg_indexes WHERE tablename = 'movies';")
    indexes = cursor.fetchall()
    for index in indexes:
        print(index)

('public', 'movies', 'movies_pkey', None, 'CREATE UNIQUE INDEX movies_pkey ON public.movies USING btree (id)')
('public', 'movies', 'movies_embedding_idx', None, "CREATE INDEX movies_embedding_idx ON public.movies USING vectors (embedding vector_l2_ops) WITH (options='[indexing.hnsw]')")


## Now, we start with question-answering. 

3. Use the indicated entry/entries to answer the question. 

Installing necessary libraries: 

In [76]:
# pip install faiss-cpu
#OPTIONAL: pip install faiss-gpu
# pip install sentence-transformers
# pip install pandas

In [77]:
from sentence_transformers import SentenceTransformer

#some options: 'paraphrase-mpnet-base-v2', 'all-mpnet-base-v2'
encoder = SentenceTransformer('all-mpnet-base-v2')

In [78]:
def encode_query(query): #Query is a string
    #Preprocess the query
    query = query.lower()
    query = query.split()
    query = [word for word in query]
    query = ' '.join(query)
    
    #then encode the query
    vector = encoder.encode(query)
    return vector

Let's see how it works: 

In [79]:
from sklearn.preprocessing import normalize

test_query = "Here is my test query about a movie."
test_encoding = encode_query(test_query)

print("Length of encoding: ", len(test_encoding)) #768, hopefully

Length of encoding:  768


### Search for embedding(s) that resembles the query_encoding. These are the relevant entry/entries to answer the question with. 

Recall the distance operators:

 - <-> for squared Euclidean distance
 - <#> for negative dot product
 - <=> for cosine distance

In [93]:
import psycopg
URL = "postgresql://postgres:mysecretpassword@localhost:5432/postgres"

search_text = "What are some popular superhero movies?"
search_vector = encode_query(search_text).tolist()

with psycopg.connect(URL) as conn:
    cursor = conn.cursor()
    cursor.execute("SELECT Title FROM movies ORDER BY embedding <-> %s::real[] LIMIT 20;", #set the limit to your desired number of results
              (search_vector,),)
    results = cursor.fetchall()

print(f"Relevant pages found: {results}")

Relevant pages found: [('Spider-Man',), ('Cat Napping',), ('Batman v Superman: Dawn of Justice',), ('Dark Knight Rises, TheThe Dark Knight Rises',), ('Amazing Spider-Man, TheThe Amazing Spider-Man',), ('Spider-Man: Homecoming',), ('Wolverine, TheThe Wolverine',), ('Freddy vs. Jason',), ('Masked Avengers',), ('Captain America: The Winter Soldier',), (' The Dark Knight',), ('The Dark Knight Rises',), ('Abraham & Lincoln',), ('Silver Hawk',), ('Raksha',), ('Avengers, TheThe Avengers',), ('Griff the Invisible',), ('Captain America',), ('Alien vs Ninja',), ('Superman Returns',)]


### Now that we know what relevant listings to use as context, it's time to start asking questions to use the context. 

In [101]:
#pip install openai
import openai

openai.api_key = "YOUR_API_KEY_HERE"
def complete(prompt):
    result = openai.Completion.create(
        engine='gpt-3.5-turbo-instruct',
        prompt=prompt,
        temperature=0,
        max_tokens=400,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
        stop=None
    )
    return result['choices'][0]['text'].strip()

Let's get the cast and plot of a popular movie first:

In [108]:
base_prompt = "What is the release year and overall plot of Skyfall?"
no_context_response = complete(base_prompt)
print(no_context_response)

The release year of Skyfall is 2012. The overall plot follows James Bond as he investigates an attack on MI6 and uncovers a sinister plot by a former agent seeking revenge on M and the entire agency. Bond must confront his past and protect MI6 from destruction.


Accurate for the most part, if somewhat vague in the plot summary. However, what about a movie that's much less known?

In [124]:
base_prompt_2 = "What is the release year and overall plot of Cat Napping?"
no_context_response_2 = complete(base_prompt_2)
print(no_context_response_2)

The release year of Cat Napping is unknown. The overall plot of Cat Napping is about a group of cats who go on a mission to rescue their friend who has been kidnapped by a group of evil dogs. Along the way, they encounter various challenges and obstacles, but with their cleverness and teamwork, they are able to save their friend and defeat the dogs.


Now, the model has invented/"hallucinated" a plot, and stated it doesn't know the release year. Cat Napping is actually a Tom and Jerry movie from 1951. 

### Now, let's give the model some context so it can more accurately answer the question: 

In [128]:
import psycopg
URL = "postgresql://postgres:mysecretpassword@localhost:5432/postgres"

def add_context(prompt):
    search_vector = encode_query(prompt).tolist()
    # start building the prompt with the retrieved contexts included
    # TODO: maybe tweak the ordering of this to not directly copy the pinecone version
    prompt_start = (
        "Answer the question based on the following context:\n\n"
    )
    prompt_end = (
        f"\n\nQuestion: {prompt}\nAnswer:"
    )
    with psycopg.connect(URL) as conn:
        cursor = conn.cursor()
        #Tune limit in accordance with how many tokens you want to pay for vs ensuring the correct movie text is included
        cursor.execute("SELECT text FROM movies ORDER BY embedding <-> %s::real[] LIMIT 3;", 
                  (search_vector,),)
        result = cursor.fetchall()
        # print(result) #DEBUG: see what the search results are
    context = ""
    for r in result:
        context += r[0] + "\n\n"
    # print(context) #DEBUG
    return prompt_start + context + prompt_end

In [127]:
base_prompt_2 = "What is the release year and overall plot of Cat Napping?"

context_prompt_2 = add_context(base_prompt_2)
context_response_2 = complete(context_prompt_2)
print(context_response_2)

The release year of Cat Napping is 1951 and the overall plot involves Tom and Jerry fighting over a hammock, with Jerry ultimately getting revenge on Tom.


And there it is! An accurate answer to the query. We can even ask more details of the plot if desired:

In [134]:
base_prompt_test = "In Cat Napping, what does Tom do to Jerry?"
print(complete(add_context(base_prompt_test)))

Tom tries to send Jerry sliding into a nearby pond by unhooking the hammock he is sleeping in. He also tries to whack the hammock to send Jerry flying into the air, but Jerry lands in a bird's nest. Tom then picks Jerry up on a spatula and places him onto a walking army of ants, causing Jerry to wake up as he bumps his head on a sprinkler. Tom also tries to chase Jerry with a lawn mower and a baseball bat.
