In [1]:
%%capture
# Below we installed specific versions of the packages
# Feel free to experiment with different versions
# However, the workspace below is only tested with these specific versions
!pip install pinecone-client==2.2.2 openai==0.28.0 tiktoken==0.5.1 langchain==0.0.291

In [1]:
 #Import pandas as pd
import pandas as pd            

# Import IMBD.csv and transform to create the movies dataframe
IMDB_df=pd.read_csv("IMDB.csv")
movies = IMDB_df.rename(columns={
    "primaryTitle": "movie_title",
    "Description" : "movie_description",
})
movies["source"] = "https://www.imdb.com/title/" + movies["tconst"]
# only movies type
movies = movies.loc[
    movies["titleType"] =="movie",
    ["movie_title","movie_description","source","genres"]
]

#show movies
movies.head()


Unnamed: 0,movie_title,movie_description,source,genres
0,The Silence of the Lambs,"Jodie Foster stars as Clarice Starling, a top ...",https://www.imdb.com/title/tt0102926,"Crime,Drama,Thriller"
1,Terminator 2: Judgment Day,"In this sequel set eleven years after ""The Ter...",https://www.imdb.com/title/tt0103064,"Action,Sci-Fi"
2,The Lion King,This Disney animated feature follows the adven...,https://www.imdb.com/title/tt0110357,"Adventure,Animation,Drama"
3,Pulp Fiction,Vincent Vega (John Travolta) and Jules Winnfie...,https://www.imdb.com/title/tt0110912,"Crime,Drama"
4,The Shawshank Redemption,Andy Dufresne (Tim Robbins) is sentenced to tw...,https://www.imdb.com/title/tt0111161,Drama


In [2]:
# Import DataFrameLoader
from langchain.document_loaders import DataFrameLoader

# Create page content column
movies["page_content"]= "Title: " + movies["movie_title"]+ "\n" + \
"Genre:"  + movies["genres"] + "\n" + \
"Description:" + movies["movie_description"] + "\n" 
  

# Drop all columns except for page_content and source

movies=movies[["page_content","source"]]


# Load the documents from the dataframe into docs
# The page content column is 'movie_description'

docs = DataFrameLoader(movies,
                       page_content_column="page_content",).load()



# Print the first 3 documents and the number of documents
docs[:3]

[Document(page_content="Title: The Silence of the Lambs\nGenre:Crime,Drama,Thriller\nDescription:Jodie Foster stars as Clarice Starling, a top student at the FBI's training academy. Jack Crawford (Scott Glenn) wants Clarice to interview Dr. Hannibal Lecter (Anthony Hopkins), a brilliant psychiatrist who is also a violent psychopath, serving life behind bars for various acts of murder and cannibalism. Crawford believes that Lecter may have insight into a case and that Starling, as an attractive young woman, may be just the bait to draw him out.\n", metadata={'source': 'https://www.imdb.com/title/tt0102926'}),
 Document(page_content='Title: Terminator 2: Judgment Day\nGenre:Action,Sci-Fi\nDescription:In this sequel set eleven years after "The Terminator," young John Connor (Edward Furlong), the key to civilization\'s victory over a future robot uprising, is the target of the shape-shifting T-1000 (Robert Patrick), a Terminator sent from the future to kill him. Another Terminator, the rev

## Estimate cost of embedding

openAI cost is based on amount of tokens.We will find number of tokens in text with tiktoken.
will derive cost from this

In [3]:
#import tiktoken
import tiktoken

#create encoder
#cl100k_base is encoder for 'text-embedding-ad-002' model
encoder =tiktoken.get_encoding("cl100k_base")

#create list containing number of tokens for each document

tokens_per_doc = [len(encoder.encode(doc.page_content)) for doc in docs]


# cost calc @ 0.0001$ every 1000 tokens
total_tokens = sum(tokens_per_doc)
cost_1k_token = 0.0001
cost = (total_tokens/1000) * cost_1k_token
cost

0.037510100000000005

In [None]:
# 37 cents cost

In [12]:
## Create index on PineCone



In [11]:
#import pinecone
import os
import pinecone
# initialize pinecone

pinecone.init(
    api_key = os.environ["PINECONE_API_KEY"],
    environment="gcp-starter"
)


#print indexes

print(pinecone.list_indexes())

index_name =  "imdb-movies"

#checking index doesnt exist

if index_name not in pinecone.list_indexes():
    #create index 

    pinecone.create_index(
        name = index_name,
        metric = "cosine",
        dimension = 1536,
    )

[]
