In [1]:
'''' Objective: To demonstrate RAG techniques using Open AI API and Pinecone'''

#versions used
!pip install pinecone-client==2.2.2 openai==0.28.0 tiktoken==0.5.1 langchain==0.0.291

In [2]:
 #Import pandas as pd
import pandas as pd     
import numpy as np       

# Import IMBD.csv and transform to create the movies dataframe
IMDB_df=pd.read_csv("IMDB.csv")
movies = IMDB_df.rename(columns={
    "primaryTitle": "movie_title",
    "Description" : "movie_description",
})
movies["source"] = "https://www.imdb.com/title/" + movies["tconst"]
# only movies type
movies = movies.loc[
    movies["titleType"] =="movie",
    ["movie_title","movie_description","source","genres"]
]

#show movies
movies.head()


Unnamed: 0,movie_title,movie_description,source,genres
0,The Silence of the Lambs,"Jodie Foster stars as Clarice Starling, a top ...",https://www.imdb.com/title/tt0102926,"Crime,Drama,Thriller"
1,Terminator 2: Judgment Day,"In this sequel set eleven years after ""The Ter...",https://www.imdb.com/title/tt0103064,"Action,Sci-Fi"
2,The Lion King,This Disney animated feature follows the adven...,https://www.imdb.com/title/tt0110357,"Adventure,Animation,Drama"
3,Pulp Fiction,Vincent Vega (John Travolta) and Jules Winnfie...,https://www.imdb.com/title/tt0110912,"Crime,Drama"
4,The Shawshank Redemption,Andy Dufresne (Tim Robbins) is sentenced to tw...,https://www.imdb.com/title/tt0111161,Drama


In [3]:
# Import DataFrameLoader
from langchain.document_loaders import DataFrameLoader

# Create page content column
movies["page_content"]= "Title: " + movies["movie_title"]+ "\n" + \
"Genre:"  + movies["genres"] + "\n" + \
"Description:" + movies["movie_description"] + "\n" 
  

# Drop all columns except for page_content and source

movies=movies[["page_content","source"]]


# Load the documents from the dataframe into docs
# The page content column is 'movie_description'

docs = DataFrameLoader(movies,
                       page_content_column="page_content",).load()



# Print the first 3 documents and the number of documents
docs[:3]

[Document(page_content="Title: The Silence of the Lambs\nGenre:Crime,Drama,Thriller\nDescription:Jodie Foster stars as Clarice Starling, a top student at the FBI's training academy. Jack Crawford (Scott Glenn) wants Clarice to interview Dr. Hannibal Lecter (Anthony Hopkins), a brilliant psychiatrist who is also a violent psychopath, serving life behind bars for various acts of murder and cannibalism. Crawford believes that Lecter may have insight into a case and that Starling, as an attractive young woman, may be just the bait to draw him out.\n", metadata={'source': 'https://www.imdb.com/title/tt0102926'}),
 Document(page_content='Title: Terminator 2: Judgment Day\nGenre:Action,Sci-Fi\nDescription:In this sequel set eleven years after "The Terminator," young John Connor (Edward Furlong), the key to civilization\'s victory over a future robot uprising, is the target of the shape-shifting T-1000 (Robert Patrick), a Terminator sent from the future to kill him. Another Terminator, the rev

## Estimate cost of embedding

openAI cost is based on amount of tokens.We will find number of tokens in text with tiktoken.
will derive cost from this

In [5]:
#import tiktoken
import tiktoken

#create encoder
#cl100k_base is encoder for 'text-embedding-ad-002' model
encoder =tiktoken.get_encoding("cl100k_base")

#create list containing number of tokens for each document

tokens_per_doc = [len(encoder.encode(doc.page_content)) for doc in docs]


# cost calc @ 0.0001$ every 1000 tokens
total_tokens = sum(tokens_per_doc)
cost_1k_token = 0.0001
cost = (total_tokens/1000) * cost_1k_token
cost
# total cost is negligible -  37 cents 

0.037510100000000005

## Create index on PineCone



In [6]:
#import pinecone
import os
import pinecone
# initialize pinecone

pinecone.init(
    api_key = os.environ["PINECONE_API_KEY"],
    environment="gcp-starter"
)


#print indexes

print(pinecone.list_indexes())

index_name =  "imdb-movies"

#checking index doesnt exist

if index_name not in pinecone.list_indexes():
    #create index 

    pinecone.create_index(
        name = index_name,
        metric = "cosine",
        dimension = 1536,
    )

  from tqdm.autonotebook import tqdm


['imdb-movies']


In [7]:
# Import OpenAIEmbeddings, Pinecone and Index

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from pinecone.index import Index

# Create the embeddings object

embeddings = OpenAIEmbeddings()

index = Index(index_name)

# Check if there is already some data in the index on Pinecone
if index.describe_index_stats()['total_vector_count'] > 0:
    # If there is, use from_existing_index to use the vector store
    docsearch = Pinecone.from_existing_index(index_name,
                            embeddings,)
else:
    # If there is not, use from_documents to fill the vector store
    docsearch = Pinecone.from_documents(docs,
                                         embeddings,
                                         index_name=index_name)

question = "What's a good sports movie with cricket?"
    
# Use the vector database as a retriever and get the relevant documents for a quesiton
docsearch.as_retriever().get_relevant_documents(question)

[Document(page_content='Title: Jersey\nGenre:Drama,Sport\nDescription:A middle-aged cricketer whose career has failed to take off is pressured to give up on his dream.\n', metadata={'source': 'https://www.imdb.com/title/tt8948790'}),
 Document(page_content='Title: Iqbal\nGenre:Drama,Sport\nDescription:A young deaf man wants to play cricket, but his father refuses to let him play. With help from his sister, they find a retired cricket player to teach him how to play.\n', metadata={'source': 'https://www.imdb.com/title/tt0453729'}),
 Document(page_content="Title: Lagaan: Once Upon a Time in India\nGenre:Drama,Musical,Sport\nDescription:The year is 1893 and India is under British occupation. In a small village, the tyrannical Captain Russell (Paul Blackthorne) has imposed an unprecedented land tax on its citizens. Outraged, Bhuvan (Aamir Khan), a rebellious farmer, rallies the villagers to publicly oppose the tax. Russell offers a novel way to settle the dispute: he challenges Bhuvan and 

In [None]:
## Prompt

In [21]:
# Import PromptTemplate
from langchain.prompts import PromptTemplate

# Read/adapt the prompts below at will
DOCUMENT_PROMPT = """{page_content}
IMDB link: {source}
========="""

QUESTION_PROMPT = """Given the following extracted parts of a movie database and a question, create a final answer with the IMDB link as source ("SOURCE").
If you don't know the answer, just say that you don't know. Don't try to make up an answer.
ALWAYS return a "SOURCE" part in your answer.

QUESTION: What's a good sports movie with cricket to watch for kids?
=========
Title: Jersey
Genre: Drama,Sport
Description: A middle-aged cricketer whose career has failed to take off is pressured to give up on his dream.
IMDB link: https://www.imdb.com/title/tt8948790
=========
Title: Iqbal
Genre: Drama,Sport
Description: A young deaf man wants to play cricket, but his father refuses to let him play. With help from his sister, they find a retired cricket player to teach him how to play.
IMDB link: https://www.imdb.com/title/tt0453729
=========
Title: Lagaan
Genre: Drama,Musical,Sport
Description: Once Upon a Time in India\nGenre:Drama,Musical,Sport\nDescription:The year is 1893 and India is under British occupation. In a small village, the tyrannical Captain Russell (Paul Blackthorne) has imposed an unprecedented land tax on its citizens. Outraged, Bhuvan (Aamir Khan), a rebellious farmer, rallies the villagers to publicly oppose the tax. Russell offers a novel way to settle the dispute: he challenges Bhuvan and his men to a game of cricket, a sport completely foreign to India. If Bhuvan and his men can defeat Russell's team, the tax will be repealed.
IMDB link: https://www.imdb.com/title/tt0169102

=========
FINAL ANSWER: 'Iqbal is a movie about a kid's challenges and how he overcomes them. It would be a good movie to watch with a kid.
SOURCE: https://www.imdb.com/title/tt0129167

QUESTION: {question}
=========
{summaries}
FINAL ANSWER:"""

# Create prompt template objects
DOCUMENT_PROMPT = PromptTemplate.from_template(DOCUMENT_PROMPT)

QUESTION_PROMPT = PromptTemplate.from_template(QUESTION_PROMPT)

In [23]:
# ask question now without adding a movie from 2023 to documents
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chat_models import ChatOpenAI

#create question bot

qa_with_sources = RetrievalQAWithSourcesChain.from_chain_type(
    chain_type="stuff",
    llm=ChatOpenAI(model_name="gpt-3.5-turbo",temperature=0),
    chain_type_kwargs={
        "prompt": QUESTION_PROMPT,
        "document_prompt": DOCUMENT_PROMPT,
    },
    retriever = docsearch.as_retriever(),
)


#ask 
question_new ="What is a good movie on cricketer Muttiah Muralitharan?"

qa_with_sources(question_new)

{'question': 'What is a good movie on cricketer Muttiah Muralitharan?',
 'answer': "I don't know of any specific movies about cricketer Muttiah Muralitharan.",
 'sources': ''}

In [None]:
# again we will try after adding a new movie from 2023



In [25]:
# Import PromptTemplate
from langchain.prompts import PromptTemplate

# Read/adapt the prompts below at will
DOCUMENT_PROMPT = """{page_content}
IMDB link: {source}
========="""

QUESTION_PROMPT = """Given the following extracted parts of a movie database and a question, create a final answer with the IMDB link as source ("SOURCE").
If you don't know the answer, just say that you don't know. Don't try to make up an answer.
ALWAYS return a "SOURCE" part in your answer.

QUESTION: What's a good sports movie with cricket to watch for kids?
=========
Title: Jersey
Genre: Drama,Sport
Description: A middle-aged cricketer whose career has failed to take off is pressured to give up on his dream.
IMDB link: https://www.imdb.com/title/tt8948790
=========
Title: Iqbal
Genre: Drama,Sport
Description: A young deaf man wants to play cricket, but his father refuses to let him play. With help from his sister, they find a retired cricket player to teach him how to play.
IMDB link: https://www.imdb.com/title/tt0453729
=========
Title: Lagaan
Genre: Drama,Musical,Sport
Description: Once Upon a Time in India\nGenre:Drama,Musical,Sport\nDescription:The year is 1893 and India is under British occupation. In a small village, the tyrannical Captain Russell (Paul Blackthorne) has imposed an unprecedented land tax on its citizens. Outraged, Bhuvan (Aamir Khan), a rebellious farmer, rallies the villagers to publicly oppose the tax. Russell offers a novel way to settle the dispute: he challenges Bhuvan and his men to a game of cricket, a sport completely foreign to India. If Bhuvan and his men can defeat Russell's team, the tax will be repealed.
IMDB link: https://www.imdb.com/title/tt0169102
=========
Title:800
Genre:Sports,Biopic
Description:The biopic promises to bring Muttiah Muralitharan's inspirational journey to a global audience, shedding light on the challenges and triumphs that defined his illustrious cricket career.
IMDB link: https://www.imdb.com/title/tt27539086

=========
FINAL ANSWER: 'Iqbal is a movie about a kid's challenges and how he overcomes them. It would be a good movie to watch with a kid.
SOURCE: https://www.imdb.com/title/tt0129167

QUESTION: {question}
=========
{summaries}
FINAL ANSWER:"""

# Create prompt template objects
DOCUMENT_PROMPT = PromptTemplate.from_template(DOCUMENT_PROMPT)

QUESTION_PROMPT = PromptTemplate.from_template(QUESTION_PROMPT)

In [26]:
qa_with_sources = RetrievalQAWithSourcesChain.from_chain_type(
    chain_type="stuff",
    llm=ChatOpenAI(model_name="gpt-3.5-turbo",temperature=0),
    chain_type_kwargs={
        "prompt": QUESTION_PROMPT,
        "document_prompt": DOCUMENT_PROMPT,
    },
    retriever = docsearch.as_retriever(),
)


#ask 
question_new ="What is a good movie on cricketer Muttiah Muralitharan?"

qa_with_sources(question_new)

{'question': 'What is a good movie on cricketer Muttiah Muralitharan?',
 'answer': "'800' is a biopic about cricketer Muttiah Muralitharan. It showcases his inspirational journey and highlights the challenges and triumphs of his cricket career.\n",
 'sources': 'https://www.imdb.com/title/tt27539086'}

In [None]:
# using RAG we  added relevant documents beyond the cutoff period of chatgpt and chatgpt was able to retrieve the desired response.
