### Install dependent packages

In [14]:
! pip install -qq datasets langchain_community langchain_text_splitters langchain_pinecone langchain_openai sentence_transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.4/227.4 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m75.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.8/77.8 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25h

### Imports

In [48]:
import pandas as pd
import re
from datasets import load_dataset
from tqdm import tqdm

from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableParallel
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.messages import AIMessage, HumanMessage



In [7]:
df = pd.read_csv("tamil-movies.csv")

In [15]:
df["text"] = df["text"].str.split("External links").str[0]

In [17]:
df.to_csv("movies_update.csv", index=False)

In [18]:
import sys
import csv

csv.field_size_limit(sys.maxsize) # Workaround for reading larger articles



loader = CSVLoader(file_path='movies_update.csv', metadata_columns=['id','url','title', "release_year"])
langchain_docs = loader.load()

In [19]:
len(langchain_docs)

4424

### Chunking

In [21]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)

In [22]:
docs = text_splitter.split_documents(documents=langchain_docs)

In [23]:
len(docs)

64068

### Generate embeddings

In [5]:
embeddings = HuggingFaceEmbeddings(model_kwargs={'device' : 'cuda'})

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

### Save into pinecone

In [7]:
import os

os.environ["PINECONE_API_KEY"] = "<INSERT-YOUR-PINE-CONE-KEY-HERE>"

In [26]:
index_name = "tamil-movies-4k"

docsearch = PineconeVectorStore.from_documents(docs, embeddings, index_name=index_name)

### Retrieval

In [8]:
index_name = "tamil-movies-4k"
docsearch = PineconeVectorStore.from_existing_index(index_name, embeddings)

In [9]:
retriever = docsearch.as_retriever()

In [10]:
template = """Answer the question based only on the following context:
{context}

Question: {question}
"""

In [16]:
prompt = ChatPromptTemplate.from_template(template)

In [1]:
from getpass import getpass

OPENAI_API_KEY = getpass()

In [20]:
import os

os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [27]:
model = ChatOpenAI()

In [28]:
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [29]:
chain.invoke("Which vijay movie is best ?")

'Based on the provided context, the movie "Thuppakki" seems to be considered the best Vijay movie, as it received positive reviews for Vijay\'s performance, the fast-paced narrative, and the engaging storyline.'

### Returning Source

In [36]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain_from_docs = (
    RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
    | prompt
    | model
    | StrOutputParser()
)

rag_chain_with_source = RunnableParallel(
    {"context": retriever, "question": RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)

In [37]:
rag_chain_with_source.invoke("What are the last 3 Vijay movie names?")

{'context': [Document(page_content="grossed  worldwide, becoming Vijay's third film to do so, following Thuppakki (2012), Kaththi (2014) and Puli (2015).", metadata={'id': '47133788', 'release_year': '2016', 'row': 2787.0, 'source': 'movies_update.csv', 'title': 'Theri (film)', 'url': 'https://en.wikipedia.org/wiki/Theri%20%28film%29'}),
  Document(page_content='Vijayan. The film released on 18 December 1992.', metadata={'id': '24199883', 'release_year': '1992', 'row': 1146.0, 'source': 'movies_update.csv', 'title': 'Meera (1992 film)', 'url': 'https://en.wikipedia.org/wiki/Meera%20%281992%20film%29'}),
  Document(page_content='films earlier made by director Priyadarshan.', metadata={'id': '49974654', 'release_year': '2017', 'row': 2875.0, 'source': 'movies_update.csv', 'title': 'Sangili Bungili Kadhava Thorae', 'url': 'https://en.wikipedia.org/wiki/Sangili%20Bungili%20Kadhava%20Thorae'}),
  Document(page_content="Casting \nFor the lead role, played by Ravi Teja in the original version

In [38]:
rag_chain_with_source.invoke("Tell me about the second movie")

{'context': [Document(page_content='Sequel\nA sequel of the film is being announced and also in development\n\nReferences', metadata={'id': '68723994', 'release_year': '2021', 'row': 4103.0, 'source': 'movies_update.csv', 'title': 'Kodiyil Oruvan', 'url': 'https://en.wikipedia.org/wiki/Kodiyil%20Oruvan'}),
  Document(page_content='had to be shown and this may overshoot the proposed budget", hence he decided to make the sequel into a digital format. Vetrimaaran also revealed that he planned the first film into a web series instead of making into a feature.', metadata={'id': '33933567', 'release_year': '2018', 'row': 1550.0, 'source': 'movies_update.csv', 'title': 'Vada Chennai', 'url': 'https://en.wikipedia.org/wiki/Vada%20Chennai'}),
  Document(page_content='this film as his second project".', metadata={'id': '14230594', 'release_year': '2008', 'row': 686.0, 'source': 'movies_update.csv', 'title': 'Jayamkondaan', 'url': 'https://en.wikipedia.org/wiki/Jayamkondaan'}),
  Document(page_co

### Chat History

In [41]:
contextualize_q_system_prompt = """Given a chat history and the latest user question \
which might reference context in the chat history, formulate a standalone question \
which can be understood without the chat history. Do NOT answer the question, \
just reformulate it if needed and otherwise return it as is."""
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder(variable_name="chat_history"),
        ("human", "{question}"),
    ]
)
contextualize_q_chain = contextualize_q_prompt | model | StrOutputParser()

In [42]:
qa_system_prompt = """You are an assistant for question-answering tasks. \
Use the following pieces of retrieved context to answer the question. \
If you don't know the answer, just say that you don't know. \
Use three sentences maximum and keep the answer concise.\

{context}"""
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder(variable_name="chat_history"),
        ("human", "{question}"),
    ]
)


def contextualized_question(input: dict):
    if input.get("chat_history"):
        return contextualize_q_chain
    else:
        return input["question"]


rag_chain = (
    RunnablePassthrough.assign(
        context=contextualized_question | retriever | format_docs
    )
    | qa_prompt
    | model
)

In [46]:
chat_history = []

question1 = "What are the last 3 Vijay movies ?"
ai_msg = rag_chain.invoke({"question": question1, "chat_history": chat_history})

In [47]:
ai_msg

AIMessage(content='The top 3 Vijay movies are Thuppakki (2012), Kaththi (2014), and Mersal (2017).')

In [49]:
chat_history.extend([HumanMessage(content=question1), ai_msg])

In [50]:
chat_history

[HumanMessage(content='What are the top 3 Vijay movies ?'),
 AIMessage(content='The top 3 Vijay movies are Thuppakki (2012), Kaththi (2014), and Mersal (2017).')]

In [51]:
second_question = "Who directed the second movie"
rag_chain.invoke({"question": second_question, "chat_history": chat_history})

AIMessage(content='The second movie, "Kaththi" (2014), was directed by A. R. Murugadoss.')