In [None]:
# Langchain components to use 
from langchain_community.vectorstores.cassandra import Cassandra
from langchain_groq import ChatGroq
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.documents import Document

# Support for dataset retrieval with Hugging Face
from datasets import load_dataset
# DB connection
import cassio

from PyPDF2 import PdfReader
from typing import Concatenate

import os
from dotenv import load_dotenv
load_dotenv()


In [None]:
# Provide  the part of pdf file/files
pdfreader = PdfReader("Budget_Speech_English.pdf")

In [None]:
# Read the text from pdf
row_text= ''
for i, page in enumerate(pdfreader.pages):
    content= page.extract_text()
    if content:
        row_text+=content

In [None]:
print(row_text)

In [None]:
#ASTRA_DB_APPLICATION_token=""
#ASTRA_DB_ID=""

In [None]:
cassio.init(token=ASTRA_DB_APPLICATION_token, database_id=ASTRA_DB_ID)

In [None]:
GUROQ_API = os.getenv("GROQ_API_KEY")
os.environ['HF_TOKEN'] = os.getenv("HF_TOKEN")

In [None]:
GROQ_MODEL= "llama-3.1-8b-instant"
HF_MODEL = "all-MiniLM-L6-v2"

In [None]:
llm =ChatGroq(groq_api_key=GUROQ_API, model=GROQ_MODEL)
embedding = HuggingFaceEmbeddings(model=HF_MODEL)

In [None]:
astra_vector_store = Cassandra(
    embedding=embedding,
    table_name="qa_mini_demo",
    session=None,
    keyspace=None,
)

In [None]:
from langchain_text_splitters import CharacterTextSplitter

text_spilitter = CharacterTextSplitter(
    separator="\n",
    chunk_size= 800,
    chunk_overlap= 200,
    length_function =len,
)
texts=text_spilitter.split_text(row_text)

In [None]:
texts[:50]

In [None]:
astra_vector_store.add_texts(texts[:50]) 

print("Insert %i headlines." % len(texts[:50]))

# Call as_retriever() on the original, correctly typed object
astra_vecto_retriever = astra_vector_store.as_retriever(k=1)

In [None]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from langchain_core.output_parsers import StrOutputParser

# 1. Define the prompt template
prompt_template = """
Use the following context to answer the user's question.
If you don't know the answer, just say you don't know.

Context:
{context}

Question: {question}
"""
rag_prompt = ChatPromptTemplate.from_template(prompt_template)

# Helper function to format the documents returned by the retriever
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# 2. Build the LCEL Chain
rag_chain = (
    RunnableParallel({
        # Get documents using the retriever and format them
        "context": astra_vecto_retriever | format_docs, 
        # Pass the original question through
        "question": RunnablePassthrough() 
    })
    | rag_prompt  # Apply the prompt template
    | llm         # Pass to the LLM
    | StrOutputParser() # Get the final string answer
)

In [None]:
first_question = True
while True:
    if first_question:
        query_text = input ("/nEnter Your question (or type 'quit' to exit)").strip()
    else:
        query_text = input("\nWhat's your next question (or type 'quit' to exit)").strip()
    
    if query_text.lower() == "quit":
        break
    if query_text == "":
        continue
    first_question = False
    print("\nQUESTION: \"%s\"" % query_text)
    answer = rag_chain.invoke(query_text).strip()
    print("ANSWER: \"%s\"" % answer)

    print("First DOCUMENT BY RELEVABCE:")
    for doc, score in astra_vector_store.similarity_search_with_score(query_text, k=4):
        print(" [%0.4f] \"%s ...\"" % (score, doc.page_content[:84]))