In [None]:
from dotenv import load_dotenv ## loads API keys
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings
# from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain_community.vectorstores.chroma import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os
from collections import OrderedDict

load_dotenv()

os.environ["HUGGINGFACEHUB_API_TOKEN"] = os.getenv("HUGGINGFACEHUB_API_TOKEN")

In [None]:
# loader = PyPDFLoader("../grades_trim.pdf")
# loader = PyPDFLoader("../speech.pdf")
loader = PyPDFLoader("../econs.pdf")
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=300)
texts = text_splitter.split_documents(docs)

## Initialize embedding model
embed_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
# embed_model = HuggingFaceInstructEmbeddings(model_name="minishlab/potion-base-4M")
# embed_model = HuggingFaceEmbeddings(model_name="meta-llama/Meta-Llama-3-8B-Instruct") #erroring out


In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=300)
texts = text_splitter.split_documents(docs)

# print(texts[1].page_content)
# print(len(texts[1].page_content))
# print("\n")
# print(texts[2].page_content)
# print(len(texts[2].page_content))
# print("\n")
# print(texts[3].page_content)
# print(len(texts[3].page_content))

In [None]:
try:
    embeds = embed_model.embed_documents([doc.page_content for doc in texts])
    print("Vectors done!!!")
except Exception as e:
    print(f"Error in embed process: {e}")

In [None]:
## Vector store
vector_store = Chroma(embedding_function=embed_model, persist_directory="data")

_ = vector_store.add_documents(documents=texts)

In [None]:
try:
    test_query = "what is the view on inflation?"#"what is the state of the economy"
    # test_query = "what are the stress levels for the class?"
    # test_query = "whats current liquidity landscape?"
    results = vector_store.search(query=test_query, search_type='similarity')

    unique_results = OrderedDict()
    for doc in results:
        if doc.page_content not in unique_results:
            unique_results[doc.page_content] = doc
    
    final_results = list(unique_results.values())[:3]
    print(f"Top query results:\n{final_results[0].page_content}")
except Exception as e:
    print(f"Error during test query: {e}")

In [None]:
len(final_results)

In [None]:
for i in final_results:
    print("\n\tContent\n")
    print(i.page_content)
# print(final_results[1].page_content,"\n")
# print(final_results[2].page_content,"\n")

In [None]:
from langchain.llms import HuggingFaceHub

hf_hub_llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512})

In [None]:
# from langchain_huggingface import HuggingFaceEndpoint

# hf_hub_llm = HuggingFaceEndpoint(repo_id="meta-llama/Meta-Llama-3-8B-Instruct",
#                                  temperature= 0.8, 
#                                  max_new_tokens= 1024)

In [None]:
# from langchain_huggingface.embeddings.huggingface import HuggingFaceEmbeddings

# help(HuggingFaceEmbeddings)

In [None]:
# from langchain_huggingface.embeddings.huggingface import HuggingFaceEmbeddings

# hf_hub_llm = HuggingFaceEmbeddings(repo_id="meta-llama/Meta-Llama-3-8B-Instruct",
#                                            model_kwargs={"temperature": 0.8, "max_new_tokens": 1024}
#                                            )

In [None]:
# help(HuggingFaceEndpointEmbeddings)

In [None]:
from langchain_core.prompts import PromptTemplate

prompt_template = """
You are a highly educated economist and central bank policy assistant, your role is to accurately interpret queries on the economy 
and provide responses using the specialized database provided in the Context.
Do not mention anything about charts or graphs. Provided your analysis as text response with facts from the data provided.

Context: {context}

Question: {question}

Answer:
"""

# prompt_template = """
# You are a highly educated class teacher and your role is to accurately interpret situation in the class based on the specialized database.

# Query: {context}

# Question: {question}

# Answer:
# """

custom_prompt = PromptTemplate(input_variables=["context","question"], template=prompt_template)

In [None]:
from langchain.chains.retrieval_qa.base import RetrievalQA

rag_chain = RetrievalQA.from_chain_type(llm=hf_hub_llm, 
                                        chain_type="stuff", 
                                        retriever = vector_store.as_retriever(search_kwargs={'k': 3}),
                                        chain_type_kwargs={"prompt": custom_prompt},return_source_documents=True)

In [None]:
def get_response(question):
    result = rag_chain({"query": question})
    response_text = result["result"]
    answer_start = response_text.find("Answer:") + len("Answer:")
    answer = response_text[answer_start:].strip()
    return answer

In [None]:
# print(rag_chain({"query": "what is the view on inflation?"}))

In [None]:
# print(get_response("whats current liquidity landscape?"))
# print(get_response("what are the stress levels for the class?"))
# print(get_response("what does the document say about union budget 2025-26?"))
print(get_response("what is the view on inflation?"))