In [None]:
import os
import pdfplumber
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS 
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.runnables import RunnablePassthrough

from dotenv import load_dotenv
load_dotenv()


combined_text = ''

files_directory = 'files'

# Loop through all the files in the directory

for filename in os.listdir(files_directory):
    if filename.endswith('.pdf'):
        # Open the pdf file
        with pdfplumber.open(os.path.join(files_directory, filename)) as pdf:
            # Loop through all pages in the pdf file
            for page in pdf.pages:
                # Extract teh text from the page and add it to the rest of the text; Also avoid crashing in case of empty content
                text = page.extract_text()
                if text:
                    combined_text += text + ' '

# print(combined_text)
# print(len(combined_text))
  

text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=200)
text_chunks = text_splitter.split_text(combined_text)

# print(len(text_chunks))

# Create an embeddings object using the HuggingFace model sentence-transformers/paraphrase-MiniLM-L6-v2

#embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-MiniLM-L6-v2")
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

# Ingest the documents into Vector store (FAISS)

db = FAISS.from_texts(text_chunks, embeddings)

# Verify the index of the FAISS vector store to confirm it is populated
# print(db.index.ntotal)    

# Convert the FAISS vector store into a retriever that can return documents for a given query
# Instantiate the retriever by calling the as_retriever method on the vector store

retriever = db.as_retriever()

# The retriever finds the 12 most similar chunks
retriever.search_kwargs["k"] = 12

# Check the default top_k value set at langchain which defines how many chunks it will return (in case it is set)
# print(retriever.search_kwargs)

# Test the retriever
# print(retriever.invoke("Who is the CEO of InnoEnergy?"))

# Create a RAG prompt template to include both the question and the context to answer the question posted by the user.
# The goal of defining the prompt is to translate user input and parameters into clear instrcutions for the Open AI model. 
# It will help the model to understand the context and answer the question posted by the user.

template = """
    Answer the question based only on the following context:
    {context}

    Question: {input}
"""

# Create the prompt based on the template. This converts the raw string into a LangChain prompt object that can be fed into an LLM chain.

prompt = ChatPromptTemplate.from_template(template)

# Setup the LLM (gpt-3.5-turbo) and RAG retrieval chain; an api key (OPENAI_API_KEY; available through .env) for the AI model would be required

llm = ChatOpenAI(model="gpt-4o") #gpt-3.5-turbo

# Construct the chain that will be used to generate a response based on a set of documents and a question
# In LangChain, a "stuff" chain takes all retrieved documents and "Stuff" them into the prompt. Then sends them to the LLM to get an answer
# It does not retrieve documents. It only formats, combines and sends to LLM.

# combine_docs_chain = create_stuff_documents_chain(llm, prompt)

# Create the retrieval chain(RAG pipeline).
# It connects two things: Retriever fetches relevant documents from the vector store and combine_docs_chain uses those documents plus prompt to answer the question
# The final chain does:
#    - Take user question
#    - Use the retriever to get relevant chunks
#    - Pass those chunks into the "stuff" chain
#    - Generate the final answer

# rag_chain = create_retrieval_chain(retriever, combine_docs_chain)

# LangChain Expression Language (LCEL) RAG pipeline

rag_chain = ( 
    { 
        "context": retriever, # retrieves relevant chunks 
        "input": RunnablePassthrough() # passes user question through 
        } 
        | prompt # formats prompt 
        | llm # generates answer 
)

# Invoke the chain by submitting queries.

print(rag_chain.invoke("What did the Diego Pavia (D.P.), CEO of InnoEnergy mention about the expected dynamics in climate tech in the next few months?")) 
print(rag_chain.invoke("What is mentioned on Capgemini's net-zero goals in the report?"))
