In [None]:
!pip install -q langchain faiss-cpu sentence-transformers==2.2.2 InstructorEmbedding pypdf
!pip install langchain PyPDF2 faiss-cpu huggingface-hub pandas
!pip install -U langchain-community

In [None]:
!pip install -U langchain-community faiss-cpu
!pip install --upgrade --quiet langchain-google-genai
!pip install gradio

from langchain.document_loaders import TextLoader
from pypdf import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA, ConversationalRetrievalChain
from langchain.memory import ConversationBufferWindowMemory
import pickle

In [25]:
from langchain_google_genai import GoogleGenerativeAI

import getpass
import os
import gradio as gr

os.environ["GOOGLE_API_KEY"] = getpass.getpass()

··········


In [26]:
def read_pdf(file):
    document = ""

    reader = PdfReader(file)
    for page in reader.pages:
        document += page.extract_text()

    return document

def read_txt(file):
    document = str(file.getvalue())
    document = document.replace("\\n", "\\n ").replace("\\r", "\\r ")

    return document

# Document Splitting
def split_doc(document, chunk_size, chunk_overlap):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    split = splitter.split_text(document)
    split = splitter.create_documents(split)
    return split

In [5]:
# def embedding_storing(model_name, split, new_vs_name):
#         embeddings = GoogleGenerativeAIEmbeddings(model=model_name)

#         db = FAISS.from_documents(split, embeddings)
#         db.save_local("vector_store/" + new_vs_name)


In [27]:
def prepare_rag_llm(
    token, llm_model, embeddings_name, temperature, max_length, split, new_vs_name
):
    embeddings = GoogleGenerativeAIEmbeddings(model=embeddings_name)

    db = FAISS.from_documents(split, embeddings)
    db.save_local("vector_store/" + new_vs_name)

    # instructor_embeddings = GoogleGenerativeAIEmbeddings(
    #     model_name=instruct_embeddings, model_kwargs={"device":"cuda"}
    # )

    loaded_db = FAISS.load_local(
        f"vector_store/{new_vs_name}", embeddings, allow_dangerous_deserialization=True
    )

    llm = GoogleGenerativeAI(
        model=llm_model,
        google_api_key=token,
        temperature=temperature,
        max_length=max_length
    )

    memory = ConversationBufferWindowMemory(
        k=2,
        memory_key="chat_history",
        output_key="answer",
        return_messages=True,
    )

    qa_conversation = ConversationalRetrievalChain.from_llm(
        llm=llm,
        chain_type="stuff",
        retriever=loaded_db.as_retriever(),
        return_source_documents=True,
        memory=memory,
    )

    return qa_conversation

In [28]:
pdf_file_path = "/content/drive/MyDrive/FINAL_SCRAPED_DATA.pdf"
document = read_pdf(pdf_file_path)
split = split_doc(document, chunk_size=1000, chunk_overlap=200)
#embedding_storing("models/embedding-001", split, create_new_vs=True, existing_vector_store=None, new_vs_name="insti_vector_store")

llm_model = "gemini-pro"
instruct_embeddings = "models/embedding-001"
vector_store_list = "insti_vector_store"
temperature = 0.7
max_length = 512
token = os.getenv("GOOGLE_API_KEY")

qa_conversation = prepare_rag_llm(token, llm_model, instruct_embeddings, temperature, max_length, split, new_vs_name="insti_vector_store")

# Save the necessary objects for reuse
# with open('/content/drive/MyDrive/qa_conversation.pkl', 'wb') as f:
#     pickle.dump(qa_conversation, f)

In [31]:
# def test_qa_conversation(qa_conversation, query="What is the document above?"):
#   input_dict = {
#       "question": query,
#       "chat_history": []
#   }
#   response = qa_conversation(input_dict)
#   return response

# response = test_qa_conversation(qa_conversation)

# print(response)

In [29]:
def chatbot_interface(user_input, history):
    history = history or []
    response = qa_conversation({"question": user_input, "chat_history": history})
    history.append((user_input, response["answer"]))
    return history, history

In [30]:
with gr.Blocks() as demo:
    # Add title
    gr.Markdown("# InstiGPT")

    # Define the chatbot and state
    chatbot = gr.Chatbot()
    state = gr.State([])

    # Layout for text input and button
    with gr.Row():
        txt = gr.Textbox(
            show_label=False,
            placeholder="Enter prompt.",
            lines=1,
            scale=3
        )
        btn = gr.Button("Send")  # Enter button

    # Configure interaction
    txt.submit(chatbot_interface, [txt, state], [chatbot, state])
    btn.click(chatbot_interface, [txt, state], [chatbot, state])

# Launch the interface
demo.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://319efb3d57b868481a.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


