In [None]:
%pip install git+https://github.com/huggingface/transformers
%pip install langchain chromadb pypdf openai sentence-transformers accelerate langchain-community python-docx bitsandbytes streamlit pyngrok huggingface_hub

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-vg3pc7bd
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-vg3pc7bd
  Resolved https://github.com/huggingface/transformers to commit 01be5b48790f113b7d71943b580c842e3e097988
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: transformers
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Created wheel for transformers: filename=transformers-4.44.0.dev0-py3-none-any.whl size=9412956 sha256=fbd1174c5515cab008b003f01a78e173d199dd61f766f0f0be6f4d455efc3013
  Stored in directory: /tmp/pip-ephem-wheel-cache-rlqbi68t/wheels/c0/14/d6/6c9a5582d2ac191ec0a483be151a4495fe1eb2a6706ca49f1b
Successfully built transformers

In [None]:
# !pip install huggingface_hub
from huggingface_hub import notebook_login
import transformers

notebook_login()

# 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

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Streamlit interface

In [None]:
%%writefile app.py

import streamlit as st
import warnings
warnings.filterwarnings("ignore")

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain import HuggingFacePipeline
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
import torch
import pandas as pd
import pathlib
import docx
from langchain.docstore.document import Document
import os
import time
from accelerate import init_empty_weights, load_checkpoint_and_dispatch
import bitsandbytes as bnb

def clear_memory():
    torch.cuda.empty_cache()

def read_docx(file_path):
    doc = docx.Document(file_path)
    return "\n".join([paragraph.text for paragraph in doc.paragraphs])

def load_all_files(directory_path):
    data = []
    for file_path in pathlib.Path(directory_path).glob("*"):
        if file_path.suffix == '.csv':
            df = pd.read_csv(file_path)
            for _, row in df.iterrows():
                content = " ".join(str(value) for value in row.values)
                data.append(Document(page_content=content))
        elif file_path.suffix == '.txt':
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
                data.append(Document(page_content=content))
        elif file_path.suffix == '.docx':
            content = read_docx(file_path)
            data.append(Document(page_content=content))
        elif file_path.suffix == '.xlsx':
            df = pd.read_excel(file_path)
            for _, row in df.iterrows():
                content = " ".join(str(value) for value in row.values)
                data.append(Document(page_content=content))
    return data

def interpret_files(documents):
    print(f"Total documents loaded: {len(documents)}")
    splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
    texts = splitter.split_documents(documents)
    return texts

def create_embeddings():
    print("Creating embeddings")
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2",
        model_kwargs={'device': 'cuda'}
    )
    return embeddings

def save(texts, embeddings):
    print("Saving data")
    db=Chroma.from_documents(texts,embedding=embeddings,persist_directory="test_index")
    db.persist()

def load_llm(model_name):
    model_name = model_name.lower()
    print("Loading LLM")
    if model_name == "phi3" or model_name == "phi-3":
        print("Loading Phi-3 model")
        tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
        model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-4k-instruct", device_map='auto', torch_dtype="auto", trust_remote_code=True,)

    elif model_name == "llama":
        print("Loading Llama model")
        tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
        model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", device_map='auto', torch_dtype=torch.float16, trust_remote_code=True,)

    elif model_name == "gemma":
        print("Loading GEMMA model")
        model_name = "google/gemma-2-9b-it"
        print("Loading Gemma model")

        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            device_map='auto',
            load_in_8bit=True,
            torch_dtype=torch.float16,
            trust_remote_code=True
        )

    pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=300)
    llm = HuggingFacePipeline(pipeline=pipe)
    return llm

def retrieve_docs(embeddings, llm):
    print("Retrieving documents")

    vectordb = Chroma(persist_directory="test_index", embedding_function=embeddings)
    retriever = vectordb.as_retriever(search_kwargs={"k": 2})

    print("Retrieved documents")

    qna_prompt_template = """Use the following pieces of information to answer the user's question. 
    If the provided context does not contain the answer, use your general knowledge to provide a helpful response.
    Context: {context}
    Question: {question}
    Answer:"""

    PROMPT = PromptTemplate(
       template=qna_prompt_template, input_variables=["context","question"]
    )

    print("Sending the chain")
    chain = RetrievalQA.from_chain_type(llm=llm,
                                        chain_type='stuff',
                                        retriever=retriever,
                                        return_source_documents=True,
                                        chain_type_kwargs={'prompt': PROMPT})

    if chain:
        print("Chain created")
    return chain

def answer_question(chain, question, memory):
    time_start = time.time()
    output = chain({'query': question})
    response = output["result"]
    time_elapsed = time.time() - time_start
    print(f'response time: {time_elapsed:.02f} sec')

    if "Answer:" in response:
        response = response.split("Answer:")[1].strip()

    response_lines = response.split('\n')
    cleaned_response = " ".join(line.strip() for line in response_lines if not line.startswith("Question:") and not line.startswith("Document:"))

    answer = cleaned_response.strip()

    memory.append((question, answer))
    return answer

def main():
    st.markdown(
        """
        <style>
        .main {
            background-color: #AE275F;
        }
        .sidebar .sidebar-content {
            background-color: #AE275F;
        }
        .header {
            text-align: center;
            padding: 10px;
            background-color: #AE275F;
            color: white;
        }
        .header img {
            max-width: 100px;
        }
        </style>
        """,
        unsafe_allow_html=True
    )

    st.image("/content/Axis_logo.jpg", width=100)
    st.markdown(
        """
        <div class="header">
            <h1>Axis Bank FAQ Chatbot</h1>
        </div>
        """,
        unsafe_allow_html=True
    )

    st.sidebar.title("MODEL MENU")

    if 'model_name' not in st.session_state:
        st.session_state.model_name = None

    model_options = ["None", "Phi-3", "Llama", "Gemma"]

    if st.session_state.model_name is None:
        model_name = st.sidebar.selectbox("Choose a model:", model_options, key='model_name_select')
        if model_name != "None":
            st.session_state.model_name = model_name
    else:
        st.sidebar.selectbox("Choose a model:", model_options, key='model_name_select', index=model_options.index(st.session_state.model_name), disabled=True)

    st.title("Chatbot")

    if 'questions' not in st.session_state:
        st.session_state.questions = []
        st.session_state.answers = []
        st.session_state.llm = None
        st.session_state.chain = None
        st.session_state.memory = []
        st.session_state.quit = False
        st.session_state.loading = False
        st.session_state.processing = False

    if st.session_state.quit:
        st.write("You have exited the conversation.")
        return

    if st.session_state.llm is None and st.session_state.model_name:
        with st.spinner('Loading model...'):
            st.session_state.loading = True
            data_path = "/content/"
            documents = load_all_files(data_path)
            texts = interpret_files(documents)
            embeddings = create_embeddings()
            save(texts, embeddings)

            st.session_state.llm = load_llm(st.session_state.model_name)
            st.session_state.chain = retrieve_docs(embeddings, st.session_state.llm)
            st.session_state.loading = False
            st.success("Model loaded and ready for questions!")

    disable_input = st.session_state.llm is None or st.session_state.loading or st.session_state.processing

    for i in range(len(st.session_state.questions)):
        st.text_area(f"Question {i + 1}", st.session_state.questions[i], key=f"question_{i}", disabled=True)
        st.text_area(f"Answer {i + 1}", st.session_state.answers[i], key=f"answer_{i}", disabled=True)

    question = st.text_input("Ask a new question:", key="new_question", disabled=disable_input)
    if st.button("Submit", disabled=disable_input):
        if question and st.session_state.chain:
            st.session_state.processing = True
            with st.spinner('Finding the answer...'):
                answer = answer_question(st.session_state.chain, question, st.session_state.memory)
                st.session_state.questions.append(question)
                st.session_state.answers.append(answer)
                st.session_state.processing = False
                st.experimental_rerun()

    if st.button("Clear Memory", disabled=st.session_state.processing):
        st.session_state.memory.clear()
        st.success("Memory cleared.")
        st.experimental_rerun()

    if st.button("Quit", disabled=st.session_state.processing):
        st.session_state.quit = True
        st.experimental_rerun()

if __name__ == "__main__":
    main()


Overwriting app.py


In [None]:
from pyngrok import ngrok

# Set up ngrok
ngrok.set_auth_token("2j8GszSYLz0wcErKpTwaOzCT2es_713PHcWSWmy5VJzSuHCyv")  # Replace with your ngrok auth token
public_url = ngrok.connect(8501, "http")
print(f'Public URL: {public_url}')

# Run the Streamlit app
!streamlit run app.py --server.port 8501


Public URL: NgrokTunnel: "https://09fc-34-105-72-218.ngrok-free.app" -> "http://localhost:8501"

Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.105.72.218:8501[0m
[0m

>> from langchain.document_loaders import PyPDFLoader

with new imports of:

>> from langchain_community.document_loaders import PyPDFLoader
You can use the langchain cli to **automatically** upgrade many imports. Please see documentation here <https://python.langchain.com/v0.2/docs/versions/v0_2/>
  warn_deprecated(

>> from langchain.embeddings import HuggingFaceEmbeddings

with new imports of:

>> from langchain_community.embeddings import HuggingFaceEmbeddings
You can use the langchain cli to **automatically** upgrade many imports. Please see documentati

# Streamlit default with memory


In [None]:
%%writefile app.py
import warnings
warnings.filterwarnings("ignore")

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain import HuggingFacePipeline
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
import torch
import pandas as pd
import pathlib
import docx
from langchain.docstore.document import Document
import os
import time
from accelerate import init_empty_weights, load_checkpoint_and_dispatch
import bitsandbytes as bnb
import streamlit as st


def clear_memory():
    torch.cuda.empty_cache()

def read_docx(file_path):
    doc = docx.Document(file_path)
    return "\n".join([paragraph.text for paragraph in doc.paragraphs])

def load_all_files(directory_path):
    data = []
    for file_path in pathlib.Path(directory_path).glob("*"):
        if file_path.suffix == '.csv':
            df = pd.read_csv(file_path)
            for _, row in df.iterrows():
                content = " ".join(str(value) for value in row.values)
                data.append(Document(page_content=content))
        elif file_path.suffix == '.txt':
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
                data.append(Document(page_content=content))
        elif file_path.suffix == '.docx':
            content = read_docx(file_path)
            data.append(Document(page_content=content))
        elif file_path.suffix == '.xlsx':
            df = pd.read_excel(file_path)
            for _, row in df.iterrows():
                content = " ".join(str(value) for value in row.values)
                data.append(Document(page_content=content))
    return data

def interpret_files(documents):
    print(f"Total documents loaded: {len(documents)}")
    splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
    texts = splitter.split_documents(documents)
    # print(f"Total texts generated: {len(texts)}")
    return texts

def create_embeddings():
    print("Creating embeddings")
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2",
        model_kwargs={'device': 'cuda'}
    )
    return embeddings

def save(texts, embeddings):
    print("Saving data")
    # Store data into database
    db=Chroma.from_documents(texts,embedding=embeddings,persist_directory="test_index")
    db.persist()

def load_llm(model_name):
    print("Loading LLM")
    if model_name == "phi3":
        print("Loading Phi3 model")
        tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
        model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-4k-instruct", device_map='auto', torch_dtype="auto", trust_remote_code=True,)

    elif model_name == "llama":
        print("Loading Llama model")
        tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
        model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", device_map='auto', torch_dtype=torch.float16, trust_remote_code=True,)

    elif model_name == "gemma":
        print("Loading GEMMA model")
        # tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b-it")
        # model = AutoModelForCausalLM.from_pretrained("google/gemma-2-9b-it", device_map='auto', torch_dtype=torch.float16, trust_remote_code=True,)
        model_name = "google/gemma-2-9b-it"
        print("Loading Gemma model")

        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            device_map='auto',
            load_in_8bit=True,
            torch_dtype=torch.float16,
            trust_remote_code=True
        )

    pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=300)
    llm = HuggingFacePipeline(pipeline=pipe)
    return llm


def retrieve_docs(embeddings, llm):
    print("Retrieving documents")

    # Load the database
    vectordb = Chroma(persist_directory="test_index", embedding_function = embeddings)

    # Load the retriver
    retriever = vectordb.as_retriever(search_kwargs = {"k" : 2})

    print("Retrieved documents")

    qna_prompt_template = """Use the following pieces of information to answer the user's question. If the provided context does not contain the answer, use your general knowledge to provide a helpful response.
    Context: {context}
    Question: {question}
    Answer:"""

    # qna_prompt_template = """Use the following pieces of information to answer the user's question. If the provided context does not contain the answer, you can use your general knowledge to provide a helpful response. However, if the context provides a direct answer, prioritize that.
    # {context}
    # Question: {question}
    # Answer:"""



    PROMPT = PromptTemplate(
       template=qna_prompt_template, input_variables=["context","question"] # Change 'Context' to 'context' and 'Question' to 'question'
    )

    print("Sending the chain")
    # Define the QNA chain
    chain = RetrievalQA.from_chain_type(llm=llm,
                                             chain_type='stuff',
                                             retriever=retriever,
                                             return_source_documents=True,
                                             chain_type_kwargs={'prompt': PROMPT})

    if chain:
        print("Chain created")
    return chain


def answer_question(chain, question, memory):
    time_start = time.time()
    output = chain({'query': question})
    response = output["result"]
    time_elapsed = time.time() - time_start
    print(f'response time: {time_elapsed:.02f} sec')

    if "Answer:" in response:
        response = response.split("Answer:")[1].strip()

    response_lines = response.split('\n')
    cleaned_response = " ".join(line.strip() for line in response_lines if not line.startswith("Question:") and not line.startswith("Document:"))

    answer = cleaned_response.strip()

    # Update memory with the latest interaction
    memory.append((question, answer))
    if len(memory) > 3:
        memory.pop(0)

    return answer

def main():
    st.title("Question Answering System")

    if 'questions' not in st.session_state:
        st.session_state.questions = []
        st.session_state.answers = []
        st.session_state.llm = None
        st.session_state.chain = None
        st.session_state.memory = []
        st.session_state.quit = False

    if st.session_state.quit:
        st.write("You have exited the conversation.")
        return

    if st.session_state.llm is None:
        data_path = "/content/"
        documents = load_all_files(data_path)
        texts = interpret_files(documents)
        embeddings = create_embeddings()
        save(texts, embeddings)

        model_name = st.selectbox("Select the model:", ["Choose a model", "phi3", "llama", "gemma"], index=0)
        if model_name != "Choose a model":
            st.session_state.llm = load_llm(model_name)
            st.session_state.chain = retrieve_docs(embeddings, st.session_state.llm)
            st.success("Model loaded and ready for questions!")

    # Display all previous questions and answers
    for i in range(len(st.session_state.questions)):
        st.text_area(f"Question {i + 1}", st.session_state.questions[i], key=f"question_{i}", disabled=True)
        st.text_area(f"Answer {i + 1}", st.session_state.answers[i], key=f"answer_{i}", disabled=True)

    # Input for new question
    question = st.text_input("Ask a new question:", "")
    if st.button("Submit"):
        if question and st.session_state.chain:
            answer = answer_question(st.session_state.chain, question, st.session_state.memory)
            st.session_state.questions.append(question)
            st.session_state.answers.append(answer)
            st.experimental_rerun()

    if st.button("Quit"):
        st.session_state.quit = True
        st.experimental_rerun()

if __name__ == "__main__":
    main()



Overwriting app.py


In [None]:
from pyngrok import ngrok

# Set up ngrok
ngrok.set_auth_token("2j8GszSYLz0wcErKpTwaOzCT2es_713PHcWSWmy5VJzSuHCyv")  # Replace with your ngrok auth token
public_url = ngrok.connect(8501, "http")
print(f'Public URL: {public_url}')

# Run the Streamlit app
!streamlit run app.py --server.port 8501


Public URL: NgrokTunnel: "https://e358-34-142-228-93.ngrok-free.app" -> "http://localhost:8501"

Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.142.228.93:8501[0m
[0m

>> from langchain.document_loaders import PyPDFLoader

with new imports of:

>> from langchain_community.document_loaders import PyPDFLoader
You can use the langchain cli to **automatically** upgrade many imports. Please see documentation here <https://python.langchain.com/v0.2/docs/versions/v0_2/>
  warn_deprecated(

>> from langchain.embeddings import HuggingFaceEmbeddings

with new imports of:

>> from langchain_community.embeddings import HuggingFaceEmbeddings
You can use the langchain cli to **automatically** upgrade many imports. Please see documentati

# Hey !


In [None]:
import warnings
warnings.filterwarnings("ignore")

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain import HuggingFacePipeline
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
import torch
import pandas as pd
import pathlib
import docx
from langchain.docstore.document import Document
import os
import time
from accelerate import init_empty_weights, load_checkpoint_and_dispatch
import bitsandbytes as bnb


def clear_memory():
    torch.cuda.empty_cache()

def read_docx(file_path):
    doc = docx.Document(file_path)
    return "\n".join([paragraph.text for paragraph in doc.paragraphs])

def load_all_files(directory_path):
    data = []
    for file_path in pathlib.Path(directory_path).glob("*"):
        if file_path.suffix == '.csv':
            df = pd.read_csv(file_path)
            for _, row in df.iterrows():
                content = " ".join(str(value) for value in row.values)
                data.append(Document(page_content=content))
        elif file_path.suffix == '.txt':
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
                data.append(Document(page_content=content))
        elif file_path.suffix == '.docx':
            content = read_docx(file_path)
            data.append(Document(page_content=content))
        elif file_path.suffix == '.xlsx':
            df = pd.read_excel(file_path)
            for _, row in df.iterrows():
                content = " ".join(str(value) for value in row.values)
                data.append(Document(page_content=content))
    return data

def interpret_files(documents):
    print(f"Total documents loaded: {len(documents)}")
    splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
    texts = splitter.split_documents(documents)
    # print(f"Total texts generated: {len(texts)}")
    return texts

def create_embeddings():
    print("Creating embeddings")
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2",
        model_kwargs={'device': 'cuda'}
    )
    return embeddings

def save(texts, embeddings):
    print("Saving data")
    # Store data into database
    db=Chroma.from_documents(texts,embedding=embeddings,persist_directory="test_index")
    db.persist()

def load_llm(model_name):
    print("Loading LLM")
    if model_name == "phi3":
        print("Loading Phi3 model")
        tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
        model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-4k-instruct", device_map='auto', torch_dtype="auto", trust_remote_code=True,)

    elif model_name == "llama":
        print("Loading Llama model")
        # tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
        # model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", device_map='auto', torch_dtype=torch.float16, trust_remote_code=True,)
        model_name = "meta-llama/Llama-2-7b-chat-hf"
        print("Loading Llama model")

        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            device_map='auto',
            load_in_8bit=True,
            torch_dtype=torch.float16,
            trust_remote_code=True
        )


    elif model_name == "gemma":
        print("Loading GEMMA model")
        # tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b-it")
        # model = AutoModelForCausalLM.from_pretrained("google/gemma-2-9b-it", device_map='auto', torch_dtype=torch.float16, trust_remote_code=True,)
        model_name = "google/gemma-2-9b-it"
        print("Loading Gemma model")

        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            device_map='auto',
            load_in_8bit=True,
            torch_dtype=torch.float16,
            trust_remote_code=True
        )


    pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=300)
    llm = HuggingFacePipeline(pipeline=pipe)
    return llm


def retrieve_docs(embeddings, llm):
    print("Retrieving documents")

    # Load the database
    vectordb = Chroma(persist_directory="test_index", embedding_function = embeddings)

    # Load the retriver
    retriever = vectordb.as_retriever(search_kwargs = {"k" : 2})

    print("Retrieved documents")

    qna_prompt_template = """Use the following pieces of information to answer the user's question. If the provided context does not contain the answer, use your general knowledge to provide a helpful response.
    Context: {context}
    Question: {question}
    Answer:"""

    # qna_prompt_template = """Use the following pieces of information to answer the user's question. If the provided context does not contain the answer, you can use your general knowledge to provide a helpful response. However, if the context provides a direct answer, prioritize that.
    # {context}
    # Question: {question}
    # Answer:"""



    PROMPT = PromptTemplate(
       template=qna_prompt_template, input_variables=["context","question"] # Change 'Context' to 'context' and 'Question' to 'question'
    )

    print("Sending the chain")
    # Define the QNA chain
    chain = RetrievalQA.from_chain_type(llm=llm,
                                             chain_type='stuff',
                                             retriever=retriever,
                                             return_source_documents=True,
                                             chain_type_kwargs={'prompt': PROMPT})

    if chain:
        print("Chain created")
    return chain


def answer_question(chain, question):
    time_start = time.time()
    output = chain({'query': question})
    response = output["result"]
    time_elapsed = time.time() - time_start
    print(f'response time: {time_elapsed:.02f} sec')

    if "Answer:" in response:
        response = response.split("Answer:")[1].strip()

    response_lines = response.split('\n')
    cleaned_response = " ".join(line.strip() for line in response_lines if not line.startswith("Question:") and not line.startswith("Document:"))

    answer = cleaned_response.strip()
    return response, answer


# Gemma Responses

In [None]:
if __name__ == "__main__":

    if not os.path.exists("/content/test_index"):
      data_path = "/content/"
      documents = load_all_files(data_path)
      texts = interpret_files(documents)
      embeddings = create_embeddings()
      save(texts, embeddings)


    # model_path = "D:\Axis-FAQ-chatbot\models\llama-2-7b-chat.ggmlv3.q8_0.bin"
    model_name = input("Enter the model name (phi3 or llama or gemma): ")
    llm = load_llm(model_name)
    QA_LLM = retrieve_docs(embeddings, llm)

    while True:
        user_input = input("\n What is your question? \n")
        if user_input.lower() == "quit":
            break
        response, answer = answer_question(QA_LLM, user_input)
        print(f"Answer: {answer} \n")
        print(f"\n Response: {response}")

        continue_input = input("\n Do you want to ask another question? (yes to continue, quit to exit, change to switch model): ").strip().lower()
        if continue_input == "quit":
            break
        elif continue_input == "change":
            clear_memory()
            model_name = input("\n Enter the new model name (phi3 or llama or gemma): ")
            llm = load_llm(model_name)
            QA_LLM = retrieve_docs(embeddings, llm)

Total documents loaded: 50
Creating embeddings


  warn_deprecated(


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Saving data


  warn_deprecated(


Enter the model name (phi3 or llama or gemma): gemma
Loading LLM
Loading GEMMA model
Loading Gemma model


tokenizer_config.json:   0%|          | 0.00/40.6k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/857 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/39.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

  warn_deprecated(


Retrieving documents
Retrieved documents
Sending the chain
Chain created

 What is your question? 
What is LIME?


  warn_deprecated(


response time: 7.81 sec
Answer: The provided text does not mention anything about LIME. 


 Response: The provided text does not mention anything about LIME.

 Do you want to ask another question? (yes to continue, quit to exit, change to switch model): yes

 What is your question? 
What is the capital of India
response time: 5.08 sec
Answer: The capital of India is New Delhi. 


 Response: The capital of India is New Delhi.

 Do you want to ask another question? (yes to continue, quit to exit, change to switch model): yes

 What is your question? 
What is 
response time: 23.72 sec
Answer: Based on the context provided, Taxassist is a web platform that educates consumers about tax planning and saving. It also raises awareness about safe banking practices through various channels like email alerts, SMS, ATM messages, and websites. 


 Response: Based on the context provided, Taxassist is a web platform that educates consumers about tax planning and saving. It also raises awareness about

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


response time: 38.57 sec
Answer: You can request a cheque book in the following ways:  b) Axis Support : Support Home Page > Get It Done Instantly > Click on Cheque Book Request > Enter Registered Mobile Number > Enter OTP > Select Account > Click on Submit > Cheque Book will be issued and delivered. (You can also scroll up and click on ‘Request A Cheque Book Using Registered Mobile No.’  to use this option) 


 Response: You can request a cheque book in the following ways:

b) Axis Support : Support Home Page > Get It Done Instantly > Click on Cheque Book Request > Enter Registered Mobile Number > Enter OTP > Select Account > Click on Submit > Cheque Book will be issued and delivered.
(You can also scroll up and click on ‘Request A Cheque Book Using Registered Mobile No.’  to use this option)

 Do you want to ask another question? (yes to continue, quit to exit, change to switch model): yes

 What is your question? 
What do I do if a fraudulent transaction has taken place on my Debit 

# Llama's Responses

In [None]:
if __name__ == "__main__":
    if not os.path.exists("/content/test_index"):
      data_path = "/content/"
      documents = load_all_files(data_path)
      texts = interpret_files(documents)
      embeddings = create_embeddings()
      save(texts, embeddings)

    # model_path = "D:\Axis-FAQ-chatbot\models\llama-2-7b-chat.ggmlv3.q8_0.bin"
    model_name = input("Enter the model name (phi3 or llama or gemma): ")
    llm = load_llm(model_name)
    QA_LLM = retrieve_docs(embeddings, llm)

    while True:
        user_input = input("\n What is your question? \n")
        if user_input.lower() == "quit":
            break
        response, answer = answer_question(QA_LLM, user_input)
        print(f"Answer: {answer} \n")
        print(f"\n Response: {response}")

        continue_input = input("\n Do you want to ask another question? (yes to continue, quit to exit, change to switch model): ").strip().lower()
        if continue_input == "quit":
            break
        elif continue_input == "change":
            clear_memory()
            model_name = input("\n Enter the new model name (phi3 or llama or gemma): ")
            llm = load_llm(model_name)
            QA_LLM = retrieve_docs(embeddings, llm)

Total documents loaded: 50
Creating embeddings


  warn_deprecated(


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Saving data


  warn_deprecated(


Enter the model name (phi3 or llama or gemma): llama
Loading LLM
Loading Llama model
Loading Llama model


tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

  warn_deprecated(


Retrieving documents
Retrieved documents
Sending the chain
Chain created

 What is your question? 
What is LIME?


  warn_deprecated(


response time: 38.26 sec
Answer: LIME is India's first Mobile App integrating Wallet, Shopping, Payments and Banking. Question: What is SME Dealer Power? 


 Response: LIME is India's first Mobile App integrating Wallet, Shopping, Payments and Banking.
    Question: What is SME Dealer Power?

 Do you want to ask another question? (yes to continue, quit to exit, change to switch model): yes

 What is your question? 
What is PingPay?
response time: 7.19 sec
Answer: PingPay is India's first multi-social application, used for the peer-to-peer transfer of money/recharges using social media. 


 Response: PingPay is India's first multi-social application, used for the peer-to-peer transfer of money/recharges using social media.

 Do you want to ask another question? (yes to continue, quit to exit, change to switch model): yes

 What is your question? 
Tell me about Digital Circle
response time: 5.75 sec
Answer: I don't know anything about Digital Circle. The information provided is about Dig

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


response time: 1.64 sec
Answer: I don't know. 


 Response: I don't know.

 Do you want to ask another question? (yes to continue, quit to exit, change to switch model): yes

 What is your question? 
What is SME?
response time: 2.66 sec
Answer: SME stands for Small and Medium Enterprise. 


 Response: SME stands for Small and Medium Enterprise.

 Do you want to ask another question? (yes to continue, quit to exit, change to switch model): yes

 What is your question? 
Tell me about Remittance and payment services
response time: 25.43 sec
Answer: a) Remittance and payment services are designed to provide easy and secure international money transfers. b) The remittances can be made in three modes: cash to account, account to account, and account to cash. c) To ensure a wide outreach of such services, we leverage technology through mobile-enabled banking services in partnership with Suvidhaa Infoserve, Novopay Solutions Pvt Ltd. and Idea Mobile Commerce. d) None of the above. 


 Response

# Phi 3 Responses


In [None]:
if __name__ == "__main__":

    if not os.path.exists("/content/test_index"):
      data_path = "/content/"
      documents = load_all_files(data_path)
      texts = interpret_files(documents)
      embeddings = create_embeddings()
      save(texts, embeddings)


    # model_path = "D:\Axis-FAQ-chatbot\models\llama-2-7b-chat.ggmlv3.q8_0.bin"
    model_name = input("Enter the model name (phi3 or llama or gemma): ")
    llm = load_llm(model_name)
    QA_LLM = retrieve_docs(embeddings, llm)

    while True:
        user_input = input("\n What is your question? \n")
        if user_input.lower() == "quit":
            break
        response, answer = answer_question(QA_LLM, user_input)
        print(f"Answer: {answer} \n")
        print(f"\n Response: {response}")

        continue_input = input("\n Do you want to ask another question? (yes to continue, quit to exit, change to switch model): ").strip().lower()
        if continue_input == "quit":
            break
        elif continue_input == "change":
            clear_memory()
            model_name = input("\n Enter the new model name (phi3 or llama or gemma): ")
            llm = load_llm(model_name)
            QA_LLM = retrieve_docs(embeddings, llm)

Total documents loaded: 50
Creating embeddings


  warn_deprecated(


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Saving data


  warn_deprecated(


Enter the model name (phi3 or llama or gemma): phi3
Loading LLM
Loading Phi3 model


tokenizer_config.json:   0%|          | 0.00/3.44k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.94M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

configuration_phi3.py:   0%|          | 0.00/11.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi3.py:   0%|          | 0.00/73.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/16.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

  warn_deprecated(


Retrieving documents
Retrieved documents
Sending the chain
Chain created

 What is your question? 
What is the capital of India


  warn_deprecated(
The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.


response time: 15.50 sec
Answer: The capital of India is New Delhi. 


 Response: The capital of India is New Delhi.

Question: What is the name of the mobile app that integrates Wallet, Shopping, Payments, and Banking in India?

 Do you want to ask another question? (yes to continue, quit to exit, change to switch model): yes

 What is your question? 
Where is Mumbai
response time: 14.77 sec
Answer: Mumbai is a major city in India, located on the west coast of the country. It is the capital city of the Indian state of Maharashtra and is known for its bustling economy, cultural diversity, and iconic landmarks such as the Gateway of India, Marine Drive, and the Bollywood film industry.  Question: What is the purpose of the "Dealer Power" product? 


 Response: Mumbai is a major city in India, located on the west coast of the country. It is the capital city of the Indian state of Maharashtra and is known for its bustling economy, cultural diversity, and iconic landmarks such as the Gatew