In [26]:
%%writefile app.py

import os
import streamlit as st
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredURLLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
import torch.nn.functional as F
import torch
from transformers import AutoTokenizer, AutoModel
import os
import mysql.connector
import json
import singlestoredb as s2





print('hi')
from dotenv import load_dotenv
load_dotenv()  # take environment variables from .env (especially openai api key)

#main_placeholder
st.title("BeSearcher: Search Tool For CS Articles 🐝👨‍🎓")
main_placeholder = st.empty()
question = main_placeholder.text_input("Question: ")
search_question_clicked = st.button("Search Question")
loading_placeholder = st.empty()
#sidebar
st.sidebar.title("Multiple Articles")
first_article_clicked = st.sidebar.button("First Article")
second_article_clicked = st.sidebar.button("2nd Article")
third_article_clicked = st.sidebar.button("Third Article")

#codes
def connect_to_db():
    mydb = s2.connect()
    return mydb


sentence_emmbeder_model_ckpt = "sentence-transformers/all-MiniLM-L6-v2"
sentence_emmbeder_tokenizer = AutoTokenizer.from_pretrained(sentence_emmbeder_model_ckpt)
sentence_emmbeder_model = AutoModel.from_pretrained(sentence_emmbeder_model_ckpt)

def sentence_emmbeder(sent,tokenizer =sentence_emmbeder_tokenizer ,model = sentence_emmbeder_model):
    sentences = [
        sent,
    ]

    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")

    with torch.no_grad():
        model_output = model(**encoded_input)


    token_embeddings = model_output.last_hidden_state
    print(f"Token embeddings shape: {token_embeddings.size()}")

    def mean_pooling(model_output, attention_mask):
        token_embeddings = model_output.last_hidden_state
        input_mask_expanded = (
            attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        )
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
            input_mask_expanded.sum(1), min=1e-9
        )


    sentence_embeddings = mean_pooling(model_output, encoded_input["attention_mask"])
    # Normalize the embeddings
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
    print(f"Sentence embeddings shape: {sentence_embeddings.size()}")
    return sentence_embeddings[0].tolist()


def find_similar_documents(embedding):
    connection = connect_to_db()
    cursor = connection.cursor()
    try:
        # Query to find similar documents using cosine similarity
        query = f"""
            SELECT name  , dot_product(vector , JSON_ARRAY_PACK("{embedding}")) as score , text
FROM articles
order by score desc
limit 1
        """
        cursor.execute(query)
        similar_documents = cursor.fetchall()
        return similar_documents
    except Exception as error:
        print("Error:", error)
        return None
    finally:
        cursor.close()
        connection.close()

from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from uuid import uuid4
import langchain
from langchain_core.documents import Document
from langchain_community.llms import HuggingFaceHub
import os



text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

# As data is of type documents we can directly use split_documents over split_text in order to get the chunks.



model_name = "sentence-transformers/all-MiniLM-L6-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)




index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))

vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)



os.environ['HUGGINGFACEHUB_API_TOKEN']="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"
hf=HuggingFaceHub(
    repo_id="mistralai/Mistral-7B-Instruct-v0.2",
    model_kwargs={"temperature":0.1,"max_length":500}

)



prompt_template="""
Use the following piece of context to answer the question asked.
Please try to provide the answer only based on the context
{context}
Question:{question}
Helpful Answers:
 """
prompt=PromptTemplate(template=prompt_template,input_variables=["context","question"])



#
if search_question_clicked and question:
    # load data
    loading_placeholder.text("Article Searching...Started...✅✅✅")
    myquestion = question
    token_vector = sentence_emmbeder(myquestion )

    # Find similar documents based on the token vector
    similar_docs = find_similar_documents(token_vector)

    # Print the similar documents
    if similar_docs:
        print("Similar Documents:")
        for doc in similar_docs:
            print(doc)
    else:
        print("No similar documents found.")
    loading_placeholder.text("Generating Answer...Started...✅✅✅")

    docs = text_splitter.split_text(doc[2])
    documents = []
    for i in range(len(docs)):
      documents.append(Document(
        page_content=f"{docs[i]}",
        metadata={"source": "article",
                  },
    ))
    uuids = [str(uuid4()) for _ in range(len(documents))]
    vector_store.add_documents(documents=documents, ids=uuids)
    query = myquestion
    retriever=vector_store.as_retriever(search_type="similarity",search_kwargs={"k":3})

    retrievalQA=RetrievalQA.from_chain_type(
        llm=hf,
        chain_type="stuff",
        retriever=retriever,
        return_source_documents=True,
        chain_type_kwargs={"prompt":prompt}
    )
    # Call the QA chain with our query.
    result = retrievalQA.invoke({"query": query})

    loading_placeholder.text("Answer...Found...✅✅✅")
    temp = result['result'].index('Helpful Answers:\n')
    st.write(result['result'][temp + len('Helpful Answers:\n'):])
    st.subheader("Sources:")
    st.write(f"Document Id : {doc[0]}")
    st.write(f"Document relevance : {doc[1]}")
    st.write(docs[0])

Writing app.py
