In [1]:
!pip install langchain
!pip install torch
!pip install accelerate
!pip install sentence-transformers
!pip install streamlit
!pip install streamlit-chat
!pip install faiss-cpu
!pip install tiktoken
!pip install huggingface-hub
!pip install pypdf
!pip install llama-cpp-python
!pip install -U langchain-community

Collecting langchain
  Downloading langchain-0.2.6-py3-none-any.whl (975 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/975.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m225.3/975.5 kB[0m [31m6.6 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━[0m [32m655.4/975.5 kB[0m [31m9.4 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m972.8/975.5 kB[0m [31m10.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m975.5/975.5 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
Collecting langchain-core<0.3.0,>=0.2.10 (from langchain)
  Downloading langchain_core-0.2.11-py3-none-any.whl (337 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m337.4/337.4 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-text

In [2]:
from langchain.chains import ConversationalRetrievalChain
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import LlamaCpp
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.memory import ConversationBufferMemory
from langchain.document_loaders import PyPDFLoader
import os
import tempfile
import openpyxl
from openpyxl import Workbook,load_workbook
from langchain.llms import DeepInfra
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, util
import spacy

  from tqdm.autonotebook import tqdm, trange


In [25]:
import re

In [3]:
def initialize_session_state():
    session_state = {
        "history": [],
        "generated": ["Hello! Ask me anything about 🤖"],
        "past": ["Hey! 👋"]
    }
    return session_state

In [4]:
def conversation_chat(query, chain, session_state):
    #session_state["history"] = []
    session_state = initialize_session_state()
    result = chain({"question": query, "chat_history": session_state["history"]})
    session_state["history"].append((query, result["answer"]))
    return result["answer"]

In [5]:
def display_chat_history(chain, session_state, questions):
    workbook = Workbook()
    sheet = workbook.active
    sheet.append(["Question", "Answer", "Relevance Score", "Status"])

    for question, repeats in questions:
        for _ in range(repeats):
            output = conversation_chat(question, chain, session_state)
            score, status = check_relevance(question, output)
            sheet.append([question, output, score, status])
            print(f"User: {question}\nBot: {output}\nScore: {score}\nStatus: {status}")

    workbook.save("ChatHistory.xlsx")
    print("Chat history saved to ChatHistory.xlsx")



In [6]:
os.environ["DEEPINFRA_API_TOKEN"] = "wUp54XhDTB1htN0ios9N8JMSg9rKpedN"

In [7]:

def create_conversational_chain(vector_store):
    local_llm = DeepInfra(model_id="mistralai/Mistral-7B-Instruct-v0.1")

    local_llm.model_kwargs = {
        "streaming": True,
        "temperature": 0.5,
        "top_p": 0.7,
        "verbose": True,
        "n_ctx": 4096,
    }

    memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
    chain = ConversationalRetrievalChain.from_llm(
        llm=local_llm,
        chain_type="stuff",
        retriever=vector_store.as_retriever(search_kwargs={"k": 2}),
        memory=memory
    )
    return chain



In [28]:
# Load spaCy model for named entity recognition
nlp = spacy.load("en_core_web_sm")

# Define functions for computing similarities
def compute_tfidf_similarity(text1, text2):
    vectorizer = TfidfVectorizer().fit_transform([text1, text2])
    vectors = vectorizer.toarray()
    return cosine_similarity([vectors[0]], [vectors[1]])[0][0]

def compute_bert_similarity(text1, text2):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings1 = model.encode(text1, convert_to_tensor=True)
    embeddings2 = model.encode(text2, convert_to_tensor=True)
    return util.pytorch_cos_sim(embeddings1, embeddings2).item()

def compute_combined_similarity(text1, text2, weight_tfidf=0.3, weight_bert=0.7):
    tfidf_similarity = compute_tfidf_similarity(text1, text2)
    bert_similarity = compute_bert_similarity(text1, text2)
    combined_similarity = weight_tfidf * tfidf_similarity + weight_bert * bert_similarity
    return combined_similarity

def contains_named_entity(text):
    doc = nlp(text)
    return any(ent.label_ in ["PERSON", "ORG", "GPE"] for ent in doc.ents)

def check_relevance(question, generated_answer):
    # Calculate similarity score and relevance
    similarity_score = compute_combined_similarity(question, generated_answer)

    # Additional check for named entities
    if contains_named_entity(question) and contains_named_entity(generated_answer):
        relevance = "Relevant"
    else:
        relevance = "Relevant" if similarity_score >= 0.4 else "Not Relevant"

    return similarity_score, relevance



In [33]:

#many questions added manually
def main():
    session_state = initialize_session_state()
    print("ChatBot using Mistral-7B-Instruct LLM :books:")

    uploaded_files = ["/content/2023.pdf"]

    if uploaded_files:
        text = []
        for file_path in uploaded_files:
            with open(file_path, "rb") as f:
                file_contents = f.read()
            file_extension = os.path.splitext(file_path)[1]
            with tempfile.NamedTemporaryFile(delete=False) as temp_file:
                temp_file.write(file_contents)
                temp_file_path = temp_file.name
                print("loading: ", file_path)
            loader = None
            if file_extension == ".pdf":
                loader = PyPDFLoader(temp_file_path)

            if loader:
                text.extend(loader.load())
                os.remove(temp_file_path)

        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=10000, chunk_overlap=20
        )

        text_chunks = text_splitter.split_documents(text)

        embeddings = HuggingFaceEmbeddings(
            model_name="sentence-transformers/all-MiniLM-L6-v2",
            model_kwargs={"device": "cpu"},
        )

        vector_store = FAISS.from_documents(text_chunks, embedding=embeddings)

        chain = create_conversational_chain(vector_store)

        questions = []
        while True:
            user_input = input("Enter your question (or type 'done' to finish): ")
            if user_input.lower() == 'done':
                break
            repeats = int(input(f"How many times to repeat '{user_input}': "))
            questions.append((user_input, repeats))

        display_chat_history(chain, session_state, questions)

if __name__ == "__main__":
    main()


ChatBot using Mistral-7B-Instruct LLM :books:
loading:  /content/2023.pdf
Enter your question (or type 'done' to finish): What do you know about ave maria convent?
How many times to repeat 'What do you know about ave maria convent?': 10
Enter your question (or type 'done' to finish): done




User: What do you know about ave maria convent?
Bot:  I don't know about Ave Maria convent.
Score: 0.8063467191679605
Status: Relevant




User: What do you know about ave maria convent?
Bot:   Ave Maria convent is not mentioned in the given context.

51  
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 

Score: 0.5790043612226597
Status: Relevant




User: What do you know about ave maria convent?
Bot:   No, Ave Maria convent is not mentioned in the given context.

47  
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 

Score: 0.5773304472579558
Status: Relevant




User: What do you know about ave maria convent?
Bot:   Ave Maria convent is not mentioned in the context provided.

50  
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 

Score: 0.5861622353300205
Status: Relevant




User: What do you know about ave maria convent?
Bot:   Ave Maria convent is not mentioned in the given context.

47  
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 

Score: 0.5888214250311008
Status: Relevant




User: What do you know about ave maria convent?
Bot:   Ave Maria convent is not mentioned in the given context. 

51  
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Score: 0.5790043612226597
Status: Relevant




User: What do you know about ave maria convent?
Bot:   Ave Maria convent is not mentioned in the given context. 

51  
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Score: 0.5790043612226597
Status: Relevant




User: What do you know about ave maria convent?
Bot:   Ave Maria convent is not mentioned in the given context.

51  
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 

Score: 0.5790043612226597
Status: Relevant


KeyboardInterrupt: 

In [10]:
# Load spaCy model for named entity recognition
nlp = spacy.load("en_core_web_sm")

# Define functions for computing similarities
def compute_tfidf_similarity(text1, text2):
    vectorizer = TfidfVectorizer().fit_transform([text1, text2])
    vectors = vectorizer.toarray()
    return cosine_similarity([vectors[0]], [vectors[1]])[0][0]

def compute_bert_similarity(text1, text2):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings1 = model.encode(text1, convert_to_tensor=True)
    embeddings2 = model.encode(text2, convert_to_tensor=True)
    return util.pytorch_cos_sim(embeddings1, embeddings2).item()

def compute_combined_similarity(text1, text2, weight_tfidf=0.3, weight_bert=0.7):
    tfidf_similarity = compute_tfidf_similarity(text1, text2)
    bert_similarity = compute_bert_similarity(text1, text2)
    combined_similarity = weight_tfidf * tfidf_similarity + weight_bert * bert_similarity
    return combined_similarity

def contains_named_entity(text):
    doc = nlp(text)
    return any(ent.label_ in ["PERSON", "ORG", "GPE"] for ent in doc.ents)

# Load questions and answers from Excel file
input_file = '/content/Book2.xlsx'
output_file = 'similarity_results.xlsx'

df = pd.read_excel(input_file)

# Ensure the input file has columns 'Question' and 'Answer'
if 'Question' not in df.columns or 'Answer' not in df.columns:
    raise ValueError("Input Excel file must contain 'Question' and 'Answer' columns.")

results = []
threshold = 0.4  # Adjust as needed

# Iterate through the questions and answers
for index, row in df.iterrows():
    question = row['Question']
    generated_answer = row['Answer']

    similarity_score = compute_combined_similarity(question, generated_answer)

    # Additional check for named entities
    if contains_named_entity(question) and contains_named_entity(generated_answer):
        relevance = "Relevant"
    else:
        relevance = "Relevant" if similarity_score >= threshold else "Not Relevant"

    results.append({
        'Question': question,
        'Generated Answer': generated_answer,
        'Similarity Score': similarity_score,
        'Relevance': relevance
    })

# Create a DataFrame for the results
results_df = pd.DataFrame(results)

# Write the results to an Excel file
results_df.to_excel(output_file, index=False)

print(f"Results have been written to {output_file}")


FileNotFoundError: [Errno 2] No such file or directory: '/content/Book2.xlsx'