### 1. Importing libraries

In [120]:
import os
import langchain 
import langchain_community
import langchain_huggingface
import langchain_pinecone 
import pinecone
import dotenv
import streamlit as st

### 2. Setting Up API Keys

In [121]:
HUGGINGFACE_API_KEY = "enter_your_api_key"
PINECONE_API_KEY = "enter_your_api_key"

env_content = f"""
HUGGINGFACE_API_KEY= 
PINECONE_API_KEY= 
"""

with open(".env", "w") as file:
    file.write(env_content)

print("Environment variables are saved to .env file.")

Environment variables are saved to .env file.


### Loading the Environment File

Run the following snippet of code to load the environment file each time you use this notebook

In [122]:
dotenv.load_dotenv()

True

### 3. Imports

In [123]:
import os
import re
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import WikipediaLoader
from langchain.retrievers import WikipediaRetriever
from pinecone import Pinecone, ServerlessSpec
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore
from uuid import uuid4
from langchain_huggingface import HuggingFaceEndpoint
from langchain import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
from langchain_core.runnables import RunnableLambda
from langchain.globals import set_llm_cache
from langchain.cache import InMemoryCache
from langchain_google_genai import ChatGoogleGenerativeAI

## Helper Functions

In [124]:
def create_index(pc):
    index_name = "chatbotresearch"
    if index_name not in pc.list_indexes().names():
        pc.create_index(
        name=index_name,
        dimension=768,
        metric="cosine",
        spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
        )
    )
    return index_name

def format_docs(docs):
    return "\n\n".join([d.page_content for d in docs])

def print_question(question):
    print(question)
    return question

def check_string(input_string):
    match = re.search(r'`(.*?)`', input_string)
    if match:
        return match.group(1)
    else:
        return "No"

In [125]:
class ResearchChatbot:
    def __init__(self):
        self.pdfs = []
        self.documents = []
        self.pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))
        self.index_name = create_index(self.pc)
        self.retriver = None
        self.wiki_retriever_instance = WikipediaRetriever()
        self.local_cache = {}
        def wiki_retriever(query):
            return self.wiki_retriever_instance.invoke(query)

        repo_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"
        GOOGLE_API_KEY="AIzaSyAVOt9dfM1lwJg-FGiXivRBsdVlNYn5mos"
        self.llm_setup = ChatGoogleGenerativeAI(model="gemini-1.5-pro", google_api_key=GOOGLE_API_KEY, temperature=0.2)

        self.llm = HuggingFaceEndpoint(
            repo_id=repo_id,
            temperature= 0.8,
            top_k= 50,
            huggingfacehub_api_token=os.getenv('HUGGINGFACE_API_KEY')
        )

    def load_papers(self, filepath):
        for f in os.listdir(filepath):
            if f.endswith('.pdf'):
                self.pdfs.append(f)

        for pdf in self.pdfs:
            loader = PyMuPDFLoader(os.path.join(filepath, pdf))
            self.documents.extend(loader.load())

        text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=4)
        self.docs = text_splitter.split_documents(self.documents)
        return

    def store_papers(self):
        embeddings = HuggingFaceEmbeddings()

        index = self.pc.Index(self.index_name)

        vector_store = PineconeVectorStore(index=index, embedding=embeddings)

        uuids = [str(uuid4()) for _ in range(len(self.documents))]
        vector_store.add_documents(documents=self.documents, ids=uuids)

        self.retriever = vector_store.as_retriever(
            search_type="similarity_score_threshold",
            search_kwargs={"k": 2, "score_threshold": 0.5},
        )
        
    def generate(self, query):

        # def llm(query):
        #     return self.llm_setup.invoke(str(query)).content
        
        if query in self.local_cache:
                self.local_cache[query] = self.local_cache.pop(query)
                return self.local_cache[query]
        if not self.local_cache:
            template2 = """You are given a question and its context. If the context and question do not match in terms of the topic, then ignore the context and answer to the best of your knowledge. Otherwise, answer the question using the context provided.
                
            Question: {question}
            Context: {context}
            Answer:"""
            prompt2 = PromptTemplate(
                template=template2,
                input_variables=["context", "question"]
            )
            gen_chain = prompt2 | self.llm | StrOutputParser()

            template3 = """Using the following context answer the question. Do not make stuff from your on.
                
            Question: {question}
            Context: {wiki_context}
            Answer:"""
            prompt3 = PromptTemplate(
                template=template3,
                input_variables=["wiki_context", "question"]
            )
            wiki_chain = prompt3 | self.llm | StrOutputParser()
            
            condition = check_string(query)
            if condition == "No":
                # print("General/Context")
                article = self.retriever.invoke(query)
                inputs = {
                    "context": article,
                    "question": query
                }
                self.local_cache[query] = gen_chain.invoke(inputs)
                return gen_chain.invoke(inputs)
            else:
                # print("Wikipedia")
                article = self.wiki_retriever_instance.invoke(condition)
                inputs = {
                    "wiki_context": article,
                    "question": query
                }
                self.local_cache[query] = wiki_chain.invoke(inputs)
                return self.local_cache[query]
        else:
            last_query = list(self.local_cache.keys())[-1]
            last_response = list(self.local_cache.values())[-1]

            template2 = """You are given a question and its context. If the context and question do not match in terms of the topic, then ignore the context and answer to the best of your knowledge. Otherwise, answer the question using the context provided.
            You are also given the last query the user asked and the response given to that query given by you. You are expected to have context of this query when responding to the question. However, use the context only if it matches the question.
            Previous Query: {prev_prompt}
            Previous Response: {prev_response}
            Question: {question}
            Context: {context}
            Answer:"""
            prompt2 = PromptTemplate(
                template=template2,
                input_variables=["context", "question", "prev_prompt", "prev_response"]
            )
            gen_chain = prompt2 | self.llm | StrOutputParser()

            template3 = """Using the following context answer the question. Do not make stuff from your on.
            You are also given the last query the user asked and the response given to that query. You are expected to have context of this query and response.
            Previous Query: {prev_prompt}
            Previous Response: {prev_response}
            Question: {question}
            Context: {wiki_context}
            Answer:"""
            prompt3 = PromptTemplate(
                template=template3,
                input_variables=["wiki_context", "question", "prev_prompt", "prev_response"]
            )
            wiki_chain = prompt3 | self.llm | StrOutputParser()
            
            condition = check_string(query)
            if condition == "No":
                # print("General/Context - History")
                # print("Last Response: ", last_response)
                article = self.retriever.invoke(query)
                inputs = {
                    "context": article,
                    "question": query,
                    "prev_prompt": last_query,
                    "prev_response": last_response
                }
                self.local_cache[query] = gen_chain.invoke(inputs)
                return gen_chain.invoke(inputs)
            else:
                # print("Wikipedia - History")
                article = self.wiki_retriever_instance.invoke(condition)
                inputs = {
                    "wiki_context": article,
                    "question": query,
                    "prev_prompt": last_query,
                    "prev_response": last_response
                }
                self.local_cache[query] = wiki_chain.invoke(inputs)
                return self.local_cache[query]

### 5. Testing your ResearchChatbot

In [126]:
bot = ResearchChatbot()
bot.load_papers("./papers/")
bot.store_papers()

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to C:\Users\Dell\.cache\huggingface\token
Login successful




In [127]:
query = "What is the `Priory of Sion`. Get information from Wikipedia."
response = bot.generate(query)
print(response)

 The Priory of Sion is a fraternal organization founded by Pierre Plantard in 1956 in Annemasse, Haute-Savoie, France. Plantard claimed that the Priory of Sion was the latest front for a secret society founded by crusading knight Godfrey of Bouillon in 1099, with the goal of installing a secret bloodline of the Merovingian dynasty on the thrones of France and the rest of Europe. These claims were later popularized by the authors of the 1982 book The Holy Blood and the Holy Grail and borrowed by Dan Brown for his 2003 mystery thriller novel The Da Vinci Code. However, it was later discovered that the historical existence and activities of the Priory of Sion before 1956 were a hoax created by Plantard as part of his unsuccessful attempt to become a respected, influential and wealthy player in French esotericist and monarchist circles. Despite being debunked as France's greatest 20th-century literary hoax by journalists and scholars, many conspiracy theorists still believe that the Priory

In [128]:
query = "Can you explain to me what is Mixed-Domain Pretraining?"
response = bot.generate(query)
print(response)

 Mixed-Domain Pretraining is a method of language model pretraining where out-domain text is assumed to be helpful and domain-specific pretraining is typically initialized with a general-domain language model and inherits its vocabulary. It is different from Domain-Specific Pretraining from Scratch which derives the vocabulary and conducts pretraining using solely in-domain text. According to the study mentioned in the context, for domains with abundant text such as biomedicine, Domain-Specific Pretraining from Scratch can substantially outperform the conventional mixed-domain approach. The authors of the study also question the prevailing assumption that out-domain text is still helpful and suggest that for domains with abundant unlabeled text such as biomedicine, it is unclear that domain-specific pretraining can benefit from transfer from general domains. They also express concern about negative transfer that actually hinders the target performance.


In [129]:
query = "Could you please summarize your previous answer in one line?"
response = bot.generate(query)
print(response)

 In the previous answer, I explained that Mixed-Domain Pretraining involves using out-domain text in addition to domain-specific text for language model pretraining, while Domain-Specific Pretraining from Scratch uses only in-domain text. I also mentioned a study that found that for domains with abundant text such as biomedicine, Domain-Specific Pretraining from Scratch can outperform the conventional mixed-domain approach, and raised concerns about negative transfer from general domains to specific domains.


In [130]:
%%time
query = "What is the `Priory of Sion`. Get information from Wikipedia."
response = bot.generate(query)
print(response)

 The Priory of Sion is a fraternal organization founded by Pierre Plantard in 1956 in Annemasse, Haute-Savoie, France. Plantard claimed that the Priory of Sion was the latest front for a secret society founded by crusading knight Godfrey of Bouillon in 1099, with the goal of installing a secret bloodline of the Merovingian dynasty on the thrones of France and the rest of Europe. These claims were later popularized by the authors of the 1982 book The Holy Blood and the Holy Grail and borrowed by Dan Brown for his 2003 mystery thriller novel The Da Vinci Code. However, it was later discovered that the historical existence and activities of the Priory of Sion before 1956 were a hoax created by Plantard as part of his unsuccessful attempt to become a respected, influential and wealthy player in French esotericist and monarchist circles. Despite being debunked as France's greatest 20th-century literary hoax by journalists and scholars, many conspiracy theorists still believe that the Priory

In [131]:
query = "Explain previous answer in one line."
response = bot.generate(query)
print(response)

  The Priory of Sion is a fraternal organization founded in France, which was claimed to be a centuries-old cabal concealing a religiously subversive secret, but was later exposed as a hoax.


In [132]:
query = "What is the biggest mammal?"
response = bot.generate(query)
print(response)

 The biggest mammal is the blue whale.


In [133]:
query = "Where is it found?"
response = bot.generate(query)
print(response)

 Blue whales are typically found in the ocean. They are known to inhabit a wide range of areas including the Arctic, Antarctic, North Atlantic, and North Pacific oceans. They prefer deep, offshore waters and are known to migrate long distances for feeding and breeding. However, specific locations can vary depending on the time of year and other environmental factors.
