In [16]:
import pinecone
from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Pinecone
from uuid import uuid4
import yaml
import logging.config
import os
import re

logging.config.fileConfig(fname='logging_config.ini', disable_existing_loggers=False)
logger = logging.getLogger(__name__)


In [18]:
class CustomLoader():
    '''
    Object class that handles loading of webpages and uploading to PineCone vectorDB
    '''
    def __init__(self, config: dict)-> None:
        '''
        # Parameters
        -------------
        config : dictionary containing the following info
            chunk_size
            chunk_overlap
            chunk_separators
            embedding_model

        '''
        self.vector_db = None
        
        self.embedding_function = OpenAIEmbeddings(
            deployment="SL-document_embedder",
            model=config['embedding_options']['model'],
            show_progress_bar=True)
        
        self.splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
                    chunk_size=config['splitter_options']['chunk_size'],
                    chunk_overlap=config['splitter_options']['chunk_overlap'],
                    separators = config['splitter_options']['chunk_separators']
                    )
        self.clean_regex = config['splitter_options']['clean_regex']
        self.regex_patterns = config['splitter_options']['regex_patterns']

        self.texts = []
        self.metadatas = []
        logger.info(f'Initialized loader, current no. of chunks: {len(self.texts), len(self.metadatas)}')

    def __str__(self) -> str:
        '''Returns the length of currently loaded data and metadata, both should match'''
        return (len(self.texts), len(self.metadatas))

    def load_webpages(self, url)-> None:
        '''
        Function to load webpage and split into raw texts with metadata
        # Parameters
        ------------
        url : str or list of strings containing urls
        '''
        loader = WebBaseLoader(url)
        for idx, record in enumerate(loader.load()):
            # Remove messy delimiters first:
            if self.clean_regex:
                for regex_pattern in self.regex_patterns:
                    record.page_content = re.sub(regex_pattern, ' ', record.page_content)
                    
            # Extract metadata, split text and append to main list to upload
            metadata = record.metadata
            record_texts = self.splitter.split_text(record.page_content)
            record_metadatas = [{"chunk": chunk_num, "text": text, **metadata} for chunk_num, text in enumerate(record_texts)]
            self.texts.extend(record_texts)
            self.metadatas.extend(record_metadatas)
            logger.info((len(self.texts), len(self.metadatas)))

    def index_db(self, vector_db: pinecone.Pinecone, index_name: str)-> None:
        '''
        Function to index the vector database
        # Parameters
        -------------
        vector_db (str) : vector_db instance
        index_name (str) : name of index
        '''
        self.index = vector_db.Index(index_name)
        logger.info(vector_db.describe_index(index_name))
        

    def upload_to_server(self, namespace : str)-> None:
        '''
        Function to upload any loaded data and metadata to the vector_db index, clears the uploaded text and metadata from instance once completed
        # Parameters
        ------------
        namespace (str) : a namespace for partitioning items in the index
        '''
        ids = [str(uuid4()) for _ in range(len(self.texts))]
        embeddings = self.embedding_function.embed_documents(self.texts)
        self.index.upsert(vectors=zip(ids, embeddings, self.metadatas), namespace=namespace)
        self.texts = []
        self.metadatas = []
        
    
    def clear_docs(self):
        self.texts.clear()
        self.metadatas.clear()

    def create_embeddings(self):
        return self.embedding_function.embed_documents(self.texts)


In [19]:
with open('config.yml', 'r') as f:
    config = yaml.safe_load(f)

urls = ['https://www.webmd.com/diabetes/diabetes-basics', 
        'https://www.webmd.com/diabetes/understanding-diabetes-symptoms']

custom_loader = CustomLoader(config)
custom_loader.load_webpages(urls)

2024-01-27 18:47:22,303 - __main__ - INFO - Initialized loader, current no. of chunks: (0, 0)
2024-01-27 18:47:25,828 - __main__ - INFO - (2, 2)
2024-01-27 18:47:25,862 - __main__ - INFO - (4, 4)


In [20]:
custom_loader.create_embeddings()

  0%|          | 0/1 [00:00<?, ?it/s]

2024-01-27 18:47:29,972 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


[[-0.0031328726745076057,
  -0.018072456210557582,
  0.023757749699631913,
  -0.03195310442993803,
  -0.020419364653888855,
  0.006312809265043521,
  -0.006915224503106191,
  -0.011100754915085172,
  -0.03225431274746123,
  -0.010303810159001047,
  -0.0014181857468226133,
  0.008935825022330231,
  -0.027108683384513887,
  0.0105046144622532,
  0.012977026992100795,
  0.0033038707001761475,
  0.05692823417615668,
  0.013052329071481595,
  0.019365137405202633,
  -0.02186265186994047,
  -0.042319666166536,
  -0.008270658672331885,
  -0.005264857732910481,
  0.004342409355971278,
  -0.016302861536059976,
  0.00901740235260297,
  -0.006626567153028115,
  -0.02615486015009871,
  -0.004856972478530509,
  0.004712644036322092,
  0.014533266861562368,
  -0.010824647601129734,
  -0.02434761397024946,
  -0.011000352763459094,
  0.008132605015354167,
  0.01634051117876665,
  -0.0029101045274724196,
  -0.013692395349234662,
  0.0019405927001661485,
  -0.035391893489952135,
  0.01885057614528837,
 

In [14]:
custom_loader.metadatas

[]

In [None]:
pc = pinecone.Pinecone(api_key=os.environ['PC_API_KEY'])
custom_loader.index_db(pc, 'healthhack')

In [92]:
custom_loader.upload_to_server('sienlong-diabetes')

  0%|          | 0/1 [00:00<?, ?it/s]

2024-01-27 14:52:23,908 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


# Retrieval

In [2]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.messages import AIMessage, HumanMessage, get_buffer_string
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from operator import itemgetter
from langchain_community.vectorstores import Pinecone
import pinecone
from langchain.memory import ConversationBufferMemory
from langchain_core.runnables import RunnableLambda
from langchain_community.callbacks import get_openai_callback
from pprint import pprint

In [3]:
class VectorDB():
    '''
    Class object for retrieval and querying
    '''
    def __init__(self, config: dict):
        '''
        # Parameters
        -------------
        config : dictionary containing the following info
            chunk_size
            chunk_overlap
            chunk_separators
            embedding_model

        '''
        index_name = config['db_options']['index_name']
        pc = pinecone.Pinecone(api_key=os.environ['PC_API_KEY'])
        self.index = pc.Index(index_name)
        self.embedding_function = OpenAIEmbeddings(
            deployment="SL-document_embedder",
            model=config['embedding_options']['model'],
            show_progress_bar=True)
        logger.info(f"\n{pc.describe_index(index_name)}")
        self.llm = ChatOpenAI(
            model_name=config['llm'],
            temperature=1
            )
        logger.info(f"llm model: {config['llm']}")
      
    def create_retriever(self, namespace):
        self.retriever = Pinecone(self.index, self.embedding_function, 'text', namespace=namespace)
        
    def create_chain(self):
        '''
        Creates the conversation chain from scratch
        '''

        # First need to summarise the history and create a standalone question using the LLM
        _template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.

        Chat History:
        {chat_history}
        Follow Up Input: {question}
        Standalone question:"""

        self.CONDENSE_QUESTION_PROMPT = ChatPromptTemplate.from_template(_template)

        _inputs = RunnableParallel(
            standalone_question=RunnablePassthrough.assign(
                chat_history=lambda x: get_buffer_string(x["chat_history"])
            )
            | self.CONDENSE_QUESTION_PROMPT
            | self.llm
            | StrOutputParser(),
        )

        # Next part is similar to a normal RAG, the difference is in the context pipe, we will pipe in the standalone_question

        langchain_retriever = self.retriever.as_retriever(
            search_type="similarity", # mmr, similarity_score_threshold, similarity
            search_kwargs = {"k": 1}
        )

        template = """Answer the question based only on the following context:
        {context}

        Question: {question}
        """
        ANSWER_PROMPT = ChatPromptTemplate.from_template(template)

        _context = {
            "context": itemgetter("standalone_question") | langchain_retriever ,
            "question": lambda x: x["standalone_question"],
        }

        # First we add a step to load memory
        # This adds a "memory" key to the input object
        self.memory = ConversationBufferMemory(
            return_messages=True, output_key="answer", input_key="question"
        )

        loaded_memory = RunnablePassthrough.assign(
            chat_history=RunnableLambda(self.memory.load_memory_variables) | itemgetter("history"),
        )

        self.final_chain = loaded_memory | _inputs | _context | ANSWER_PROMPT | self.llm
        logger.info('Chain created')
        
    def query(self, question):
        input = {'question': question}
        with get_openai_callback() as cb:
            result = self.final_chain.invoke(input)
        self.memory.save_context(input, {"answer": result.content})
        logger.info(result.content)
        logger.info(f"\n{cb}")
    
    def clear_memory(self):
        self.memory.clear()
        logger.info('Memory cleared')
        

In [4]:
with open('config.yml', 'r') as f:
    config = yaml.safe_load(f)

vector_db = VectorDB(config)
vector_db.create_retriever(namespace='sienlong-diabetes')
vector_db.create_chain()

2024-01-27 17:11:39,849 - __main__ - INFO - 
{'dimension': 1536,
 'host': 'healthhack-2nyyyl3.svc.gcp-starter.pinecone.io',
 'metric': 'cosine',
 'name': 'healthhack',
 'spec': {'pod': {'environment': 'gcp-starter',
                  'pod_type': 'starter',
                  'pods': 1,
                  'replicas': 1,
                  'shards': 1}},
 'status': {'ready': True, 'state': 'Ready'}}
2024-01-27 17:11:39,898 - __main__ - INFO - llm model: gpt-3.5-turbo-1106
2024-01-27 17:11:39,906 - __main__ - INFO - Chain created


In [5]:
vector_db.memory.load_memory_variables([])

{'history': []}

In [55]:
vector_db.query('What is diabetes?')

2024-01-27 16:35:13,921 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  0%|          | 0/1 [00:00<?, ?it/s]

2024-01-27 16:35:14,354 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-01-27 16:35:17,343 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-01-27 16:35:17,351 - __main__ - INFO - Diabetes is a number of diseases that involve problems with the hormone insulin. Normally, the pancreas releases insulin to help the body store and use the sugar and fat from the food. Diabetes occurs when the pancreas does not produce enough insulin or when the body does not respond appropriately to insulin, leading to high levels of sugar circulating in the blood.
2024-01-27 16:35:17,354 - __main__ - INFO - 
Tokens Used: 1723
	Prompt Tokens: 1646
	Completion Tokens: 77
Successful Requests: 2
Total Cost (USD): $0.0018


In [46]:
vector_db.query('Does it have a cure?')

2024-01-27 16:29:27,032 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  0%|          | 0/1 [00:00<?, ?it/s]

2024-01-27 16:29:27,473 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-01-27 16:29:28,611 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
'There is no cure for diabetes.'


In [47]:
vector_db.memory.load_memory_variables([])

{'history': [HumanMessage(content='What is diabetes?'),
  AIMessage(content='Diabetes is a number of diseases that involve problems with the hormone insulin. It occurs when the pancreas does not produce insulin, produces very little insulin, or when the body does not respond appropriately to insulin, a condition called "insulin resistance."'),
  HumanMessage(content='Does it have a cure?'),
  AIMessage(content='There is no cure for diabetes.')]}

## Initial testing on Pinecone with Langchain

In [15]:
with open('config.yml', 'r') as f:
    config = yaml.safe_load(f)

vector_db = VectorDB(config)
vector_db.create_retriever(namespace='sienlong-diabetes')

query = "What is diabetes?"

vector_db.retriever.similarity_search(
    query,  # our search query
    k=3  # return 3 most relevant docs
)

2024-01-27 15:39:00,550 - __main__ - INFO - {'dimension': 1536,
 'host': 'healthhack-2nyyyl3.svc.gcp-starter.pinecone.io',
 'metric': 'cosine',
 'name': 'healthhack',
 'spec': {'pod': {'environment': 'gcp-starter',
                  'pod_type': 'starter',
                  'pods': 1,
                  'replicas': 1,
                  'shards': 1}},
 'status': {'ready': True, 'state': 'Ready'}}


  0%|          | 0/1 [00:00<?, ?it/s]

2024-01-27 15:39:01,327 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


[Document(page_content='An Overview of Diabetes  Skip to main content  Home Conditions  Back Conditions View All ADD/ADHDAllergiesArthritisAtrial fibrillationBreast CancerCancerCrohn\'s DiseaseDepressionDiabetesDVTEczemaEye HealthHeart DiseaseHIV & AIDSLung DiseaseLupusMental HealthMultiple SclerosisMigrainePain ManagementPsoriasisPsoriatic ArthritisRheumatoid ArthritisSexual ConditionsSkin ProblemsSleep DisordersUlcerative Colitis View All Drugs & Supplements  Back Drugs & SupplementsDrugsSupplementsPill IdentifierInteraction CheckerWell-Being  Back Well-Being View All Aging WellBabyBirth ControlChildren\'s HealthDiet & Weight ManagementFitness & ExerciseFood & RecipesHealthy BeautyMen\'s HealthParentingPet HealthPregnancySex & RelationshipsTeen HealthWomen\'s Health View All Symptom CheckerFind a DoctorMore  Back MoreNewsBlogsPodcastsWebinarsNewslettersWebMD MagazineBest HospitalsSupport GroupsOrthopedics Privacy & More  Subscribe  Log In  Search  Subscribe DiabetesDiabetes GuideOver

In [6]:
# First need to summarise the history and create a standalone question using the LLM
_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.

Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:"""

CONDENSE_QUESTION_PROMPT = ChatPromptTemplate.from_template(_template)

_inputs = RunnableParallel(
    standalone_question=RunnablePassthrough.assign(
        chat_history=lambda x: get_buffer_string(x["chat_history"])
    )
    | CONDENSE_QUESTION_PROMPT
    | ChatOpenAI(temperature=1)
    | StrOutputParser(),
)

# Our invoke will pipe both 'question' and 'chat_history' through a model to condense the question
_inputs.invoke({'question' : 'How do you mean by chronic?', 
                'chat_history': [ HumanMessage(content="What is Diabetes?"), 
                                 AIMessage(content="Diabetes is a form of chronic disease")]})

# This will be further piped to the retriever etc

2024-01-27 17:11:56,246 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


{'standalone_question': 'What does the term "chronic" mean in this context?'}

In [7]:
# Next part is similar to a normal RAG, the difference is in the context pipe, we will pipe in the standalone_question

retriever = vector_db.retriever.as_retriever(
    search_type="similarity", # mmr, similarity_score_threshold, similarity
    search_kwargs = {"k": 1}
)

template = """Answer the question based only on the following context:
{context}

Question: {question}
"""
ANSWER_PROMPT = ChatPromptTemplate.from_template(template)

_context = {
    "context": itemgetter("standalone_question") | retriever ,
    "question": lambda x: x["standalone_question"],
}

# Putting all together
conversational_qa_chain = _inputs | _context | ANSWER_PROMPT | ChatOpenAI()
output = conversational_qa_chain.invoke({'question' : 'How are the symptoms?', 
                'chat_history': [ HumanMessage(content="What is Diabetes?"), 
                                 AIMessage(content="Diabetes is a form of chronic disease")]})
pprint(output.content)

2024-01-27 17:12:00,675 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  0%|          | 0/1 [00:00<?, ?it/s]

2024-01-27 17:12:01,465 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-01-27 17:12:06,117 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
('The symptoms of diabetes include hunger and fatigue, increased urination and '
 'thirst, dry mouth and itchy skin, blurred vision, yeast infections, '
 'slow-healing sores or cuts, pain or numbness in the feet or legs, unplanned '
 'weight loss, nausea and vomiting, high blood sugar during pregnancy, '
 'slow-healing sores or cuts, itchy skin, frequent yeast infections, recent '
 'weight gain, dark skin changes, numbness and tingling of the hands and feet, '
 'decreased vision, impotence or erectile dysfunction, hypoglycemia (low blood '
 'sugar), hyperglycemia (high blood sugar), and diabetic coma.')


In [8]:
# First we add a step to load memory
# This adds a "memory" key to the input object
memory = ConversationBufferMemory(
    return_messages=True, output_key="answer", input_key="question"
)

loaded_memory = RunnablePassthrough.assign(
    chat_history=RunnableLambda(memory.load_memory_variables) | itemgetter("history"),
)

final_chain = loaded_memory | _inputs | _context | ANSWER_PROMPT | ChatOpenAI()
input = {'question': 'What is diabetes?'}
result = final_chain.invoke(input)
pprint(result.content)

2024-01-27 17:12:12,542 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


  0%|          | 0/1 [00:00<?, ?it/s]

2024-01-27 17:12:12,974 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-01-27 17:12:19,398 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
AIMessage(content='Diabetes is a chronic disease that involves problems with the hormone insulin. Normally, the pancreas releases insulin to help the body store and use the sugar and fat from the food we eat. Diabetes occurs when the pancreas does not produce enough insulin, produces no insulin, or when the body does not respond appropriately to insulin. This results in high levels of sugar circulating in the blood, known as high blood sugar. There are different types of diabetes, including type 1 diabetes, type 2 diabetes, prediabetes, and gestational diabetes. Type 1 diabetes occurs when the insulin-producing cells of the pancreas are destroyed, leading to no insulin production. Type 2 diabetes occurs when the body either does not produce enough insuli

In [None]:
# We need to manually save the  (input and output), this will be improved in the future
memory.save_context(input, {"answer": result.content})
memory.load_memory_variables([])