In [1]:
%pip install --upgrade --quiet  langchain langchain-community langchainhub langchain-openai langchain-chroma bs4

Note: you may need to restart the kernel to use updated packages.


In [3]:
import bs4
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_chroma import Chroma
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter


In [4]:
import requests
from langchain.document_loaders import TextLoader

url = "https://www.legalandgeneral.com/retirement/our-products/"

res = requests.get(url)

from bs4 import BeautifulSoup

soup = BeautifulSoup(res.text, "html.parser")
text = soup.get_text()

lines = text.split('\n')
non_empty_lines = [line for line in lines if line.strip()]
cleaned_text = '\n'.join(non_empty_lines)

In [23]:
import os
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.parsers import LanguageParser
from langchain.text_splitter import Language
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.document_loaders import TextLoader
from bs4 import BeautifulSoup
import requests
import re

with open("insurance_products.txt", "w") as f:
    f.write(cleaned_text)

loader = TextLoader("insurance_products.txt")
documents = loader.load()

# documents_splitter = RecursiveCharacterTextSplitter.from_language(language=Language.PYTHON,
#                                                                       chunk_size = 2000,
#                                                                       chunk_overlap = 200)

# text_chunks = documents_splitter.split_documents(documents)


documents_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

texts = documents_splitter.split_documents(documents)


In [25]:
embeddings =  OpenAIEmbeddings(disallowed_special=())
vectordb = Chroma.from_documents(texts, embedding=embeddings, persist_directory='./db')

retriever = vectordb.as_retriever()

In [26]:
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)

In [27]:
### Contextualize question ###
contextualize_q_system_prompt = """Given a chat history and the latest user question \
which might reference context in the chat history, formulate a standalone question \
which can be understood without the chat history. Do NOT answer the question, \
just reformulate it if needed and otherwise return it as is."""
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)

In [28]:
### Answer question ###
qa_system_prompt = """You are a helpful assistant for question-answering tasks. \
Use the following pieces of retrieved context to answer the question. \
If you don't know the answer, just say that you don't know. \
Use three sentences maximum and keep the answer concise.\

{context}"""
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)


In [29]:
### Statefully manage chat history ###
store = {}


def get_session_history(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]


conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
)

In [31]:
conversational_rag_chain.invoke(
    {"input": "what are the investment option available?"},
    config={
        "configurable": {"session_id": "abc123"}
    },  # constructs a key "abc123" in `store`.
)["answer"]

Parent run bcbab051-a632-4934-a9dc-069140147874 not found for run 29586f48-14e0-4470-ac43-fef912233c09. Treating as a root run.


'The investment options available include Stocks and Shares ISA, Personal Pension, Pension Annuity, and Cash-Out Retirement Plan. These options offer tax-efficient ways to save for retirement and provide guaranteed income for life or a set period. Customers can also transfer existing pensions or start saving with workplace pensions.'