In [1]:
import os
import glob
from dotenv import load_dotenv
import gradio as gr

In [2]:
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.vectorstores import FAISS
import numpy as np
from sklearn.manifold import TSNE
import plotly.graph_objects as go
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

In [3]:
MODEL = 'gpt-4o-mini'
db_name = 'vector-db'

In [4]:
load_dotenv(override=True)

True

In [5]:
folders = glob.glob("knowledge-base/*")
folders

['knowledge-base\\company',
 'knowledge-base\\contracts',
 'knowledge-base\\employees',
 'knowledge-base\\products']

In [6]:
text_loader_kwargs = {'encoding': 'utf-8'}
# text_loader_kwargs = {'autodetect_encoding': True}

documents = []
for folder in folders:
    doc_type = os.path.basename(folder)
    print(doc_type)
    loader = DirectoryLoader(folder, glob= '**/*.md', loader_cls= TextLoader, loader_kwargs= text_loader_kwargs)
    folder_docs = loader.load()
    print(folder_docs)
    for doc in folder_docs:
        doc.metadata['doc_type'] = doc_type
        # print(doc)
        documents.append(doc)

company
[Document(metadata={'source': 'knowledge-base\\company\\about.md'}, page_content="# About Insurellm\n\nInsurellm was founded by Avery Lancaster in 2015 as an insurance tech startup designed to disrupt an industry in need of innovative products. It's first product was Markellm, the marketplace connecting consumers with insurance providers.\nIt rapidly expanded, adding new products and clients, reaching 200 emmployees by 2024 with 12 offices across the US."), Document(metadata={'source': 'knowledge-base\\company\\careers.md'}, page_content='# Careers at Insurellm\n\nInsurellm is hiring! We are looking for talented software engineers, data scientists and account executives to join our growing team. Come be a part of our movement to disrupt the insurance sector.'), Document(metadata={'source': 'knowledge-base\\company\\overview.md'}, page_content='# Overview of Insurellm\n\nInsurellm is an innovative insurance tech firm with 200 employees across the US.\nInsurellm offers 4 insuranc

In [7]:
text_splitter = CharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200)
chunks = text_splitter.split_documents(documents)
chunks[0]

Created a chunk of size 1088, which is longer than the specified 1000


Document(metadata={'source': 'knowledge-base\\company\\about.md', 'doc_type': 'company'}, page_content="# About Insurellm\n\nInsurellm was founded by Avery Lancaster in 2015 as an insurance tech startup designed to disrupt an industry in need of innovative products. It's first product was Markellm, the marketplace connecting consumers with insurance providers.\nIt rapidly expanded, adding new products and clients, reaching 200 emmployees by 2024 with 12 offices across the US.")

In [8]:
len(chunks)

123

In [9]:
doc_types = set(chunk.metadata['doc_type'] for chunk in chunks)
doc_types

{'company', 'contracts', 'employees', 'products'}

In [10]:
embeddings = OpenAIEmbeddings()

vector_store = FAISS.from_documents(chunks, embedding=embeddings)

total_vectors = vector_store.index.ntotal
dimensions = vector_store.index.d

print(f"There are {total_vectors} vectors with {dimensions:,} dimensions in the vectorstore")

There are 123 vectors with 1,536 dimensions in the vectorstore


In [11]:
vectors = []
documents = []
doc_types = []
colors = []
color_map = {'products': 'blue', 'employees' : 'green', 'contracts' : 'red', 'company' : 'orange'}

for i in range(total_vectors):
    vectors.append(vector_store.index.reconstruct(i))
    doc_id = vector_store.index_to_docstore_id[i]
    document = vector_store.docstore.search(doc_id)
    documents.append(document.page_content)
    doc_type = document.metadata['doc_type']
    doc_types.append(doc_type)
    colors.append(color_map[doc_type])
vectors = np.array(vectors)

In [12]:
tsne = TSNE(n_components=2, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

fig = go.Figure(data = [go.Scatter(
    x = reduced_vectors[:,0],
    y = reduced_vectors[:,1],
    mode = 'markers',
    marker = dict(size = 5, color = colors, opacity = 0.7),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)
]
)

fig.update_layout(
    title='2D FAISS Vector Store Visualization',
    scene=dict(xaxis_title='x',yaxis_title='y'),
    width=800,
    height=600,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

In [13]:
tsne = TSNE(n_components=3, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 3D scatter plot
fig = go.Figure(data=[go.Scatter3d(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    z=reduced_vectors[:, 2],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.7),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='3D FAISS Vector Store Visualization',
    scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),
    width=900,
    height=700,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

In [15]:
llm = ChatOpenAI(temperature=0.7, model=MODEL)
memory = ConversationBufferMemory(memory_key= "chat_history", return_messages=True)
retriever = vector_store.as_retriever()
conversation_chain = ConversationalRetrievalChain.from_llm(llm, retriever=retriever, memory=memory)

In [18]:
query = "Can you describe Insurellm in a few sentences?"
result = conversation_chain({"question": query})
print(result['answer'])
# print(result['chat_history'])

Insurellm is an innovative insurance tech startup founded by Avery Lancaster in 2015. The company offers four software products, including Carllm for auto insurance, Homellm for home insurance, Rellm for the reinsurance sector, and Marketllm, a marketplace connecting consumers with insurance providers. With 200 employees and over 300 clients worldwide, Insurellm aims to disrupt the insurance industry through technology and innovation.


In [None]:
memory = ConversationBufferMemory(memory_key= "chat_history", return_messages=True)
conversation_chain = ConversationalRetrie   valChain.from_llm(llm, retriever=retriever, memory=memory)

In [23]:
def chat_with_memory(user_input, history):
    result = conversation_chain({"question": user_input})
    return result['answer']

In [24]:
view = gr.ChatInterface(fn=chat_with_memory).launch(share=True)


The 'tuples' format for chatbot messages is deprecated and will be removed in a future version of Gradio. Please set type='messages' instead, which uses openai-style 'role' and 'content' keys.



* Running on local URL:  http://127.0.0.1:7861
* Running on public URL: https://f025d616d929df7b38.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


In [25]:
from langchain_core.callbacks import StdOutCallbackHandler

llm = ChatOpenAI(temperature=0.7, model=MODEL)

memory = ConversationBufferMemory(memory_key= "chat_history", return_messages=True)

retriever = vector_store.as_retriever()

conversation_chain = ConversationalRetrievalChain.from_llm(llm, retriever=retriever, memory=memory)

query = "Who recieved the prestigious IIOTY award in 2023?"
result = conversation_chain({"question": query})
answer = result['answer']
print(answer)

I don't know.


In [27]:
# create a new Chat with OpenAI
llm = ChatOpenAI(temperature=0.7, model_name=MODEL)

# set up the conversation memory for the chat
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

# the retriever is an abstraction over the VectorStore that will be used during RAG; k is how many chunks to use
retriever = vector_store.as_retriever(search_kwargs={"k": 25})

# putting it together: set up the conversation chain with the GPT 3.5 LLM, the vector store and memory
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

In [28]:
def chat(question, history):
    result = conversation_chain.invoke({"question": question})
    return result["answer"]

In [29]:
view = gr.ChatInterface(chat, type="messages").launch(inbrowser=True)

* Running on local URL:  http://127.0.0.1:7862

To create a public link, set `share=True` in `launch()`.
