In [2]:
import os
import glob
import gradio as gr
from dotenv import load_dotenv

In [3]:
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
# from langchain_chroma import Chroma
from langchain.vectorstores import FAISS
import numpy as np
from sklearn.manifold import TSNE
import plotly.graph_objects as go
import plotly.io as pio
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain


In [4]:
MODEL_NAME = "gpt-4o-mini"
db_name = "faiss_db"

In [5]:
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

In [6]:
folders = glob.glob("knowledge-base/*")

documents = []
for folder in folders:
    # Folder name as document type
    doc_type = os.path.basename(folder)
    print(f"Document tpye: {doc_type}")

    # Load all markdown files in the folder
    loader = DirectoryLoader(folder, glob="**/*.md", loader_cls=TextLoader)
    folder_docs = loader.load()
    print(f"Loaded {len(folder_docs)} documents from {folder}")

    for doc in folder_docs:
        doc.metadata["doc_type"] = doc_type
        documents.append(doc)


Document tpye: products
Loaded 4 documents from knowledge-base/products
Document tpye: contracts
Loaded 12 documents from knowledge-base/contracts
Document tpye: company
Loaded 3 documents from knowledge-base/company
Document tpye: employees
Loaded 12 documents from knowledge-base/employees


In [7]:
len(documents)

31

In [8]:
(documents[0])

Document(metadata={'source': 'knowledge-base/products/Rellm.md', 'doc_type': 'products'}, page_content="# Product Summary\n\n# Rellm: AI-Powered Enterprise Reinsurance Solution\n\n## Summary\n\nRellm is an innovative enterprise reinsurance product developed by Insurellm, designed to transform the way reinsurance companies operate. Harnessing the power of artificial intelligence, Rellm offers an advanced platform that redefines risk management, enhances decision-making processes, and optimizes operational efficiencies within the reinsurance industry. With seamless integrations and robust analytics, Rellm enables insurers to proactively manage their portfolios and respond to market dynamics with agility.\n\n## Features\n\n### AI-Driven Analytics\nRellm utilizes cutting-edge AI algorithms to provide predictive insights into risk exposures, enabling users to forecast trends and make informed decisions. Its real-time data analysis empowers reinsurance professionals with actionable intellige

In [9]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

Created a chunk of size 1088, which is longer than the specified 1000


In [10]:
len(chunks)

123

In [11]:
embeddings = OpenAIEmbeddings()

In [14]:
vectorstore = FAISS.from_documents(chunks, embedding=embeddings)

total_vectors = vectorstore.index.ntotal
dimensions = vectorstore.index.d

print(f"There are {total_vectors} vectors with {dimensions:,} dimensions in the vector store")

There are 123 vectors with 1,536 dimensions in the vector store


Visualizing the Vector Store

In [15]:
# Prework
vectors = []
documents = []
doc_types = []
colors = []
color_map = {'products':'blue', 'employees':'green', 'contracts':'red', 'company':'orange'}

for i in range(total_vectors):
    vectors.append(vectorstore.index.reconstruct(i))
    doc_id = vectorstore.index_to_docstore_id[i]
    document = vectorstore.docstore.search(doc_id)
    documents.append(document.page_content)
    doc_type = document.metadata['doc_type']
    doc_types.append(doc_type)
    colors.append(color_map[doc_type])
    
vectors = np.array(vectors)

In [16]:
tsne = TSNE(n_components=2, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 2D scatter plot
fig = go.Figure(data=[go.Scatter(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='2D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x',yaxis_title='y'),
    width=800,
    height=600,
    margin=dict(r=20, b=10, l=10, t=40)
)


pio.renderers.default = 'browser'  

fig.show()



In [17]:
# Chat with OpenAI
llm = ChatOpenAI(temperature=0.7, model=MODEL_NAME)
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
retriver = vectorstore.as_retriever()
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriver, memory=memory)


Please see the migration guide at: https://python.langchain.com/docs/versions/migrating_memory/



In [38]:
question = "What is the company name?"
result = conversation_chain.invoke({"question": question})
print(result["answer"])

The company names mentioned are Insurellm and EverGuard Insurance.


In [44]:
def chat(message, history):
    result = conversation_chain.invoke({"question": message})
    return result["answer"]

In [45]:

view = gr.ChatInterface(chat, type="messages").launch(inbrowser=True)

* Running on local URL:  http://127.0.0.1:7869

To create a public link, set `share=True` in `launch()`.
