# Use case for using RAG - Creating Knowledge Base - II - Creating vectors

### A question answering agent that is an expert knowledge worker
### To be used by employees of Insurellm, an Insurance Tech company
### The agent needs to be accurate and the solution should be low cost.

This project will use RAG (Retrieval Augmented Generation) to ensure our question/answering assistant has high accuracy.

In [None]:
import os 
from dotenv import load_dotenv 
import glob 
import gradio as gr 
from openai import OpenAI

In [None]:
# including langchain imports 
from langchain_core.output_parsers import StrOutputParser
from langchain_community.document_loaders import DirectoryLoader, TextLoader          # Directory loader will load whole directory and text loader will load whole document
from langchain_text_splitters import CharacterTextSplitter                   # splitting the content in chunks so that there is some meaningful context 

In [None]:
# some more imports 
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
import numpy as np
from sklearn.manifold import TSNE
import plotly.graph_objects as go

# lets add one more embedding model from local mxbai to compare the output
from langchain_ollama import OllamaEmbeddings, ChatOllama

# from langchain.memory import ConversationBufferMemory
# from langchain.chains import ConversationalRetrievalChain

In [None]:
# create 2 llm model for gradio screen 
OLLAMA_MODEL="llama3.1"
OPENAI_MODEL="gpt-4o-mini"

db_name_ollama="vector_db_ollama_mxbai"
db_name_openai="vector_db_openai_embed"

In [None]:
load_dotenv(override=True)

In [None]:
# create 2 instances one frontier model and another is local model
api_key=os.getenv("OPENAI_API_KEY")
openai=OpenAI()
ollama=OpenAI(base_url=os.getenv("OLLAMA_BASE_URL"), api_key=os.getenv("OLLAMA_API_KEY"))

### 1. Now grab documents and load them to Langchain Loaders 

In [None]:
context={} 

# grab the documents in knowledge-base all folders 
folders=glob.glob("knowledge-base/*")

text_loader_kwargs={"encoding": "utf-8"}

documents=[] 
for folder in folders: 

    # grab the name of the file in sub-folder name e.g. products, employees etc
    doc_type=os.path.basename(folder)

    # load the files from the directory 
    loader=DirectoryLoader(folder, glob="**/*.md", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)
    folder_docs=loader.load()

    # for each folder document loaded add a metadata tag
    for doc in folder_docs: 
        doc.metadata["doc_type"]=doc_type
        documents.append(doc)

In [None]:
len(documents)

### Split the documents to manageable chunks 

if chunk_size=1000 is provided; langchain will not cut the characters at 1000; it will try to create meaningful chunks near to 1000.   
Also each chunk will have some overlap to logically connect the documents

In [None]:
text_splitter=CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks=text_splitter.split_documents(documents=documents)

In [None]:
len(chunks)

## A sidenote on Embeddings, and "Auto-Encoding LLMs"

We will be mapping each chunk of text into a Vector that represents the meaning of the text, known as an embedding.

OpenAI offers a model to do this, which we will use by calling their API with some LangChain code.

This model is an example of an "Auto-Encoding LLM" which generates an output given a complete input.
It's different to all the other LLMs we've discussed today, which are known as "Auto-Regressive LLMs", and generate future tokens based only on past context.

Another example of an Auto-Encoding LLMs is BERT from Google. In addition to embedding, Auto-encoding LLMs are often used for classification.

### Sidenote

In week 8 we will return to RAG and vector embeddings, and we will use an open-source vector encoder so that the data never leaves our computer - that's an important consideration when building enterprise systems and the data needs to remain internal.

In [None]:
# how many document types we have 
doc_types=set(chunk.metadata["doc_type"] for chunk in chunks)
print(doc_types)

### Using OpenAIEmbeddings

In [None]:
openai_embeddings=OpenAIEmbeddings()

# If you would rather use the free Vector Embeddings from HuggingFace sentence-transformers
# Then replace embeddings = OpenAIEmbeddings()
# with:
# from langchain.embeddings import HuggingFaceEmbeddings
# embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [None]:
# delete the datastore if exists already
if os.path.exists(db_name_openai):
    Chroma(persist_directory=db_name_openai, embedding_function=openai_embeddings).delete_collection()

In [None]:
# Create our Chroma vectorstore!
vectorstore = Chroma.from_documents(documents=chunks, embedding=openai_embeddings, persist_directory=db_name_openai)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

In [None]:
# get the collection name 
collection = vectorstore._collection

# get one document from the vector store 
sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]

# get the dimension of document retrieved 
dimensions = len(sample_embedding)
print(f"The vectors have {dimensions:,} dimensions")

In [None]:
sample_embedding.shape

In [None]:
# get first 10 values 
sample_embedding[:10]

### Visualize the vector store for Open AI 

In [None]:
# Prework
# get all documents from the vector store 
result = collection.get(include=['embeddings', 'documents', 'metadatas'])

# get vector embeddings in numpy array
vectors = np.array(result['embeddings'])

# retrieve documents from the result set 
documents = result['documents']

# get the document type from metadata 
doc_types = [metadata['doc_type'] for metadata in result['metadatas']]

# set color for each type; blue for products, green for employees and so on 
colors = [['blue', 'green', 'red', 'orange'][['products', 'employees', 'contracts', 'company'].index(t)] for t in doc_types]

In [None]:
colors[:10]

In [None]:
# Visualize the data in plotly 2D using projection technique 
tsne = TSNE(n_components=2, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 2D scatter plot
fig = go.Figure(data=[go.Scatter(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='2D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x',yaxis_title='y'),
    width=800,
    height=600,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

In [None]:
# Visualize this in 3D 
tsne = TSNE(n_components=3, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 3D scatter plot
fig = go.Figure(data=[go.Scatter3d(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    z=reduced_vectors[:, 2],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='3D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),
    width=900,
    height=700,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

# Let's try with local model 

In [None]:
ollama_embeddings=OllamaEmbeddings(model="mxbai-embed-large")

In [None]:
if os.path.exists(db_name_ollama):
    Chroma(persist_directory=db_name_ollama, embedding_function=ollama_embeddings).delete_collection()

In [None]:
vectorstore = Chroma.from_documents(documents=chunks, embedding=ollama_embeddings, persist_directory=db_name_ollama)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

In [None]:
# get the collection name 
collection = vectorstore._collection

# get one document from the vector store 
sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]

# get the dimension of document retrieved 
dimensions = len(sample_embedding)
print(f"The vectors have {dimensions:,} dimensions")

In [None]:
sample_embedding.shape

In [None]:
# get first 10 values 
sample_embedding[:10]

In [None]:
# Prework
# get all documents from the vector store 
result = collection.get(include=['embeddings', 'documents', 'metadatas'])

# get vector embeddings in numpy array
vectors = np.array(result['embeddings'])

# retrieve documents from the result set 
documents = result['documents']

# get the document type from metadata 
doc_types = [metadata['doc_type'] for metadata in result['metadatas']]

# set color for each type; blue for products, green for employees and so on 
colors = [['blue', 'green', 'red', 'orange'][['products', 'employees', 'contracts', 'company'].index(t)] for t in doc_types]

In [None]:
# Visualize the data in plotly 2D using projection technique 
tsne = TSNE(n_components=2, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 2D scatter plot
fig = go.Figure(data=[go.Scatter(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='2D Chroma Vector Store Visualization - Local Model',
    scene=dict(xaxis_title='x',yaxis_title='y'),
    width=800,
    height=600,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

In [None]:
# Visualize this in 3D 
tsne = TSNE(n_components=3, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 3D scatter plot
fig = go.Figure(data=[go.Scatter3d(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    z=reduced_vectors[:, 2],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='3D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),
    width=900,
    height=700,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

# Using these vectors efficiently in our prompt

First run this in a cell: `!pip install langchain-ollama`

Then replace `llm = ChatOpenAI(temperature=0.7, model_name=MODEL)` with:

```python
from langchain_ollama import ChatOllama
llm = ChatOllama(temperature=0.7, model="llama3.2")
```

In [None]:
# create new chat with ollama 
llm=ChatOllama(temperature=0.7, model=OLLAMA_MODEL)

# set up conversation memory for the chat 
memory=Con