### Visualize the vector embeddings in 3D vector space
- We will use llama-index to parse PDF and embed it and load into vector database (Pinecone)
    - Create a pinecone account (free starter available)
    - Get OpenAI API Key
- Then we will fetch all the vectors from that index
- Perform Semantic Search and visualize its Nearest Neighbors 

#### First step is to create a Pinecone Index

In [20]:
from tqdm.autonotebook import tqdm

#setting up pinecone client
from pinecone import Pinecone, PodSpec
pc = Pinecone(api_key='<Your PINECONE API KEY>')

In [24]:
#create a pinecone index
#we will use openai ada 02 embeddings model which has 1536 dimensions
pc.create_index(
name='visualizeindex', 
dimension=1536, 
metric="cosine", 
spec=PodSpec(environment="gcp-starter")
)

In [3]:
#we can see the index details below
pc.describe_index('visualizeindex')

{'dimension': 1536,
 'host': 'visualizeindex-02a0dcb.svc.gcp-starter.pinecone.io',
 'metric': 'cosine',
 'name': 'visualizeindex',
 'spec': {'pod': {'environment': 'gcp-starter',
                  'pod_type': 'starter',
                  'pods': 1,
                  'replicas': 1,
                  'shards': 1}},
 'status': {'ready': True, 'state': 'Ready'}}

In [4]:
#set the index
pinecone_index = pc.Index("visualizeindex")

#### Now we will use Llamaindex to parse a sample pdf, embed it using openai and then load it into Pinecone

In [21]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.vector_stores.pinecone import PineconeVectorStore
from llama_index.core import StorageContext
from llama_index.core import ServiceContext
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from openai import OpenAI

In [22]:
#setting openai client
openai = OpenAI(api_key="<Your OpenAI API Key>")

In [23]:
#loading documents
documents = SimpleDirectoryReader("Data/").load_data()
documents[0:5]

[Document(id_='e13ee43b-5031-4974-9b68-cde1618f51b5', embedding=None, metadata={'page_label': '1', 'file_name': 'course-catalog.pdf', 'file_path': 'Data\\course-catalog.pdf', 'file_type': 'application/pdf', 'file_size': 1362746, 'creation_date': '2024-02-17', 'last_modified_date': '2023-11-08', 'last_accessed_date': '2024-02-17'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='Databricks\nAcademy\nCourse\nCatalog\nUPDATED:\nOCTOBER\n31,\n2023\nWelcome\nto\nthe\nDatabricks\nAcademy\n3\nAbout\nthe\nDatabricks\nAcademy\n3\nTraining\nOfferings\n4\nLearning\npaths\n5\nDatabricks\nLakehouse\nFundamentals\n5\nData\nanalysis\n6\nData\nengineering\n6\nMachine\nlearning\n7\nPlatform\nadministration\n(cloud\nagnostic)\n8\nPlatform\narchitecture\n-\nAzure\n8\nPlatf

In [24]:
#defining embeddings models from OpenAI
embed_model = OpenAIEmbedding( model = "text-embedding-ada-002", api_key = "<Your OpenAI API Key>")

In [25]:
#now creating embeddings and loading into Pinecone Index
vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
service_context = ServiceContext.from_defaults(llm=None, embed_model = embed_model)
index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context, service_context=service_context
)


Call to deprecated class method from_defaults. (ServiceContext is deprecated, please use `llama_index.settings.Settings` instead.) -- Deprecated since version 0.10.0.



LLM is explicitly disabled. Using MockLLM.


Upserted vectors:   0%|          | 0/1068 [00:00<?, ?it/s]

#### Now we will query the database against a Question and fetch top 3 vectors
- Question: What is difference between data engineering and Mlops?

In [26]:
#embed the question for semantic search
response = openai.embeddings.create(
    input="What is difference between data engineering and Mlops?",
    model="text-embedding-ada-002"
)

query_emb = response.data[0].embedding
print("Showing only first 5 dimensions of the query embedding:")
print(query_emb[0:5])

Showing only first 5 dimensions of the query embedding:
[0.008855032734572887, 0.004691744223237038, 0.01623309589922428, -0.04208680987358093, -0.016490550711750984]


In [43]:
#now we will find the similar documents
search = pinecone_index.query(
  vector=query_emb,
  top_k=1068,
  include_values=True
  )

embs = []
for i in range(0,1068):
    embs.append(search.matches[i].values)

#### Now we have the vectors with similarity score against the question
- We will add the query vector as well in the stack
- Reduce dimesions to 3D using tSNE
- Color the question vector to Red, top 3 to Blue and rest of them as gray

In [44]:
#adding the question vector
embs.insert(0,query_emb)

In [45]:
import numpy as np
from sklearn.manifold import TSNE

In [46]:
n_components = 3 #3D
embs = np.array(embs) #converting to numpy array

In [47]:
tsne = TSNE(n_components=n_components, random_state=42, perplexity=5)
reduced_vectors = tsne.fit_transform(embs)
reduced_vectors[0:10]

array([[ 23.58361   , -11.59345   ,  -5.1756334 ],
       [ 23.297134  , -11.580782  ,  -4.0102615 ],
       [ 23.254757  , -11.570523  ,  -3.8742952 ],
       [ 23.258215  , -11.569283  ,  -3.8789215 ],
       [ 22.840782  , -15.323355  ,  -5.9736443 ],
       [ 22.819704  , -15.337939  ,  -5.972782  ],
       [ 22.848036  , -15.325046  ,  -5.9678974 ],
       [ 25.10428   , -16.35287   ,   0.44661796],
       [ 27.081112  , -14.252187  ,  -7.466387  ],
       [ 25.091845  , -16.36631   ,   0.44778484]], dtype=float32)

#### Lets Visualize

In [48]:
len(reduced_vectors)

1069

In [62]:
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import plotly.io as pio

# Create a 3D scatter plot
scatter_plot = go.Scatter3d(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    z=reduced_vectors[:, 2],
    mode='markers',
    marker=dict(size=5, color='grey', opacity=0.5, line=dict(color='lightgray', width=1)),
    text=[f"Point {i}" for i in range(len(reduced_vectors))]
)

# Highlight the first point with a different color
highlighted_point = go.Scatter3d(
    x=[reduced_vectors[0, 0]],
    y=[reduced_vectors[0, 1]],
    z=[reduced_vectors[0, 2]],
    mode='markers',
    marker=dict(size=8, color='red', opacity=0.8, line=dict(color='lightgray', width=1)),
    text=["Question"]
    
)

blue_points = go.Scatter3d(
    x=reduced_vectors[1:4, 0],
    y=reduced_vectors[1:4, 1],
    z=reduced_vectors[1:4, 2],
    mode='markers',
    marker=dict(size=8, color='blue', opacity=0.8,  line=dict(color='black', width=1)),
    text=["Top 1 Document","Top 2 Document","Top 3 Document"]
)

# Create the layout for the plot
layout = go.Layout(
    scene=dict(
        xaxis=dict(title='X'),
        yaxis=dict(title='Y'),
        zaxis=dict(title='Z'),
    ),
    title=f'3D Representation after t-SNE (Perplexity=5)'
)


fig = make_subplots(rows=1, cols=1, specs=[[{'type': 'scatter3d'}]])

# Add the scatter plots to the Figure
fig.add_trace(scatter_plot)
fig.add_trace(highlighted_point)
fig.add_trace(blue_points)

fig.update_layout(layout)

pio.write_html(fig, 'interactive_plot.html')
fig.show()