## Vectorizing and Embedding

In [52]:
# import and run all cells of the preprocessing file

# import nbformat

# from IPython.core.interactiveshell import InteractiveShell

# with open('data-preprocessing.ipynb') as f:
#     nb = nbformat.read(f, as_version=4)

# shell = InteractiveShell.instance()
# for cell in nb.cells:
#     if cell.cell_type == 'code':
#         shell.run_cell(cell.source)

In [17]:
from dotenv import load_dotenv, find_dotenv
import os
_ = load_dotenv(find_dotenv())

OPENAI_API_KEY  = os.getenv('OPENAI_API_KEY')

In [None]:
#retrieve dataframe
import pandas as pd
dataframe = pd.read_pickle("preprocessed_dataframe_new.pkl")

print(dataframe["text"])


0       planet become interesting moon\nbecome place g...
1       hand told american\ntrained speak russian\nrus...
2       day okay remember\nkid like hurricane thats pr...
3       driving around submarine ice\nanarctica prepar...
4       10yearolds eye\ncamera put\nhumankind could im...
                              ...                        
1048    bobble around theyre like buoy\nocean yes they...
1049    love look face thank\nhe looking back forth\nl...
1050    really one force magical\nsomething nuclear ra...
1051    thing supposed around\ngravity point crumb\ncr...
1052    like albert einstein\nlaying physical idea\nun...
Name: text, Length: 1053, dtype: object


In [53]:
# %pip install langchain-openai

### Initialize embedding model

In [None]:
from langchain_openai import OpenAIEmbeddings

#initialize embedding model
model_name = 'text-embedding-ada-002'

embed = OpenAIEmbeddings(
    model=model_name,
    openai_api_key="OPENAI_API_KEY"
)

#Embedding the text

#Extract the actual text content from each Document
texts = [doc for doc in dataframe["text"]]

#embed the text
res = embed.embed_documents(texts, chunk_size=100) 
len(res), len(res[0])

(1053, 1536)

In [39]:
type(texts)

print(texts)



In [None]:
#initializing Pinecone vector database
import getpass
import os
from pinecone import Pinecone, ServerlessSpec

if not os.getenv("PINECONE_API_KEY"):
    os.environ["PINECONE_API_KEY"] = getpass.getpass("Enter your Pinecone API key: ")

# initialize connection to pinecone (get API key at app.pinecone.io)
pinecone_api_key = os.environ.get("PINECONE_API_KEY")

# configure client
pc = Pinecone(api_key=pinecone_api_key)

#define the cloud provider and region where we want to deploy our index
spec = ServerlessSpec(
    cloud="aws", region="us-east-1"
)

In [None]:
import time
#initialize the index.
index_name = 'langchain-retrieval-augmentation'

existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
#create index if does not already exist
if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)

index = pc.Index(index_name)

In [None]:
# Import tqdm for displaying a progress bar during iteration
from tqdm.auto import tqdm
# Import uuid4 to generate unique IDs for each text chunk
from uuid import uuid4

# Set the maximum number of chunks to process in one batch
batch_limit = 100

# Initialize a list to hold text chunks before embedding
text_chunks= []
# Initialize a list to hold metadata for each text chunk
metadatas = []

# Loop over each record in the dataset with a progress bar
for i, text in enumerate(tqdm(texts)):
    metadata = {"chunk": i}

    # Split text into chunks
    record_texts = text_splitter.split_text(text)

    record_metadatas = [{"chunk": j, **metadata} for j, _ in enumerate(record_texts)]

    text_chunks.extend(record_texts)
    metadatas.extend(record_metadatas)

    if len(text_chunks) >= batch_limit:
        ids = [str(uuid4()) for _ in range(len(text_chunks))]
        embeds = embed.embed_documents(text_chunks)
        index.upsert(vectors=zip(ids, embeds, metadatas))
        text_chunks = []
        metadatas = []


# After processing all records, if any chunks are left, upload them too
if len(text_chunks) > 0:
    # Generate unique IDs for the remaining text chunks
    ids = [str(uuid4()) for _ in range(len(text_chunks))]
    # Embed the remaining text chunks
    embeds = embed.embed_documents(text_chunks)
    # Upload the remaining vectors with IDs and metadata
    index.upsert(vectors=zip(ids, embeds, metadatas))

#check number of vectors in index:
index.describe_index_stats()

100%|██████████| 1053/1053 [01:26<00:00, 12.24it/s]


{'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 1053}},
 'total_vector_count': 1053,
 'vector_type': 'dense'}

In [50]:
# print(text_chunks)
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 1053}},
 'total_vector_count': 1053,
 'vector_type': 'dense'}

### Creating a vector store and querying

In [54]:
from langchain_pinecone import PineconeVectorStore

vector_store = PineconeVectorStore(index=index, embedding=embed, text_key="texts")

query = "What is the meaning of life?"

results = vector_store.similarity_search(query, k=3)

Found document with no `texts` key. Skipping.
Found document with no `texts` key. Skipping.
Found document with no `texts` key. Skipping.
