In [None]:
import os
from dotenv import load_dotenv
load_dotenv() # Load environment variables from .env file

True

In [4]:
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [5]:
from langchain_openai import OpenAIEmbeddings
openai_embedding_model = OpenAIEmbeddings(
    model="text-embedding-3-small"
)

openai_embedding_model

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x110f3b160>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x111862350>, model='text-embedding-3-small', dimensions=None, deployment='text-embedding-ada-002', openai_api_version=None, openai_api_base=None, openai_api_type=None, openai_proxy=None, embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True)

In [6]:
text = "My name is Shivang Singh, I am a Data Scientist at Publicis Sapient"

vector_embeddings = openai_embedding_model.embed_query(text)
vector_embeddings

[0.014153925701975822,
 -0.011937323957681656,
 0.015793288126587868,
 0.005391421727836132,
 0.013391968794167042,
 -0.01267619151622057,
 -0.027661342173814774,
 0.028838912025094032,
 -0.019025832414627075,
 -0.07850464433431625,
 -0.02514457516372204,
 -0.04271576181054115,
 -0.04641009867191315,
 -0.00370588107034564,
 0.029416153207421303,
 -0.006326550617814064,
 0.04287739098072052,
 -0.021831218153238297,
 -0.03862890228629112,
 0.008139084093272686,
 0.028838912025094032,
 0.011602524667978287,
 0.019429899752140045,
 0.026021981611847878,
 0.005590569693595171,
 -0.014338642358779907,
 -0.03666628897190094,
 0.013461237773299217,
 0.019903237000107765,
 -0.032602518796920776,
 0.010586582124233246,
 -0.02143869549036026,
 -0.040660787373781204,
 -0.010609671473503113,
 0.007948595099151134,
 0.023689931258559227,
 -0.009230067953467369,
 -0.016924677416682243,
 -0.003030510153621435,
 -0.0006818647962063551,
 0.012618467211723328,
 -0.006713301874697208,
 0.03971411287784576

In [11]:
print(len(vector_embeddings))
vector_embeddings[0]

1536


0.014153925701975822

In [13]:
#We can also have custom dimensions for the embeddings
custom_embedding_model_1024 = OpenAIEmbeddings(
    model="text-embedding-3-small",
    dimensions=1024
)

custom_vector_embeddings_1024 = custom_embedding_model_1024.embed_query(text)
print(len(custom_vector_embeddings_1024))
print(custom_vector_embeddings_1024[0])

1024
0.01642787456512451


In [14]:
#Lets do this for an entire document also
#Text Loader
from langchain_community.document_loaders import TextLoader

text_loader = TextLoader('speech.txt')
text_loader

<langchain_community.document_loaders.text.TextLoader at 0x111e656c0>

In [16]:
#Text splitting
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100
)
documents = text_loader.load()

splitter_documents = text_splitter.split_documents(documents)
print(len(splitter_documents))

5


In [None]:
#Creating embeddings and store into vectordb
from langchain.vectorstores import Chroma
vector_db = Chroma.from_documents(
    splitter_documents,
    custom_embedding_model_1024
)
#we have given our splitted documents and custom embeddings model
vector_db

<langchain_community.vectorstores.chroma.Chroma at 0x117a308b0>

In [None]:
#Now we can query the vector database and 
#it will return the most similar documents based on the embeddings
query = "What is the speech about?"
results = vector_db.similarity_search(query, k=3)
print(results)
for result in results:
    print(result.page_content)

[Document(metadata={'source': 'speech.txt'}, page_content='It is a distressing and oppressive duty, gentlemen of the Congress, which I have performed in thus addressing you. There are, it may be, many months of fiery trial and sacrifice ahead of us. It is a fearful thing to lead this great peaceful people into war, into the most terrible and disastrous of all wars, civilization itself seeming to be in the balance. But the right is more precious than peace, and we shall fight for the things which we have always carried nearest our hearts—for democracy, for the right of those who submit to authority to have a voice in their own governments, for the rights and liberties of small nations, for a universal dominion of right by such a concert of free peoples as shall bring peace and safety to all nations and make the world itself at last free.'), Document(metadata={'source': 'speech.txt'}, page_content='The world must be made safe for democracy. Its peace must be planted upon the tested found