### Embedding (OpenAI Embeddings)

In [1]:
import os

In [2]:
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [4]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
embeddings

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x0000028C205157C0>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x0000028C22A55F40>, model='text-embedding-3-large', dimensions=None, deployment='text-embedding-ada-002', openai_api_version=None, openai_api_base=None, openai_api_type=None, openai_proxy=None, embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True)

In [5]:
text = "This is a tutorial on OpenAI Embeddings"

query_result = embeddings.embed_query(text)
query_result

[-0.003737016813829541,
 0.03273262828588486,
 -0.01847485825419426,
 -0.0452834814786911,
 0.025428030639886856,
 -0.010536442510783672,
 0.011904485523700714,
 0.04804467037320137,
 -0.007568165194243193,
 0.003316563321277499,
 -0.020884621888399124,
 0.013441964983940125,
 -0.008201982825994492,
 -0.011873108334839344,
 0.008245911449193954,
 0.011879383586347103,
 0.016906000673770905,
 0.03918376564979553,
 -0.028390033170580864,
 -0.023972131311893463,
 0.013002685271203518,
 0.01937851868569851,
 -0.055273961275815964,
 -0.04882282391190529,
 0.022541334852576256,
 -0.025428030639886856,
 0.013818490318953991,
 0.04099109023809433,
 -0.00352679006755352,
 0.051358096301555634,
 0.008616161532700062,
 -0.017106814309954643,
 0.004261014983057976,
 -0.004270428325980902,
 -0.021662773564457893,
 0.022629190236330032,
 0.050128113478422165,
 0.03597074747085571,
 -0.05138319730758667,
 0.026783522218465805,
 0.02911798097193241,
 0.008904831483960152,
 0.00040515727596357465,
 -0.

In [7]:
print(query_result[0])
print(len(query_result))

-0.003737016813829541
3072


### Take Chunks of Documents and Convert to Vectors:

In [11]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader('./../3-DataIngestion/attention.pdf')
docs = loader.load()
docs

[Document(metadata={'producer': 'PyPDF2', 'creator': 'PyPDF', 'creationdate': '', 'subject': 'Neural Information Processing Systems http://nips.cc/', 'publisher': 'Curran Associates, Inc.', 'language': 'en-US', 'created': '2017', 'eventtype': 'Poster', 'description-abstract': 'The dominant sequence transduction models are based on complex recurrent orconvolutional neural networks in an encoder and decoder configuration. The best performing such models also connect the encoder and decoder through an attentionm echanisms.  We propose a novel, simple network architecture based solely onan attention mechanism, dispensing with recurrence and convolutions entirely.Experiments on two machine translation tasks show these models to be superiorin quality while being more parallelizable and requiring significantly less timeto train. Our single model with 165 million parameters, achieves 27.5 BLEU onEnglish-to-German translation, improving over the existing best ensemble result by over 1 BLEU. On 

In [12]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=50)
final_docs = text_splitter.split_documents(docs)

In [14]:
final_docs[0:3]

[Document(metadata={'producer': 'PyPDF2', 'creator': 'PyPDF', 'creationdate': '', 'subject': 'Neural Information Processing Systems http://nips.cc/', 'publisher': 'Curran Associates, Inc.', 'language': 'en-US', 'created': '2017', 'eventtype': 'Poster', 'description-abstract': 'The dominant sequence transduction models are based on complex recurrent orconvolutional neural networks in an encoder and decoder configuration. The best performing such models also connect the encoder and decoder through an attentionm echanisms.  We propose a novel, simple network architecture based solely onan attention mechanism, dispensing with recurrence and convolutions entirely.Experiments on two machine translation tasks show these models to be superiorin quality while being more parallelizable and requiring significantly less timeto train. Our single model with 165 million parameters, achieves 27.5 BLEU onEnglish-to-German translation, improving over the existing best ensemble result by over 1 BLEU. On 

### Apply the Embedding Technique to create Vector Store:

In [None]:
from langchain_community.vectorstores import Chroma

db = Chroma.from_documents(final_docs, embeddings)
db

<langchain_community.vectorstores.chroma.Chroma at 0x28c44914f50>

In [None]:
# Similarity Search

query = "encoder-decoder attention"
retrieved_results = db.similarity_search(query)
print(retrieved_results)

[Document(metadata={'book': 'Advances in Neural Information Processing Systems 30', 'lastpage': '6008', 'editors': 'I. Guyon and U.V. Luxburg and S. Bengio and H. Wallach and R. Fergus and S. Vishwanathan and R. Garnett', 'type': 'Conference Proceedings', 'publisher': 'Curran Associates, Inc.', 'author': 'Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Łukasz Kaiser, Illia Polosukhin', 'creationdate': '', 'published': '2017', 'description': 'Paper accepted and presented at the Neural Information Processing Systems Conference (http://nips.cc/)', 'total_pages': 11, 'language': 'en-US', 'created': '2017', 'page_label': '5', 'producer': 'PyPDF2', 'eventtype': 'Poster', 'subject': 'Neural Information Processing Systems http://nips.cc/', 'date': '2017', 'firstpage': '5998', 'moddate': '2018-02-12T21:22:10-08:00', 'creator': 'PyPDF', 'source': './../3-DataIngestion/attention.pdf', 'title': 'Attention is All you Need', 'description-abstract': 'The domin