# Data Embeddings

- Converting **Text** into **Vectors** 

## 1. Ollama Embeddings

- Download Ollama - https://ollama.com/download/
- Embedding models - https://ollama.com/blog/embedding-models
- Ollama supports embedding models, making it possible to build retrieval augmented generation (RAG) application that combine text prompts with existing documents or other data.

In [1]:
from langchain_community.embeddings import OllamaEmbeddings
ollama_embeddings = OllamaEmbeddings(model="gemma2:2b") # Default -> Llama2
ollama_embeddings

  ollama_embeddings = OllamaEmbeddings(model="gemma2:2b") # Default -> Llama2


OllamaEmbeddings(base_url='http://localhost:11434', model='gemma2:2b', embed_instruction='passage: ', query_instruction='query: ', mirostat=None, mirostat_eta=None, mirostat_tau=None, num_ctx=None, num_gpu=None, num_thread=None, repeat_last_n=None, repeat_penalty=None, temperature=None, stop=None, tfs_z=None, top_k=None, top_p=None, show_progress=False, headers=None, model_kwargs=None)

In [2]:
text_list = [
    "I am a sentence for which I would like to get its embedding.",
    "And I am another sentence.",
]

In [3]:
embedding1 = ollama_embeddings.embed_documents(text_list)
embedding1

[[1.0798555612564087,
  -0.4122850000858307,
  -1.9241951704025269,
  0.07925327122211456,
  0.859781801700592,
  1.7996304035186768,
  0.12912537157535553,
  0.2891291379928589,
  3.2795467376708984,
  -1.7216293811798096,
  -1.24277925491333,
  1.9579665660858154,
  -3.1202149391174316,
  0.027284879237413406,
  -0.043146874755620956,
  -0.5677456855773926,
  -2.4885504245758057,
  0.8865227103233337,
  -1.3027304410934448,
  0.07325530797243118,
  2.186525821685791,
  -0.446972519159317,
  -1.0450876951217651,
  1.4218370914459229,
  -0.01624675653874874,
  0.8433957099914551,
  -0.836681067943573,
  -0.8123200535774231,
  -2.1960012912750244,
  0.5433127284049988,
  0.6828920841217041,
  -0.026942258700728416,
  -2.325800657272339,
  0.5635003447532654,
  0.21868328750133514,
  0.24671734869480133,
  -1.1509162187576294,
  0.5304752588272095,
  2.150618076324463,
  -2.8590614795684814,
  0.14348046481609344,
  -1.1721694469451904,
  0.9675548076629639,
  -1.7019990682601929,
  1.67

In [4]:
ollama_embeddings.embed_query("What is the embedding of this sentence?")

[0.9740376472473145,
 -2.374103307723999,
 -2.9420180320739746,
 1.228000521659851,
 -0.3235505223274231,
 1.0026825666427612,
 -1.0559073686599731,
 -0.6491212844848633,
 4.876015663146973,
 -2.5531275272369385,
 -1.0996161699295044,
 1.868201494216919,
 -3.4991238117218018,
 2.3828094005584717,
 0.3516286015510559,
 0.6021966934204102,
 -0.24978354573249817,
 0.527446985244751,
 -0.9959125518798828,
 2.2094967365264893,
 3.2360496520996094,
 -1.4906089305877686,
 0.5334116816520691,
 0.3454529643058777,
 -0.3035808503627777,
 -0.32121437788009644,
 0.5400655269622803,
 0.8458496928215027,
 -1.7896784543991089,
 -0.11719764024019241,
 -0.4959355294704437,
 -0.03511287271976471,
 -1.4144530296325684,
 0.8332565426826477,
 0.9241091012954712,
 0.044909149408340454,
 -1.257499098777771,
 0.0691598653793335,
 1.1373660564422607,
 -1.5351136922836304,
 -0.8412286043167114,
 -0.9912355542182922,
 -0.5282731056213379,
 -1.0062191486358643,
 1.8372026681900024,
 -2.54734468460083,
 1.52834355

## 2. HuggingFace Embeddings

In [5]:
import os
from dotenv import load_dotenv 
load_dotenv()

HF_TOKENS = os.getenv("HUGGINGFACE_API_KEY")
os.environ["HF_TOKENS"] = HF_TOKENS

### Sentence Transformer on HuggingFace
- HuggingFace sentence transformer is a Python Framework for state-of-the-art sentence, text and image embeddings.
- We have also added an alias for SentenceTransformerEmbeddinfs for users who are more familiar with directly using that package.

In [6]:
from langchain_huggingface import HuggingFaceEmbeddings 
hf_embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") 

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
text = "this a test sentence"
embed_result = hf_embeddings.embed_query(text)
embed_result

[0.051118314266204834,
 0.09269876033067703,
 0.0333590991795063,
 0.10514255613088608,
 0.009567661210894585,
 0.006538490764796734,
 0.07584372907876968,
 -0.030718442052602768,
 0.035187602043151855,
 0.0375276543200016,
 0.1182112842798233,
 -0.061565104871988297,
 -0.023134702816605568,
 0.01695535145699978,
 0.041094742715358734,
 -0.01277181226760149,
 0.00692750234156847,
 -0.022520096972584724,
 -0.07784253358840942,
 0.03591591864824295,
 0.02132968045771122,
 0.0393342450261116,
 0.0017822002992033958,
 0.024385783821344376,
 -0.022503573447465897,
 0.05186379328370094,
 -0.05893256515264511,
 0.007268192246556282,
 0.07900568842887878,
 -0.04176861420273781,
 -0.028312405571341515,
 0.0008944515138864517,
 0.04300723969936371,
 0.07693148404359818,
 0.037935368716716766,
 0.010660264641046524,
 0.04000617936253548,
 0.023425394669175148,
 0.014613120816648006,
 0.0054117231629788876,
 0.013869193382561207,
 -0.14571858942508698,
 0.019580641761422157,
 0.016332358121871948,

In [8]:
## dimentions
len(embed_result) # 343 dimensions

384

## 3. OpenAI Embeddings

In [9]:
import os
from dotenv import load_dotenv 
load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [10]:
from langchain_openai import OpenAIEmbeddings
openai_embeddings = OpenAIEmbeddings(model = "text-embeddings-3-large", dimensions=1024) # Creates embeddings with 1024 dimensions -> By default it is 3072 dimensions
openai_embeddings

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x17871f530>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x178c47470>, model='text-embeddings-3-large', dimensions=1024, deployment='text-embedding-ada-002', openai_api_version=None, openai_api_base=None, openai_api_type=None, openai_proxy=None, embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True)

In [11]:
text = "This text is for OpenAI embeddings"
open_ai_vectors = openai_embeddings.embed_query(text)

AuthenticationError: Error code: 401 - {'error': {'message': 'Incorrect API key provided: "" # htt***************************************************keys. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}