## Embedding Techniques
Convert text into vectors

In [2]:
import os
from dotenv import  load_dotenv
load_dotenv() # load all the environment variables

True

## OPEN AI

In [3]:
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [4]:
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
embeddings



OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x102abeb50>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x108f3ae20>, model='text-embedding-3-large', dimensions=None, deployment='text-embedding-ada-002', openai_api_version=None, openai_api_base=None, openai_api_type=None, openai_proxy=None, embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True)

In [None]:
text = "This is a tutorial on OPENAI embedding"
query_results = embeddings.embed_query(text)

## Ollama
Ollama supports embedding models, making it possible to build retrieval augmented generation (RAG) applications that combines text prompts with existing documents or other data.

In [7]:
from langchain_community.embeddings import OllamaEmbeddings

In [8]:
embeddings = OllamaEmbeddings(model="gemma:2b") # by default, it uses llama2

In [9]:
embeddings

OllamaEmbeddings(base_url='http://localhost:11434', model='gemma:2b', embed_instruction='passage: ', query_instruction='query: ', mirostat=None, mirostat_eta=None, mirostat_tau=None, num_ctx=None, num_gpu=None, num_thread=None, repeat_last_n=None, repeat_penalty=None, temperature=None, stop=None, tfs_z=None, top_k=None, top_p=None, show_progress=False, headers=None, model_kwargs=None)

In [10]:
r1 = embeddings.embed_documents([
    "Alpha is the first letter of greek alphabet",
    "Beta is the second letter of greek alphabet"
])
r1

[[-2.7003536224365234,
  -1.0473356246948242,
  0.17994165420532227,
  2.663728952407837,
  -0.28092509508132935,
  0.8820974826812744,
  -0.17843374609947205,
  0.3314308226108551,
  0.7458213567733765,
  -1.2317312955856323,
  0.7281075119972229,
  0.7715346813201904,
  0.5498905181884766,
  -0.5613488554954529,
  -0.6415230631828308,
  0.0058078705333173275,
  2.715690851211548,
  -1.2191450595855713,
  0.8078550100326538,
  0.17958705127239227,
  0.3550061881542206,
  -0.2722943127155304,
  0.8378119468688965,
  0.5964418053627014,
  -0.5585238337516785,
  -1.1163105964660645,
  -0.7309419512748718,
  -0.2126338630914688,
  0.2332524210214615,
  1.200757384300232,
  -0.20445947349071503,
  -0.12089765071868896,
  0.7731602191925049,
  0.5546122789382935,
  -1.6836470365524292,
  -0.20944885909557343,
  -0.8625937700271606,
  0.5442084074020386,
  0.8873997330665588,
  0.49008849263191223,
  1.5818431377410889,
  1.069983720779419,
  1.4978220462799072,
  -0.23023520410060883,
  -0.

In [11]:
len(r1[0])

2048

In [12]:
embeddings.embed_query("What is the second letter of greek alphabet?")

[-2.723020553588867,
 0.04448791220784187,
 -2.050244092941284,
 0.09286335855722427,
 -0.806308388710022,
 0.07298683375120163,
 -1.4879645109176636,
 -0.9441307783126831,
 0.10854365676641464,
 -1.5524121522903442,
 0.47428104281425476,
 -0.37064170837402344,
 1.2750109434127808,
 0.955312967300415,
 -0.9475001692771912,
 0.41135916113853455,
 8.045516014099121,
 -0.9787219762802124,
 0.2880857288837433,
 -0.5060679316520691,
 0.7924796342849731,
 -0.7283662557601929,
 2.1703481674194336,
 0.30023637413978577,
 -2.308462619781494,
 -1.2059979438781738,
 0.3655410408973694,
 -0.41892531514167786,
 -0.9990905523300171,
 -0.5528619289398193,
 -1.8181380033493042,
 1.4809050559997559,
 1.2045971155166626,
 0.7968412041664124,
 -0.4050234854221344,
 -1.0517534017562866,
 -1.466113567352295,
 0.031236393377184868,
 -0.16238832473754883,
 0.5751945376396179,
 -0.4457736015319824,
 1.65984046459198,
 -1.7393049001693726,
 -2.5081934928894043,
 -2.8452370166778564,
 0.21070006489753723,
 1.97

## Embedding techniques using huggingFace

In [13]:
os.environ["HF_TOKEN"] = os.getenv("HF_TOKEN")

In [14]:
from langchain_huggingface import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

  from tqdm.autonotebook import tqdm, trange


In [16]:
text = "This is a test document"
query_results = embeddings.embed_query(text)
query_results

[-0.04820787161588669,
 0.11789613962173462,
 -0.03746980428695679,
 0.056620530784130096,
 0.015501742251217365,
 -0.036749307066202164,
 -0.059571523219347,
 0.057209160178899765,
 -0.020756414160132408,
 0.057084690779447556,
 0.07765142619609833,
 0.018936708569526672,
 0.0006362255080603063,
 -0.00040406998596154153,
 -0.06529416888952255,
 -0.028550060465931892,
 -0.011813553981482983,
 -0.04569051414728165,
 -0.007525664754211903,
 0.08929041773080826,
 0.0531037375330925,
 0.06305599212646484,
 -0.0045524477027356625,
 0.0003609011182561517,
 0.008460184559226036,
 0.030092841014266014,
 -0.06308867037296295,
 0.03805892914533615,
 0.08158417046070099,
 -0.05816444009542465,
 0.032057542353868484,
 0.06851987540721893,
 0.07797114551067352,
 0.0341692715883255,
 0.0647691935300827,
 0.004228704143315554,
 0.07276935130357742,
 0.0025179816875606775,
 0.03840572014451027,
 0.03519796207547188,
 -0.01765647903084755,
 -0.11470238119363785,
 0.009409711696207523,
 0.03478093445301

In [17]:
len(query_results)

384

In [20]:
docs_result = embeddings.embed_documents([text, "This is not a test document."])
docs_result

[[-0.04820790886878967,
  0.11789615452289581,
  -0.03746984899044037,
  0.0566205158829689,
  0.015501810237765312,
  -0.03674936294555664,
  -0.059571556746959686,
  0.05720925331115723,
  -0.02075641229748726,
  0.05708468332886696,
  0.07765144109725952,
  0.01893671602010727,
  0.0006362652638927102,
  -0.00040411201189272106,
  -0.06529419124126434,
  -0.0285500455647707,
  -0.0118135716766119,
  -0.04569047689437866,
  -0.007525731343775988,
  0.08929037302732468,
  0.05310367792844772,
  0.06305602192878723,
  -0.004552455153316259,
  0.0003609296109061688,
  0.008460155688226223,
  0.030092846602201462,
  -0.06308867037296295,
  0.03805895149707794,
  0.08158419281244278,
  -0.058164458721876144,
  0.03205755352973938,
  0.06851985305547714,
  0.07797118276357651,
  0.03416924551129341,
  0.06476929038763046,
  0.004228707868605852,
  0.07276921719312668,
  0.0025179411750286818,
  0.03840567171573639,
  0.03519795835018158,
  -0.017656544223427773,
  -0.11470235139131546,
  0