In [65]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [2]:
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_ENDPOINT")
AZURE_REGION = os.getenv("AZURE_REGION")
MODEL_NAME = os.getenv("MODEL_NAME")

In [4]:
os.environ["AZURE_OPENAI_API_KEY"] = AZURE_OPENAI_API_KEY
os.environ["AZURE_OPENAI_ENDPOINT"]= AZURE_OPENAI_ENDPOINT

In [5]:
from langchain_openai import AzureOpenAIEmbeddings

In [7]:
embeddings = AzureOpenAIEmbeddings(model="text-embedding-ada-002")

In [9]:
sample_text = "I am testing text to embedding"
query_result = embeddings.embed_query(sample_text)

In [10]:
len(query_result)

1536

In [11]:
query_result[:5]

[-0.04333774000406265,
 -0.0007516933837905526,
 -0.013608371838927269,
 0.0007650940679013729,
 0.014110896736383438]

In [14]:
from langchain_community.document_loaders import TextLoader
loader = TextLoader('2. DataIngestion/super_DS.txt')
text_docs = loader.load()

In [15]:
text_docs

[Document(metadata={'source': '2. DataIngestion/super_DS.txt'}, page_content='Show Notes: http://www.superdatascience.com/802 1\nSDS PODCAST\nEPISODE 802:\nIN CASE YOU MISSED\nIT IN JUNE 2024\n    Show Notes: http://www.superdatascience.com/802 2\nJon: 00:02 This is episode number 802, our In Case You Missed it in\nJune episode.\n    00:19 Welcome back to the Super Data Science Podcast. I\'m\nyour host, Jon Krohn. This is an In Case You Missed It\nepisode that highlights the best parts of conversations we\nhad on the show in the last month. This first clip you\'ll\nhear is from my interview with Dr. Jason Yosinski, one of\nmy all-time favorite AI researchers. We had a great\nconversation about making your AI and ML models\nattractive to customers.\n    00:40 In this clip, I got him to speak from his experience as\nCEO of the climate technology startup he founded,\nWindscape AI. This is a great case study if you\'re\nplanning to launch your own AI models commercially.\n00:51 I\'m sure t

In [17]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [20]:
text_spitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=20)

In [25]:
docs = text_spitter.split_documents(text_docs)

In [26]:
docs

[Document(metadata={'source': '2. DataIngestion/super_DS.txt'}, page_content='Show Notes: http://www.superdatascience.com/802 1\nSDS PODCAST\nEPISODE 802:\nIN CASE YOU MISSED'),
 Document(metadata={'source': '2. DataIngestion/super_DS.txt'}, page_content='IN CASE YOU MISSED\nIT IN JUNE 2024\n    Show Notes: http://www.superdatascience.com/802 2'),
 Document(metadata={'source': '2. DataIngestion/super_DS.txt'}, page_content='Jon: 00:02 This is episode number 802, our In Case You Missed it in\nJune episode.'),
 Document(metadata={'source': '2. DataIngestion/super_DS.txt'}, page_content="June episode.\n    00:19 Welcome back to the Super Data Science Podcast. I'm"),
 Document(metadata={'source': '2. DataIngestion/super_DS.txt'}, page_content='your host, Jon Krohn. This is an In Case You Missed It'),
 Document(metadata={'source': '2. DataIngestion/super_DS.txt'}, page_content='episode that highlights the best parts of conversations we'),
 Document(metadata={'source': '2. DataIngestion/supe

In [29]:
from langchain_community.vectorstores import Chroma

In [53]:
db = Chroma.from_documents(docs, embeddings)

In [54]:
db

<langchain_community.vectorstores.chroma.Chroma at 0x7f25a0192da0>

In [55]:
query = "Who is CEO of the climate technology startup?"
retrieved_results = db.similarity_search(query)

In [56]:
print(retrieved_results)

[Document(metadata={'source': '2. DataIngestion/super_DS.txt'}, page_content='CEO of the climate technology startup he founded,'), Document(metadata={'source': '2. DataIngestion/super_DS.txt'}, page_content='CEO of the climate technology startup he founded,'), Document(metadata={'source': '2. DataIngestion/super_DS.txt'}, page_content='might go is to pilot our technology, start working with'), Document(metadata={'source': '2. DataIngestion/super_DS.txt'}, page_content='recently selected Windscape as one of nine startups for its')]


## Ollama Embeddings

In [57]:
from langchain_community.embeddings import OllamaEmbeddings

In [58]:
ollama_embeddings = OllamaEmbeddings(model="gemma2")

In [59]:
ollama_embeddings

OllamaEmbeddings(base_url='http://localhost:11434', model='gemma2', embed_instruction='passage: ', query_instruction='query: ', mirostat=None, mirostat_eta=None, mirostat_tau=None, num_ctx=None, num_gpu=None, num_thread=None, repeat_last_n=None, repeat_penalty=None, temperature=None, stop=None, tfs_z=None, top_k=None, top_p=None, show_progress=False, headers=None, model_kwargs=None)

In [60]:
r1 = ollama_embeddings.embed_documents(
    ["Machine Learning is subset of AI",
    "Deep Learning is subset of Machine Learning",
    "Generative AI is subset of Deep Learning"]
)

In [62]:
len(r1[0])

3584

In [63]:
r1[0][:5]

[1.3644047975540161,
 -2.029574155807495,
 0.8655009865760803,
 0.20633915066719055,
 0.13833169639110565]

In [64]:
ollama_embeddings.embed_query("What is Fine Tuning in LLM?")

[0.2738792598247528,
 -4.164581775665283,
 0.13975514471530914,
 -1.8949103355407715,
 -0.4075901210308075,
 -0.7781814932823181,
 1.095896601676941,
 -0.1486121416091919,
 0.08774778246879578,
 1.502740502357483,
 -1.679840087890625,
 -0.218009814620018,
 -1.8676831722259521,
 1.6254539489746094,
 -0.5945742726325989,
 -2.2157199382781982,
 -0.432412326335907,
 -1.1882922649383545,
 0.5471755266189575,
 -0.5567122101783752,
 1.947269082069397,
 2.0280091762542725,
 1.109502911567688,
 -4.09214973449707,
 3.0050854682922363,
 -1.4610167741775513,
 0.8770132064819336,
 -1.7370434999465942,
 1.252832293510437,
 -2.20581316947937,
 -1.2523497343063354,
 0.6333433389663696,
 0.458801805973053,
 -0.5503769516944885,
 0.06268345564603806,
 -0.18938124179840088,
 1.6511965990066528,
 -0.621436595916748,
 -0.4570638835430145,
 -0.2568209767341614,
 -0.14027206599712372,
 -0.5444285273551941,
 0.11933310329914093,
 -0.4465180039405823,
 -1.0265681743621826,
 2.3076648712158203,
 1.8012589216232

## Huggingface Embeddings

In [67]:
load_dotenv()

True

In [68]:
os.environ["HF_TOKEN"] = os.getenv("HUGGINGFACE_API_TOKEN")

In [70]:
from langchain_huggingface import HuggingFaceEmbeddings

In [71]:
hf_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

  from tqdm.autonotebook import tqdm, trange


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [72]:
text = "I am trying to use huggingface embeedings"
query_result = hf_embeddings.embed_query(text)
print(len(query_result))

384


In [73]:
query_result[:5]

[-0.05429377779364586,
 0.019458668306469917,
 0.021442962810397148,
 0.006438622251152992,
 -0.01802055723965168]