In [1]:
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS

In [2]:
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import CharacterTextSplitter

In [3]:
loader = TextLoader("4. Embedding/super_DS.txt")

In [8]:
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=20)

In [9]:
docs = text_splitter.split_documents(documents)

In [13]:
ollama_embeddings = OllamaEmbeddings(model="gemma2")
db = FAISS.from_documents(docs, ollama_embeddings)

In [14]:
q1 = "What is EDP?"
query_result = db.similarity_search(q1)
query_result

[Document(metadata={'source': '4. Embedding/super_DS.txt'}, page_content='Show Notes: http://www.superdatascience.com/802 1\nSDS PODCAST\nEPISODE 802:\nIN CASE YOU MISSED\nIT IN JUNE 2024\n    Show Notes: http://www.superdatascience.com/802 2\nJon: 00:02 This is episode number 802, our In Case You Missed it in\nJune episode.\n    00:19 Welcome back to the Super Data Science Podcast. I\'m\nyour host, Jon Krohn. This is an In Case You Missed It\nepisode that highlights the best parts of conversations we\nhad on the show in the last month. This first clip you\'ll\nhear is from my interview with Dr. Jason Yosinski, one of\nmy all-time favorite AI researchers. We had a great\nconversation about making your AI and ML models\nattractive to customers.\n    00:40 In this clip, I got him to speak from his experience as\nCEO of the climate technology startup he founded,\nWindscape AI. This is a great case study if you\'re\nplanning to launch your own AI models commercially.\n00:51 I\'m sure that 

## Retriever

In [17]:
retriever = db.as_retriever()

In [20]:
docs = retriever.invoke(q1)
docs[0].page_content

'Show Notes: http://www.superdatascience.com/802 1\nSDS PODCAST\nEPISODE 802:\nIN CASE YOU MISSED\nIT IN JUNE 2024\n    Show Notes: http://www.superdatascience.com/802 2\nJon: 00:02 This is episode number 802, our In Case You Missed it in\nJune episode.\n    00:19 Welcome back to the Super Data Science Podcast. I\'m\nyour host, Jon Krohn. This is an In Case You Missed It\nepisode that highlights the best parts of conversations we\nhad on the show in the last month. This first clip you\'ll\nhear is from my interview with Dr. Jason Yosinski, one of\nmy all-time favorite AI researchers. We had a great\nconversation about making your AI and ML models\nattractive to customers.\n    00:40 In this clip, I got him to speak from his experience as\nCEO of the climate technology startup he founded,\nWindscape AI. This is a great case study if you\'re\nplanning to launch your own AI models commercially.\n00:51 I\'m sure that kind of engineering mindset is applicable to\na lot of our listeners, and

## Similarity Searach With Score

In [21]:
docs_and_score = db.similarity_search_with_score(q1)
docs_and_score

[(Document(metadata={'source': '4. Embedding/super_DS.txt'}, page_content='Show Notes: http://www.superdatascience.com/802 1\nSDS PODCAST\nEPISODE 802:\nIN CASE YOU MISSED\nIT IN JUNE 2024\n    Show Notes: http://www.superdatascience.com/802 2\nJon: 00:02 This is episode number 802, our In Case You Missed it in\nJune episode.\n    00:19 Welcome back to the Super Data Science Podcast. I\'m\nyour host, Jon Krohn. This is an In Case You Missed It\nepisode that highlights the best parts of conversations we\nhad on the show in the last month. This first clip you\'ll\nhear is from my interview with Dr. Jason Yosinski, one of\nmy all-time favorite AI researchers. We had a great\nconversation about making your AI and ML models\nattractive to customers.\n    00:40 In this clip, I got him to speak from his experience as\nCEO of the climate technology startup he founded,\nWindscape AI. This is a great case study if you\'re\nplanning to launch your own AI models commercially.\n00:51 I\'m sure that

In [22]:
embedding_vector = ollama_embeddings.embed_query(q1)

In [25]:
len(embedding_vector)

3584

In [24]:
embedding_vector[:5]

[-0.8230404853820801,
 -1.8964414596557617,
 -0.14863909780979156,
 -0.6310588121414185,
 -3.0927000045776367]

In [27]:
docs_score = db.similarity_search_by_vector(embedding_vector)

In [28]:
docs_score

[Document(metadata={'source': '4. Embedding/super_DS.txt'}, page_content='Show Notes: http://www.superdatascience.com/802 1\nSDS PODCAST\nEPISODE 802:\nIN CASE YOU MISSED\nIT IN JUNE 2024\n    Show Notes: http://www.superdatascience.com/802 2\nJon: 00:02 This is episode number 802, our In Case You Missed it in\nJune episode.\n    00:19 Welcome back to the Super Data Science Podcast. I\'m\nyour host, Jon Krohn. This is an In Case You Missed It\nepisode that highlights the best parts of conversations we\nhad on the show in the last month. This first clip you\'ll\nhear is from my interview with Dr. Jason Yosinski, one of\nmy all-time favorite AI researchers. We had a great\nconversation about making your AI and ML models\nattractive to customers.\n    00:40 In this clip, I got him to speak from his experience as\nCEO of the climate technology startup he founded,\nWindscape AI. This is a great case study if you\'re\nplanning to launch your own AI models commercially.\n00:51 I\'m sure that 

## Saving and Loading

In [30]:
db.save_local("faiss_index")

In [33]:
new_db = FAISS.load_local("faiss_index", ollama_embeddings,allow_dangerous_deserialization=True)

In [34]:
docs = new_db.similarity_search(q1)

In [35]:
docs

[Document(metadata={'source': '4. Embedding/super_DS.txt'}, page_content='Show Notes: http://www.superdatascience.com/802 1\nSDS PODCAST\nEPISODE 802:\nIN CASE YOU MISSED\nIT IN JUNE 2024\n    Show Notes: http://www.superdatascience.com/802 2\nJon: 00:02 This is episode number 802, our In Case You Missed it in\nJune episode.\n    00:19 Welcome back to the Super Data Science Podcast. I\'m\nyour host, Jon Krohn. This is an In Case You Missed It\nepisode that highlights the best parts of conversations we\nhad on the show in the last month. This first clip you\'ll\nhear is from my interview with Dr. Jason Yosinski, one of\nmy all-time favorite AI researchers. We had a great\nconversation about making your AI and ML models\nattractive to customers.\n    00:40 In this clip, I got him to speak from his experience as\nCEO of the climate technology startup he founded,\nWindscape AI. This is a great case study if you\'re\nplanning to launch your own AI models commercially.\n00:51 I\'m sure that 

## Chroma DB

In [37]:
# building sample vectorDB

from langchain_chroma import Chroma
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings import OllamaEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [38]:
loader = TextLoader("5. VectorStores/vectorStores.ipynb")
data = loader.load()
data

[Document(metadata={'source': '5. VectorStores/vectorStores.ipynb'}, page_content='{\n "cells": [\n  {\n   "cell_type": "code",\n   "execution_count": 1,\n   "metadata": {},\n   "outputs": [],\n   "source": [\n    "from langchain_community.document_loaders import TextLoader\\n",\n    "from langchain_community.vectorstores import FAISS"\n   ]\n  },\n  {\n   "cell_type": "code",\n   "execution_count": 2,\n   "metadata": {},\n   "outputs": [],\n   "source": [\n    "from langchain_community.embeddings import OllamaEmbeddings\\n",\n    "from langchain_text_splitters import CharacterTextSplitter"\n   ]\n  },\n  {\n   "cell_type": "code",\n   "execution_count": 3,\n   "metadata": {},\n   "outputs": [],\n   "source": [\n    "loader = TextLoader(\\"4. Embedding/super_DS.txt\\")"\n   ]\n  },\n  {\n   "cell_type": "code",\n   "execution_count": 8,\n   "metadata": {},\n   "outputs": [],\n   "source": [\n    "documents = loader.load()\\n",\n    "text_splitter = CharacterTextSplitter(chunk_size=100,

In [39]:
# split

text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=20)
splits = text_splitter.split_documents(data)

In [40]:
vectorDB = Chroma.from_documents(documents=splits, embedding=ollama_embeddings)

In [None]:
vectorDB

In [None]:
## Query it

q2 = "How do you get safety into your model?"
docs = vectorDB.similarity_search(q2)

In [None]:
docs[0].page_content

In [None]:
## saving in local

vectordb = Chroma.from_documents(documents=splits, embedding=ollama_embeddings, persist_directory="./chroma_db")

In [None]:
# Load from disk

db2 = Chroma(persist_directory="./chroma_db", embedding_function=ollama_embeddings)

In [None]:
docs = db2.similarity_search(q2)
print(docs[0].page_content)

In [None]:
# similary Search with Score

docs = vectorDB.similarity_search_with_score(q2)
docs

In [None]:
## Retriever 

retriever_chroma = vectorDB.retriever()
retriever_chroma.invoke(q2)[0].page_content