# 2.3 Vectorstores and Embeddings - part 2

## Using other embedding models

## Setup

### Install dependencies

In [None]:
%pip install python-dotenv~=1.0 docarray~=0.40.0 pypdf~=5.1 --upgrade --quiet
%pip install chromadb~=0.5.18 sentence-transformers~=3.3 --upgrade --quiet 
%pip install langchain~=0.3.7 langchain_openai~=0.2.6 langchain_community~=0.3.5 langchain-huggingface~=0.1.2 --upgrade --quiet
%pip install unstructured[md]~=0.16.5 --upgrade --quiet

# If running locally, you can do this instead:
#%pip install -r ../requirements.txt

### Load environment variables

In [None]:
import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

# If running in Google Colab, you can use this code instead:
# from google.colab import userdata
# os.environ["AZURE_OPENAI_API_KEY"] = userdata.get("AZURE_OPENAI_API_KEY")
# os.environ["AZURE_OPENAI_ENDPOINT"] = userdata.get("AZURE_OPENAI_ENDPOINT")

### Setup Models

In [None]:
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
api_version = "2024-10-01-preview"
oai_embedding_model = AzureOpenAIEmbeddings(model="text-embedding-3-large", openai_api_version=api_version)
print(f"Dimension in OpenAI embedding model: {len(oai_embedding_model.embed_query('test'))}")

### Setup path to data 

In [None]:
data_path = "../data"

## Setup HuggingFace Embedding Model

In [None]:
from langchain_core.embeddings import Embeddings
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

# Try using an open-source embedding function from HuggingFace

# See https://huggingface.co/spaces/mteb/leaderboard
hf_embedding_model: Embeddings = HuggingFaceEmbeddings(
    #model_name="avsolatorio/GIST-all-MiniLM-L6-v2" # 23M params, 0.08GB mem use, 384 dim, 512 tokens, 59 avg score
    #model_name="intfloat/multilingual-e5-large-instruct" # 560M params, 2.09GB mem use, 1024 dim, 514 tokens, 63.61 avg score
    model_name="HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1" # 494M params, 1.84GB mem use, 896 dim, 131k tokens, 64.74 avg score
    #model_name="Salesforce/SFR-Embedding-2_R" # 7B params, 26GB mem use, 4096 dim, 32k tokens, 70.32 avg score
    #model_name="nvidia/NV-Embed-v2" # 7B params, 29GB mem use, 4096 dim, 32k tokens, 72.31 avg score
)
print(f"Dimension in HF model: {len(hf_embedding_model.embed_query('test'))}")

### Load the documents

In [None]:
from langchain_community.document_loaders import UnstructuredMarkdownLoader

loaders = [
    UnstructuredMarkdownLoader(f"{data_path}/listing1.md"),
    UnstructuredMarkdownLoader(f"{data_path}/listing2.md"),
    UnstructuredMarkdownLoader(f"{data_path}/listing3.md"),
]
documents = []
for loader in loaders:
    documents.extend(loader.load())


### Split

In [None]:
from langchain.text_splitter import CharacterTextSplitter

# split it into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=32)
splitDocs = text_splitter.split_documents(documents)

# embeddings = []
# for sp in splitDocs:
#     embeddings = embedding.embed_query(sp.page_content)

print(f"splitDocs count: {len(splitDocs)}")

### Setup vector stores

In [None]:
from langchain_community.vectorstores import Chroma

print('Loading the vector store(s)...')
oai_vectorstore = Chroma.from_documents(collection_name="listings_oai", documents=splitDocs, embedding=oai_embedding_model)
hf_vectorstore = Chroma.from_documents(collection_name="listings_hf", documents=splitDocs, embedding=hf_embedding_model)

### Query time (similarity search)!

In [None]:
question = "I'm looking for a 2-bedroom apartment"
#question = "I'm looking for an apartment with a laundry closet and preferably a stackable washer and dryer."
#question = "I'm looking for an electric car with autopilot"
# TODO: Write your own questions

print("Similarity search...")
# Compare results from different embeddings
#docs = oai_vectorstore.similarity_search(question, k=1)
docs = hf_vectorstore.similarity_search(question, k=1)

length = len(docs)
print(f"Result: {length}")

for d in docs:
    print(d.metadata)
    print(f'Content: \n"{d.page_content}"')
    
    