#### Setup Chromadb locally(to compare model by score)
---
### RUN ONLY ONCE IF YOU DON'T HAVE DIRECTORY embeddingtest/chroma

In [1]:
def get_hf_model_names() -> list:
    try:
        with open(file="model/model_list.txt", mode="r", encoding="utf-8") as file:
            model_list = [line.strip() for line in file]
    except:
        print("""file not exsist. check directory or file.""")
    return model_list

In [2]:
from langchain.vectorstores.chroma import Chroma

### from_document method 이용해서 저장(document, embedding, persist_directory, collection_name)
#Chroma object 생성.
chroma = Chroma()

In [3]:
#Pre-load document
from document.mdLoader import TeamALoader, TeamBLoader

#document parsing when get max_seq_length -> use in chroma.from_documents()
#일단 split 없이 진행하고, chroma db에 넣을 때 sequence에 맞춰서 split 해 줄 것(편의성을 위해 미리 불러온다.)
a_loader = TeamALoader(path_db="data/teamA", path_metadata="document/meta_team_a.json", path_url_table="document/url_table_team_a.csv", text_splitter=None)
b_loader = TeamBLoader(path_db="data/teamB", path_metadata="document/meta_team_b.json", path_url_table="document/url_table_team_b.csv", text_splitter=None)

a_raw_docs = a_loader.load(is_split=False, is_regex=False, show_progress=True)
b_raw_docs = b_loader.load(is_split=False, is_regex=True, show_progress=True)

### splitter 넣어서 하자........... seq 구하고 -> 이건 model loading 필요하니까 결국... loading하고
## None으로 시작한 다음에 Splitter 넣어서 돌아가게 하고 다음에 수정해서 하나의 .py로

initialize class takes 0.0 seconds.
initialize class takes 0.0 seconds.


100%|██████████| 41/41 [00:05<00:00,  7.31it/s]
100%|██████████| 36/36 [00:00<00:00, 40.79it/s]
100%|██████████| 50/50 [00:01<00:00, 42.12it/s]
100%|██████████| 57/57 [00:01<00:00, 37.18it/s]
100%|██████████| 27/27 [00:00<00:00, 41.98it/s]
100%|██████████| 20/20 [00:00<00:00, 45.49it/s]
100%|██████████| 83/83 [00:01<00:00, 42.88it/s]
100%|██████████| 40/40 [00:00<00:00, 49.27it/s]
100%|██████████| 24/24 [00:00<00:00, 42.53it/s]
100%|██████████| 86/86 [00:02<00:00, 41.11it/s]


loading Documents takes 15.941847 seconds.


  rows = body.findall("tr") if body else []
100%|██████████| 41/41 [00:00<00:00, 42.92it/s]
  rows = body.findall("tr") if body else []
100%|██████████| 36/36 [00:00<00:00, 49.71it/s]
  rows = body.findall("tr") if body else []
100%|██████████| 50/50 [00:00<00:00, 57.47it/s]
  rows = body.findall("tr") if body else []
100%|██████████| 57/57 [00:00<00:00, 67.80it/s]
  rows = body.findall("tr") if body else []
100%|██████████| 27/27 [00:00<00:00, 45.25it/s]
  rows = body.findall("tr") if body else []
100%|██████████| 20/20 [00:00<00:00, 39.93it/s]
  rows = body.findall("tr") if body else []
100%|██████████| 83/83 [00:01<00:00, 57.45it/s]
  rows = body.findall("tr") if body else []
100%|██████████| 40/40 [00:00<00:00, 63.65it/s]
  rows = body.findall("tr") if body else []
100%|██████████| 24/24 [00:00<00:00, 39.61it/s]
  rows = body.findall("tr") if body else []
100%|██████████| 86/86 [00:01<00:00, 51.32it/s]

loading Documents takes 9.086778 seconds.





In [4]:
# Embedding Model Loading
from embedding import EmbeddingLoader

ste_embedding = EmbeddingLoader.SentenceTransformerEmbedding
openai_embedding = EmbeddingLoader.OpenAIEmbedding

# UseCase
# ste_embedding()
# openai_embedding()

# get model names
model_list = get_hf_model_names()

#### HuggingFace Embedding Setup

In [5]:
from langchain.text_splitter import SentenceTransformersTokenTextSplitter, TokenTextSplitter #STE, OpenAIEmbedding(@text-ada-002)
import os
import json

def set_text_splitter(ste_model, max_seq_length)->SentenceTransformersTokenTextSplitter:
    splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=10, model_name=ste_model, tokens_per_chunk=max_seq_length)
    return splitter

def get_max_seq_length(model_path)->int:
    sentence_bert_config = "sentence_bert_config.json"
    config_path = os.path.join(model_path, sentence_bert_config)

    with open(config_path) as file :
        bert_config = json.load(file)
        
    return bert_config["max_seq_length"]

In [6]:
## HuggingFaceEmbedding Setup

import os
import json

directory = "model/"
sentence_bert_config = "sentence_bert_config.json"

for model in model_list:
    # load model from locally saved HuggingFace model
    model_path = os.path.join(directory, model)
    
    sentenceloader = ste_embedding(model_name=model_path, multi_process=True, encode_kwargs={'normalize_embeddings':True})
    embedding_model = sentenceloader.load()

    max_seq_length = get_max_seq_length(model_path=model_path)
    text_splitter = set_text_splitter(model_path, max_seq_length=max_seq_length)

    a_splitted_docs = text_splitter.split_documents(a_raw_docs)
    b_splitted_docs = text_splitter.split_documents(b_raw_docs)

    # get max sequence length from embedding model
    config_path = os.path.join(model_path, sentence_bert_config)
    with open(config_path) as file :
        bert_config = json.load(file)
        max_seq_length = bert_config["max_seq_length"]

    print(f"max sequence from current model({model_path}) is {max_seq_length}.")

    print(f"Document splitted with SentenceTransformerTokenizer -> length <A:{len(a_splitted_docs)}, B:{len(b_splitted_docs)}>")

    # set model name(cause collection name length limit)
    model_name = model.split("/")[-1]

    # save document with chunk - embedding calculate and save it to persist directory
    collection_a = chroma.from_documents(documents=a_splitted_docs, embedding=embedding_model, collection_name=model_name+"-a", collection_metadata={"hnsw:space":"cosine"}, persist_directory="chroma")
    collection_a.persist()
    print(collection_a._collection)

    collection_b = chroma.from_documents(documents=b_splitted_docs, embedding=embedding_model, collection_name=model_name+"-b", collection_metadata={"hnsw:space":"cosine"}, persist_directory="chroma")
    collection_b.persist()
    print(collection_b._collection)

    print("="*60, '\n')

embedding model in path <model/sentence-transformers/paraphrase-multilingual-mpnet-base-v2> has been loaded successfully.
Function call load took 4.888586s to run.

max sequence from current model(model/sentence-transformers/paraphrase-multilingual-mpnet-base-v2) is 128.
Document splitted with SentenceTransformerTokenizer -> length <A:1782, B:1307>
name='paraphrase-multilingual-mpnet-base-v2-a' id=UUID('02f482bf-1717-4e6c-98a4-af61adc61999') metadata={'hnsw:space': 'cosine'} tenant='default_tenant' database='default_database'
name='paraphrase-multilingual-mpnet-base-v2-b' id=UUID('795c7e3d-61ec-4788-8a64-4d901a64675b') metadata={'hnsw:space': 'cosine'} tenant='default_tenant' database='default_database'

embedding model in path <model/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2> has been loaded successfully.
Function call load took 1.787393s to run.

max sequence from current model(model/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2) is 128.
Document split

#### OpenAI Embedding Setup

In [7]:
loader = openai_embedding()
emb_openai = loader.load()

openai_text_splitter = TokenTextSplitter.from_tiktoken_encoder(
    model_name=emb_openai.model
)

a_loader.text_splitter = openai_text_splitter
b_loader.text_splitter = openai_text_splitter

a_raw_docs = a_loader.load(is_split=True, is_regex=False)
b_raw_docs = b_loader.load(is_split=True, is_regex=True)

chroma.from_documents(documents=a_raw_docs, embedding=emb_openai, collection_name=emb_openai.model+"-a", collection_metadata={"hnsw:space":"cosine"}, persist_directory="chroma")
chroma.from_documents(documents=b_raw_docs, embedding=emb_openai, collection_name=emb_openai.model+"-b", collection_metadata={"hnsw:space":"cosine"}, persist_directory="chroma")

OpenAI Embedding has been activated.
Function call load took 0.0s to run.



100%|██████████| 41/41 [00:01<00:00, 34.16it/s]
100%|██████████| 36/36 [00:00<00:00, 38.49it/s]
100%|██████████| 50/50 [00:01<00:00, 40.70it/s]
100%|██████████| 57/57 [00:01<00:00, 38.27it/s]
100%|██████████| 27/27 [00:00<00:00, 38.75it/s]
100%|██████████| 20/20 [00:00<00:00, 39.59it/s]
100%|██████████| 83/83 [00:02<00:00, 39.03it/s]
100%|██████████| 40/40 [00:00<00:00, 46.50it/s]
100%|██████████| 24/24 [00:00<00:00, 43.11it/s]
100%|██████████| 86/86 [00:02<00:00, 41.14it/s]


loading Documents takes 12.18189 seconds.


  rows = body.findall("tr") if body else []
100%|██████████| 41/41 [00:00<00:00, 43.03it/s]
  rows = body.findall("tr") if body else []
100%|██████████| 36/36 [00:00<00:00, 48.75it/s]
  rows = body.findall("tr") if body else []
100%|██████████| 50/50 [00:00<00:00, 57.32it/s]
  rows = body.findall("tr") if body else []
100%|██████████| 57/57 [00:00<00:00, 72.63it/s]
  rows = body.findall("tr") if body else []
100%|██████████| 27/27 [00:00<00:00, 45.97it/s]
  rows = body.findall("tr") if body else []
100%|██████████| 20/20 [00:00<00:00, 44.62it/s]
  rows = body.findall("tr") if body else []
100%|██████████| 83/83 [00:01<00:00, 59.02it/s]
  rows = body.findall("tr") if body else []
100%|██████████| 40/40 [00:00<00:00, 49.30it/s]
  rows = body.findall("tr") if body else []
100%|██████████| 24/24 [00:00<00:00, 41.56it/s]
  rows = body.findall("tr") if body else []
100%|██████████| 86/86 [00:01<00:00, 51.75it/s]


loading Documents takes 9.260814 seconds.


<langchain_community.vectorstores.chroma.Chroma at 0x15d2891b040>

#### test