#### Setup Chromadb locally(to compare model by score)
---
### RUN ONLY ONCE IF YOU DON'T HAVE DIRECTORY embeddingtest/chroma

In [18]:
def get_hf_model_names() -> list:
    try:
        with open(file="model/model_list.txt", mode="r", encoding="utf-8") as file:
            model_list = [line.strip() for line in file]
    except:
        print("""file not exsist. check directory or file.""")
    return model_list

In [19]:
from langchain.vectorstores.chroma import Chroma

### from_document method 이용해서 저장(document, embedding, persist_directory, collection_name)
#Chroma object 생성.
chroma = Chroma()

In [20]:
#Pre-load document
from document.mdLoader import TeamALoader, TeamBLoader

#document parsing when get max_seq_length -> use in chroma.from_documents()
#일단 split 없이 진행하고, chroma db에 넣을 때 sequence에 맞춰서 split 해 줄 것(편의성을 위해 미리 불러온다.)
a_loader = TeamALoader(path_db="data/teamA", path_metadata="document/meta_team_a.json", path_url_table="document/url_table_team_a.csv", text_splitter=None)
b_loader = TeamBLoader(path_db="data/teamB", path_metadata="document/meta_team_b.json", path_url_table="document/url_table_team_b.csv", text_splitter=None)

### splitter 넣어서 하자........... seq 구하고 -> 이건 model loading 필요하니까 결국... loading하고
## None으로 시작한 다음에 Splitter 넣어서 돌아가게 하고 다음에 수정해서 하나의 .py로

# a_raw_docs = a_loader.load(is_split=False, is_regex=False)
# b_raw_docs = b_loader.load(is_split=False, is_regex=True)

# print(len(a_raw_docs), len(b_raw_docs))

initialize class takes 0.0 seconds.
initialize class takes 0.0 seconds.


In [21]:
# Embedding Model Loading
from embedding import EmbeddingLoader

ste_embedding = EmbeddingLoader.SentenceTransformerEmbedding
openai_embedding = EmbeddingLoader.OpenAIEmbedding

# UseCase
# ste_embedding()
# openai_embedding()

# get model names
model_list = get_hf_model_names()

#### HuggingFace Embedding Setup

In [24]:
from langchain.text_splitter import SentenceTransformersTokenTextSplitter, TokenTextSplitter #STE, OpenAIEmbedding(@text-ada-002)
import os
import json

def set_text_splitter(ste_model, max_seq_length)->SentenceTransformersTokenTextSplitter:
    splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=10, model_name=ste_model, tokens_per_chunk=max_seq_length, seperator="\n\n")
    return splitter

def get_max_seq_length(model_path)->int:
    sentence_bert_config = "sentence_bert_config.json"
    config_path = os.path.join(model_path, sentence_bert_config)

    with open(config_path) as file :
        bert_config = json.load(file)
        
    return bert_config["max_seq_length"]

In [25]:
## HuggingFaceEmbedding Setup

import os
import json

directory = "model/"
sentence_bert_config = "sentence_bert_config.json"

for model in model_list:
    # load model from locally saved HuggingFace model
    model_path = os.path.join(directory, model)
    
    sentenceloader = ste_embedding(model_name=model_path, multi_process=True, encode_kwargs={'normalize_embeddings':True})
    embedding_model = sentenceloader.load()

    max_seq_length = get_max_seq_length(model_path=model_path)
    text_splitter = set_text_splitter(model_path, max_seq_length=max_seq_length)

    a_loader.text_splitter = text_splitter
    b_loader.text_splitter = text_splitter

    a_raw_docs = a_loader.load(is_split=True, is_regex=False)
    b_raw_docs = b_loader.load(is_split=True, is_regex=True)

    print(f"total document length : {len(a_raw_docs)}, {len(b_raw_docs)}")

    # get max sequence length from embedding model
    config_path = os.path.join(model_path, sentence_bert_config)
    with open(config_path) as file :
        bert_config = json.load(file)
        max_seq_length = bert_config["max_seq_length"]

    # set model name(cause collection name length limit)
    model_name = model.split("/")[-1]

    # save document with chunk - embedding calculate and save it to persist directory
    chroma.from_documents(documents=a_raw_docs, embedding=embedding_model, collection_name=model_name+"-a", collection_metadata={"hnsw:space":"cosine"}, persist_directory="chroma")
    chroma.from_documents(documents=b_raw_docs, embedding=embedding_model, collection_name=model_name+"-b", collection_metadata={"hnsw:space":"cosine"}, persist_directory="chroma")


embedding model in path <model/sentence-transformers/paraphrase-multilingual-mpnet-base-v2> has been loaded successfully.
Function call load took 6.481762s to run.



TypeError: TextSplitter.__init__() got an unexpected keyword argument 'seperator'

#### OpenAI Embedding Setup

In [None]:
### 여기도 max sequence 찾아서 해야 할 듯......

In [None]:


loader = openai_embedding()
emb_openai = loader.load()

openai_text_splitter = TokenTextSplitter.from_tiktoken_encoder(
    model_name=emb_openai.model
)

a_loader.text_splitter = openai_text_splitter
b_loader.text_splitter = openai_text_splitter

a_raw_docs = a_loader.load(is_split=True, is_regex=False)
b_raw_docs = b_loader.load(is_split=True, is_regex=True)

chroma.from_documents(documents=a_raw_docs, embedding=emb_openai, collection_name=emb_openai.model+"-a", collection_metadata={"hnsw:space":"cosine"}, persist_directory="chroma")
chroma.from_documents(documents=b_raw_docs, embedding=emb_openai, collection_name=emb_openai.model+"-b", collection_metadata={"hnsw:space":"cosine"}, persist_directory="chroma")

OpenAI Embedding has been activated.
Function call load took 0.0s to run.

<langchain.text_splitter.TokenTextSplitter object at 0x0000017CD7ED8880>


200

#### test