In [2]:
from datetime import datetime
from langchain.vectorstores.chroma import Chroma
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings as STE
from embedding import EmbeddingLoader

In [3]:
#decorator
def checktime(func):
    def wrapper(*args, **kwargs):
        start_time = datetime.now()
        result = func(*args, **kwargs)
        end_time = datetime.now()
        print(f"Function call {func.__name__} took {(end_time - start_time).total_seconds()}s to run.\n")
        return result
    return wrapper

#get model file from .txt file
@checktime
def get_hf_model_names() -> list:
    try:
        with open(file="model/model_list.txt", mode="r", encoding="utf-8") as file:
            model_list = ["model/"+line.strip() for line in file]
    except:
        print("""file not exsist. check directory or file.""")

    return model_list

@checktime
def get_collection_names(model_list:list, if_openai=True) -> list:
    collection_names = [model_name.split("/")[-1] for model_name in model_list]
    if if_openai:
        collection_names.append("text-embedding-ada-002")

    return collection_names

@checktime
def loading_hf_embedding(model_path):
    memorystore = []
    for path in model_path:
        memorystore.append(EmbeddingLoader.SentenceTransformerEmbedding(model_name=path, encode_kwargs={'normalize_embeddings': True}).load())

    return memorystore

In [6]:
### calling(get model names, collections)
hf_model_path = get_hf_model_names()
collection_names = get_collection_names(hf_model_path)

### loading embedding with hf_model_path
embedding_memorystore = loading_hf_embedding(hf_model_path) #list 저장

### also loading OPENAI Embedding and save
openai_embedding = EmbeddingLoader.OpenAIEmbedding().load() #loading OPENAI Embedding(@text-ada-002) object
embedding_memorystore.append(openai_embedding) #append openai object

Function call get_hf_model_names took 0.000511s to run.

Function call get_collection_names took 0.0s to run.

embedding model in path <model/sentence-transformers/paraphrase-multilingual-mpnet-base-v2> has been loaded successfully.
Function call load took 3.253376s to run.

embedding model in path <model/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2> has been loaded successfully.
Function call load took 1.733356s to run.

embedding model in path <model/sentence-transformers/distiluse-base-multilingual-cased-v2> has been loaded successfully.
Function call load took 1.329167s to run.

embedding model in path <model/sentence-transformers/stsb-xlm-r-multilingual> has been loaded successfully.
Function call load took 3.086755s to run.

embedding model in path <model/jhgan/ko-sroberta-multitask> has been loaded successfully.
Function call load took 0.976828s to run.

embedding model in path <model/snunlp/KR-SBERT-V40K-klueNLI-augSTS> has been loaded successfully.
Function call

In [None]:
# 12 hf embeddings + openai embedding(@text-ada-002)
print(len(embedding_memorystore))
#check collection names before launch
print(collection_names)

13


In [22]:
def _ab_getter(collection_name:str)->list:
    footer = ["-a", "-b"]
    return [collection_name+footer_string for footer_string in footer]

#### 여기에서부터 QA query로 넣어서 hitrate 평가(hitrate_test_qa.csv)
- k 개수에 따라서 (@3, @5, @10) hitrate 평가 후 csv file로 저장

In [26]:
for embedding_function, collection_name in zip(embedding_memorystore, collection_names):
    ab_names = _ab_getter(collection_name=collection_name)

    for collection_name_complete in ab_names:
        db = Chroma(persist_directory="chroma", collection_name=collection_name_complete, embedding_function=embedding_function)
        print(f"with document set <Team {collection_name_complete[-1].upper()}>: {db._collection.name}, {db._collection.count()}")

        ### 여기 부분에서 평가하면 될듯(함수로 wrap해라.)
    print("="*80,"\n")

with document set <Team A>: paraphrase-multilingual-mpnet-base-v2-a, 1764
with document set <Team B>: paraphrase-multilingual-mpnet-base-v2-b, 1307

with document set <Team A>: paraphrase-multilingual-MiniLM-L12-v2-a, 1764
with document set <Team B>: paraphrase-multilingual-MiniLM-L12-v2-b, 1307

with document set <Team A>: distiluse-base-multilingual-cased-v2-a, 2170
with document set <Team B>: distiluse-base-multilingual-cased-v2-b, 1596

with document set <Team A>: stsb-xlm-r-multilingual-a, 1764
with document set <Team B>: stsb-xlm-r-multilingual-b, 1307

with document set <Team A>: ko-sroberta-multitask-a, 1732
with document set <Team B>: ko-sroberta-multitask-b, 1250

with document set <Team A>: KR-SBERT-V40K-klueNLI-augSTS-a, 1476
with document set <Team B>: KR-SBERT-V40K-klueNLI-augSTS-b, 1101

with document set <Team A>: moco-sentencedistilbertV2.1-a, 868
with document set <Team B>: moco-sentencedistilbertV2.1-b, 670

with document set <Team A>: kpf-sbert-128d-v1-a, 583
with d

In [25]:
db.similarity_search(query="hi")

[Document(page_content='. go. kr 입니다.', metadata={'category': '01 생계 지원', 'source': 'data\\teamA\\01_생계_지원\\01_내집_마련_디딤돌대출(주택구입_시).md', 'tag': '주거 대출,노령층,주거 대출,서민금융,청년,주거,1인가구,중장년,주거자금', 'title': '내집 마련 디딤돌대출(주택구입 시)', 'url': 'https://www.bokjiro.go.kr/ssis-tbu/twataa/wlfareInfo/moveTWAT52011M.do?wlfareInfoId=WLF00003271&wlfareInfoReldBztpCd=01'}),
 Document(page_content=') 혹은 웹사이트 https : / / www. kosaf. go. kr에 접속하여 확인할 수 있다.', metadata={'category': '04 청소년·청년 지원', 'source': 'data\\teamA\\04_청소년·청년_지원\\04_이공계_우수학생_국가장학사업.md', 'tag': '장학금,교육비가 부담될 때(장학금 등 지원),교육,청년', 'title': '이공계 우수학생 국가장학사업', 'url': 'https://www.bokjiro.go.kr/ssis-tbu/twataa/wlfareInfo/moveTWAT52011M.do?wlfareInfoId=WLF00001071&wlfareInfoReldBztpCd=01'}),
 Document(page_content='##업플라자, 사이트 주소는 http : / / www. seoulwomenventure. or. kr 입니다.', metadata={'category': '02 취업 지원', 'source': 'data\\teamA\\02_취업_지원\\02_여성창업액셀러레이팅.md', 'tag': '청년,창업,여성이 기업 운영을 원할 때,고용,중장년', 'title': '여성창업액셀러레이팅', 'url': ''}),
 Document(page