In [1]:
import os
import random
import json

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

from transformers import AutoModel, AutoTokenizer
import torch
import cohere
import openai
from openai import OpenAI

os.environ["TOKENIZERS_PARALLELISM"] = "false"

now_path = './'
with open(f'{now_path}/metadata/setting.json', 'r', encoding='utf-8') as file:
    config_dict = json.load(file)

# initialize openai
os.environ["OPENAI_API_KEY"] = config_dict['OPENAI_KEY']
openai.api_key = os.environ["OPENAI_API_KEY"]

# initialize cohere
os.environ["CO_API_KEY"] = config_dict['COHERE_KEY']
co = cohere.Client()

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("quora_dataset.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,text,id,duplicated_questions,length
0,0,Astrology: I am a Capricorn Sun Cap moon and c...,11,[12],1
1,1,"I'm a triple Capricorn (Sun, Moon and ascendan...",12,[11],1
2,2,How can I be a good geologist?,15,[16],1
3,3,What should I do to be a great geologist?,16,[15],1
4,4,How do I read and find my YouTube comments?,23,[24],1


# <font color=yellow>[1] Playground </font>

In [5]:
text1 = df.loc[2, 'text']
print(text1)

How can I be a good geologist?


In [6]:
text2 = df.loc[3, 'text']
print(text2)

What should I do to be a great geologist?


누가봐도 위 두 문장은 유사한 문장임
- 데이터셋에서 봐도 id가 15, 16인데 duplicated_questions 모두 16, 15로 매칭되어 있음

In [7]:
def create_embeddings(txt_list, provider='openai'):
    if provider=='openai':
        client = OpenAI()

        response = client.embeddings.create(
            input=txt_list,
            model="text-embedding-3-small",
            )
        responses = [r.embedding for r in response.data]

        return responses
    
    elif provider=='cohere':
        doc_embeds = co.embed(
            txt_list,
            input_type="search_document",
            model="embed-english-v3.0",
            )
        return doc_embeds.embeddings
    else:
        assert False, "Double check provider name"

두 질문을 임베딩으로 변환하고, 코싸인 유사도를 봐도 유사한가를 확인

In [8]:
emb1 = create_embeddings(df.loc[2, 'text'])
emb2 = create_embeddings(df.loc[3, 'text'])

In [9]:
from utils import cosine_similarity

In [10]:
# simarity between two embeddings
print(f"Cosine 유사도 : {cosine_similarity(emb1[0], emb2[0])}.\n사용된 문장 : \n{text1}\n{text2}")

Cosine 유사도 : 0.9153082312599056.
사용된 문장 : 
How can I be a good geologist?
What should I do to be a great geologist?


In [11]:
text3 = df.loc[4, 'text']

emb3 = create_embeddings(text3)
print(f"Cosine 유사도 : {cosine_similarity(emb1[0], emb3[0])}.\n사용된 문장 : \n{text1}\n{text3}")

Cosine 유사도 : 0.18171728610851576.
사용된 문장 : 
How can I be a good geologist?
How do I read and find my YouTube comments?


In [12]:
text4 = df.loc[6, 'text']

emb4 = create_embeddings(text4)
print(f"Cosine 유사도 : {cosine_similarity(emb1[0], emb4[0])}.\n사용된 문장 : \n{text1}\n{text4}")

Cosine 유사도 : 0.2795684519934309.
사용된 문장 : 
How can I be a good geologist?
What can make Physics easy to learn?


# <font color=yellow>[2] Embedding vector Dataset 만들기 </font>

In [None]:
# create embeddings (openai)
# (비용 발생 주의)
openai_emb = create_embeddings(df.text.tolist(), provider='openai')
df['openai_emb'] = openai_emb

In [None]:
# create embeddings (cohere)
# (비용 발생 주의)
cohere_emb = create_embeddings(df.text.tolist(), 'cohere')
df['cohere_emb'] = cohere_emb

In [4]:
# e5 embeddings
# load gpu if possible
device = "cuda" 
model_id = "intfloat/e5-base-v2"

# init tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModel.from_pretrained(model_id).to(device)
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [5]:
def create_e5_emb(docs, model):
    docs = [f"query: {d}" for d in docs]
    tokens = tokenizer(docs, padding=True, max_length=512, truncation=True, return_tensors="pt").to(device)
    with torch.no_grad():
        out = model(**tokens)
        last_hidden = out.last_hidden_state.masked_fill( # from last hidden state
            ~tokens["attention_mask"][..., None].bool(), 0.0
        )
        # average out embeddings per token (non-padding)
        doc_embeds = last_hidden.sum(dim=1) / tokens["attention_mask"].sum(dim=1)[..., None]
    return doc_embeds.cpu().numpy()

In [6]:
data = df.text.tolist()
batch_size = 128

for i in tqdm(range(0, len(data), batch_size)):
    i_end = min(len(data), i+batch_size)
    data_batch = data[i:i_end]
    # embed current batch
    embed_batch = create_e5_emb(data_batch, model)
    if i == 0:
        emb3 = embed_batch.copy()
    else:
        emb3 = np.concatenate([emb3, embed_batch.copy()])

  0%|          | 0/44 [00:00<?, ?it/s]

In [7]:
emb3

array([[ 0.05987817, -0.15769596, -0.14131545, ...,  0.0830001 ,
         0.23116857,  0.3649252 ],
       [ 0.08937579, -0.29545042, -0.33455297, ...,  0.34410715,
         0.48995113,  0.4949758 ],
       [ 0.08258092, -0.09264588, -0.7805369 , ..., -0.0878003 ,
         0.5608229 ,  0.68429667],
       ...,
       [-0.34141308, -1.1765785 , -0.94045174, ..., -0.03786189,
         0.55909896,  0.30307025],
       [-0.29797608, -0.8888769 , -0.8590143 , ..., -0.13681315,
         0.5658392 ,  0.39265954],
       [ 0.23199065, -0.38657165, -0.4294519 , ..., -0.28575844,
         0.48060724,  0.38324454]], dtype=float32)

In [8]:
emb3.shape

(5539, 768)

In [9]:
emb3 = [list(e) for e in emb3]
df['e5_emb'] = emb3

In [10]:
df

Unnamed: 0.1,Unnamed: 0,text,id,duplicated_questions,length,e5_emb
0,0,Astrology: I am a Capricorn Sun Cap moon and c...,11,[12],1,"[0.059878167, -0.15769596, -0.14131545, -0.546..."
1,1,"I'm a triple Capricorn (Sun, Moon and ascendan...",12,[11],1,"[0.08937579, -0.29545042, -0.33455297, -0.3294..."
2,2,How can I be a good geologist?,15,[16],1,"[0.08258092, -0.092645876, -0.7805369, -0.3241..."
3,3,What should I do to be a great geologist?,16,[15],1,"[-0.16533037, 0.19044484, -0.89066523, -0.3643..."
4,4,How do I read and find my YouTube comments?,23,[24],1,"[0.50644565, -0.62657744, -0.25233957, -0.1711..."
...,...,...,...,...,...,...
5534,33147,How do girl feel after losing virginity?,14989,[14988],1,"[-0.22131807, -0.65976393, -0.6754078, 0.21552..."
5535,33152,What is it like to lose 30 pounds in one month?,14992,"[9736, 9737]",2,"[-0.43680525, -0.62481517, -0.48897994, 0.0090..."
5536,33160,Why is 2000 rupee note released earlier than 5...,14997,[14998],1,"[-0.34141308, -1.1765785, -0.94045174, 0.39267..."
5537,33161,Why does RBI release 2000 note first instead o...,14998,[14997],1,"[-0.29797608, -0.8888769, -0.8590143, 0.385192..."


In [None]:
# df.to_csv("quora_dataset_emb.csv", index=False)

# <font color=yellow>[3-1] Embedding이 이미 처리된 데이터 읽어오기 </font>

In [11]:
df_new = pd.read_csv("quora_dataset_emb.csv")
df_new

Unnamed: 0,text,id,duplicated_questions,length,openai_emb,cohere_emb,e5_emb
0,Astrology: I am a Capricorn Sun Cap moon and c...,11,[12],1,"[-0.005765771958976984, -0.018585262820124626,...","[-0.05834961, -0.010795593, -0.04522705, 0.035...","[0.059878636, -0.15769655, -0.14131568, -0.546..."
1,"I'm a triple Capricorn (Sun, Moon and ascendan...",12,[11],1,"[0.026014558970928192, -0.014319832436740398, ...","[-0.022338867, -0.0063285828, -0.057128906, 0....","[0.08937627, -0.2954505, -0.33455396, -0.32940..."
2,How can I be a good geologist?,15,[16],1,"[0.005276682320982218, 0.004194203298538923, 0...","[-0.012535095, 0.005092621, -0.033233643, -0.0...","[0.0825816, -0.09264662, -0.78053623, -0.32416..."
3,What should I do to be a great geologist?,16,[15],1,"[0.015116829425096512, 0.0010464431252330542, ...","[-0.013465881, 0.0018148422, -0.052612305, 0.0...","[-0.1653303, 0.19044468, -0.8906647, -0.364357..."
4,How do I read and find my YouTube comments?,23,[24],1,"[0.03505030274391174, -0.0010134828044101596, ...","[-0.0047836304, 0.028137207, -0.037231445, -0....","[0.50644577, -0.62657785, -0.2523397, -0.17112..."
...,...,...,...,...,...,...,...
5534,How do girl feel after losing virginity?,14989,[14988],1,"[0.014664657413959503, 0.0213747750967741, -0....","[-0.009284973, 0.023544312, -0.024612427, 0.05...","[-0.22131896, -0.6597636, -0.6754084, 0.215526..."
5535,What is it like to lose 30 pounds in one month?,14992,"[9736, 9737]",2,"[0.02287902496755123, 0.030351461842656136, -0...","[-0.03161621, -0.0013971329, -0.029754639, 0.0...","[-0.43680522, -0.6248159, -0.48898008, 0.00900..."
5536,Why is 2000 rupee note released earlier than 5...,14997,[14998],1,"[0.03714467212557793, -0.03145376965403557, 0....","[-0.025436401, 0.01210022, -0.027526855, -0.00...","[-0.34141293, -1.1765785, -0.94045097, 0.39267..."
5537,Why does RBI release 2000 note first instead o...,14998,[14997],1,"[0.05366398021578789, -0.04045843333005905, 0....","[-0.031799316, 0.0077934265, -0.03744507, 0.00...","[-0.29797572, -0.88887674, -0.859015, 0.385191..."


In [12]:
# str -> list 형태로 변환
df_new['openai_emb'] = df_new['openai_emb'].apply(json.loads)
df_new['cohere_emb'] = df_new['cohere_emb'].apply(json.loads)
df_new['e5_emb'] = df_new['e5_emb'].apply(json.loads)
df_new['duplicated_questions'] = df_new['duplicated_questions'].apply(json.loads)

json.loads는 문자열을 JSON 객체(파이썬 객체)로 바꾸는 함수
- 문자열 형태의 리스트를 실제 리스트 객체로 바꿔주는 역할
- 파이썬 리스트, 튜플, 딕셔너리를 CSV에 저장하면 문자열로 변환되어 저장됨

In [15]:
type(df_new.loc[0, 'length'])

numpy.int64

# <font color=yellow>[3-2] Test set 선별 </font>

In [16]:
# now choose random 10 rows of answers
test_query = random.choices(df_new.id, k=1000)
test_query

[8866,
 10561,
 14317,
 8863,
 9164,
 13169,
 6058,
 14007,
 805,
 9455,
 10671,
 3775,
 14926,
 1487,
 1015,
 3943,
 10617,
 12261,
 11653,
 6210,
 7999,
 3370,
 14897,
 10560,
 337,
 12565,
 6232,
 6123,
 12334,
 12054,
 7392,
 1319,
 2935,
 14930,
 306,
 4072,
 10151,
 10598,
 1692,
 9760,
 11701,
 14123,
 8879,
 10055,
 12275,
 13691,
 11512,
 14756,
 13324,
 11437,
 2802,
 6801,
 14890,
 7221,
 1553,
 5533,
 1066,
 6367,
 8756,
 12055,
 2658,
 5289,
 13810,
 11845,
 6520,
 12072,
 10336,
 11081,
 540,
 7286,
 3325,
 10899,
 14626,
 13691,
 5090,
 10799,
 9306,
 6248,
 10318,
 5518,
 5995,
 13184,
 10682,
 3330,
 14227,
 11468,
 443,
 14169,
 7673,
 12651,
 7625,
 6891,
 4734,
 2893,
 290,
 13260,
 7978,
 5868,
 13579,
 2297,
 3802,
 13125,
 8098,
 390,
 4195,
 11113,
 1718,
 9725,
 7860,
 734,
 11291,
 12233,
 1238,
 12213,
 12966,
 3575,
 13594,
 7986,
 12030,
 11168,
 12279,
 3999,
 10829,
 4517,
 8239,
 8510,
 8471,
 6378,
 13491,
 4818,
 3764,
 13953,
 4804,
 6476,
 1012,
 698

In [17]:
test = df_new.loc[df_new.id.isin(test_query)]
test

Unnamed: 0,text,id,duplicated_questions,length,openai_emb,cohere_emb,e5_emb
0,Astrology: I am a Capricorn Sun Cap moon and c...,11,[12],1,"[-0.005765771958976984, -0.018585262820124626,...","[-0.05834961, -0.010795593, -0.04522705, 0.035...","[0.059878636, -0.15769655, -0.14131568, -0.546..."
20,How one should know that he/she completely pre...,60,"[59, 3775, 3774]",3,"[0.01716982200741768, -0.0018703066743910313, ...","[-0.0033817291, -0.008735657, -0.0019350052, -...","[0.08099816, -0.080133975, -0.59878486, -0.541..."
25,How do we prepare for UPSC?,77,"[78, 14412, 13689]",3,"[0.003527139313519001, -0.0007861784542910755,...","[0.0046653748, 0.01449585, -0.0017051697, 0.00...","[0.13959141, -0.6108713, -1.0003744, -0.403412..."
37,I was suddenly logged off Gmail. I can't remem...,117,"[3875, 118, 10024, 6953, 12018]",5,"[0.05869226157665253, 0.01173845212906599, 0.0...","[-0.044555664, 0.008895874, -0.044830322, 0.03...","[-0.5211264, -0.4668132, -0.37974587, 0.006814..."
40,How bad is the new book by J.K Rowling?,126,[125],1,"[0.014812997542321682, 0.05930182710289955, 0....","[0.009567261, -0.017654419, 0.00592041, 0.0154...","[-0.3210051, -0.7553193, -0.6274402, -0.071635..."
...,...,...,...,...,...,...,...
5519,How should I learn hacking by myself?,14963,"[8241, 14962]",2,"[0.002407674677670002, -0.02673427388072014, 0...","[-0.043518066, 0.011192322, -0.027282715, 0.02...","[0.016890272, -0.43490824, -0.4932563, 0.08736..."
5528,What are some great mixers for Southern Comfort?,14981,[14980],1,"[0.015770725905895233, -0.05443859100341797, -...","[-0.047088623, 0.019241333, -0.0390625, 0.0248...","[-0.11738305, -0.5468031, -0.710302, -0.301716..."
5529,Mrs. Clinton: What are your goals/plans for NA...,14982,[14983],1,"[0.04022955149412155, -0.01941961795091629, 0....","[-0.032073975, 0.0069007874, -0.027435303, 0.0...","[-0.20345676, 0.107300244, -0.6646933, 0.09437..."
5531,How I stop my hair fall?,14986,[14987],1,"[-0.00802390743046999, -0.014185726642608643, ...","[0.0054016113, 0.017227173, -0.018371582, -0.0...","[-0.056083582, -0.61637473, -0.61591345, -0.31..."


## <font color=green> 가장 유사한 top-5를 우리 VectorDB에서 가져오는 것임 </font>

In [18]:
from sklearn.metrics.pairwise import cosine_similarity

def search_top_k(search_df, search_df_column, id, topk):
    """
    search_df : search를 할 대상 dataframe
    search_df_column : search를 위해 사용될 embedding column name (out of openai, cohere, e5)
    id : test query id
    topk : 유사도 기반으로 top-k개 선별
    """
    query = search_df.loc[search_df['id']==id, search_df_column].values[0]
    query_reshaped = np.array(query).reshape(1, -1)
    
    search_df = search_df.loc[search_df['id']!=id]
    # cosine similarity in batch
    similarities = cosine_similarity(query_reshaped, np.vstack(search_df[search_df_column].values)).flatten()
    
    search_df['similarity'] = similarities
    
    # Get top-k indices
    # hence we sort the topk indices again to ensure they are truly the top-k
    topk_indices = np.argpartition(similarities, -topk)[-topk:]
    topk_indices_sorted = topk_indices[np.argsort(-similarities[topk_indices])]
    
    # Retrieve the top-k results
    search_result = search_df.iloc[topk_indices_sorted]
    
    return search_result

- 각 테스트 질문당 데이터 전체를 대상으로 cosine_similarity를 계산하고
- openai embedding, cohere embedding에 대해 각각 질문 k 개씩 진행
- search_result format :
```json
{
    'question id' : cosine_sim 기준 유사한 질문 top-k개를 담은 pd.DataFrame,
    'question id' : ...
}
```

In [None]:
# 각 질문들 중, test 질문과 동일한 질문이 가장 유사하게 도출될 것이기 때문에
# test 질문을 제외한 top-5
query_results_openai = {k:search_top_k(df_new, 'openai_emb', k, 5) for k in test.id}
query_results_cohere = {k:search_top_k(df_new, 'cohere_emb', k, 5) for k in test.id}
query_results_e5 = {k:search_top_k(df_new, 'e5_emb', k, 5) for k in test.id}

In [21]:
test.loc[test.length==3].tail()

Unnamed: 0,text,id,duplicated_questions,length,openai_emb,cohere_emb,e5_emb
4692,How do I start preparation for upsc exams?,12397,"[7976, 5556, 7975]",3,"[0.034068383276462555, -0.015171932987868786, ...","[-0.009613037, 0.0048065186, 0.014907837, 0.02...","[0.33539206, -0.5252116, -0.9702325, -0.354469..."
5280,What are some of the best investment plans?,14211,"[12694, 14212, 12695]",3,"[0.01397707685828209, -0.005568686407059431, 0...","[-0.024749756, -0.0109939575, 0.0053482056, 0....","[-0.4714, -0.439601, -0.8333078, -0.062841795,..."
5281,What's the best investment?,14212,"[12695, 14211, 12694]",3,"[0.0035537381190806627, 0.007597768679261208, ...","[0.0030536652, -0.020111084, 0.018203735, -0.0...","[-0.6153567, -0.45850173, -0.8385687, -0.05946..."
5404,How do I my increase memory power?,14637,"[14636, 10681, 10682]",3,"[0.022645525634288788, -0.005303962621837854, ...","[0.0047340393, 0.02658081, -0.052825928, -0.02...","[-0.31909862, -0.46033445, -0.46627238, -0.031..."
5417,How do I make friend?,14678,"[14677, 9642, 9641]",3,"[-0.010315492749214172, -0.03124191053211689, ...","[-0.023132324, 0.013587952, -0.03375244, 0.018...","[-0.008433589, -0.61488116, -0.4545104, -0.532..."


In [25]:
test.loc[test['id']==14212, 'text']

5281    What's the best investment?
Name: text, dtype: object

In [27]:
test.loc[test['id']==14212, 'text'].values

array(["What's the best investment?"], dtype=object)

In [31]:
test.loc[test['id']==14212, 'text'].values[0]

"What's the best investment?"

In [48]:
query_results_openai

{11:                                                    text     id  \
 1     I'm a triple Capricorn (Sun, Moon and ascendan...     12   
 2904                 Why should i believe in astrology?   7641   
 4627       What does my kundali reveal about my career?  12223   
 4628          What does my Kundali reveal about career?  12224   
 3656                       Do you believe in horoscope?   9566   
 
      duplicated_questions  length  \
 1                    [11]       1   
 2904               [7642]       1   
 4627              [12224]       1   
 4628              [12223]       1   
 3656               [9565]       1   
 
                                              openai_emb  \
 1     [0.026014558970928192, -0.014319832436740398, ...   
 2904  [0.0007690633647143841, 0.00936548225581646, 0...   
 4627  [0.008140774443745613, -0.019140049815177917, ...   
 4628  [-0.00570955965667963, -0.008877615444362164, ...   
 3656  [-0.0014365173410624266, 0.0060619148425757885...   
 


In [28]:
query_results_openai[14212]

Unnamed: 0,text,id,duplicated_questions,length,openai_emb,cohere_emb,e5_emb,similarity
4784,What is best investment option?,12694,"[12695, 14212, 14211]",3,"[0.0066238767467439175, 0.01729673333466053, 0...","[0.0029945374, -0.028869629, 0.028793335, 0.00...","[-0.87640125, -0.5093946, -0.9197591, -0.14335...",0.86344
4785,What is best option to investment?,12695,"[12694, 14211, 14212]",3,"[0.002989136381074786, 0.009364699013531208, 0...","[0.0018501282, -0.028259277, 0.023361206, 0.00...","[-0.59119123, -0.43359506, -0.9806511, -0.1559...",0.829069
5280,What are some of the best investment plans?,14211,"[12694, 14212, 12695]",3,"[0.01397707685828209, -0.005568686407059431, 0...","[-0.024749756, -0.0109939575, 0.0053482056, 0....","[-0.4714, -0.439601, -0.8333078, -0.062841795,...",0.751761
598,What stocks are the best to invest in right now?,1587,[1588],1,"[0.015089420601725578, -0.013920684345066547, ...","[-0.008598328, -0.007949829, -0.028762817, 0.0...","[-0.3589214, -0.7058543, -0.6543744, 0.2933644...",0.649535
599,What are the best stocks to invest for long te...,1588,[1587],1,"[-0.0008520701667293906, -0.014323555864393711...","[-0.0046310425, 0.0046920776, -0.019592285, 0....","[-0.3834505, -0.70068496, -0.6507094, 0.186360...",0.62257


In [29]:
query_results_cohere[14212]

Unnamed: 0,text,id,duplicated_questions,length,openai_emb,cohere_emb,e5_emb,similarity
4784,What is best investment option?,12694,"[12695, 14212, 14211]",3,"[0.0066238767467439175, 0.01729673333466053, 0...","[0.0029945374, -0.028869629, 0.028793335, 0.00...","[-0.87640125, -0.5093946, -0.9197591, -0.14335...",0.893486
4785,What is best option to investment?,12695,"[12694, 14211, 14212]",3,"[0.002989136381074786, 0.009364699013531208, 0...","[0.0018501282, -0.028259277, 0.023361206, 0.00...","[-0.59119123, -0.43359506, -0.9806511, -0.1559...",0.868701
5280,What are some of the best investment plans?,14211,"[12694, 14212, 12695]",3,"[0.01397707685828209, -0.005568686407059431, 0...","[-0.024749756, -0.0109939575, 0.0053482056, 0....","[-0.4714, -0.439601, -0.8333078, -0.062841795,...",0.781877
598,What stocks are the best to invest in right now?,1587,[1588],1,"[0.015089420601725578, -0.013920684345066547, ...","[-0.008598328, -0.007949829, -0.028762817, 0.0...","[-0.3589214, -0.7058543, -0.6543744, 0.2933644...",0.745602
5266,What is the best way to get investment for you...,14165,[14166],1,"[0.012466010637581348, 0.018429160118103027, 0...","[-0.017562866, -0.010498047, -0.0043525696, 0....","[-0.21665119, -0.51234066, -1.08015, -0.090698...",0.70666


In [30]:
query_results_e5[14212]

Unnamed: 0,text,id,duplicated_questions,length,openai_emb,cohere_emb,e5_emb,similarity
4784,What is best investment option?,12694,"[12695, 14212, 14211]",3,"[0.0066238767467439175, 0.01729673333466053, 0...","[0.0029945374, -0.028869629, 0.028793335, 0.00...","[-0.87640125, -0.5093946, -0.9197591, -0.14335...",0.944431
4785,What is best option to investment?,12695,"[12694, 14211, 14212]",3,"[0.002989136381074786, 0.009364699013531208, 0...","[0.0018501282, -0.028259277, 0.023361206, 0.00...","[-0.59119123, -0.43359506, -0.9806511, -0.1559...",0.942599
5280,What are some of the best investment plans?,14211,"[12694, 14212, 12695]",3,"[0.01397707685828209, -0.005568686407059431, 0...","[-0.024749756, -0.0109939575, 0.0053482056, 0....","[-0.4714, -0.439601, -0.8333078, -0.062841795,...",0.910198
5266,What is the best way to get investment for you...,14165,[14166],1,"[0.012466010637581348, 0.018429160118103027, 0...","[-0.017562866, -0.010498047, -0.0043525696, 0....","[-0.21665119, -0.51234066, -1.08015, -0.090698...",0.904005
1122,What are some best business ideas with minimum...,2940,[2939],1,"[-0.04373488575220108, 0.06218893453478813, 0....","[-0.010887146, -0.021270752, 0.005241394, 0.02...","[-0.38691205, -0.35685608, -0.93017876, 0.3106...",0.883595


테스트 결과를 보면 test.length==3 (비슷한 문장이 3개 있는 것)의 실제 정답 (duplicated_questions)과 vectorDB로부터 찾을 것을 비교함
- 잘 찾은 것을 알 수 있음
- 그러나 duplicated_questions은 사람이 tag한 것이여서, 데이터셋의 한계가 있음
- 정밀하게 사람이 tag를 하지 않은듯
- 이런 한계점이 실제 pjt에서 보이기도 함 (사람간의 관점 차이가 있음)
- 텍스트가 항상 1과 0으로 분명하게 나뉘는게 아니고, 스펙트럼 상에 있기 때문에, 태깅의 차이는 어느정도 감수해야함

# <font color=yellow>[4] 성능 평가 </font>

In [None]:
def score_accuracy(full_df, tmp_df, test_id):
    """
    각 테스트 질문과 유사하다고 판단된 질문들 중, 실제 duplicated_questions에 들어있는 질문들을 count
    full_df: 우리의 database
    tmp_df: retrieve된 결과
    test_id
    """
    duplicated_questions = full_df.loc[full_df['id'] == test_id, 'duplicated_questions'].values[0]

    # 본인 ID는 제외
    filtered_df = tmp_df[tmp_df['id'] != test_id]
    # 현재 retrieve 해온 ID들이, 테스트 질문 내에 들어있는 아이디들인지 count
    match_count = filtered_df['id'].isin(duplicated_questions).sum()

    # Calculate the accuracy in terms of percentage
    if filtered_df.shape[0] < len(duplicated_questions):
        percentage = (match_count / filtered_df.shape[0])
    else:
        percentage = (match_count / len(duplicated_questions))
    return percentage

현재 우리가 유사하다고 가져온 id들이, 실제 유사하다고 tag된 id들과 얼마나 매칭되는가 계산해야함

In [33]:
accuracy_openai = [score_accuracy(df_new, query_results_openai[i], i) for i in query_results_openai.keys()]
accuracy_cohere = [score_accuracy(df_new, query_results_cohere[i], i) for i in query_results_cohere.keys()]
accuracy_e5 = [score_accuracy(df_new, query_results_e5[i], i) for i in query_results_e5.keys()]

In [34]:
np.mean(accuracy_openai)

0.9561020036429873

In [35]:
np.mean(accuracy_cohere)

0.9538979963570128

In [36]:
np.mean(accuracy_e5)

0.9454280510018216

In [38]:
accuracy_e5

[1.0,
 1.0,
 0.0,
 0.6,
 1.0,
 1.0,
 1.0,
 1.0,
 0.6666666666666666,
 1.0,
 0.5,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.8,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.8,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.6666666666666666,
 1.0,
 1.0,
 1.0,
 0.8,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.8,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.4,
 0.8,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.8,
 0.25,
 1.0,
 1.0,
 1.0,
 1.0,
 0.6666666666666666,
 0.6666666666666666,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.4,
 1.0,
 0.4,
 1.0,
 1.0,
 1.0,
 1.0,
 0.6666666666666666,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.8,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.5,
 1.0,
 1.0,
 1.0,
 1.0,
 0.8,
 1.0,
 0.8,
 1.0,
 1.0,
 1.0,
 0.6,
 0.4,
 1.0,


## <font color=blue> 이전에 말했던 것처럼 웬만한 임베딩 모델 모두 좋다 </font>
- MTEB 순위상으로는 cohere, openai, e5 순이였음
- 그러나 위 결과로 정확도는 거의 비슷하게 보임

아래와 같은 이유면 로컬 모델 (e5)를 사용
- 정보 보안이 매우 중요함
- GPU가 있으면 사용

## <font color=green> 오답 엿보기 </font>

In [37]:
indices = [index for index, value in enumerate(accuracy_openai) if value <= 0.5]
indices

[2,
 3,
 8,
 10,
 51,
 81,
 105,
 120,
 122,
 140,
 154,
 175,
 178,
 179,
 246,
 258,
 299,
 317,
 344,
 378,
 448,
 463,
 470,
 482,
 497,
 527,
 538,
 546,
 556,
 705,
 757,
 871,
 873,
 907,
 908]

In [39]:
list(query_results_openai.keys())[81]

1178

In [40]:
test.loc[test['id']==1178]

Unnamed: 0,text,id,duplicated_questions,length,openai_emb,cohere_emb,e5_emb
453,What is the meaning of life? Whats our purpose...,1178,"[12131, 159, 1179, 12132, 12591, 160]",6,"[0.0299716554582119, 0.02121357060968876, 0.01...","[-0.07147217, 0.012718201, -0.114990234, 0.004...","[0.42690742, 0.013789681, -0.7896737, -0.13812..."


In [41]:
query_results_openai[1178]

Unnamed: 0,text,id,duplicated_questions,length,openai_emb,cohere_emb,e5_emb,similarity
551,What is the meaning and purpose to life?,1448,"[1449, 7295, 599, 7296, 598, 12726]",6,"[0.04635842517018318, -0.008825418539345264, -...","[-0.06225586, 0.015586853, -0.08898926, -0.000...","[0.28515336, 0.14494295, -0.893992, -0.3613152...",0.8114
4791,What is the meaning or purpose of life?,12726,"[1449, 1448, 598, 7295, 599, 7296]",6,"[0.03492662310600281, -0.013805676251649857, 0...","[-0.059509277, 0.012268066, -0.08856201, 0.014...","[0.21815039, 0.15135749, -0.9114527, -0.282442...",0.80069
55,What is purpose of life?,159,"[160, 1178, 1179, 12131, 12591, 12132, 12888]",7,"[0.03259734436869621, -0.0014210438821464777, ...","[-0.053100586, 0.014640808, -0.07556152, 0.001...","[0.1612927, 0.0706275, -0.8870243, -0.34103918...",0.78272
454,What do you feel is the purpose of life?,1179,"[12132, 1178, 12591, 159, 160, 12131]",6,"[0.03397490084171295, -0.0029158813413232565, ...","[-0.04751587, 0.020019531, -0.07183838, 0.0208...","[0.24237813, 0.19882739, -0.78977484, -0.28604...",0.777071
235,What's are the meaning of life?,599,"[7295, 598, 1448, 7296, 12726, 1449]",6,"[0.03313767537474632, -0.019551923498511314, 0...","[-0.053588867, 0.016159058, -0.055480957, 0.01...","[0.3338299, 0.3477132, -0.73120797, -0.3410514...",0.763303


In [49]:
df_new.loc[df_new['id']==160]

Unnamed: 0,text,id,duplicated_questions,length,openai_emb,cohere_emb,e5_emb
56,What's the purpose of life? What is life actua...,160,"[12591, 1179, 1178, 159, 12131, 12132]",6,"[0.017729053273797035, -0.0027049058116972446,...","[-0.06365967, 0.00097608566, -0.09008789, -0.0...","[0.18751019, -0.16379529, -0.60503745, -0.0456..."


#### 결론

- cohere, openai, e5 모두 굉장히 성능이 좋기 때문에 대부분의 task에 곧바로 활용해도 무방함.
- Local embedding 모델을 활용하고자 할 때 위와 같은 방법으로 classification 성능 & 자원 할당 체크 필요.
- 성능 평가 방법
    - 태깅된 데이터 셋 활용
    - 정성적 평가 (duplicated_questions에 tag하기 너무 어려움 -> 어쩔수 없이 정성적 평가를 해야함)
        - 데이터 태깅을 할 노동력이 부족할 때
        - 태깅을 하기 애매한 분야 (정답이 없는 경우)

### <font color=green> 임베딩이 벡터DB에만 쓰이는게 아니라, LLM에 활용한 서비스에 필수적으로 들어가는 기능에 쓰이기도 하고 다른 머신러닝 통계적 기법도 다양하게 활용할 수 있음 </font>
- 활용할 수 있는 툴이라던지, 확장 가능한 방법에 대해서 어느정도 알면, 우리의 pjt에 활용할 수 있음
- 좀 더 창의적으로 사용할 수 있음