### ChatGPT Chunk

In [174]:
import openai

openai.api_key = ""

def ask_chatgpt(prompt):
    
    completion = openai.ChatCompletion.create(
      model="gpt-3.5-turbo",
      temperature=0,
      messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt},
        ]
    )

    return completion

In [215]:
import pandas as pd
from langchain.document_loaders import DataFrameLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

df = pd.read_csv('document/clean_content_4.csv')

loader = DataFrameLoader(df, page_content_column="content")
docs = loader.load()

CHUNK_SIZE = 1024
CHUNK_OVERLAP = 0

child_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP, add_start_index=True
)

all_splits = child_splitter.split_documents(docs)
faq_split_1024 = all_splits[-226:]
nan_faq_split_1024 = all_splits[:-226]
print(f'len(all_splits): {len(all_splits)}, len(nan_faq_split_1024): {len(nan_faq_split_1024)}')


len(all_splits): 19091, len(nan_faq_split_1024): 18865


In [216]:
faq_split_1024 = all_splits[-226:]
len(faq_split_1024)

226

In [217]:
from tqdm import tqdm
import time

gpt_dic = {'url': [], 'chunks': []}
# ans = []
for idx, doc in enumerate(tqdm(nan_faq_split_1024[10557:])):
    prompt = f'''Decompose the "Content" into clear and simple propositions, ensuring they are interpretable out of context.
    1. Split compound sentence into simple sentences. Maintain the original phrasing from the input whenever possible.
    2. Remove the irrelevant content. 
    3. For any named entity that is accompanied by additional descriptive information, separate this information into its own distinct proposition.
    4. Decontextualize the proposition by adding necessary modifier to nouns or entire sentences and replacing pronouns (e.g., "it", "he", "she", "they", "this", "that") with the full name of the entities they refer to.
    5. Present the results as a list of strings, formatted in JSON.
    Input:"English  Version 國立臺灣大學圖書資訊學系 Department and Graduate Institute of Library and Information Science, National Taiwan University  選單
                        主選單 學士班獎助學金 僅提供系上開設獎學金消息 圖資系專屬獎學金一覽表 2023.09更新 臺大文學院獎助學金 臺大獎助學金一覽表 臺大弱勢助學金 臺大生活學習獎助金 國立臺灣大學希望助學金 (請以關鍵字搜尋希望助學金) LIS, NTU © All rights reserved since 2014"
    Output:["國立臺灣大學圖書資訊學系 Department and Graduate Institute of Library and Information Science, National Taiwan University  選單", "圖書資訊學系學士班獎助學金", "僅提供圖書資訊學系系上開設獎學金消息", "圖資系專屬獎學金一覽表 2023.09更新", "臺大文學院獎助學金", "臺大獎助學金一覽表", "臺大弱勢助學金", "臺大生活學習獎助金", "國立臺灣大學希望助學金 (請以關鍵字搜尋希望助學金)" ]
    Input:"{doc.page_content}"
    Output:
    '''
    # print(prompt)
    try:
        response = ask_chatgpt(prompt)['choices'][0]['message']['content']
        # print(response)
        # ans.append(response)
        gpt_dic['url'].append(nan_faq_split_1024[10557:][idx].metadata['url'])
        gpt_dic['chunks'].append(response)
        gpt_dic_df = pd.DataFrame(gpt_dic)
        gpt_dic_df.to_csv('document/non_faq_ChatGPT_chunk.csv', index=False)
    except:
        time.sleep(10)
        response = ask_chatgpt(prompt)['choices'][0]['message']['content']
        # print(response)
        # ans.append(response)
        gpt_dic['url'].append(nan_faq_split_1024[10557:][idx].metadata['url'])
        gpt_dic['chunks'].append(response)
        gpt_dic_df = pd.DataFrame(gpt_dic)
        gpt_dic_df.to_csv('document/non_faq_ChatGPT_chunk.csv', index=False)

    # if idx>0:
    #     break

100%|██████████| 8308/8308 [29:31:12<00:00, 12.79s/it]    


### CHATGPT Chunk TEST

In [222]:
import json

gpt_chunk_df = pd.read_csv('document/all_chatgpt_chunk.csv')
gpt_sep_chunk = {'url': [], 'chunk': []}
for doc in gpt_chunk_df.iterrows():
    # print(doc)
    try:
        chunks = json.loads(doc[1][1])
        url = doc[1][0]
        # print(url, chunks)
        for chunk in chunks:
            gpt_sep_chunk['url'].append(url)
            gpt_sep_chunk['chunk'].append(chunk)
    except:
        print(doc)

gpt_sep_chunk_df = pd.DataFrame(gpt_sep_chunk)
print(f'len(gpt_all_splits): {len(gpt_sep_chunk_df)} {gpt_sep_chunk_df[:1]}')

(13, url                               http://www.oc.ntu.edu.tw/
chunks    ["臺大海洋研究所謝志豪教授與碩士畢業生楊箴芸，使用新穎的研究船上實驗設計，首次提出海洋野外...
Name: 13, dtype: object)
(24, url                               https://rsprc.ntu.edu.tw/
chunks    ["臺大風險中心（以下簡稱本中心）由本年度（2023年）展開的【青年淨零力實踐平台】活動專案...
Name: 24, dtype: object)
(90, url                             https://liberal.ntu.edu.tw/
chunks    ["臺大首頁", "本院首頁", "網站導覽", "關於本院", "本院簡介（含歷任院長）"...
Name: 90, dtype: object)
(98, url                                  http://ibs.ntu.edu.tw/
chunks    ["ç”\x9fç\x89©ã\x80\x81é\x86«å\xad¸ã\x80\x81å\...
Name: 98, dtype: object)
(123, url       https://www.ieee-pels.org/awards/pels/achievem...
chunks    ["Join IEEE Sign In IEEE Power Electronics Soc...
Name: 123, dtype: object)
(165, url                        http://www.oc.ntu.edu.tw/?cat=30
chunks    ["國立臺灣大學海洋研究所 大型研究計畫", "本所引進水下滑翔觀測載具Seaglider提...
Name: 165, dtype: object)
(193, url       https://www.nature.com/articles/s41586-021-034...
chunks    ["contacts at room t

In [227]:
# gpt_sep_chunk_df.to_csv('document/ChatGPT_washed_data.csv', index=False)

In [2]:
import pandas as pd
gpt_sep_chunk_df = pd.read_csv('document/ChatGPT_washed_data.csv')
gpt_sep_chunk_df

Unnamed: 0,url,chunk
0,https://www.ac.ntu.edu.tw/,國立臺灣大學農業化學系忘記密碼?
1,https://www.ac.ntu.edu.tw/,國立臺灣大學農業化學系首頁
2,https://www.ac.ntu.edu.tw/,國立臺灣大學農業化學系網站導覽
3,https://www.ac.ntu.edu.tw/,國立臺灣大學農業化學系舊網頁
4,https://www.ac.ntu.edu.tw/,國立臺灣大學農業化學系最新消息
...,...,...
399714,https://sec.ntu.edu.tw/News_Content_n_1419_sms...,國立臺灣大學秘書室 職員申訴
399715,https://sec.ntu.edu.tw/News_Content_n_1419_sms...,申訴提出後，申訴人尚需做哪些後續工作？
399716,https://sec.ntu.edu.tw/News_Content_n_1419_sms...,申訴資料送至秘書室後，秘書室會先請相對單位提供書面說明，並籌備召開申評會議等事宜
399717,https://sec.ntu.edu.tw/News_Content_n_1419_sms...,俟時間、地點確定後，發開會通知單邀請申訴人及相對人到會說明


In [3]:
from langchain.document_loaders import DataFrameLoader

loader = DataFrameLoader(gpt_sep_chunk_df, page_content_column="chunk")
docs = loader.load()
# print(docs[0])

print(f'len(gpt_all_splits): {len(docs)} {docs[0]}')


len(gpt_all_splits): 399719 page_content='國立臺灣大學農業化學系忘記密碼?' metadata={'url': 'https://www.ac.ntu.edu.tw/'}


In [4]:
import numpy as np

l = np.array([len(doc.page_content) for doc in docs])
np.min(l), np.mean(l), np.median(l), np.max(l)

(1, 34.97879760531774, 18.0, 1024)

In [5]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceBgeEmbeddings

embeddings = HuggingFaceBgeEmbeddings(model_name = "yentinglin/bert-base-zhtw")
faiss = FAISS.from_documents(docs, embeddings)
faiss.save_local(f'embeddings/all_bert_base_zhtw_chatgpt')

No sentence-transformers model found with name /home/ai2lab/.cache/torch/sentence_transformers/yentinglin_bert-base-zhtw. Creating a new one with MEAN pooling.
Some weights of BertModel were not initialized from the model checkpoint at /home/ai2lab/.cache/torch/sentence_transformers/yentinglin_bert-base-zhtw and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [229]:
# raw_data = pd.read_csv('document/clean_content_4.csv')
# faq_df = raw_data[-190:]
# faq_df.head()

Unnamed: 0,url,timestamp,content
2914,https://ntubeats.ntu.edu.tw/enews/95,1702927000.0,April 95 Subscribe FEATURES A Warmer Campus: I...
2915,https://www.ch.ntu.edu.tw/esh.html,1704133000.0,跳到主要內容區塊 國立臺灣大學化學系 環境安全衛生 手機版選單 search 搜尋 搜尋 分...
2916,https://sec.ntu.edu.tw/news_content_n_1413_sms...,1704133000.0,跳到主要內容區塊 國立臺灣大學秘書室 校務會議 手機版選單 search 搜尋 搜尋 分享 ...
2917,https://sec.ntu.edu.tw/news_content_n_1422_sms...,1704133000.0,跳到主要內容區塊 國立臺灣大學秘書室 優秀學生出國開會補助 手機版選單 search 搜尋 ...
2918,https://sec.ntu.edu.tw/News_Content_n_1423_s_4...,1704133000.0,跳到主要內容區塊 國立臺灣大學秘書室 臺大講座 手機版選單 search 搜尋 搜尋 分享 ...


In [145]:
# from langchain.document_loaders import DataFrameLoader
# from langchain.text_splitter import RecursiveCharacterTextSplitter


# loader = DataFrameLoader(faq_df, page_content_column="content")
# docs = loader.load()

# CHUNK_SIZE = 64
# CHUNK_OVERLAP = 8

# child_splitter = RecursiveCharacterTextSplitter(
#     chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP, add_start_index=True
# )

# all_splits = child_splitter.split_documents(docs)
# print(f'len(all_splits): {len(all_splits)} {all_splits[0]}')


len(all_splits): 2336 page_content='跳到主要內容區塊 國立臺灣大學化學系 環境安全衛生 手機版選單 search 搜尋 搜尋 分享 上方連結 下方連結 分享' metadata={'url': 'https://www.ch.ntu.edu.tw/esh.html', 'timestamp': 1704133444.8363142, 'start_index': 0}


In [146]:
# faiss = FAISS.from_documents(all_splits, embeddings)
# faiss.save_local(f'embeddings/faq_bge_large_64_8')

### Evaluation Function Def

In [168]:
import pandas as pd

# testset = pd.read_csv('document/FAQ_ChatGPT.csv')
testset = pd.read_csv('document/FAQ_HyDE.csv')

testset.head(3)

Unnamed: 0,URL,Question,Answer,ChatGPT,retreival doc,retreival url,HyDE
0,https://admissions.ntu.edu.tw/zh-hant/apply/faq/,學生身份別之間有什麽差異？我應該如何查詢我的身份別呢？,學生身份別一共分四種：國際生、僑生、港澳生及陸生。您可以透過境外學位生身份別測驗來查詢自己的...,學生身份別有哪些類別？我該如何確認自己的身份別呢？,['to pay tuition? 我應該去哪裡得到學費繳交訊息？ To access yo...,['http://www.ipcs.ntu.edu.tw/page/about/index....,學生身份有多種類別，主要分為全日制學生和非全日制學生。全日制學生是指在學校正式註冊並按照學校...
1,https://admissions.ntu.edu.tw/zh-hant/apply/faq/,學生身份別的不同會帶來甚麼差異？如果我同時符合兩個身份別，那應該如何處理？,根據學生身份別的不同，入學流程、學費、獎助學金、開放的系所及一些相關限制都會有差異。建議您在...,如果我同時符合兩個身份別，我該如何處理？,['You may have one-on-one or one-on-many consu...,['https://mastertalk.oia.ntu.edu.tw/'],如果你同時符合兩個身份別，你可以根據具體情況來處理。首先，你可以評估這兩個身份別對你的重要性...
2,https://admissions.ntu.edu.tw/zh-hant/apply/faq/,我現在就讀於其他大學，我可以轉學到臺大嗎？,如您為目前正攻讀學士學位的國際生，請參考國際生二年級入學頁面。至於僑、港澳生，如您目前在臺灣...,我目前在其他大學就讀，我可以轉學到臺大嗎？,['to pay tuition? 我應該去哪裡得到學費繳交訊息？ To access yo...,['http://www.ipcs.ntu.edu.tw/page/about/index....,當然可以！臺灣大學是臺灣最著名的大學之一，具有卓越的學術聲譽和優秀的教學資源。如果你目前在其...


In [169]:
def retriever_eval(retriever, k):

    retrieve_urls = []
    
    for idx, query in enumerate(testset['ChatGPT']):
        # print('query:', query)
        docs = retriever.get_relevant_documents(query)
        # print('docs:', docs)
        retrieve_urls.append(sorted([(doc.metadata['url'])for doc in docs]))
    
    correct_cnt = 0
    for idx, d in enumerate(testset.iterrows()):
        if d[1]['URL'] in retrieve_urls[idx]:
            correct_cnt+=1

    print(f'Recall@{k}, ChatGPT Generated Question Accuracy: {correct_cnt/len(testset)}')
    return correct_cnt/len(testset)

def hyde_retriever_eval(retriever, k):

    retrieve_urls = []
    
    for idx, query in enumerate(testset['ChatGPT']):
        # print('query:', query)
        docs = retriever.get_relevant_documents(query+testset['HyDE'][idx])
        # print('docs:', docs)
        retrieve_urls.append(sorted([(doc.metadata['url'])for doc in docs]))
    
    correct_cnt = 0
    for idx, d in enumerate(testset.iterrows()):
        if d[1]['URL'] in retrieve_urls[idx]:
            correct_cnt+=1
    
    print(f'Recall@{k}, ChatGPT Generated Question with HyDE Accuracy: {correct_cnt/len(testset)}')
    return correct_cnt/len(testset)

def retriever_original(retriever, k):

    retrieve_urls = []
    
    for idx, query in enumerate(testset['Question']):
        # print('query:', query)
        docs = retriever.get_relevant_documents(query)
        # print('docs:', docs)
        retrieve_urls.append(sorted([(doc.metadata['url'])for doc in docs]))
    
    correct_cnt = 0
    for idx, d in enumerate(testset.iterrows()):
        if d[1]['URL'] in retrieve_urls[idx]:
            correct_cnt+=1
    # print(f"Correct Answer: {d[1]['URL']}")
    # print(f'Retrieve document urls:')
    # [print(url) for url in retrieve_urls[idx]]
    print(f'Recall@{k}, Original Question Accuracy: {correct_cnt/len(testset)}')
    return correct_cnt/len(testset)


### CHUNK TEST

In [232]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceBgeEmbeddings

embeddings = HuggingFaceBgeEmbeddings(model_name = "BAAI/bge-large-zh-v1.5")
db = FAISS.load_local('embeddings/all_bge_large_chatgpt', embeddings)

Ks = [1, 5, 10, 20, 50, 100]
# Ks = [100]
# Ks = [1, 5, 10, 20]

for K in Ks:
    retriever = db.as_retriever(search_kwargs={"k": K})
    # retriever_original(retriever, K)
    # retriever_eval(retriever, K)
    hyde_retriever_eval(retriever, K)

Recall@1, ChatGPT Generated Question with HyDE Accuracy: 0.36313868613138683
Recall@5, ChatGPT Generated Question with HyDE Accuracy: 0.44343065693430656
Recall@10, ChatGPT Generated Question with HyDE Accuracy: 0.5036496350364964
Recall@20, ChatGPT Generated Question with HyDE Accuracy: 0.5273722627737226
Recall@50, ChatGPT Generated Question with HyDE Accuracy: 0.5802919708029197
Recall@100, ChatGPT Generated Question with HyDE Accuracy: 0.6222627737226277


In [161]:
# from langchain.vectorstores import FAISS
# from langchain.embeddings import HuggingFaceBgeEmbeddings

# db = FAISS.load_local('embeddings/faq_bge_large_chatgpt', embeddings)

# Ks = [1, 5, 10, 20, 50, 100]
# # Ks = [1, 5, 10, 20]

# for K in Ks:
#     retriever = db.as_retriever(search_kwargs={"k": K})
#     retriever_original(retriever, K)
#     retriever_eval(retriever, K)
#     hyde_retriever_eval(retriever, K)
#     # retriever_original(retriever, K)

Recall@1, Original Question Accuracy: 0.5492700729927007
Recall@1, ChatGPT Generated Question Accuracy: 0.5711678832116789
Recall@1, ChatGPT Generated Question with HyDE Accuracy: 0.5091240875912408
Recall@5, Original Question Accuracy: 0.666058394160584
Recall@5, ChatGPT Generated Question Accuracy: 0.6733576642335767
Recall@5, ChatGPT Generated Question with HyDE Accuracy: 0.6332116788321168
Recall@10, Original Question Accuracy: 0.7007299270072993
Recall@10, ChatGPT Generated Question Accuracy: 0.7098540145985401
Recall@10, ChatGPT Generated Question with HyDE Accuracy: 0.6697080291970803
Recall@20, Original Question Accuracy: 0.7408759124087592
Recall@20, ChatGPT Generated Question Accuracy: 0.7299270072992701
Recall@20, ChatGPT Generated Question with HyDE Accuracy: 0.7025547445255474
Recall@50, Original Question Accuracy: 0.7791970802919708
Recall@50, ChatGPT Generated Question Accuracy: 0.7755474452554745
Recall@50, ChatGPT Generated Question with HyDE Accuracy: 0.76094890510948

In [155]:
db = FAISS.load_local('embeddings/faq_bge_large_chatgpt', embeddings)
retriever = db.as_retriever(search_kwargs={"k": 1})
retriever_eval(retriever, 1)

query: 學生身份別有哪些類別？我該如何確認自己的身份別呢？
docs: [Document(page_content='一、學校裡甚麼身分的人應該參加勞保？', metadata={'url': 'http://www.personnel.ntu.edu.tw/cp_n_2778.html'})]
query: 如果我同時符合兩個身份別，我該如何處理？
docs: [Document(page_content='能否只填寫一個或兩個呢', metadata={'url': 'http://www.oc.ntu.edu.tw/?page_id=15221'})]
query: 我目前在其他大學就讀，我可以轉學到臺大嗎？
docs: [Document(page_content='從臺大如何到中研院？', metadata={'url': 'https://visitorcenter.ntu.edu.tw/News_Content_n_55961_sms_15514_s_62775.html'})]
query: 臺大的教學語言有哪些？
docs: [Document(page_content='什麼是臺大線上英語課程(Online English Programs)？', metadata={'url': 'https://fltc.fltc.ntu.edu.tw/index.php?action=about&id=28'})]
query: 臺大提供哪些類別的課程呢？
docs: [Document(page_content='欲知更多資訊，請見臺大線上課程介紹。', metadata={'url': 'https://fltc.fltc.ntu.edu.tw/index.php?action=about&id=28'})]
query: 臺大的學習規劃辦公室提供哪些學習支援服務？
docs: [Document(page_content='國立臺灣大學秘書室 校務發展規劃委員會', metadata={'url': 'https://sec.ntu.edu.tw/News_Content_n_1415_s_4352.html'})]
query: 我可以在入學後改變專業、修讀輔系或雙主修嗎？
docs: [Document(page_content='可否選

0.5711678832116789

### BM25 TEST

In [165]:
# Only BM25
from langchain.retrievers import BM25Retriever

bm25_retriever = BM25Retriever.from_documents(docs)

Ks = [1, 5, 10, 20, 50, 100]
for K in Ks:
    # initialize the bm25 retriever and faiss retriever
    bm25_retriever.k = K
    retriever_original(retriever, K)
    retriever_eval(bm25_retriever, K)
    hyde_retriever_eval(bm25_retriever, K)

Recall@1, Original Question Accuracy: 0.8138686131386861
Recall@1, ChatGPT Generated Question Accuracy: 0.025547445255474453
Recall@1, ChatGPT Generated Question with HyDE Accuracy: 0.016423357664233577
Recall@5, Original Question Accuracy: 0.8138686131386861
Recall@5, ChatGPT Generated Question Accuracy: 0.032846715328467155
Recall@5, ChatGPT Generated Question with HyDE Accuracy: 0.023722627737226276
Recall@10, Original Question Accuracy: 0.8138686131386861
Recall@10, ChatGPT Generated Question Accuracy: 0.032846715328467155
Recall@10, ChatGPT Generated Question with HyDE Accuracy: 0.025547445255474453
Recall@20, Original Question Accuracy: 0.8138686131386861
Recall@20, ChatGPT Generated Question Accuracy: 0.032846715328467155
Recall@20, ChatGPT Generated Question with HyDE Accuracy: 0.02737226277372263
Recall@50, Original Question Accuracy: 0.8138686131386861
Recall@50, ChatGPT Generated Question Accuracy: 0.032846715328467155
Recall@50, ChatGPT Generated Question with HyDE Accuracy

### MultiQ

In [166]:
import pandas as pd

multiQ_df = pd.read_csv('document/FAQ_ChatGPT_MultiQ.csv')
multiQ_df.head()

Unnamed: 0,URL,Question,Answer,ChatGPT_MultiQ
0,https://admissions.ntu.edu.tw/zh-hant/apply/faq/,學生身份別之間有什麽差異？我應該如何查詢我的身份別呢？,學生身份別一共分四種：國際生、僑生、港澳生及陸生。您可以透過境外學位生身份別測驗來查詢自己的...,學生身份別有哪些類別？我該如何確認自己的身份別？\n問：學生身份分為幾種？我要如何查詢自己的...
1,https://admissions.ntu.edu.tw/zh-hant/apply/faq/,學生身份別的不同會帶來甚麼差異？如果我同時符合兩個身份別，那應該如何處理？,根據學生身份別的不同，入學流程、學費、獎助學金、開放的系所及一些相關限制都會有差異。建議您在...,學生身份別的不同會對入學流程有什麼影響？\n 問：如果我同時符合兩個身份別，我可以同時...
2,https://admissions.ntu.edu.tw/zh-hant/apply/faq/,我現在就讀於其他大學，我可以轉學到臺大嗎？,如您為目前正攻讀學士學位的國際生，請參考國際生二年級入學頁面。至於僑、港澳生，如您目前在臺灣...,我目前在其他大學就讀，可以轉學到臺大嗎？\n 問：我是一名國際生，可以轉學到臺大嗎？\...
3,https://admissions.ntu.edu.tw/zh-hant/apply/faq/,臺大的教學語言是什麽？,臺大主要以中文及英文為授課語言。英語授課系所可分為 1. 部分英語授課，即以英語授課的科目數...,臺大的教學語言有哪些？\n問：臺大的授課語言是什麽？\n問：臺大的主要教學語言是什麽？\n問...
4,https://admissions.ntu.edu.tw/zh-hant/apply/faq/,臺大有提供哪些課程呢？,臺大致力於提供全面的教育，因此課程包括系上必修課程、通識教育課程和人文教育課程。通識教育核心...,臺大提供哪些核心通識教育課程？\n 問：臺大的人文教育課程有哪些類別？\n 問：...


In [172]:
def retriever_multiQ_eval(testset, retriever, k):

    retrieve_urls = []
    
    for idx, query in enumerate(testset['ChatGPT_MultiQ']):
        # print('query:', query)
        docs = retriever.get_relevant_documents(query)
        # print('docs:', docs)
        retrieve_urls.append(sorted([(doc.metadata['url'])for doc in docs]))
    
    correct_cnt = 0
    for idx, d in enumerate(testset.iterrows()):
        if d[1]['URL'] in retrieve_urls[idx]:
            correct_cnt+=1

    print(f'Recall@{k}, *Multi Query* ChatGPT Generated Question Accuracy: {correct_cnt/len(testset)}')
    return correct_cnt/len(testset)

# def hyde_retriever_eval(retriever, k):

#     retrieve_urls = []
    
#     for idx, query in enumerate(testset['ChatGPT_MultiQ']):
#         # print('query:', query)
#         docs = retriever.get_relevant_documents(query+testset['HyDE'][idx])
#         # print('docs:', docs)
#         retrieve_urls.append(sorted([(doc.metadata['url'])for doc in docs]))
    
#     correct_cnt = 0
#     for idx, d in enumerate(testset.iterrows()):
#         if d[1]['URL'] in retrieve_urls[idx]:
#             correct_cnt+=1
    
#     print(f'Recall@{k}, ChatGPT Generated Question with HyDE Accuracy: {correct_cnt/len(testset)}')
#     return correct_cnt/len(testset)

# def retriever_original(retriever, k):

#     retrieve_urls = []
    
#     for idx, query in enumerate(testset['Question']):
#         # print('query:', query)
#         docs = retriever.get_relevant_documents(query)
#         # print('docs:', docs)
#         retrieve_urls.append(sorted([(doc.metadata['url'])for doc in docs]))
    
#     correct_cnt = 0
#     for idx, d in enumerate(testset.iterrows()):
#         if d[1]['URL'] in retrieve_urls[idx]:
#             correct_cnt+=1
#     # print(f"Correct Answer: {d[1]['URL']}")
#     # print(f'Retrieve document urls:')
#     # [print(url) for url in retrieve_urls[idx]]
#     print(f'Recall@{k}, Original Question Accuracy: {correct_cnt/len(testset)}')
#     return correct_cnt/len(testset)


In [173]:
from langchain.vectorstores import FAISS

db = FAISS.load_local('embeddings/faq_bge_large_chatgpt', embeddings)

Ks = [1, 5, 10, 20, 50, 100]

for K in Ks:
    retriever = db.as_retriever(search_kwargs={"k": K})
    retriever_eval(retriever, K)
    retriever_multiQ_eval(multiQ_df, retriever, K)
    # retriever_original(retriever, K)

Recall@1, ChatGPT Generated Question Accuracy: 0.5711678832116789
Recall@1, *Multi Query* ChatGPT Generated Question Accuracy: 0.5474452554744526
Recall@5, ChatGPT Generated Question Accuracy: 0.6733576642335767
Recall@5, *Multi Query* ChatGPT Generated Question Accuracy: 0.6368613138686131
Recall@10, ChatGPT Generated Question Accuracy: 0.7098540145985401
Recall@10, *Multi Query* ChatGPT Generated Question Accuracy: 0.6788321167883211
Recall@20, ChatGPT Generated Question Accuracy: 0.7299270072992701
Recall@20, *Multi Query* ChatGPT Generated Question Accuracy: 0.7135036496350365
Recall@50, ChatGPT Generated Question Accuracy: 0.7755474452554745
Recall@50, *Multi Query* ChatGPT Generated Question Accuracy: 0.7591240875912408
Recall@100, ChatGPT Generated Question Accuracy: 0.8102189781021898
Recall@100, *Multi Query* ChatGPT Generated Question Accuracy: 0.801094890510949
