## Clean FAQ

In [None]:
import pandas as pd

df = pd.read_excel('document/FAQ.xlsx')
df.head()

In [None]:
from langchain.document_loaders import UnstructuredURLLoader, UnstructuredFileLoader
from langchain.document_loaders.image import UnstructuredImageLoader
from langchain.docstore.document import Document

from unstructured.cleaners.core import remove_punctuation, clean, clean_extra_whitespace
from urllib.parse import urlparse

def is_url(url):
  try:
    result = urlparse(url)
    return all([result.scheme, result.netloc])
  except ValueError:
    return False
  

  
def generate_document(url):
    
    fake_head = {
              'User-Agent': 'My User Agent 1.0',
              "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*"
              ";q=0.8",
              "Accept-Language": "en-US,en;q=0.5",
              "Referer": "https://www.google.com/",
              "DNT": "1",
              "Connection": "keep-alive",
              "Upgrade-Insecure-Requests": "1",
          }
    try:
      if is_url(url):
          loader = UnstructuredURLLoader(urls=[url],
                                        mode="elements",
                                        post_processors=[clean,remove_punctuation,clean_extra_whitespace], 
                                        headers=fake_head)
      elif url.endswith('.jpg'):
        loader = UnstructuredImageLoader(url,
                                        mode="elements",
                                        post_processors=[clean,remove_punctuation,clean_extra_whitespace], 
                                        headers=fake_head)
      else:
          loader = UnstructuredFileLoader(url, 
                                  strategy="fast", 
                                  mode="elements",
                                  post_processors=[clean,remove_punctuation,clean_extra_whitespace], 
                                  headers=fake_head)
      elements = loader.load()
      # print(f'elements {elements}')
      selected_elements = [e for e in elements if e.metadata['category']=="NarrativeText" or e.metadata['category']=="Title"]
      # print(f'selected_elements {selected_elements}')
      full_clean = " ".join([e.page_content for e in selected_elements])
      return Document(page_content=full_clean, metadata={"source":url})
    except:
       print(f'*ERROR* {url}')

In [None]:
df['Updated'] = df['Answer']

In [None]:
import numpy as np

for idx, item in enumerate(df['Appendix']):
    if item is not np.nan:
        # print(df['Answer'][idx])
        try:
            # print(str(df['Answer'][idx])+'\n'+generate_document(item).page_content)
            df['Updated'][idx] = df['Answer'][idx]+generate_document(item).page_content
        except:
            print(f'*ERROR* @ {df["URL"][idx]}')
        # break

In [None]:
df.dropna(subset=['Updated'], inplace=True, ignore_index=True)

In [None]:
df['Answer'] = df['Updated']
df = df.drop(columns=['Updated', 'Appendix'])
df.to_csv('FAQ.csv', index=False)

## Complete trainset

In [None]:
trainset_df = pd.read_csv('document/clean_content.csv',encoding='utf-8')

train_urls = set(trainset_df['url'])
len(trainset_df['url']), len(train_urls)

In [None]:
trainset_df.drop_duplicates(subset=['url'],keep='last',inplace=True,ignore_index=True)
# trainset_df = trainset_df.reset_index()
len(trainset_df)

In [None]:
trainset_df.to_csv('document/clean_content_2.csv', index=False)

In [None]:
trainset_df = pd.read_csv('document/clean_content_2.csv')
len(trainset_df)

In [None]:
len(set(df['URL']))

In [None]:
import time
for test_url in set(df['URL']):
    if test_url not in trainset_df['url']:
        # print(test_url, time.time(), generate_document(test_url))
        trainset_df.loc[len(trainset_df.index)] = [test_url, time.time(), generate_document(test_url).page_content] 
len(trainset_df)

In [None]:
trainset_df

In [None]:
trainset_df.to_csv('document/clean_content_3.csv', index=False)

In [None]:
import pandas as pd

df = pd.read_csv('document/clean_content_3.csv')
print(len(df))
df.drop_duplicates(subset=['url'],keep='last',inplace=True,ignore_index=True)
# trainset_df = trainset_df.reset_index()
print(len(df))


In [None]:
df.to_csv('document/clean_content_4.csv', index=False)

## Generate ChatGPT Question

In [None]:
import openai

openai.api_key = ""

def ask_chatgpt(prompt):
    completion = openai.ChatCompletion.create(
      model="gpt-3.5-turbo",
      temperature=0,
      messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt},
        ]
    )
    return completion

In [None]:
import pandas as pd

testset = pd.read_csv('document/FAQ.csv')
testset.head(3)

In [None]:
import tiktoken
from tqdm import tqdm

encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")

gpts_ans = []
for t in tqdm(testset.iterrows()):
    text = f"""
    以下是一組問答：
    問： {t[1]['Question']}
    答：{t[1]['Answer']}
    請基於以上問答生成五個意思一樣的問題給我，不需要回答。
    問："""
    token_count = len(encoding.encode(text))
    if token_count>4097:
        text = f"""
            以下是一組問答：
            問： {t[1]['Question']}
            答：{t[1]['Answer'][:-(token_count-4097)*2]}
            請基於以上問答生成五個意思一樣的問題給我，不需要回答。
            問：
        """
        token_count = len(encoding.encode(text))
        print(f'After: token_count:{token_count}')
    try:
        answer = ask_chatgpt(text)['choices'][0]['message']['content']
        print('Original answer:', t[1]['Question'])
        print('Multi query:', answer)
        gpts_ans.append(answer)
    except:
        print(f'*ERROR* {t} with {len(encoding.encode(text))} tokens.')
    # break


In [None]:
testset['ChatGPT_MultiQ'] = gpts_ans
testset.head(3)

In [None]:
testset.to_csv('document/FAQ_ChatGPT_MultiQ.csv', index=False)

In [None]:
import pandas as pd

df = pd.read_csv('document/clean_content_4.csv')

In [None]:
from langchain.document_loaders import DataFrameLoader

loader = DataFrameLoader(df, page_content_column="content")
docs = loader.load()

In [None]:
len(docs),docs[0]

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

CHUNK_SIZE = 64
CHUNK_OVERLAP = 8

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP, add_start_index=True
)
all_splits = text_splitter.split_documents(docs)

In [None]:
len(all_splits), len(all_splits[0].page_content), all_splits[10].metadata

In [None]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceBgeEmbeddings

embeddings = HuggingFaceBgeEmbeddings(model_name = "thenlper/gte-large-zh")
db = FAISS.from_documents(all_splits, embeddings)
db.save_local(f'embeddings/faiss_gte_large_{CHUNK_SIZE}_{CHUNK_OVERLAP}')

## Evaluation Retrieval

In [None]:
import pandas as pd

# testset = pd.read_csv('document/FAQ_ChatGPT.csv')
testset = pd.read_csv('document/testset.csv')

testset.head(3)

In [None]:
def retriever_eval(retriever, k):

    retrieve_urls = []
    
    for idx, query in enumerate(testset['Question']):
        # print('query:', query)
        docs = retriever.get_relevant_documents(query)
        # print('docs:', docs)
        retrieve_urls.append(sorted([(doc.metadata['url'])for doc in docs]))
    
    correct_cnt = 0
    for idx, d in enumerate(testset.iterrows()):
        if d[1]['URL'] in retrieve_urls[idx]:
            correct_cnt+=1
    
    # print(f"Correct Answer: {d[1]['URL']}")
    # print(f'Retrieve document urls:')
    # [print(url) for url in retrieve_urls[idx]]
    print(f'Recall@{k}, ChatGPT Generated Question Accuracy: {correct_cnt/len(testset)}')
    return correct_cnt/len(testset)

def hyde_retriever_eval(retriever, k):

    retrieve_urls = []
    
    for idx, query in enumerate(testset['Question']):
        # print('query:', query)
        docs = retriever.get_relevant_documents(query+testset['HyDE'][idx])
        # print('docs:', docs)
        retrieve_urls.append(sorted([(doc.metadata['url'])for doc in docs]))
    
    correct_cnt = 0
    for idx, d in enumerate(testset.iterrows()):
        if d[1]['URL'] in retrieve_urls[idx]:
            correct_cnt+=1
    
    # print(f"Correct Answer: {d[1]['URL']}")
    # print(f'Retrieve document urls:')
    # [print(url) for url in retrieve_urls[idx]]
    print(f'Recall@{k}, ChatGPT Generated Question with HyDE Accuracy: {correct_cnt/len(testset)}')
    return correct_cnt/len(testset)


In [None]:
# def retriever_original(retriever, k):

#     retrieve_urls = []
    
#     for idx, query in enumerate(testset['Question']):
#         # print('query:', query)
#         docs = retriever.get_relevant_documents(query)
#         # print('docs:', docs)
#         retrieve_urls.append(sorted([(doc.metadata['url'])for doc in docs]))
    
#     correct_cnt = 0
#     for idx, d in enumerate(testset.iterrows()):
#         if d[1]['URL'] in retrieve_urls[idx]:
#             correct_cnt+=1
#     # print(f"Correct Answer: {d[1]['URL']}")
#     # print(f'Retrieve document urls:')
#     # [print(url) for url in retrieve_urls[idx]]
#     # print(f'Recall@{k}, Original Question Accuracy: {correct_cnt/len(testset)}')
#     return correct_cnt/len(testset)

## Chuck Length: BAAI/bge-large-zh-v1.5, chuck by ChatGPT size=64 is the best

In [None]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceBgeEmbeddings
# from langchain.embeddings import OpenAIEmbeddings

embeddings = HuggingFaceBgeEmbeddings(model_name = "TownsWu/PEG")
db = FAISS.load_local('embeddings/all_PEG_chatgpt', embeddings)

Ks = [1, 5, 10, 20, 50, 100]
# Ks = [1, 5, 10, 20]

for K in Ks:
    retriever = db.as_retriever(search_kwargs={"k": K})
    retriever_eval(retriever, K)

In [None]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceBgeEmbeddings
# from langchain.embeddings import OpenAIEmbeddings

embeddings = HuggingFaceBgeEmbeddings(model_name = "infgrad/stella-large-zh")
db = FAISS.load_local('embeddings/all_stella_large_chatgpt', embeddings)

Ks = [1, 5, 10, 20, 50, 100]
# Ks = [1, 5, 10, 20]

for K in Ks:
    retriever = db.as_retriever(search_kwargs={"k": K})
    retriever_eval(retriever, K)

In [None]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceBgeEmbeddings
# from langchain.embeddings import OpenAIEmbeddings

embeddings = HuggingFaceBgeEmbeddings(model_name = "infgrad/stella-base-zh")
db = FAISS.load_local('embeddings/all_stella_base_chatgpt', embeddings)

Ks = [1, 5, 10, 20, 50, 100]
# Ks = [1, 5, 10, 20]

for K in Ks:
    retriever = db.as_retriever(search_kwargs={"k": K})
    retriever_eval(retriever, K)

In [None]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceBgeEmbeddings
# from langchain.embeddings import OpenAIEmbeddings

embeddings = HuggingFaceBgeEmbeddings(model_name = "yentinglin/bert-base-zhtw")
db = FAISS.load_local('embeddings/all_bert_base_zhtw_chatgpt', embeddings)

Ks = [1, 5, 10, 20, 50, 100]
# Ks = [1, 5, 10, 20]

for K in Ks:
    retriever = db.as_retriever(search_kwargs={"k": K})
    retriever_eval(retriever, K)

In [None]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceBgeEmbeddings
# from langchain.embeddings import OpenAIEmbeddings

embeddings = HuggingFaceBgeEmbeddings(model_name = "thenlper/gte-large-zh")
db = FAISS.load_local('embeddings/all_gte_large_chatgpt', embeddings)

Ks = [1, 5, 10, 20, 50, 100]
# Ks = [1, 5, 10, 20]

for K in Ks:
    retriever = db.as_retriever(search_kwargs={"k": K})
    retriever_eval(retriever, K)

In [None]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceBgeEmbeddings
# from langchain.embeddings import OpenAIEmbeddings

embeddings = HuggingFaceBgeEmbeddings(model_name = "thenlper/gte-base-zh")
db = FAISS.load_local('embeddings/all_gte_base_chatgpt', embeddings)

Ks = [1, 5, 10, 20, 50, 100]
# Ks = [1, 5, 10, 20]

for K in Ks:
    retriever = db.as_retriever(search_kwargs={"k": K})
    retriever_eval(retriever, K)

In [None]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceBgeEmbeddings
# from langchain.embeddings import OpenAIEmbeddings

embeddings = HuggingFaceBgeEmbeddings(model_name = "thenlper/gte-small-zh")
db = FAISS.load_local('embeddings/all_gte_small_chatgpt', embeddings)

Ks = [1, 5, 10, 20, 50, 100]
# Ks = [1, 5, 10, 20]

for K in Ks:
    retriever = db.as_retriever(search_kwargs={"k": K})
    retriever_eval(retriever, K)

In [None]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceBgeEmbeddings
# from langchain.embeddings import OpenAIEmbeddings

embeddings = HuggingFaceBgeEmbeddings(model_name = "BAAI/bge-small-zh-v1.5")
db = FAISS.load_local('embeddings/all_bge_small_chatgpt', embeddings)

Ks = [1, 5, 10, 20, 50, 100]
# Ks = [1, 5, 10, 20]

for K in Ks:
    retriever = db.as_retriever(search_kwargs={"k": K})
    retriever_eval(retriever, K)

In [None]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceBgeEmbeddings
# from langchain.embeddings import OpenAIEmbeddings

embeddings = HuggingFaceBgeEmbeddings(model_name = "BAAI/bge-base-zh-v1.5")
db = FAISS.load_local('embeddings/all_bge_base_chatgpt', embeddings)

Ks = [1, 5, 10, 20, 50, 100]
# Ks = [1, 5, 10, 20]

for K in Ks:
    retriever = db.as_retriever(search_kwargs={"k": K})
    retriever_eval(retriever, K)

In [None]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceBgeEmbeddings
# from langchain.embeddings import OpenAIEmbeddings

embeddings = HuggingFaceBgeEmbeddings(model_name = "BAAI/bge-large-zh-v1.5")
db = FAISS.load_local('embeddings/all_bge_large_chatgpt', embeddings)

Ks = [1, 5, 10, 20, 50, 100]
# Ks = [1, 5, 10, 20]

for K in Ks:
    retriever = db.as_retriever(search_kwargs={"k": K})
    retriever_eval(retriever, K)
    # hyde_retriever_eval(retriever, K)
    # retriever_original(retriever, K)

In [None]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceBgeEmbeddings
# from langchain.embeddings import OpenAIEmbeddings

embeddings = HuggingFaceBgeEmbeddings(model_name = "BAAI/bge-large-zh-v1.5")
db = FAISS.load_local('embeddings/faiss_bge_largev1.5_32_8', embeddings)

# Ks = [1, 5, 10, 20, 50, 100]

for K in Ks:
    retriever = db.as_retriever(search_kwargs={"k": K})
    retriever_eval(retriever, K)
    # hyde_retriever_eval(retriever, K)
    # retriever_original(retriever, K)

In [None]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceBgeEmbeddings
# from langchain.embeddings import OpenAIEmbeddings

embeddings = HuggingFaceBgeEmbeddings(model_name = "BAAI/bge-large-zh-v1.5")
db = FAISS.load_local('embeddings/faiss_bge_largev1.5_64_8', embeddings)

# Ks = [1, 5, 10, 20, 50, 100]

for K in Ks:
    retriever = db.as_retriever(search_kwargs={"k": K})
    retriever_eval(retriever, K)
    # hyde_retriever_eval(retriever, K)
    # retriever_original(retriever, K)

In [None]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceBgeEmbeddings
# from langchain.embeddings import OpenAIEmbeddings

embeddings = HuggingFaceBgeEmbeddings(model_name = "BAAI/bge-large-zh-v1.5")
db = FAISS.load_local('embeddings/faiss_bge_largev1.5_128_16', embeddings)

# Ks = [1, 5, 10, 20, 50, 100]

for K in Ks:
    retriever = db.as_retriever(search_kwargs={"k": K})
    retriever_eval(retriever, K)
    # hyde_retriever_eval(retriever, K)
    # retriever_original(retriever, K)

In [None]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceBgeEmbeddings
# from langchain.embeddings import OpenAIEmbeddings

embeddings = HuggingFaceBgeEmbeddings(model_name = "BAAI/bge-large-zh-v1.5")
db = FAISS.load_local('embeddings/faiss_bge_largev1.5_256_16', embeddings)

# Ks = [1, 5, 10, 20, 50, 100]

for K in Ks:
    retriever = db.as_retriever(search_kwargs={"k": K})
    retriever_eval(retriever, K)
    # hyde_retriever_eval(retriever, K)
    # retriever_original(retriever, K)

## Multi Query x HyDE(5-shot)

In [None]:
import openai

openai.api_key = ""

def ask_chatgpt(prompt):
    completion = openai.ChatCompletion.create(
      model="gpt-3.5-turbo",
      temperature=0,
      messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt},
        ]
    )
    return completion

In [None]:
import pandas as pd

testset = pd.read_csv('document/FAQ_ChatGPT.csv')
testset.head(3)

In [None]:
testset['ChatGPT'] = gpts_ans
testset.head(3)

In [None]:
testset.to_csv('document/FAQ_ChatGPT.csv', index=False)

In [None]:
import pandas as pd

df = pd.read_csv('document/clean_content_4.csv')

In [None]:
from langchain.document_loaders import DataFrameLoader

loader = DataFrameLoader(df, page_content_column="content")
docs = loader.load()

In [None]:
len(docs),docs[0]

## BM25: improve a lit bit, but toooo slow

In [None]:
import pandas as pd

testset = pd.read_csv('document/FAQ_ChatGPT.csv')
testset.head(3)

In [None]:
# Only BM25
from langchain.retrievers import BM25Retriever
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import DataFrameLoader
import pandas as pd

df = pd.read_csv('document/ChatGPT_washed_data.csv')
loader = DataFrameLoader(df, page_content_column="chunk")
docs = loader.load()
# CHUNK_SIZE = 64
# CHUNK_OVERLAP = 8

# text_splitter = RecursiveCharacterTextSplitter(
#     chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP, add_start_index=True
# # )
# all_splits = text_splitter.split_documents(docs)
bm25_retriever = BM25Retriever.from_documents(docs)

Ks = [5, 10, 20, 50, 100]
for K in Ks:
    # initialize the bm25 retriever and faiss retriever
    bm25_retriever.k = K

    retriever_eval(bm25_retriever, K)
    # hyde_retriever_eval(bm25_retriever, K)

In [None]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import DataFrameLoader
import pandas as pd


embedding = HuggingFaceBgeEmbeddings(model_name = "BAAI/bge-large-zh-v1.5")
faiss_vectorstore = FAISS.load_local('embeddings/all_bge_large_chatgpt', embeddings)


df = pd.read_csv('document/ChatGPT_washed_data.csv')
loader = DataFrameLoader(df, page_content_column="chunk")
docs = loader.load()
bm25_retriever = BM25Retriever.from_documents(docs)

Ks = [1, 5, 10, 20, 50, 100]
for K in Ks:
    # initialize the bm25 retriever and faiss retriever
    bm25_retriever.k = K
    # faiss_vectorstore = FAISS.from_texts(doc_list, embedding)
    faiss_retriever = faiss_vectorstore.as_retriever(search_kwargs={"k": K})

    # initialize the ensemble retriever
    ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, faiss_retriever],
                                        weights=[0.5, 0.5])

    retriever_eval(ensemble_retriever, K)
    hyde_retriever_eval(ensemble_retriever, K)

In [None]:
import pandas as pd

testset = pd.read_csv('document/FAQ_ChatGPT.csv')
testset.head(3)

In [None]:
# PARENT_SIZE = 512
# CHUNK_SIZE = 64
# CHUNK_OVERLAP = 8
# parent_splitter = RecursiveCharacterTextSplitter(chunk_size=PARENT_SIZE, chunk_overlap=CHUNK_OVERLAP)
# child_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
# vectorstore = Chroma(collection_name="split_parents", persist_directory=f'parent_child_vectorstore_{PARENT_SIZE}', embedding_function=embeddings)

# fs = LocalFileStore(f"kv_docstore_{PARENT_SIZE}")
# store = create_kv_docstore(fs)
# Ks = [10, 20, 50, 100]

# print(f'Parent Size: {PARENT_SIZE}')
# for K in Ks:
#     # retriever = db.as_retriever(search_kwargs={"k": K})
#     retriever = ParentDocumentRetriever(
#         vectorstore=vectorstore,
#         docstore=store,
#         child_splitter=child_splitter,
#         parent_splitter=parent_splitter,
#         search_kwargs={"k": K}
#     )
#     print(f'Retrieve: {K} documents to reranking....')
#     retriever_eval(retriever, 5)
#     hyde_retriever_eval(retriever, 5)
#     # retriever_original(retriever, K)

In [None]:
# ## Great!
# PARENT_SIZE = 256
# CHUNK_SIZE = 64
# CHUNK_OVERLAP = 8
# parent_splitter = RecursiveCharacterTextSplitter(chunk_size=PARENT_SIZE, chunk_overlap=CHUNK_OVERLAP)
# child_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
# vectorstore = Chroma(collection_name="split_parents", persist_directory=f'parent_child_vectorstore_{PARENT_SIZE}', embedding_function=embeddings)

# fs = LocalFileStore(f"kv_docstore_{PARENT_SIZE}")
# store = create_kv_docstore(fs)
# Ks = [10, 20, 50, 100]

# print(f'Parent Size: {PARENT_SIZE}')
# for K in Ks:
#     # retriever = db.as_retriever(search_kwargs={"k": K})
#     retriever = ParentDocumentRetriever(
#         vectorstore=vectorstore,
#         docstore=store,
#         child_splitter=child_splitter,
#         parent_splitter=parent_splitter,
#         search_kwargs={"k": K}
#     )
#     print(f'Retrieve: {K} documents to reranking....')
#     retriever_eval(retriever, 5)
#     hyde_retriever_eval(retriever, 5)
#     # retriever_original(retriever, K)

In [None]:
# PARENT_SIZE = 128
# CHUNK_SIZE = 64
# CHUNK_OVERLAP = 8
# parent_splitter = RecursiveCharacterTextSplitter(chunk_size=PARENT_SIZE, chunk_overlap=CHUNK_OVERLAP)
# child_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
# vectorstore = Chroma(collection_name="split_parents", persist_directory=f'parent_child_vectorstore_{PARENT_SIZE}', embedding_function=embeddings)

# fs = LocalFileStore(f"kv_docstore_{PARENT_SIZE}")
# store = create_kv_docstore(fs)
# Ks = [10, 20, 50, 100]
# #
# print(f'Parent Size: {PARENT_SIZE}')
# for K in Ks:
#     # retriever = db.as_retriever(search_kwargs={"k": K})
#     retriever = ParentDocumentRetriever(
#         vectorstore=vectorstore,
#         docstore=store,
#         child_splitter=child_splitter,
#         parent_splitter=parent_splitter,
#         search_kwargs={"k": K}
#     )
#     print(f'Retrieve: {K} documents to reranking....')
#     retriever_eval(retriever, 5)
#     hyde_retriever_eval(retriever, 5)
#     # retriever_original(retriever, K)

## Rerank

In [None]:
import pandas as pd

# testset = pd.read_csv('document/FAQ_ChatGPT.csv')
testset = pd.read_csv('document/FAQ_filter_merged.csv')

testset.head(1)

In [None]:
full_doc = pd.read_csv('document/clean_content_4.csv', index_col=['url'])
# full_doc['url'] = full_doc.index

full_doc.head(1)

In [None]:
from FlagEmbedding import FlagReranker
import numpy as np


def retriever_eval(retriever, k, parent_size, reranker_K=5):

    retrieve_urls = []
    docs_list = []
    docs_passage = []
    
    cnt = 0
    for idx, query in enumerate(testset['Question_ChatGPT_generated']):
        docs = retriever.get_relevant_documents(query)
        # print(docs)
        
        window_passage = []
        for doc in docs:
            # print(doc)
            try:
                full_passage = full_doc['content'][doc.metadata['url']]
                half_window = (parent_size-len(doc.page_content))//2

                str_idx = full_passage.find(doc.page_content)
                if str_idx != -1:
                    start_idx = str_idx-half_window if str_idx-half_window>=0 else 0
                    end_idx = str_idx+half_window if str_idx+half_window<len(full_passage) else len(full_passage)-1

                    window_passage.append(full_passage[start_idx:end_idx])
                else:
                    window_passage.append(full_passage)
            except:
                    window_passage.append(doc)
        docs_passage.append(window_passage)
        # docs_list.append(docs)
        retrieve_urls.append([(doc.metadata['url'])for doc in docs])
    # print(len(docs_passage), len(docs_passage[0]))
    # print(len(retrieve_urls),len(retrieve_urls[0]))
    # return

    correct_cnt = 0
    for idx, d in enumerate(testset.iterrows()):
        if d[1]['URL'] in retrieve_urls[idx]:
            correct_cnt+=1
    print(f'Recall@{k}, ChatGPT Generated Question Accuracy: {correct_cnt/len(testset)}')
    
    if k <= reranker_K:
        return correct_cnt/len(testset), docs_list

    reranker = FlagReranker('thenlper/gte-base-zh', use_fp16=True)
    reranker_index = []
    for idx, doc in enumerate(docs_passage): 
        scores = []
        for idxs in range(len(doc)):
            try:
                # print(query, doc[idxs])
                scores.append(reranker.compute_score([query, doc[idxs]]))
            except:
                scores.append(-np.inf)
                print(f'[ERROR] the {idx} document: {doc[idxs]}')
                print(f'Check {retrieve_urls[idx][idxs]} in database.')

            # print(f'scores: {scores}')
        # reranker_scores.append(scores)
        # print(f'np.argsort(scores)[::-1]: {np.argsort(scores)[::-1]}')
        reranker_index.append(np.argsort(scores)[::-1])
    # print(len(reranker_index),len(reranker_index[0]))
    correct_cnt = 0
    for doc_idx, urls in enumerate(retrieve_urls):
        parent_retrieve_urls = []
        for idxs in reranker_index[doc_idx][:reranker_K]:
            # print(doc[idxs].metadata['url'])
            # print(f'doc {doc}, idxs {idxs}')
            parent_retrieve_urls.append(urls[idxs])
        # print(testset['URL'][doc_idx], retrieve_urls)
        # print(testset['ChatGPT'][doc_idx], doc[reranker_index[doc_idx][0]])
        if testset['URL'][doc_idx] in parent_retrieve_urls:
            correct_cnt+=1
    print(f'[Rerank from {k}] Recall@{reranker_K}, ChatGPT Generated Question Accuracy: {correct_cnt/len(testset)}')
    return correct_cnt/len(testset), docs_list

In [None]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceBgeEmbeddings

Ks = [5, 10, 20, 50, 100, 200]
embeddings = HuggingFaceBgeEmbeddings(model_name = "BAAI/bge-large-zh-v1.5")
db = FAISS.load_local('embeddings/all_bge_large_chatgpt', embeddings)

print('Parent passage chuck length: 128')
for K in Ks:
    retriever = db.as_retriever(search_kwargs={"k": K})
    retriever_eval(retriever, K, 128)

In [None]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceBgeEmbeddings

Ks = [5, 10, 20, 50, 100, 200]
embeddings = HuggingFaceBgeEmbeddings(model_name = "BAAI/bge-large-zh-v1.5")

db = FAISS.load_local('embeddings/faiss_bge_largev1.5_64_8', embeddings)

print('Parent passage chuck length: 64')
for K in Ks:
    retriever = db.as_retriever(search_kwargs={"k": K})
    retriever_eval(retriever, K, 64)

In [None]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceBgeEmbeddings

Ks = [5, 10, 20, 50, 100, 200]
embeddings = HuggingFaceBgeEmbeddings(model_name = "BAAI/bge-large-zh-v1.5")

db = FAISS.load_local('embeddings/faiss_bge_largev1.5_64_8', embeddings)

print('Parent passage chuck length: 128')
for K in Ks:
    retriever = db.as_retriever(search_kwargs={"k": K})
    retriever_eval(retriever, K, 128)

In [None]:
Ks = [5, 10, 20, 50, 100, 200]
print('Parent passage chuck length: 256')
for K in Ks:
    retriever = db.as_retriever(search_kwargs={"k": K})
    retriever_eval(retriever, K, 256)

In [None]:
retriever = db.as_retriever(search_kwargs={"k": 200})
retriever_eval(retriever, 200, 256)

In [None]:
print('Parent passage chuck length: 512')

for K in Ks:
    retriever = db.as_retriever(search_kwargs={"k": K})
    retriever_eval(retriever, K, 512)

In [None]:
print('Parent passage chuck length: 1024')

for K in Ks:
    retriever = db.as_retriever(search_kwargs={"k": K})
    retriever_eval(retriever, K, 1024)

In [None]:
# from langchain.vectorstores import Chroma
# from langchain.retrievers import ParentDocumentRetriever
# from langchain.embeddings import HuggingFaceBgeEmbeddings
# from langchain.storage._lc_store import create_kv_docstore
# from langchain.storage import LocalFileStore
# from langchain.text_splitter import RecursiveCharacterTextSplitter

# embeddings = HuggingFaceBgeEmbeddings(model_name = "BAAI/bge-large-zh-v1.5")

# PARENT_SIZE = 1024
# CHUNK_SIZE = 64
# CHUNK_OVERLAP = 8
# parent_splitter = RecursiveCharacterTextSplitter(chunk_size=PARENT_SIZE, chunk_overlap=CHUNK_OVERLAP)
# child_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
# vectorstore = Chroma(collection_name="split_parents", persist_directory=f'parent_child_vectorstore_{PARENT_SIZE}', embedding_function=embeddings)

# fs = LocalFileStore(f"kv_docstore_{PARENT_SIZE}")
# store = create_kv_docstore(fs)
# Ks = [5, 20, 100, 200]

# # db = FAISS.load_local('embeddings/faiss_bge_largev1.5_64_8', embeddings)

# print(f'Parent Size: {PARENT_SIZE}')
# for K in Ks:
#     # retriever = db.as_retriever(search_kwargs={"k": K})
#     retriever = ParentDocumentRetriever(
#         vectorstore=vectorstore,
#         docstore=store,
#         child_splitter=child_splitter,
#         parent_splitter=parent_splitter,
#         search_kwargs={"k": K}
#     )
#     print(f'Retrieve {K} documents to reranking....')
#     retriever_eval(retriever, K)


In [None]:
# retriever1_urls = []
# retriever1_ans = []
# retriever5_urls = []
# retriever1 = db.as_retriever(search_kwargs={"k": 1})
# retriever5 = db.as_retriever(search_kwargs={"k": 5})

# for idx, query in enumerate(testset['Question']):
#     print('Answer:',testset['URL'][idx])
#     print('Question:',query)
#     docs1 = retriever1.get_relevant_documents(query)
#     print(sorted([(doc.metadata['url'])for doc in docs1]))
#     print(sorted([(doc.page_content)for doc in docs1]))
#     retriever1_urls.append(sorted([(doc.metadata['url'])for doc in docs1]))
#     retriever1_ans.append(sorted([(doc.page_content)for doc in docs1]))
#     print('======================================================================================')
#     docs5 = retriever5.get_relevant_documents(query)
#     print(sorted([(doc.metadata['url'])for doc in docs5]))
#     retriever5_urls.append(sorted([(doc.metadata['url'])for doc in docs5]))

# answer_df = testset
# answer_df['retreival doc'] = retriever1_ans
# answer_df['retreival url'] = retriever1_urls

# answer_df.to_csv('evaluation/retrieval_original.csv', index=False)


## Generate Hyde for Testset

In [None]:
import pandas as pd

testset = pd.read_csv('document/testset.csv')
testset.head(1)

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

model_id = "yentinglin/Taiwan-LLM-7B-v2.1-chat"

llm = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True).eval()
tokenizer = AutoTokenizer.from_pretrained(model_id)

def generate_text(prompt_text):

    inputs = tokenizer(prompt_text, return_tensors="pt").to('cuda')

    outputs = llm.generate(inputs["input_ids"], pad_token_id=50256, max_new_tokens=512)
    response = tokenizer.decode(outputs[0].tolist(), skip_special_tokens=True)
        
    return response

In [None]:
from tqdm import tqdm
response = []
for query in tqdm(testset['Question']):
    res = generate_text(query)
    print(query)
    print(res[len(query):])
    response.append(res[len(query):])


In [None]:
testset['Answer_Taiwan_Llama_7B_RAG'] = response
# testset = testset.drop(columns=['Unnamed: 0'])
testset.to_csv('evaluation/testset_Taiwan_Llama_7B_v2.1_RAG.csv', index=False)