In [None]:
!pip install -q -U langchain_community sentence-transformers faiss-gpu==1.7.2

In [None]:
!pip install flash_attn

In [None]:
import json
from pathlib import Path
import pandas as pd
from tqdm.auto import tqdm
from typing import List, Union, Optional

from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings

In [None]:
class JSONLLoader(BaseLoader):
    def __init__(
        self,
        file_path: Union[str, Path],
        content_key: Optional[str] = None,
        ):
        self.file_path = Path(file_path).resolve()
        self._content_key = content_key
        
    def load(self) -> List[Document]:
        docs = []
        with open(self.file_path, 'r', encoding="utf8") as file:
            for line in file:
                data = json.loads(line.strip())
                claim_id = data['claim_id']
                type_ = data['type']
                query = data['query']
                url = data['url']
                url2text = data['url2text']

                text = ' '.join(url2text)

                metadata = dict(
                    claim_id=claim_id,
                    type=type_,
                    query=query,
                    source=url
                )
                docs.append(Document(page_content=text, metadata=metadata))
                    
        return docs

In [None]:
embedding_model = HuggingFaceEmbeddings(model_name='dunzhang/stella_en_1.5B_v5', model_kwargs={"trust_remote_code": True})

In [None]:
def document_to_dict(doc):
    return {
        "claim_id": doc.metadata.get("claim_id", ""),
        "type": doc.metadata.get("type", ""),
        "query": doc.metadata.get("query", ""),
        "url": doc.metadata.get("source", ""),
        "url2text": doc.page_content.split("\n")
    }

def convert_to_jsonl(documents, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        for doc in documents:
            doc_dict = document_to_dict(doc)
            json_line = json.dumps(doc_dict)
            f.write(json_line + '\n')

In [None]:
dev_data_df = pd.read_csv("dev_data_df.csv")

In [None]:
import os
os.makedirs('top3')

In [None]:
for i in tqdm(range(len(dev_data_df))):
    claim = dev_data_df['claim'][i]
    json_path = f'test-data/{i}.json'

    loader = JSONLLoader(file_path=json_path)
    data = loader.load()
    db = FAISS.from_documents(documents=data, embedding=embedding_model)
    retriever = db.as_retriever(search_type="similarity", search_kwargs={'k': 3})
    
    evidences = retriever.get_relevant_documents(claim)
    convert_to_jsonl(evidences, f'top3/{i}.json')