# Создание датасета для AutoRAG 

In [13]:
import pandas as pd

from langchain.document_loaders import DirectoryLoader
from langchain_community.document_loaders.text import TextLoader
from langchain_text_splitters import MarkdownHeaderTextSplitter

In [2]:
loader = DirectoryLoader("../data/from_site/md", glob="**/*.md", loader_cls=TextLoader, loader_kwargs={'encoding':'utf-8'})
docs = loader.load()
len(docs)

26

In [3]:
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on)

In [4]:
chunks = []

for doc in docs:
    metadata = doc.metadata
    page_content = doc.page_content
    doc_chunks = markdown_splitter.split_text(page_content)
    for chunk in doc_chunks:
        chunk.metadata = chunk.metadata | metadata
        chunks.append(chunk)

In [10]:
result = []
for chunk in chunks:
    data = [chunk.metadata, chunk.page_content]
    result.append(data)

In [15]:
corpus = pd.DataFrame(result)

In [19]:
corpus.columns = ['metadata','contents']

In [20]:
corpus = corpus.reset_index()

In [22]:
corpus.columns = ['doc_id', 'metadata', 'contents']

In [24]:
corpus['doc_id'] = corpus['doc_id'].apply(lambda x: str(x))

In [27]:
corpus.to_parquet('../data/autorag/corpus.parquet')

In [29]:
# corpus['']

In [30]:
data_for_db = pd.read_parquet('../data/actual/interim/data_for_db_corrected.parquet')

In [36]:
data_for_db['contents'] = "Вопрос: " + data_for_db['kb_query'] + '\n\nОтвет: ' + data_for_db['kb_answer']

In [42]:
data_for_db['metadata'] = data_for_db[['class_1','class_2']].to_dict(orient='index')

In [44]:
data_for_db['doc_id'] = data_for_db['index'].apply(lambda x: str(x))

In [47]:
data_for_db[['doc_id','contents','metadata']].to_parquet('../data/autorag/corpus.parquet')

In [48]:
qa = pd.read_parquet('../data/actual/interim/evaluation_df.parquet')

In [50]:
qa = qa.reset_index()

In [53]:
qa = qa[['level_0','input','index','expected_output']]

In [55]:
qa.columns = ['qid', 'query', 'retrieval_gt', 'generation_gt']

In [57]:
qa.to_parquet('../data/autorag/qa.parquet')