In [1]:
from langchain.document_loaders import DirectoryLoader, JSONLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

from elqm.utils.dataFinder import get_data_dir
import os
import json
from tqdm import tqdm
from bs4 import BeautifulSoup

import csv
import numpy as np

In [2]:
DATA_DIR = get_data_dir("eur_lex_data")
PREPROCESSED_DATA_DIR = get_data_dir("preprocessed")

print(os.path.abspath(DATA_DIR))
print(os.path.abspath(PREPROCESSED_DATA_DIR))

/home/psaegert/Projects/elqm-INLPT-WS2023/elqm-raw/eur_lex_data
/home/psaegert/Projects/elqm-INLPT-WS2023/elqm-raw/preprocessed


In [3]:
for filename in tqdm(os.listdir(DATA_DIR)):
    if filename.endswith(".json"):
        with open(os.path.join(DATA_DIR, filename), 'r') as f:
            data = json.load(f)
        
        bs = BeautifulSoup(data['html'], 'html.parser')

        # Get the text
        text = bs.get_text()

        data['text'] = text
        del data['html']

        with open(os.path.join(PREPROCESSED_DATA_DIR, filename), 'w') as f:
            json.dump(data, f)

100%|██████████| 508/508 [00:08<00:00, 59.69it/s] 


In [4]:
schema = {
    'jq_schema': '.text'
}

In [5]:
loader = DirectoryLoader(PREPROCESSED_DATA_DIR, glob='**/*.json', show_progress=True, loader_cls=JSONLoader, loader_kwargs=schema)
data = loader.load()

100%|██████████| 508/508 [00:00<00:00, 720.84it/s]


In [6]:
# Split into chunks 
text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=200)
all_splits = text_splitter.split_documents(data)
print(f"Split into {len(all_splits)} chunks")

Split into 5832 chunks


In [7]:
from elqm.eval.oracle import QUSTION_TYPES, generate_question_answer_pairs

In [8]:
list(QUSTION_TYPES.keys())

['confirmation',
 'factoid',
 'list',
 'causal',
 'hypothetical',
 'complex',
 'default']

In [9]:
N_DOCUMENTS = 100
N_QUESTIONS = 5

DATA_DIR = get_data_dir("question_answer_pairs")
print(os.path.abspath(DATA_DIR))

/home/psaegert/Projects/elqm-INLPT-WS2023/elqm-raw/question_answer_pairs


In [10]:
random_state = 20240102

# Choose random splits
np.random.seed(random_state)
random_splits = np.random.choice(all_splits, N_DOCUMENTS, replace=False)

In [11]:
# Write to a source_name.csv that contains columns source, type, question, answer in append mode
# If the file does not exist, create it and write the header
filename = os.path.join(DATA_DIR, f"random_{N_DOCUMENTS}_{random_state}.csv")

if not os.path.exists(filename):
    with open(filename, 'w') as f:
        writer = csv.writer(f)
        writer.writerow(["source", "type", "question", "answer"])

In [12]:
for split in tqdm(random_splits):
    for question_type in QUSTION_TYPES.keys():
        qa_pairs = generate_question_answer_pairs(
            context=split.page_content,
            prompt=None,
            question_type=question_type,
            n=N_QUESTIONS)

        with open(filename, 'a', newline='') as f:
            writer = csv.writer(f, quoting=csv.QUOTE_ALL)
            for qa_pair in qa_pairs:
                writer.writerow([os.path.splitext(os.path.basename(split.metadata['source']))[0], question_type, qa_pair[0], qa_pair[1]])


100%|██████████| 100/100 [40:30<00:00, 24.30s/it]
