In [None]:
%pip install bs4 markdownify

In [3]:
# stage 1: prepare corpus

In [4]:
from pathlib import Path
import hashlib
import time
import copy
import re

import requests
from bs4 import BeautifulSoup, Tag
from tqdm.auto import tqdm

def persistent_hash(text: str):
    """Generates a persistent SHA-256 hash for a given string."""
    return hashlib.sha256(text.encode('utf-8')).hexdigest()

def get_page(url: str, cache_dir: Path, delay: float = 0) -> BeautifulSoup:
    print('Retrieving', url)
    cache_path = cache_dir / f'{persistent_hash(url)}.html'
    if cache_path.exists():
        html = cache_path.read_text()
    else:
        if delay:
            time.sleep(delay)
        response = requests.get(url)
        response.raise_for_status()
        html = response.text
        cache_path.write_text(html)
    return BeautifulSoup(html, 'html.parser')

In [5]:
CACHE_DIR = Path('tmp/requests_cache')
CACHE_DIR.mkdir(exist_ok=True, parents=True)

# getting index pages

DOMAIN = 'https://the-betweenlands.fandom.com'
nav_page_urls = [DOMAIN + '/wiki/Special:AllPages']
nav_pages = [get_page(nav_page_urls[0], CACHE_DIR)]

while True:
    nav_block = nav_pages[-1].select_one('.mw-allpages-nav')
    assert nav_block
    nav_link = nav_block.find_all('a')[-1]
    if not 'Next page' in nav_link.text:
        break
    next_url = DOMAIN + str(nav_link['href'])
    nav_page_urls.append(next_url)
    new_page = get_page(next_url, CACHE_DIR)
    nav_pages.append(new_page)

Retrieving https://the-betweenlands.fandom.com/wiki/Special:AllPages
Retrieving https://the-betweenlands.fandom.com/wiki/Special:AllPages?from=Button+Bush+Flowers
Retrieving https://the-betweenlands.fandom.com/wiki/Special:AllPages?from=Filter
Retrieving https://the-betweenlands.fandom.com/wiki/Special:AllPages?from=Marsh
Retrieving https://the-betweenlands.fandom.com/wiki/Special:AllPages?from=Release+3.6.1
Retrieving https://the-betweenlands.fandom.com/wiki/Special:AllPages?from=Sushi+Green+Dye+%28fluid%29
Retrieving https://the-betweenlands.fandom.com/wiki/Special:AllPages?from=Winding+Walkways


In [6]:
wiki_page_urls: list[str] = []
for page in nav_pages:
    content_block = page.select_one('.mw-allpages-body')
    assert content_block
    for el in content_block.find_all('li'):
        if 'allpagesredirect' not in el.get('class', []): # type: ignore
            wiki_page_urls.append(DOMAIN + el.find('a')['href']) # type: ignore

wiki_page_urls = list(set(wiki_page_urls))

print(f'Found {len(wiki_page_urls)} pages')

Found 745 pages


In [None]:
pages = [
    get_page(page_url, CACHE_DIR, delay=0.5)
    for page_url in tqdm(wiki_page_urls)
]

In [None]:
from markdownify import markdownify as md

removed_tables: list[str] = []

MD_DIR = Path('datasets/bl/docs')
MD_DIR.mkdir(exist_ok=True, parents=True)

HTML_DIR = Path('datasets/bl/sources')
HTML_DIR.mkdir(exist_ok=True, parents=True)

def verbose_table(el: Tag) -> str:
    content = el.find('tbody').text.strip()
    content = content.split('\n', 1)[0]
    return content

def remove_table(el: Tag):
    global removed_tables
    removed_tables.append(tabstr := verbose_table(el))
    print(f'Removing table "{tabstr}"')
    el.decompose()
    
def parse_page_for_rag(soup: BeautifulSoup) -> tuple[str, str]:
    soup = copy.deepcopy(soup)
    title = soup.select_one('#firstHeading').text.strip()
    print(f'{title=}')
    body = soup.select_one('#mw-content-text').select_one('.mw-content-ltr')
    assert body
    
    # clarifying aside blocks (add "[Aside block]" to title)
    for aside in body.find_all('aside'):
        for h2 in aside.find_all('h2'):
            h2.string = f'[Aside block] {h2.text}'
    
    # removing footer tables
    history_block = body.select_one('#History')
    if history_block is None:
        # remove all trailing tables (usually in changelog pages)
        children = [c for c in body.children if isinstance(c, Tag)]
        for child in children[::-1]:
            if child.name == 'table' and 'margin: auto; width: 85%' in child.get('style', ''):
                remove_table(child)
            else:
                break
    else:
        # remove tables after "history" section
        pos = body.index(history_block.parent)
        children = [c for c in list(body.children)[pos + 1:] if isinstance(c, Tag)]
        for child in children:
            if child.name == 'table' and 'margin: auto; width: 85%' in child.get('style', ''):
                remove_table(child)
           
    body_md = md(str(body), strip=['img', 'a'])
    
    # fix multiple \n in a row
    body_md = re.sub(r'\n[\n\s]+\n', '\n\n', body_md)
    
    # remove trailing brackets "Farming[]"" or "Leggings: 2 ()  "
    body_md = re.sub(r'\[\]\s*\n', '\n', body_md)
    body_md = re.sub(r'\(\)\s*\n', '\n', body_md)
            
    return title, body_md

doc_titles: list[str] = []
for page in pages:
    doc_title, doc_content = parse_page_for_rag(page)
    
    assert doc_title not in doc_titles, f'duplicate title {doc_title}'
    doc_titles.append(doc_title)
    
    Path(MD_DIR / f'{doc_title}.md').write_text(f'# {doc_title}\n\n' + doc_content)
    Path(HTML_DIR / f'{doc_title}.html').write_text(str(page))

print('Removed tables:')
from collections import Counter
print(Counter(removed_tables))

for nav_idx, page in enumerate(nav_pages):
    Path(HTML_DIR / f'AllPages_{nav_idx}.html').write_text(str(page))

In [10]:
# stage 2: validate QA

from pathlib import Path
import yaml
import re

MD_DIR = Path('datasets/bl/docs')

md_files = {
    path.stem: path.read_text()
    for path in MD_DIR.glob('*.md')
}

lengths = {
    name: len(content)
    for name, content in md_files.items()
}
lengths = dict(sorted(lengths.items(), key=lambda x: x[1]))

print('Total documents:', len(md_files))
print('Shortest documents:', list(lengths.items())[:4])
print('Longest documents:', list(lengths.items())[::-1][:4])

joint_text = '\n\n'.join(md_files.values())

print('Symbols:', len(joint_text))
print('Words:', len(re.findall(r'\w+', joint_text)))
print('Pages (assuming 1800 chars/page):', len(joint_text) // 1800)

import tiktoken

model_name = 'gpt-4o-mini'
encoding = tiktoken.encoding_for_model(model_name)
n_tokens = len(encoding.encode(joint_text))
print(f'{model_name} tokens:', n_tokens)

Total documents: 745
Shortest documents: [('Snowfall', 279), ('Release 3.9.1', 280), ('Environment', 299), ('Release 3.9.6', 301)]
Longest documents: [('Loot Tables', 73440), ('Block IDs', 45060), ('Item IDs', 42362), ('Infusions', 25090)]
Symbols: 2104571
Words: 309186
Pages (assuming 1800 chars/page): 1169
gpt-4o-mini tokens: 571046


In [13]:
# QA data validation (TODO remove)

with open('datasets/bl/qa_v2.yaml', 'r') as file:
    qa = yaml.safe_load(file)

for sample in qa:
    if 'sources' not in sample:
        continue
    for src in sample['sources']:
        
        doc = md_files[src['doc']]
        doc = doc.replace('*', '')
        doc_lines_nohashmark = set([
            l.replace('[Aside block]', '').replace('#', '').strip()
            for l in doc.split('\n')
        ])
        
        if 'loc' in src:
            locs = src['loc']
            if isinstance(locs, str):
                locs = [locs]
            for loc in locs:
                loc = loc.removesuffix('...')
                assert loc in doc
        
        if 'sec' in src:
            secs = src['sec']
            if isinstance(secs, str):
                secs = [secs]
            for sec in secs:
                assert sec in doc_lines_nohashmark

    if 'answer' in sample:
        eval = sample['eval']
        scores = [x['score'] for x in eval]
        positive_scores = [x for x in scores if x > 0]
        assert sum(positive_scores)

In [None]:
# def text_to_sentences(text: str) -> list[str]:
#     """Represents text as a list of sentences, keeping all the
#     characters, including trailing spaces and linebreaks. A sentence
#     is either a line, or a sentence found by sent_tokenize within a
#     line. 
#     """
#     ...

# import nltk
# nltk.download('punkt')
# nltk.download('punkt_tab')

# from nltk.tokenize import sent_tokenize
# text = "Hello, world! How are you today? The U.S. government is in session."

# sentences = sent_tokenize(text)
# sentences

# from pathlib import Path

# MD_DIR = Path('datasets/bl/docs')

# docs: dict[str, list[str]] = []

# for path in MD_DIR.glob('*.md'):
#     sentences_or_lines: list[str] = []
    
#     for line in path.read_text().splitlines(keepends=True):
#         sentences_or_lines.append(line)
        
#     docs[path.stem] = sentences_or_lines

In [None]:
# converting to pydantic

import yaml

with open('datasets/bl/qa_v2.yaml', 'r') as file:
    qa = yaml.safe_load(file)

In [9]:
from typing import Any
from natural_rag.data import Question

pydantic_questions: list[Question] = []

for sample in qa:

    question_as_dict: dict[str, Any] = {'text': sample['question']}

    if 'answer' in sample:
        question_as_dict['reference_answer'] = sample['answer']

    question_as_dict['metadata'] = {'reasoning_type': sample['reasoning type']}
    if 'comment' in sample:
        question_as_dict['metadata']['comment'] = sample['comment']

    if 'eval' in sample:
        question_as_dict['eval_rules'] = []
        for check_as_dict in sample['eval']:
            check_as_dict = check_as_dict.copy()
            if 'comment' in check_as_dict:
                check_as_dict['metadata'] = {'comment': check_as_dict.pop('comment')}
            question_as_dict['eval_rules'].append(check_as_dict) # type: ignore

    if 'sources' in sample:
        question_as_dict['relevant'] = []
        for source_as_dict in sample['sources']:
            item = {'doc_id': source_as_dict['doc']}
            locs: list[str] = []
            if 'loc' in source_as_dict:
                loc = source_as_dict['loc']
                if isinstance(loc, str):
                    loc = [loc]
                locs += [x.removesuffix('...') for x in loc]
            if 'sec' in source_as_dict:
                sec = source_as_dict['sec']
                if isinstance(sec, str):
                    sec = [sec]
                locs += [f'<{x}>' for x in sec]
            if locs:
                item['loc'] = locs
            question_as_dict['relevant'].append(item) # type: ignore

    pydantic_questions.append(Question.model_validate(question_as_dict))

In [10]:
from pathlib import Path

from natural_rag.data import Document

pydantic_documents: list[Document] = []

HTML_DIR = Path('datasets/bl/sources')

for path in sorted(HTML_DIR.glob('*.html')):
    doc = {
        'id': path.stem,
        'title': path.stem,
        'source_ext': '.html',
    }
    pydantic_documents.append(Document.model_validate(doc))

In [11]:
from natural_rag.data import RAGDataset
datset = RAGDataset(
    documents={doc.id: doc for doc in pydantic_documents},
    questions=pydantic_questions,
)

In [None]:
datset.dump_to_dir('bl')