# Crawl, load, and split Pearl of Great Price from The Church of Jesus Christ of Latter-day Saints

In [None]:
%load_ext autoreload
%autoreload 2
%load_ext dotenv
%dotenv

In [None]:
from datetime import datetime
import os

from models.load_pearl_of_great_price import load_pogp
from models.load_utils import Loader, save_docs_to_jsonl
from models.load_utils import load_docs_from_jsonl, save_docs_to_jsonl
from models.split_model import MarkdownSyntacticEmbeddingSplitter
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
from models.crawl_utils import get_page, save_page

## Crawl

In [None]:
# config
hosts = ['https://pearlofgreatpricecentral.org/category/book-of-abraham/', 'https://pearlofgreatpricecentral.org/category/joseph-smith-history/']
base_dir = '../data/raw/pearl_of_great_price'
bs_parser = 'html.parser'
delay_seconds = 15
if not os.path.exists(base_dir):
    os.makedirs(base_dir)
    

In [None]:
def extract_next_sibling_href(soup, base_url):
    # Find the span tag with classes 'page-numbers' and 'current'
    span_tag = soup.find('span', class_='page-numbers current')    
    if span_tag:
        # Find the next sibling anchor tag
        anchor_tag = span_tag.find_next_sibling('a', href=True)
        if anchor_tag:
            href = anchor_tag['href']
            full_url = urljoin(base_url, href)
            return full_url
    return None

In [None]:
def fetch_and_extract_hrefs(start_url, base_url, max_pages=10):
    # List to store all extracted hrefs
    all_hrefs = []
    # Loop to fetch pages and extract hrefs
    for _ in range(max_pages):
        # Fetch the current page
        status_code, html = get_page(start_url)
        if status_code == 200:
            # Parse the HTML content using BeautifulSoup
            soup = BeautifulSoup(html, 'html.parser')
            # Extract hrefs from the current page
            hrefs = extract_hrefs_from_elementor_div(soup, base_url)
            # Add the extracted hrefs to the list
            all_hrefs.extend(hrefs)
            # Get the href for the next page
            next_page_href = extract_next_sibling_href(soup, base_url)
            if next_page_href:
                # Update the start_url for the next iteration
                start_url = next_page_href
            else:
                print("No next page found. Exiting loop.")
                break
        else:
            print(f"Failed to fetch page: {start_url}")
            break
    return all_hrefs

In [None]:
all_hrefs = []
for start_url in hosts:
    hrefs = fetch_and_extract_hrefs(start_url, start_url)
    all_hrefs.extend(hrefs)
print(all_hrefs)

In [None]:
len(all_hrefs)

In [None]:
def get_path(url):
    path_components = urlparse(url).path.split('/')
    return os.path.join(base_dir, f"{path_components[-2]}.json") 

In [None]:
for url in all_hrefs:
    path_file =  get_path(url)
    print(path_file)
    if os.path.exists(path_file):
        continue
    status_code, html = get_page(url, delay_seconds)
    if status_code != 200:
        print("Error!", status_code , url)
        continue
    save_page(path_file,url,html)    
print("End")

## Load

In [None]:
# config
input_dir = '../data/raw/pearl_of_great_price'
output_dir = '../data/load/pearl_of_great_price'

if not os.path.exists(output_dir):
   os.makedirs(output_dir)

today = datetime.today().strftime('%Y-%m-%d')

In [None]:
loader = Loader(load_pogp, input_dir)
docs = loader.load(verbose=True)
len(docs)

In [None]:
print(docs[0].metadata)
print(docs[0].page_content)

In [None]:
output_filename = os.path.join(output_dir, f"{today}.jsonl")

save_docs_to_jsonl(docs, output_filename)

## Split

In [None]:
# configure
input_path = '../data/load/pearl_of_great_price/2023-11-27.jsonl'
output_dir = '../data/split/pearl_of_great_price/'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
today = datetime.today().strftime('%Y-%m-%d')

In [None]:
docs = load_docs_from_jsonl(input_path)
len(docs)

In [None]:
text_splitter = MarkdownSyntacticEmbeddingSplitter()

In [None]:
splits = text_splitter.split_documents(docs, verbose=True)
len(splits)

In [None]:
for ix, split in enumerate(splits[:10]):
    print(ix, split.metadata)
    print(split.page_content)
    print("\n!!! SPLIT !!!\n")

In [None]:
filename = os.path.join(output_dir, f"{today}.jsonl")
save_docs_to_jsonl(splits, filename)

In [None]:
print(len(docs), len(splits))