# Crawl, Load and Split Redeemer of Israel from the Church of Jesus Christ of Latter-day Saints

In [None]:
%load_ext autoreload
%autoreload 2
%load_ext dotenv
%dotenv

In [None]:
import os
from urllib.parse import urljoin, urlparse

from bs4 import BeautifulSoup
import requests
import time

from models.crawl_utils import get_page, save_page

from datetime import datetime

from models.load_redeemer import load_redeemer

from models.load_utils import Loader, load_docs_from_jsonl, save_docs_to_jsonl
from models.split_model import MarkdownSyntacticEmbeddingSplitter

In [None]:
# config
host = 'https://www.redeemerofisrael.org/sitemap.xml'
source = 'redeemer_of_israel'  # use this name in the directories
crawl_dir = f'../data/raw/{source}'  # base_dir is now crawl_dir
bs_parser = 'html.parser'
delay_seconds = 5

if not os.path.exists(crawl_dir):
    os.makedirs(crawl_dir)

In [None]:
def extract_links(xml_content):
    #Parse the XML content using BeautifulSoup
    soup = BeautifulSoup(xml_content, 'xml')

    #Find all <loc> tags in the XML
    loc_tags = soup.find_all('loc')

    #Extract and store the text inside each <loc> tag
    loc_texts = [loc.get_text() for loc in loc_tags]

    return loc_texts

In [None]:
def get_path(url):
    path_components = urlparse(url).path.split('/')
    path = path_components[-1]
    if path.endswith('.html'):
        path = path[:-5]
    # print(path_components)
    return os.path.join(crawl_dir, f"{path}.json")

In [None]:
def get_sitemap(host):
    headers = {
        'Content-Type': 'text/xml',
    }
    response = requests.get(host, headers=headers)
    return response

In [None]:
response = get_sitemap(host)
if response.status_code != 200:
    print(f"ERROR {response.status_code}")
xml_content = response.content

In [None]:
xml_links = extract_links(xml_content)
print(len(xml_links))
print(xml_links[2])

In [None]:
for url in xml_links:
    path_file =  get_path(url)
    print(path_file)
    if os.path.exists(path_file):
        continue
    status_code, html = get_page(url, delay_seconds)
    if status_code != 200:
        print("Error!", status_code , url)
        continue
    save_page(path_file,url,html)
    
print("End")

## Load

In [None]:
# config
# input_dir is now crawl_dir, and output_dir is now load_dir, and output_filename is now load_filename
load_dir = f'../data/load/{source}/'
today = datetime.today().strftime('%Y-%m-%d')
load_filename = os.path.join(load_dir, f"{today}.jsonl")

if not os.path.exists(load_dir):
    os.makedirs(load_dir)

In [None]:
loader = Loader(load_redeemer, crawl_dir)
docs = loader.load(verbose=True)
len(docs)

In [None]:
print(docs[0].metadata)
print(docs[0].page_content)

In [None]:
save_docs_to_jsonl(docs, load_filename)

# Split loaded documents from Redeemer of Israel

In [None]:
# configure
# input_path is now load_filename, output_dir is now split_dir, and output filename is now split_filename
split_dir = f'../data/split/{source}/'
today = datetime.today().strftime('%Y-%m-%d')
# output filename is now split_filename
split_filename = os.path.join(split_dir, f"{today}.jsonl")

if not os.path.exists(split_dir):
    os.makedirs(split_dir)

## Load documents

In [None]:
docs = load_docs_from_jsonl(load_filename)
len(docs)

## Create splits

In [None]:
text_splitter = MarkdownSyntacticEmbeddingSplitter()

In [None]:
splits = text_splitter.split_documents(docs, verbose=True)
len(splits)

In [None]:
for ix, split in enumerate(splits[:10]):
    print(ix, split.metadata["url"], split.metadata["title"])
    print(split.page_content)
    print("\n!!! SPLIT !!!\n")

## Save splits 

In [None]:
save_docs_to_jsonl(splits, split_filename)

In [None]:
print(len(docs), len(splits))