# Crawl General Conference talks from the Church of Jesus Christ of Latter-day Saints

In [None]:
%load_ext autoreload
%autoreload 2
%load_ext dotenv
%dotenv

In [None]:
import os
from urllib.parse import urljoin, urlparse

from bs4 import BeautifulSoup

from models.crawl_utils import get_page, save_page
from datetime import datetime

from models.load_conf import ConferenceTalkLoader

from models.load_utils import load_docs_from_jsonl, save_docs_to_jsonl
from models.split_model import ModelTextSplitter, MarkdownSyntacticEmbeddingSplitter
from models.split_utils import get_openai_embedder

## Crawl

In [None]:
# config
years = range(1980, 2024)
months = ['04', '10']
host = 'https://www.churchofjesuschrist.org'
source = 'conference'  # use this name in the directories
crawl_dir = f'../data/raw/{source}'  # base_dir is now crawl_dir
bs_parser = 'html.parser'
delay_seconds = 5

if not os.path.exists(crawl_dir):
    os.makedirs(crawl_dir)
    

In [None]:
def _is_talk_url(url):
    path_components = urlparse(url).path.split('/')
    # must be 6 components (first component is empty) and last component must not end in -session
    return len(path_components) == 6 and not path_components[-1].endswith('-session')


def get_talk_urls(base_url, html):
    soup = BeautifulSoup(html, bs_parser)
    return [urljoin(base_url, a['href']) for a in soup.find_all('a', href=True) \
            if _is_talk_url(urljoin(base_url, a['href']))]


def get_talk_path(url):
    path_components = urlparse(url).path.split('/')
    year, month, title = path_components[3:6]
    return os.path.join(crawl_dir, f"{year}-{month}-{title}.json")

In [None]:
for year in years:
    for month in months:
        dir_url = f"{host}/study/general-conference/{year}/{month}?lang=eng"
        status_code, dir_html = get_page(dir_url, delay_seconds)
        if status_code != 200:
            print(f"Status code={status_code} url={dir_url}")
            continue
        talk_urls = get_talk_urls(dir_url, dir_html)
        print(dir_url, len(talk_urls))
        for talk_url in talk_urls:
            path = get_talk_path(talk_url)
            if os.path.exists(path):
                continue
            print("    ", path)
            status_code, talk_html = get_page(talk_url, delay_seconds)
            if status_code != 200:
                print(f"Status code={status_code} url={talk_url}")
                continue
            save_page(path, talk_url, talk_html)

## Load

In [None]:
# config
# input_dir is now crawl_dir, and output_dir is now load_dir, and output_filename is now load_filename
load_dir = f'../data/load/{source}/'
today = datetime.today().strftime('%Y-%m-%d')
load_filename = os.path.join(load_dir, f"{today}.jsonl")

if not os.path.exists(load_dir):
    os.makedirs(load_dir)

In [None]:
loader = ConferenceTalkLoader(crawl_dir)
docs = loader.load(verbose=True)

In [None]:
len(docs)

In [None]:
save_docs_to_jsonl(docs, load_filename)

In [None]:
load_filename

# Split loaded documents using trained splitter model

In [None]:
# configure
# input_path is now load_filename, output_dir is now split_dir, and output filename is now split_filename
split_dir = f'../data/split/{source}/'
today = datetime.today().strftime('%Y-%m-%d')
# output filename is now split_filename
split_filename = os.path.join(split_dir, f"{today}.jsonl")

if not os.path.exists(split_dir):
    os.makedirs(split_dir)

## Load documents

In [None]:
docs = load_docs_from_jsonl(load_filename)
len(docs)

## Create splits

In [None]:
%%time
mse_splitter = MarkdownSyntacticEmbeddingSplitter()
splits = mse_splitter.split_documents(docs, verbose=True)
len(splits)

## Save splits

In [None]:
save_docs_to_jsonl(splits, split_filename)

In [None]:
print(len(docs), len(splits))