# Setting up the environment
Make sure to select kernel `Python 3.8 AzureML`

## Libraries

In [5]:
!pip install beautifulsoup4



In [100]:
!pip install markdownify

Collecting markdownify
  Downloading markdownify-0.11.6-py3-none-any.whl (16 kB)
Installing collected packages: markdownify
Successfully installed markdownify-0.11.6


# Build Datasets

## Catalog 

A dataframe to host all content to index. 

In [30]:
import requests
from bs4 import BeautifulSoup

# Root URL 
url = "https://www.azstlucas.be/onderzoek-en-behandelingen"
response = requests.get(url)
catalog_soup = BeautifulSoup(response.content, 'html.parser')

In [26]:
import pandas as pd 

catalog = []
for i, div in enumerate(catalog_soup.find_all('div', class_='col md:w-1/2')): 
    catalog.append({
        'id': i,
        'title': div.get_text(strip=True), 
        'url': div.find('a', href=True)['href']
    })

catalog_df = pd.DataFrame.from_records(catalog)
catalog_df.head(20)

Unnamed: 0,id,title,url
0,0,24 uur observatie van de pasgeborene,https://www.azstlucas.be/onderzoek-en-behandel...
1,1,24 uur ph-metrie meting,https://www.azstlucas.be/onderzoek-en-behandel...
2,2,24 uur stoelgang sparen: richtlijnen,https://www.azstlucas.be/onderzoek-en-behandel...
3,3,24 uur urine sparen,https://www.azstlucas.be/onderzoek-en-behandel...
4,4,24-uurs bloeddrukmeting,https://www.azstlucas.be/onderzoek-en-behandel...
5,5,Ablatie,https://www.azstlucas.be/onderzoek-en-behandel...
6,6,Ablatieve CO2 laserbehandeling,https://www.azstlucas.be/onderzoek-en-behandel...
7,7,ACNES (buikwand pijnsyndroom),https://www.azstlucas.be/onderzoek-en-behandel...
8,8,Algemene verdoving,https://www.azstlucas.be/onderzoek-en-behandel...
9,9,Ambulante algemene revalidatie,https://www.azstlucas.be/onderzoek-en-behandel...


### Serialize Catalog Dataframe

In [160]:
# Save DF to Blob Storage for later retrieval. 
catalog_df.to_pickle(path='./data/catalog_raw.pkl')

In [161]:
# Read from pikle file
catalog_test_df = pd.read_pickle('./data/catalog_raw.pkl')

catalog_test_df.head()

Unnamed: 0,id,title,url
0,0,24 uur observatie van de pasgeborene,https://www.azstlucas.be/onderzoek-en-behandel...
1,1,24 uur ph-metrie meting,https://www.azstlucas.be/onderzoek-en-behandel...
2,2,24 uur stoelgang sparen: richtlijnen,https://www.azstlucas.be/onderzoek-en-behandel...
3,3,24 uur urine sparen,https://www.azstlucas.be/onderzoek-en-behandel...
4,4,24-uurs bloeddrukmeting,https://www.azstlucas.be/onderzoek-en-behandel...


## Page Content 

A dataframe that holds all page content data

In [31]:
response = requests.get(catalog_df.iloc[0]['url'])
page_soup = BeautifulSoup(response.content, 'html.parser')

In [146]:
from typing import List, Dict
from markdownify import MarkdownConverter

# TODO: CONVERT TO A STATEFUL CLASS 

def extract_banner_title(s: BeautifulSoup) -> str: 
    try: 
        return (
            s
            # .find(class_='header-detail__smaller-height')
            .find('h1', class_='t-alpha')
            .get_text(strip=True)
        )
    except AttributeError as e:
        return ''

def extract_banner_divisions(s: BeautifulSoup) -> List[Dict[str, str]]: 
    try: 
        ban_li = []
        ban_a: BeautifulSoup = (
            s
            .find_all('a', class_='text-link', href=True)
        )
        for a in ban_a: 
            ban_li.append({
                'division_url': a['href'],
                'division_name': a.get_text(strip=True)
            })
        return ban_li 
    except AttributeError as e:
        return [] 

def extract_page_intro(s: BeautifulSoup) -> str: 
    try: 
        return (
            s
            .find('p', class_='t-intro')
            .get_text(strip=True)
        )
    except AttributeError as e:
        return ''

def extract_page_toc(s: BeautifulSoup) -> List[Dict[str, str]]: 
    try: 
        toc_li = []
        ul_li: BeautifulSoup = (
            s
            .find('ul', class_='flex flex-wrap mt-4')
            .find_all('li', class_='w-full sm:w-1/2 md:w-1/3 mt-2')
        )
        for l in ul_li: 
            toc_li.append({
                'link_url': l.find('a', href=True)['href'],
                'link_title': l.find('a').get_text(strip=True)
            })
        return toc_li 
    except AttributeError as e:
        return [] 

def soup_to_md(soup, **options):
    """
        Shorthand method for conversion
    """
    return MarkdownConverter(**options).convert_soup(soup)

def extract_html_text(s: BeautifulSoup) -> str: 
    try: 
        return soup_to_md(s, heading_style='ATX')
    except AttributeError as e:
        return ''

def mine_page_content(s: BeautifulSoup) -> Dict[str, object]:
    page_data = {
        'content': ''
    }
    for i, c in enumerate(s.find_all(class_='container')):
        # 0 is nav container, 1 is the banner container
        if i == 1:
            page_data['banner_title'] = extract_banner_title(c)
            page_data['banner_divisions'] = extract_banner_divisions(c)
        # 2 can be a overview container, or actual content
        elif i == 2: 
            page_data['intro'] = extract_page_intro(c)
            page_data['toc'] = extract_page_toc(c)

            # No Banner Edge Case
            if page_data['intro'] == '' and page_data['toc'] == '':
                page_data['content'] = page_data['content'] + extract_html_text(c)
        else: 
            # Get richtext elements
            container_text: BeautifulSoup = c.find_all(class_='richtext')
            for paragraph in container_text:
                page_data['content'] = (
                    page_data['content'] + 
                    extract_html_text(paragraph)
                )
    return page_data

In [151]:
from datetime import datetime

def scrape_page(url: str) -> Dict[str, object]:
    now = datetime.now()
    print(f'scraping {url} at {now.strftime("%d/%m/%Y %H:%M:%S")}')
    
    page_response = requests.get(url)
    page_soup = BeautifulSoup(page_response.content, 'html.parser')
    print(extract_banner_title(page_soup))

    page_content: Dict[str, object] = mine_page_content(page_soup)
    page_content['url'] = url

    # dd/mm/YY H:M:S format
    page_content['scrape_date'] = now.strftime("%d/%m/%Y %H:%M:%S")
    return page_content

webpages_df = (
    pd.DataFrame
    .from_records(
        list(catalog_df['url'].apply(scrape_page))
    )
)

scraping https://www.azstlucas.be/onderzoek-en-behandelingen/24-uur-observatie-van-de-pasgeborene at 12/11/2023 15:26:50
24 uur observatie van de pasgeborene
scraping https://www.azstlucas.be/onderzoek-en-behandelingen/24-uur-ph-metrie-meting at 12/11/2023 15:26:50
24 uur ph-metrie meting
scraping https://www.azstlucas.be/onderzoek-en-behandelingen/24-uur-stoelgang-sparen-richtlijnen at 12/11/2023 15:26:51
24 uur stoelgang sparen: richtlijnen
scraping https://www.azstlucas.be/onderzoek-en-behandelingen/24-uur-urine-sparen at 12/11/2023 15:26:52
24 uur urine sparen
scraping https://www.azstlucas.be/onderzoek-en-behandelingen/24-uurs-bloeddrukmeting at 12/11/2023 15:26:52
24-uurs bloeddrukmeting
scraping https://www.azstlucas.be/onderzoek-en-behandelingen/ablatie at 12/11/2023 15:26:53
Ablatie
scraping https://www.azstlucas.be/onderzoek-en-behandelingen/ablatieve-co2-laserbehandeling at 12/11/2023 15:26:54
Ablatieve CO2 laserbehandeling
scraping https://www.azstlucas.be/onderzoek-en-beha

### Serialize Scraped Webpages

In [158]:
# Save DF to Blob Storage for later retrieval. 
webpages_df.to_pickle(path='./data/webpages_raw.pkl')

In [159]:
# Read from pikle file
webpages_test_df = pd.read_pickle('./data/webpages_raw.pkl')

webpages_test_df.head()

Unnamed: 0,content,banner_title,banner_divisions,intro,toc,url,scrape_date
0,\n## Waarom willen de kinderartsen dat je baby...,24 uur observatie van de pasgeborene,[{'division_url': 'https://www.azstlucas.be/sp...,Elke pasgeborene wordt - bij overnachting - bi...,[{'link_url': '#waarom-willen-we-dat-je-baby-m...,https://www.azstlucas.be/onderzoek-en-behandel...,12/11/2023 15:26:50
1,\nWe weten dat reflux en opboeren voor een aan...,24 uur ph-metrie meting,[{'division_url': 'https://www.azstlucas.be/sp...,Een ph-metrie meting met impedantiemeting is e...,"[{'link_url': '#hoe-gebeurt-de-meting', 'link_...",https://www.azstlucas.be/onderzoek-en-behandel...,12/11/2023 15:26:50
2,\n## Algemene richtlijnen\n\n\nIn opdracht van...,24 uur stoelgang sparen: richtlijnen,[{'division_url': 'https://www.azstlucas.be/sp...,,"[{'link_url': '#algemene-richtlijnen', 'link_t...",https://www.azstlucas.be/onderzoek-en-behandel...,12/11/2023 15:26:51
3,\n## Algemene richtlijnen\n\n\nIn opdracht van...,24 uur urine sparen,[{'division_url': 'https://www.azstlucas.be/sp...,,"[{'link_url': '#algemene-richtlijnen', 'link_t...",https://www.azstlucas.be/onderzoek-en-behandel...,12/11/2023 15:26:52
4,\nBij een 24-uurs bloeddrukmeting wordt de blo...,24-uurs bloeddrukmeting,[{'division_url': 'https://www.azstlucas.be/sp...,Als je last hebt van hoge bloeddruk of hyperte...,[],https://www.azstlucas.be/onderzoek-en-behandel...,12/11/2023 15:26:52
