In [None]:
pip install beautifulsoup4 requests lxml

Web scraping with Beautiful Soup

Get all laws and statues of Israel's law. 

In [None]:
import requests
from bs4 import BeautifulSoup

url = "https://he.wikisource.org/wiki/%D7%A1%D7%A4%D7%A8_%D7%94%D7%97%D7%95%D7%A7%D7%99%D7%9D_%D7%94%D7%A4%D7%AA%D7%95%D7%97"
headers = {"User-Agent": "Mozilla/5.0 (compatible; Scraper/1.0)"}
response = requests.get(url, headers=headers)

# Parse HTML
soup = BeautifulSoup(response.text, "lxml")

# Get the page title
print(soup.title.string)

#Get law names and links
root_url = 'https://he.wikisource.org'
law_html_elements = soup.select("dd a")
law_items = []
for l in law_html_elements:
    url = root_url + l['href']
    law_name = l.text
    law_items.append({'law_name': law_name, 'url': url})

Lets go through one of the items:
Building the URL for edit page.
It's more LLM-friendly version of the content

In [None]:
url_for_edit = 'https://he.wikisource.org/w/index.php?title=מקור:{law_name}&action=edit'
name = law_items[24]['law_name']
formatted_url_for_edit = url_for_edit.format(law_name=name).replace(' ', '_')
print(formatted_url_for_edit)

Fetching the data from the textarea

In [None]:
headers = {"User-Agent": "Mozilla/5.0 (compatible; Scraper/1.0)"}
response = requests.get(formatted_url_for_edit, headers=headers)

# Parse HTML
soup = BeautifulSoup(response.text, "lxml")

law_content = soup.find(id='wpTextbox1')
if law_content:
    law_content = law_content.get_text(strip=True)

print(law_content)

Convert to json

In [None]:
import re
import json

def parse_text(text):
    result = []

    # extract <name> as law name
    law_match = re.search(r"<שם>\s*(.+)", text)
    law_name = law_match.group(1).strip() if law_match else None

    # Normalize lines
    lines = text.splitlines()

    current_part = None  # חלק
    current_chapter = None   # פרק
    current_sign = None   # סימן

    # pattern for headings like === ((פרק ראשון)) ===  or == פרק א' ==
    part_regex = re.compile(r"={1,}\s*(?:\(\()?([^=()]*חלק[^=()]*?)\)?\s*={1,}")
    chapter_regex = re.compile(r"={1,}\s*(?:\(\()?([^=()]*פרק[^=()]*?)\)?\s*={1,}")
    sign_regex = re.compile(r"={1,}\s*(?:\(\()?([^=()]*סימן[^=()]*?)\)?\s*={1,}")
    # simpler: lines starting with '=' indicate hierarchical headings:
    heading_regex = re.compile(r"^(=+)\s*(.+?)\s*(=+)?$")

    # pattern for section: start with @ number. optionally a title after dot
    section_regex = re.compile(r"^\s*@\s*([\d\w־\-\.]+)\.\s*(?::\s*)?(.*)$")

    # We'll collect multiline section text until next @ or heading
    i = 0
    current_section = None
    section_text_lines = []

    while i < len(lines):
        line = lines[i]

        # heading?
        m_head = heading_regex.match(line)
        if m_head:
            title = m_head.group(2).strip()

            m_part = part_regex.match(line)
            m_chapter = chapter_regex.match(line)
            m_sign = sign_regex.match(line)

            if m_part:
                current_part = m_part.group(1).strip()
            elif m_chapter:
                current_chapter = m_chapter.group(1).strip()
            elif m_sign:
                current_sign = m_sign.group(1).strip()
            i += 1
            continue

        # section start?
        m = section_regex.match(line)
        if m:
            # finish previous section
            if current_section:
                current_section['סעיף_טקסט'] = "\n".join(section_text_lines).strip()
                result.append(current_section)
                section_text_lines = []
            sec_num = m.group(1).strip()
            sec_rest = m.group(2).strip()
            current_section = {
                "חוק": law_name,
                "חלק": current_part,
                "פרק": current_chapter,
                "סימן": current_sign,
                "סעיף_מספר": sec_num,
                "סעיף_כותרת": None,
                "סעיף_טקסט": ""
            }
            # if the section line included immediate text, use it as start of text
            if sec_rest:
                section_text_lines.append(sec_rest)
            i += 1
            # collect following indented/continued lines
            while i < len(lines):
                nxt = lines[i]
                if section_regex.match(nxt) or heading_regex.match(nxt):
                    break
                section_text_lines.append(nxt)
                i += 1
            continue

        i += 1

    # append last
    if current_section:
        current_section['סעיף_טקסט'] = "\n".join(section_text_lines).strip()
        result.append(current_section)

    return result


# דוגמה: שימוש על הטקסט שסיפקת (כאן יש להדביק את הטקסט המלא בתוך `example_text`)
example_text = law_content

parsed = parse_text(example_text)
print(json.dumps(parsed, ensure_ascii=False, indent=2))


Improved Version:

In [45]:
import re
import json
from enum import Enum

#TODO: create class of Parser, and parse function, and helper functions
class LineType(Enum):
    ADDENDUM = 'תוספת'
    PART = 'חלק'
    CHAPTER = 'פרק'
    SIGN = 'סימן'
    SECTION = 'סעיף'
    REGULAR = 'רגיל'
    #TODO: if we chose to use that, update stop_tags
    # LONG_SECTION = 'סעיף',
    # SHORT_SECTION = 'סעיף מקוצר',
    # NO_NUM_SECTION = 'סעיף אל ממוספר',
    LAW_NAME = 'שם החוק'
    METADATA='metadata' #מבוא, מקור, שם קודם, חתימות, פרסום

stop_tags = {
    LineType.ADDENDUM : [LineType.ADDENDUM, LineType.PART, LineType.METADATA],
    LineType.PART : [LineType.PART, LineType.ADDENDUM, LineType.METADATA],
    LineType.CHAPTER : [LineType.CHAPTER, LineType.PART, LineType.ADDENDUM, LineType.METADATA],
    LineType.SIGN : [LineType.SIGN, LineType.CHAPTER, LineType.PART, LineType.ADDENDUM, LineType.METADATA],
    LineType.SECTION: [LineType.SECTION, LineType.SIGN, LineType.CHAPTER, LineType.PART, LineType.ADDENDUM, LineType.METADATA],
    LineType.METADATA: [LineType.SECTION, LineType.SIGN, LineType.CHAPTER, LineType.PART, LineType.ADDENDUM]
}
def get_line_type(line:str):


    # is heading? (= ** =)
    heading_regex = re.compile(r"^(=+)\s*(.+?)\s*(=+)?$")
    m_head = heading_regex.match(line)
    
    if m_head:
        #determine the heading
        title = m_head.group(2).strip(' =()\{\}')
        headings = [LineType.ADDENDUM, LineType.PART, LineType.CHAPTER, LineType.SIGN]
        for t in headings:
            if t.value in title:
                return t
    
    #is metadata?
    if line.strip().startswith(('<שם>', '<מקור>', '<מבוא>', '<חתימות>', '<פרסום>', '<שם קודם>', '<מאגר')):
        return LineType.METADATA
    
    #is section?
    section_regex = re.compile(r"^\s*@\s*([\d\w־\-\.]+)\.\s*(.*)$")
    m_section = section_regex.match(line)
    if m_section:
        return LineType.SECTION
    #Default
    return LineType.REGULAR

#TODO: check and Doc
def get_section_properties(line: str):
    section_regex = re.compile(r"^\s*@\s*([\d\w־\-\.]+)\.\s*(.*)$")
    m_section = section_regex.match(line)
    if not m_section:
        raise ValueError('This is not a section line')
    sec_num = m_section.group(1).strip()
    sec_rest = m_section.group(2).strip()
    return {'section_num': sec_num, 'section_text': sec_rest}

def parse_text3(text):
    result = []

    # extract <name> as law name
    law_match = re.search(r"<שם>\s*(.+)", text)
    law_name = law_match.group(1).strip() if law_match else None

    # Normalize lines
    lines = text.splitlines()

    current_part = None  # חלק
    current_chapter = None   # פרק
    current_sign = None   # סימן
    current_section = None

    #Iterating the lines and divide + concat to chunks
    lineIdx = 0
    while lineIdx < len(lines):
        chunk = ''
        line = lines[lineIdx]
        line_type = get_line_type(line)

        #Metadata
        if(line_type == LineType.METADATA):
            #Pack all current metadata
            current_part = 'metadata'
            current_sign = current_chapter = current_section =  None
            #building the chunk
            chunk = line
            #Next line
            lineIdx+=1
            while(lineIdx < len(lines) and get_line_type(lines[lineIdx]) not in stop_tags[LineType.METADATA]):
                chunk+=lines[lineIdx]

                lineIdx+=1
        #ADDENDUM
        elif(line_type == LineType.ADDENDUM):
            #Pack all current ADDENDUM
            current_part = line.strip(' =()\{\}')
            current_sign = current_chapter = current_section = None
            #building the chunk - first line
            chunk = line
            #Next line
            lineIdx+=1
            while(lineIdx < len(lines) and get_line_type(lines[lineIdx]) not in stop_tags[LineType.ADDENDUM]):
                chunk+=lines[lineIdx]
                #Next line
                lineIdx+=1
        #PART, CHAPTER, SIGN
        elif(line_type == LineType.PART):
            current_part = line.strip(' =()\{\}')
            current_chapter = current_sign = current_section = None
            lineIdx+=1
        elif(line_type == LineType.CHAPTER):
            current_chapter = line.strip(' =()\{\}')
            current_sign = current_section = None
            if current_part == 'metadata': current_part = None
            lineIdx+=1
        elif(line_type == LineType.SIGN):
            current_sign = line.strip(' =()\{\}')
            current_section = None
            if current_part == 'metadata': current_part = None
            lineIdx+=1
        #SECTION
        elif(line_type == LineType.SECTION):
            #Determine section type
            prop = get_section_properties(line)
            if current_part == 'metadata': current_part = None
            current_section = prop['section_num']
            chunk = prop['section_text']
            #Next line
            lineIdx+=1
            while(lineIdx < len(lines) and get_line_type(lines[lineIdx]) not in stop_tags[LineType.SECTION]):
                chunk+=lines[lineIdx]
                lineIdx+=1
        else: #regular line
            chunk = line
            lineIdx+=1
        #Pack
        block = {
            'law_name': law_name,
            'part' : current_part,
            'chapter' : current_chapter,
            'sign' : current_sign,
            'section' : current_section,
            'text': chunk
        }
        #Add to results(with filtering of empty blocks)
        if chunk:
            result.append(block)
    #return
    return result


# דוגמה: שימוש על הטקסט שסיפקת (כאן יש להדביק את הטקסט המלא בתוך `example_text`)
example_text = law_content

parsed = parse_text3(example_text)
print(json.dumps(parsed, ensure_ascii=False, indent=2))


[
  {
    "law_name": "חוק אוויר נקי, התשס\"ח-2008",
    "part": "metadata",
    "chapter": null,
    "sign": null,
    "section": null,
    "text": "<שם> חוק אוויר נקי, התשס\"ח-2008<מאגר 2000055 תיקון 2204149 תקן 98122 קוד a163Y00000DuC5mQAF><מקור> ((ס\"ח תשס\"ח, 752|חוק אוויר נקי|17:299971)); ((תשס\"ט, 119|ת\"ט|2192)), ((307|ת\"ט|2207)); ((תשע\"א, 200|ת\"ט|2272)), ((749|חוק הגנת הסביבה (סמכויות פיקוח ואכיפה)|18:301171)); ((תשע\"ב, 450|תיקון מס' 2 והוראת שעה|18:301482)); ((תשפ\"ג, 293|חוק התוכנית הכלכלית (תיקוני חקיקה ליישום המדיניות הכלכלית לשנות התקציב 2023 ו־2024)|25:2620741)); ((תשפ\"ד, 178|תיקון מס' 9 לחוק פסיקת ריבית והצמדה|25:3568695)), ((798|חוק הגנת הסביבה (ייעול הליכי רישוי סביבתי) (תיקוני חקיקה)|25:4328606)), ((1070|ת\"ט|25:4690532)).''עדכון סכומים:'' ((י\"פ תשע\"ב, 6129|הודעה בדבר עדכון סכום עיצום כספי)); ((תשע\"ג, 3144|הודעה בדבר עדכון סכום עיצום כספי)); ((תשע\"ד, 3614|הודעה בדבר עדכון סכום עיצום כספי)); ((תשע\"ו, 3380|הודעה בדבר עדכון סכום עיצום כספי)); ((תשע\"ז, 3903|הו