In [None]:
from dataclasses import asdict

import pandas as pd

from juddges.settings import NSA_DATA_PATH
import polars as pl
import pandas as pd

lf = pl.scan_parquet(NSA_DATA_PATH / "pages" / "pages_chunk_0.parquet")

In [None]:
df = lf.collect()

In [None]:
# df[2]["page"]

In [None]:
page = df[2]["page"].item()

In [None]:
import re
from itertools import groupby
from bs4 import BeautifulSoup, Tag


# Function to split text by <br/> tags
def extract_data_with_br_tags(value):
    return [item.strip() for item in value.split('<br/>') if item.strip()]

# Function to preserve paragraphs with \n\n
def extract_text_preserve_paragraphs(html_element):
    paragraphs = html_element.find_all(['p', 'br'])
    text_parts = []
    for paragraph in paragraphs:
        text = paragraph.get_text(strip=True)
        if text:
            text_parts.append(text)
    return '\n\n'.join(text_parts)

def extract_przepisy(value):
    chunks = []
    x = 0
    chunks.append([])   # create an empty chunk to which we'd append in the loop
    for i in value.contents:
        if ('<br/>' not in str(i)):
            if i.get_text(strip=True) != '':
                chunks[x].append(i)
        else:
            x += 1
            chunks.append([])
    
    reference = chunks[0::2]
    ustawa = chunks[1::2]
    
    dziennik_ustaw = [r[0].get_text(strip=True) for r in reference]
    art = [r[1].get_text(strip=True) if len(r) > 1 else None for r in reference ]
    links = [r[0]["href"] if isinstance(r[0], Tag) else None for r in reference]
    ustawa = [u[0].get_text(strip=True) for u in ustawa]
    
    assert len(dziennik_ustaw) == len(links) == len(art) == len(ustawa)

    return {
        "Dziennik ustaw": dziennik_ustaw,
        "Dziennik ustaw link": links,
        "Artykuły": art,
        "Ustawa": ustawa
    }

def extract_data(page):
    # Initialize BeautifulSoup with the HTML content
    soup = BeautifulSoup(page, 'html.parser')  # 'page' contains the HTML
    
    # Dictionary to store the extracted data
    extracted_data = {}
    
    # Find all rows containing labels and values
    rows = soup.find_all('tr', class_='niezaznaczona')
    
    # Iterate through the rows and extract label-value pairs
    for row in rows:
        label = row.find('td', class_='lista-label')
        value = row.find('td', class_='info-list-value')
    
        if label and value:
            label_text = label.get_text(strip=True)
            value_text = value.decode_contents().strip()  # Extract HTML content, including <br/>
    
            if 'Powołane przepisy' in label_text:
                extracted_data = extracted_data | extract_przepisy(value)
            elif 'Sygn. powiązane' in label_text:
                 extracted_data[label_text] = [a['href'] for a in value.find_all('a')]
                 extracted_data[label_text] = [a.get_text(strip=True) for a in value.find_all('a')]
                 extracted_data[label_text+" link"] = [a['href'] for a in value.find_all('a')]
            elif '<br/>' in value_text or label_text in ("Hasła tematyczne", "Symbol z opisem", "Sędziowie", "Treść wyniku"):
                extracted_data[label_text] = extract_data_with_br_tags(value_text)
                if label_text == "Sędziowie":
                    function = [re.findall(r'/([^/]*)/', j) for j in extracted_data[label_text]]
                    # function = [f[0] if f else None for f in function]
                    extracted_data[label_text] = [re.sub(r'/[^/]*/', '', s).strip() for s in extracted_data[label_text]]

                    function_map = {f[0]:j for f, j in zip(function, extracted_data[label_text]) if f}
                    if "przewodniczący" in function_map and "sprawozdawca" in function_map:
                        extracted_data["przewodniczący"] = function_map["przewodniczący"]
                        extracted_data["sprawozdawca"] = function_map["sprawozdawca"]
                    elif "przewodniczący sprawozdawca" in function_map:
                        extracted_data["przewodniczący"] = function_map["przewodniczący sprawozdawca"]
                        extracted_data["sprawozdawca"] = function_map["przewodniczący sprawozdawca"]
            elif 'Data orzeczenia' in label_text:
                date_value = value.find_all('td')[0].get_text(strip=True)
                judgement_type = value.find_all('td')[1].get_text(strip=True)
                if len(judgement_type) == 0:
                    judgement_type = None
                extracted_data['Data orzeczenia'] = date_value
                extracted_data['Rodzaj orzeczenia'] = judgement_type
            else:
                extracted_data[label_text] = value_text
    
    # Extract sections that follow the pattern in the document (avoiding explicit label names)
    section_headers = soup.find_all('div', class_='lista-label')
    for header in section_headers:
        next_section = header.find_next('span', class_='info-list-value-uzasadnienie')
        if next_section:
            header_text = header.get_text(strip=True)
            extracted_data[header_text] = extract_text_preserve_paragraphs(next_section)
    
    return extracted_data

from tqdm import tqdm

l = []

for item in tqdm(df[:500].iter_rows(named=True)):
    data = extract_data(item["page"])
    data["doc_id"] = item["doc_id"]
    data["page"] = item["page"]
    l.append(data)

In [None]:
l[0]

In [None]:
df = pd.DataFrame(l)

In [None]:
df

In [None]:
for i, entry in df.iterrows():
    if not isinstance(entry["Dziennik ustaw"], list):
        continue
    if not len(entry['Dziennik ustaw']) == len(entry["Dziennik ustaw link"]) == len(entry["Artykuły"]) == len(entry["Ustawa"]):
        print(entry)
        print(entry["Dziennik ustaw"])
        print(entry["Dziennik ustaw link"])
        print(entry["Artykuły"])
        print(entry["Ustawa"])
        break