# üèõÔ∏è Sejm Process Downloader - Pobieranie druku nr 471

Ten notebook pobiera dane z Sejmu dla konkretnego druku legislacyjnego wraz z za≈ÇƒÖcznikami i tworzy drzewo chronologiczne oraz powiƒÖzaniowe.

## U≈ºycie:
1. Uruchom wszystkie kom√≥rki po kolei
2. Wyniki zostanƒÖ zapisane w folderze `druk_471_dokumentacja`

## Kompatybilno≈õƒá:
- Vast.ai
- Google Colab
- Jupyter Notebook (lokalnie)

In [None]:
# Instalacja wymaganych pakiet√≥w
!pip install requests beautifulsoup4 -q

In [None]:
# Importy
import os
import re
import json
import requests
from datetime import datetime
from typing import Dict, List, Optional, Any
from urllib.parse import urljoin, urlparse, unquote

try:
    from bs4 import BeautifulSoup
    HAS_BS4 = True
    print("‚úÖ BeautifulSoup za≈Çadowany")
except ImportError:
    HAS_BS4 = False
    print("‚ö†Ô∏è BeautifulSoup nie zainstalowany")

print("‚úÖ Importy za≈Çadowane pomy≈õlnie!")

In [None]:
# ‚öôÔ∏è KONFIGURACJA - ZMIE≈É WARTO≈öCI TUTAJ

API_URL = "https://api.sejm.gov.pl/sejm"
SEJM_WEB_URL = "https://www.sejm.gov.pl"
TERM = 10  # Kadencja X
PROCESS_NUMBER = 471  # Numer druku do pobrania
OUTPUT_DIR = f"druk_{PROCESS_NUMBER}_dokumentacja"
DOWNLOAD_ATTACHMENTS = True  # Czy pobieraƒá pliki za≈ÇƒÖcznik√≥w?

print(f"üìã Konfiguracja:")
print(f"   - Kadencja: {TERM}")
print(f"   - Numer druku: {PROCESS_NUMBER}")
print(f"   - Folder wyj≈õciowy: {OUTPUT_DIR}")
print(f"   - Pobieranie za≈ÇƒÖcznik√≥w: {'Tak' if DOWNLOAD_ATTACHMENTS else 'Nie'}")

In [None]:
# Klasa g≈Ç√≥wna

class SejmProcessDownloader:
    """Pobiera i analizuje proces legislacyjny z Sejmu."""
    
    def __init__(self, term: int, process_number: int, output_dir: str):
        self.term = term
        self.process_number = process_number
        self.output_dir = output_dir
        self.process_data: Dict[str, Any] = {}
        self.attachments: List[Dict[str, Any]] = []
        self.tree_structure: List[Dict[str, Any]] = []
        self.all_prints: List[int] = []
        
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
    
    def _make_request(self, url: str, timeout: int = 60) -> Optional[requests.Response]:
        try:
            headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
            resp = requests.get(url, timeout=timeout, headers=headers)
            if resp.status_code == 200:
                return resp
            else:
                print(f"‚ö†Ô∏è  HTTP {resp.status_code}: {url}")
                return None
        except requests.exceptions.RequestException as e:
            print(f"‚ùå B≈ÇƒÖd po≈ÇƒÖczenia: {e}")
            return None
    
    def fetch_print_from_api(self, print_number: int) -> Optional[Dict[str, Any]]:
        url = f"{API_URL}/term{self.term}/prints/{print_number}"
        resp = self._make_request(url)
        if resp:
            try:
                return resp.json()
            except Exception:
                return None
        return None
    
    def scrape_process_page(self) -> bool:
        if not HAS_BS4:
            print("‚ùå BeautifulSoup wymagany do scrapowania strony")
            return False
        
        page_url = f"{SEJM_WEB_URL}/Sejm{self.term}.nsf/PrzebiegProc.xsp?nr={self.process_number}"
        print(f"\nüåê Pobieram stronƒô: {page_url}")
        
        resp = self._make_request(page_url)
        if not resp:
            return False
        
        soup = BeautifulSoup(resp.text, 'html.parser')
        
        title_elem = soup.find('h1') or soup.find('title')
        if title_elem:
            self.process_data['title'] = title_elem.get_text(strip=True)
        else:
            self.process_data['title'] = f"Druk nr {self.process_number}"
        
        print(f"‚úÖ Tytu≈Ç: {self.process_data['title'][:100]}...")
        
        doc_links = []
        for link in soup.find_all('a', href=True):
            href = link['href']
            link_text = link.get_text(strip=True)
            
            if any(ext in href.lower() for ext in ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.zip', '.rtf']):
                full_url = urljoin(page_url, href)
                doc_links.append({
                    'url': full_url,
                    'text': link_text,
                    'filename': self._extract_filename(href)
                })
            elif 'api.sejm.gov.pl' in href:
                doc_links.append({
                    'url': href,
                    'text': link_text,
                    'filename': self._extract_filename(href)
                })
            elif '/druk' in href.lower() or 'druk' in link_text.lower():
                match = re.search(r'(\d+)', link_text)
                if match:
                    druk_num = int(match.group(1))
                    if druk_num not in self.all_prints:
                        self.all_prints.append(druk_num)
        
        for table in soup.find_all('table'):
            for row in table.find_all('tr'):
                for cell in row.find_all(['td', 'th']):
                    for link in cell.find_all('a', href=True):
                        href = link['href']
                        link_text = link.get_text(strip=True)
                        if any(ext in href.lower() for ext in ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.zip', '.rtf']):
                            full_url = urljoin(page_url, href)
                            if full_url not in [d['url'] for d in doc_links]:
                                doc_links.append({
                                    'url': full_url,
                                    'text': link_text,
                                    'filename': self._extract_filename(href)
                                })
        
        self.process_data['scraped_documents'] = doc_links
        print(f"üìé Znaleziono {len(doc_links)} link√≥w do dokument√≥w na stronie")
        
        if self.process_number not in self.all_prints:
            self.all_prints.insert(0, self.process_number)
        
        return True
    
    def _extract_filename(self, url: str) -> str:
        parsed = urlparse(url)
        path = unquote(parsed.path)
        filename = os.path.basename(path)
        if not filename or '.' not in filename:
            filename = f"dokument_{datetime.now().strftime('%H%M%S')}.pdf"
        return filename
    
    def fetch_process_info(self) -> bool:
        print(f"\nüì• Pobieram informacje o druku nr {self.process_number}...")
        
        print(f"üîç Pr√≥bujƒô API: {API_URL}/term{self.term}/prints/{self.process_number}")
        print_data = self.fetch_print_from_api(self.process_number)
        
        if print_data:
            self.process_data = {
                'title': print_data.get('title', f'Druk nr {self.process_number}'),
                'documentDate': print_data.get('documentDate', ''),
                'deliveryDate': print_data.get('deliveryDate', ''),
                'documentType': print_data.get('documentType', ''),
                'prints': [self.process_number],
                'attachments': print_data.get('attachments', []),
                'print_data': print_data
            }
            self.all_prints = [self.process_number]
            
            additional_prints = print_data.get('additionalPrints', [])
            if additional_prints:
                self.all_prints.extend(additional_prints)
            
            print(f"‚úÖ Znaleziono druk: {self.process_data['title'][:80]}...")
            print(f"   üìé Za≈ÇƒÖczniki z API: {len(self.process_data['attachments'])}")
            return True
        
        print("‚ö†Ô∏è  API nie zwr√≥ci≈Ço danych, pr√≥bujƒô scrapowania strony...")
        if HAS_BS4:
            if self.scrape_process_page():
                return True
        
        print(f"‚ùå Nie znaleziono druku nr {self.process_number}")
        return False
    
    def download_attachment(self, url: str, filename: str, subfolder: str = "") -> Optional[str]:
        resp = self._make_request(url)
        
        if resp:
            if subfolder:
                target_dir = os.path.join(self.output_dir, subfolder)
            else:
                target_dir = self.output_dir
            
            if not os.path.exists(target_dir):
                os.makedirs(target_dir)
            
            safe_filename = re.sub(r'[<>:"/\\|?*]', '_', filename)
            filepath = os.path.join(target_dir, safe_filename)
            
            with open(filepath, 'wb') as f:
                f.write(resp.content)
            
            return filepath
        return None
    
    def download_api_attachment(self, print_number: int, filename: str) -> Optional[str]:
        url = f"{API_URL}/term{self.term}/prints/{print_number}/{filename}"
        return self.download_attachment(url, filename, f"druk_{print_number}")
    
    def build_tree(self) -> List[Dict[str, Any]]:
        tree = []
        
        if not self.process_data:
            return tree
        
        process_node = {
            "level": 0,
            "type": "PROCES",
            "id": self.process_number,
            "title": self.process_data.get('title', 'Brak tytu≈Çu'),
            "description": self.process_data.get('description', ''),
            "document_type": self.process_data.get('documentType', ''),
            "document_date": self.process_data.get('documentDate', ''),
            "term": self.term,
            "children": []
        }
        
        print(f"\nüìã Przetwarzam {len(self.all_prints)} druk√≥w...")
        
        for idx, print_num in enumerate(self.all_prints):
            print(f"\nüìÑ [{idx+1}/{len(self.all_prints)}] Druk nr {print_num}...")
            
            print_data = self.fetch_print_from_api(print_num)
            
            if print_data:
                print_node = {
                    "level": 1,
                    "type": "DRUK",
                    "number": print_num,
                    "title": print_data.get('title', ''),
                    "document_date": print_data.get('documentDate', ''),
                    "delivery_date": print_data.get('deliveryDate', ''),
                    "attachments": []
                }
                
                attachments = print_data.get('attachments', [])
                print(f"   üìé Za≈ÇƒÖczniki: {len(attachments)}")
                
                for att_idx, att in enumerate(attachments):
                    att_node = {
                        "level": 2,
                        "type": "ZA≈ÅƒÑCZNIK",
                        "filename": att,
                        "download_url": f"{API_URL}/term{self.term}/prints/{print_num}/{att}",
                        "local_path": None
                    }
                    
                    if DOWNLOAD_ATTACHMENTS:
                        print(f"      ‚¨áÔ∏è  [{att_idx+1}/{len(attachments)}] {att}")
                        local_path = self.download_api_attachment(print_num, att)
                        if local_path:
                            att_node["local_path"] = local_path
                            print(f"      ‚úÖ Zapisano")
                        else:
                            print(f"      ‚ùå B≈ÇƒÖd")
                    
                    print_node["attachments"].append(att_node)
                    self.attachments.append(att_node)
                
                process_node["children"].append(print_node)
            else:
                print(f"   ‚ö†Ô∏è  Brak danych w API")
        
        scraped_docs = self.process_data.get('scraped_documents', [])
        if scraped_docs and DOWNLOAD_ATTACHMENTS:
            print(f"\nüì• Pobieranie {len(scraped_docs)} dokument√≥w ze strony Sejmu...")
            
            scraped_node = {
                "level": 1,
                "type": "STRONA_WWW",
                "title": "Dokumenty ze strony Sejmu",
                "attachments": []
            }
            
            for doc_idx, doc in enumerate(scraped_docs):
                print(f"   ‚¨áÔ∏è  [{doc_idx+1}/{len(scraped_docs)}] {doc['filename']}")
                
                att_node = {
                    "level": 2,
                    "type": "ZA≈ÅƒÑCZNIK_WWW",
                    "filename": doc['filename'],
                    "text": doc['text'],
                    "download_url": doc['url'],
                    "local_path": None
                }
                
                local_path = self.download_attachment(doc['url'], doc['filename'], "strona_www")
                if local_path:
                    att_node["local_path"] = local_path
                    print(f"      ‚úÖ Zapisano")
                else:
                    print(f"      ‚ùå B≈ÇƒÖd")
                
                scraped_node["attachments"].append(att_node)
                self.attachments.append(att_node)
            
            if scraped_node["attachments"]:
                process_node["children"].append(scraped_node)
        
        tree.append(process_node)
        self.tree_structure = tree
        return tree
    
    def print_tree_ascii(self) -> str:
        output_lines = []
        
        def add_node(node, prefix="", is_last=True):
            connector = "‚îî‚îÄ‚îÄ " if is_last else "‚îú‚îÄ‚îÄ "
            node_type = node.get("type", "")
            
            if node_type == "PROCES":
                title = node.get('title', 'Brak tytu≈Çu')
                output_lines.append(f"üìÇ DRUK NR {self.process_number}: {title[:80]}...")
                doc_date = node.get('document_date', '')
                if doc_date:
                    output_lines.append(f"   Data dokumentu: {doc_date}")
                output_lines.append(f"   Typ dokumentu: {node.get('document_type', 'N/A')}")
                output_lines.append("")
                
                children = node.get("children", [])
                for idx, child in enumerate(children):
                    is_last_child = (idx == len(children) - 1)
                    add_node(child, "", is_last_child)
                    
            elif node_type == "DRUK":
                output_lines.append(f"{prefix}{connector}üìÑ DRUK NR {node.get('number', '?')}")
                title = node.get('title', '')
                if title:
                    output_lines.append(f"{prefix}{'    ' if is_last else '‚îÇ   '}   Tytu≈Ç: {title[:60]}...")
                output_lines.append(f"{prefix}{'    ' if is_last else '‚îÇ   '}   Data dokumentu: {node.get('document_date', 'N/A')}")
                output_lines.append(f"{prefix}{'    ' if is_last else '‚îÇ   '}   Data dostarczenia: {node.get('delivery_date', 'N/A')}")
                
                attachments = node.get("attachments", [])
                for att_idx, att in enumerate(attachments):
                    is_last_att = (att_idx == len(attachments) - 1)
                    att_prefix = prefix + ("    " if is_last else "‚îÇ   ")
                    att_connector = "‚îî‚îÄ‚îÄ " if is_last_att else "‚îú‚îÄ‚îÄ "
                    
                    status = "‚úÖ" if att.get("local_path") else "üîó"
                    output_lines.append(f"{att_prefix}{att_connector}{status} {att.get('filename', '?')}")
                
                output_lines.append("")
            
            elif node_type == "STRONA_WWW":
                output_lines.append(f"{prefix}{connector}üåê DOKUMENTY ZE STRONY WWW")
                
                attachments = node.get("attachments", [])
                for att_idx, att in enumerate(attachments):
                    is_last_att = (att_idx == len(attachments) - 1)
                    att_prefix = prefix + ("    " if is_last else "‚îÇ   ")
                    att_connector = "‚îî‚îÄ‚îÄ " if is_last_att else "‚îú‚îÄ‚îÄ "
                    
                    status = "‚úÖ" if att.get("local_path") else "üîó"
                    output_lines.append(f"{att_prefix}{att_connector}{status} {att.get('filename', '?')}")
                
                output_lines.append("")
        
        for node in self.tree_structure:
            add_node(node)
        
        return "\n".join(output_lines)
    
    def generate_chronological_tree(self) -> str:
        output_lines = []
        output_lines.append("=" * 80)
        output_lines.append("üìÖ DRZEWO CHRONOLOGICZNE")
        output_lines.append("=" * 80)
        output_lines.append("")
        
        events = []
        
        for node in self.tree_structure:
            if node["type"] == "PROCES":
                for child in node.get("children", []):
                    if child.get("type") == "DRUK":
                        doc_date = child.get("document_date", "")
                        delivery_date = child.get("delivery_date", "")
                        
                        if doc_date:
                            events.append({
                                "date": doc_date,
                                "type": "Dokument",
                                "description": f"Druk nr {child.get('number', '?')}: {child.get('title', '')[:50]}...",
                                "attachments": len(child.get("attachments", []))
                            })
                        
                        if delivery_date and delivery_date != doc_date:
                            events.append({
                                "date": delivery_date,
                                "type": "Dostarczenie",
                                "description": f"Dostarczenie druku nr {child.get('number', '?')}",
                                "attachments": 0
                            })
        
        events.sort(key=lambda x: x.get("date", ""))
        
        for event in events:
            output_lines.append(f"üìÜ {event['date']}")
            output_lines.append(f"   [{event['type']}] {event['description']}")
            if event['attachments'] > 0:
                output_lines.append(f"   üìé Za≈ÇƒÖczniki: {event['attachments']}")
            output_lines.append("")
        
        return "\n".join(output_lines)
    
    def save_results(self):
        print("\nüíæ Zapisywanie wynik√≥w...")
        
        json_path = os.path.join(self.output_dir, "process_data.json")
        with open(json_path, 'w', encoding='utf-8') as f:
            json.dump({
                "process": self.process_data,
                "tree": self.tree_structure,
                "attachments": self.attachments,
                "generated_at": datetime.now().isoformat()
            }, f, ensure_ascii=False, indent=2)
        print(f"   ‚úÖ Dane JSON: {json_path}")
        
        tree_path = os.path.join(self.output_dir, "drzewo_struktury.txt")
        with open(tree_path, 'w', encoding='utf-8') as f:
            f.write("=" * 80 + "\n")
            f.write("üå≥ DRZEWO STRUKTURY PROCESU LEGISLACYJNEGO\n")
            f.write(f"   Numer druku: {self.process_number}\n")
            f.write(f"   Kadencja: {self.term}\n")
            f.write(f"   Data wygenerowania: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
            f.write("=" * 80 + "\n\n")
            f.write(self.print_tree_ascii())
        print(f"   ‚úÖ Drzewo struktury: {tree_path}")
        
        chrono_path = os.path.join(self.output_dir, "drzewo_chronologiczne.txt")
        with open(chrono_path, 'w', encoding='utf-8') as f:
            f.write(self.generate_chronological_tree())
        print(f"   ‚úÖ Drzewo chronologiczne: {chrono_path}")
        
        summary_path = os.path.join(self.output_dir, "raport_podsumowujacy.txt")
        with open(summary_path, 'w', encoding='utf-8') as f:
            f.write("=" * 80 + "\n")
            f.write("üìä RAPORT PODSUMOWUJƒÑCY\n")
            f.write("=" * 80 + "\n\n")
            f.write(f"Numer druku: {self.process_number}\n")
            f.write(f"Kadencja: {self.term}\n")
            f.write(f"Tytu≈Ç: {self.process_data.get('title', 'N/A')}\n")
            f.write(f"Typ dokumentu: {self.process_data.get('documentType', 'N/A')}\n\n")
            f.write(f"Liczba powiƒÖzanych druk√≥w: {len(self.all_prints)}\n")
            f.write(f"Liczba pobranych za≈ÇƒÖcznik√≥w: {len(self.attachments)}\n\n")
            
            f.write("LINK DO STRONY SEJMU:\n")
            f.write(f"https://www.sejm.gov.pl/Sejm{self.term}.nsf/PrzebiegProc.xsp?nr={self.process_number}\n\n")
            
            f.write("POBRANE ZA≈ÅƒÑCZNIKI:\n")
            f.write("-" * 40 + "\n")
            for att in self.attachments:
                status = "‚úÖ Pobrano" if att.get("local_path") else "‚ùå Nie pobrano"
                f.write(f"  {status}: {att.get('filename', '?')}\n")
                if att.get("local_path"):
                    f.write(f"     Lokalna ≈õcie≈ºka: {att['local_path']}\n")
        
        print(f"   ‚úÖ Raport: {summary_path}")
    
    def run(self):
        print("=" * 80)
        print("üèõÔ∏è  SEJM PROCESS DOWNLOADER")
        print(f"   Pobieranie druku nr {self.process_number} z kadencji {self.term}")
        print("=" * 80)
        
        if not self.fetch_process_info():
            print("\n‚ùå Nie uda≈Ço siƒô pobraƒá informacji o druku.")
            return False
        
        self.build_tree()
        
        print("\n" + "=" * 80)
        print("üå≥ DRZEWO STRUKTURY:")
        print("=" * 80)
        print(self.print_tree_ascii())
        
        print(self.generate_chronological_tree())
        
        self.save_results()
        
        downloaded_count = len([a for a in self.attachments if a.get('local_path')])
        
        print("\n" + "=" * 80)
        print("‚úÖ ZAKO≈ÉCZONO POMY≈öLNIE!")
        print(f"   üìÇ Folder: {os.path.abspath(self.output_dir)}")
        print(f"   üìÑ Pobrano dokument√≥w: {downloaded_count}")
        print("=" * 80)
        
        return True

print("‚úÖ Klasa SejmProcessDownloader za≈Çadowana!")

In [None]:
# üöÄ URUCHOMIENIE POBIERANIA

downloader = SejmProcessDownloader(
    term=TERM,
    process_number=PROCESS_NUMBER,
    output_dir=OUTPUT_DIR
)

downloader.run()

In [None]:
# üìä WY≈öWIETL POBRANE DANE

json_path = os.path.join(OUTPUT_DIR, "process_data.json")
if os.path.exists(json_path):
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    print("üìä PODSUMOWANIE POBRANYCH DANYCH:")
    print("=" * 50)
    print(f"Tytu≈Ç: {data['process'].get('title', 'N/A')}")
    print(f"Liczba za≈ÇƒÖcznik√≥w: {len(data['attachments'])}")
    print(f"\nData wygenerowania: {data['generated_at']}")
else:
    print("‚ùå Brak danych - uruchom najpierw kom√≥rkƒô pobierania powy≈ºej.")

In [None]:
# üìÅ LISTA POBRANYCH PLIK√ìW

if os.path.exists(OUTPUT_DIR):
    print(f"üìÅ Zawarto≈õƒá folderu {OUTPUT_DIR}:")
    print("=" * 50)
    
    for root, dirs, files in os.walk(OUTPUT_DIR):
        level = root.replace(OUTPUT_DIR, '').count(os.sep)
        indent = ' ' * 2 * level
        print(f'{indent}üìÇ {os.path.basename(root)}/')
        sub_indent = ' ' * 2 * (level + 1)
        for file in files:
            print(f'{sub_indent}üìÑ {file}')
else:
    print(f"‚ùå Folder {OUTPUT_DIR} nie istnieje.")