In [None]:
from datasets import load_dataset, Dataset
from huggingface_hub import login

login(token=os.environ['HUGGINGFACE_HUB_TOKEN'])  
ds_stream = load_dataset("uonlp/CulturaX", "pl", split="train", streaming=True)
ds_shuffled = ds_stream.shuffle(seed=42, buffer_size=10_000)

data_list = []
for i, item in enumerate(ds_shuffled):
    if i >= 50_000:
        break
    data_list.append(item)
    if i % 10_000 == 0:
        print(f"Loaded {i}/50,000...")

ds_small = Dataset.from_list(data_list)

print(f"Created Dataset with {len(ds_small)} samples")
print(f"Type: {type(ds_small)}")

print(ds_small[0])
print(len(ds_small))
print(ds_small.features)

In [None]:
def clean_text_function(example):
    import re
    text = example['text']
    text = re.sub(r'<[^>]+>', ' ', text) # HTML
    text = re.sub(r'https?://\S+', ' ', text) # Linki
    text = re.sub(r'\s+', ' ', text).strip() # Bia≈Çe znaki
    example['text'] = text
    return example

# --- 2. NOWA, BARDZIEJ AGRESYWNA KLASA QualityFilter (Poziom 4) ---
class QualityFilter:
    def __init__(self):
        import re 
        
        self.MIN_TEXT_LENGTH = 100
        self.MAX_TEXT_LENGTH = 50_000

        # Heurystyki jƒôzykowe (bez zmian)
        self.COMMON_POLISH = set(
            "w i z na siƒô ≈ºe do nie o po to co ma ale jest jak od "
            "do w tym za o siƒô ale z to i na nie ma po co jak "
            "jest od ≈ºe ten by≈Ç mi lub tak".split()
        )
        self.COMMON_ENGLISH = set(
            "the be to of and a in that have i it for not on with "
            "he as you do at this but his by from they we say her "
            "she or an will my one all would there their what so "
            "up out if about who get which go me".split()
        )

        # --- NOWE, AGRESYWNE REGEXY ---
        
        # Wzorzec na E-commerce, Og≈Çoszenia, Ceny
        self.ecommerce_pattern = re.compile(
            r'(\b(z≈Ç|PLN|cenie|okazja|SALE|promocja|rabat(u|em)?)\b.*){2,}|' + # Obni≈ºony pr√≥g do {2,}
            r'(\b(Kod produktu|Kod EAN|Cena netto:|Rozmiar:|Waga:|z≈Ç / m¬≤|Szafa\.pl|Gandalf\.com|Gomez\.pl|Mivo\.pl|KRN\.pl|Morizon\.pl|Gumtree)\b)|' +
            r'(zobacz oferty z \d+ ksiƒôgarni)|' +
            r'\b(OSZCZƒòDZASZ \d+ Z≈Å)\b',
            re.IGNORECASE
        )

        # Wzorzec na Listy, Agregatory, SEO, Tagi
        self.list_seo_pattern = re.compile(
            r'(\s*\|.*){5,}|' + 
            r'(^[\s‚Ä¢*-].*$){5,}|' + 
            r'(^((\d+\.)|\w\))\s.*$){5,}|' + 
            r'(üì¢.*){3,}|' +
            r'(\b(TAGI|Tags|Kategoria|Portal|Polecane wpisy|Najnowsze wpisy)\b:?)|' +
            r'(\[podobne:)|(Czytaj wiƒôcej ¬ª)|(Program na dzi≈õ)|(Strona g≈Ç√≥wna ¬ª)',
            re.MULTILINE | re.IGNORECASE
        )

        # Wzorzec na Fora, Komentarze, B≈Çƒôdy Kodu
        self.forum_errors_pattern = re.compile(
            r'(\b(Posted|napisa≈Ç|dnia|Autor:|dodany przez|komentarze:|kupi≈Ç/a|Zg≈Ço≈õ komentarz)\b.*){2,}|' + # Obni≈ºony pr√≥g do {2,}
            r'((\d+ lat temu)|\d{2}:\d{2}:\d{2})|' + 
            r'(\b(Re: |Zobacz wƒÖtek)\b)|' +
            r'\b(Warning: |failed to open stream|No such file or directory)\b|' +
            r'\w{3} \w{3} \d{2}, \d{4}', # np. Thu Jun 01, 2017
            re.IGNORECASE
        )
        
        # Wzorzec na be≈Çkot i z≈Çe t≈Çumaczenia (wy≈Çapuje konkretne nonsensy)
        self.garbled_pattern = re.compile(
            r'\b[bcdfghjklmnpqrstvwxzBCDFGHJKLMNPQRSTVWXZ]{7,}\b|' +
            r'\b(Bulls Snap Up Improads|trasie Ro≈õlinƒô|≈õmierci pracy|stawaƒá w ogniach|za≈Çadowali w≈Çadzy)\b',
             re.IGNORECASE
        )

    def __call__(self, example):
        text = example['text']
        
        if not (self.MIN_TEXT_LENGTH < len(text) < self.MAX_TEXT_LENGTH):
            return False
            
        # Sprawdzamy nowe, agresywne filtry
        if self.ecommerce_pattern.search(text):
            return False
        if self.list_seo_pattern.search(text):
            return False
        if self.forum_errors_pattern.search(text):
            return False
        if self.garbled_pattern.search(text):
            return False
            
        # Filtr jƒôzykowy (bez zmian)
        words = text.lower().split()
        if not words: 
             return False
             
        polish_words = sum(1 for word in words if word in self.COMMON_POLISH)
        english_words = sum(1 for word in words if word in self.COMMON_ENGLISH)
        total_common_words = polish_words + english_words
        
        if total_common_words > 10: 
            if english_words / total_common_words > 0.15:
                return False 
            if polish_words / len(words) < 0.05: 
                return False

        return True # Pr√≥bka jest "dobra"

def add_hash(example):
    import hashlib
    example['hash'] = hashlib.sha256(example['text'].encode('utf-8')).hexdigest()
    return example

In [None]:
NUM_CORES_TO_USE = 12 

print(f"Rozmiar oryginalny (CulturaX sample): {len(ds_small)}")
print(f"Uruchamiam przetwarzanie na {NUM_CORES_TO_USE} rdzeniach.")

print("Krok 1: Czyszczenie .map()...")
cleaned_ds = ds_small.map(
    clean_text_function, 
    num_proc=NUM_CORES_TO_USE
)

print("Krok 2: Zaawansowane filtrowanie (Poziom 4) .filter()...")
quality_filter_instance_v4 = QualityFilter()
filtered_ds = cleaned_ds.filter(
    quality_filter_instance_v4, 
    num_proc=NUM_CORES_TO_USE
)

print(f"Rozmiar po filtrowaniu jako≈õciowym: {len(filtered_ds)} (Usuniƒôto {len(cleaned_ds) - len(filtered_ds)})")

print("Krok 3: Obliczanie hashy do deduplikacji...")
ds_with_hashes = filtered_ds.map(
    add_hash, 
    num_proc=NUM_CORES_TO_USE
)

hashes_seen = set()
print("Krok 4: Deduplikacja .filter()...")
ds_deduplicated = ds_with_hashes.filter(
    lambda x: (h := x['hash']) not in hashes_seen and not hashes_seen.add(h)
)
final_ds = ds_deduplicated.remove_columns("hash")

print("\n--- WYNIK FINALNY (Poziom 4, Ultra-agresywny) ---")
print(f"Rozmiar oryginalny: {len(ds_small)}")
print(f"Rozmiar po czyszczeniu, filtrowaniu I deduplikacji: {len(final_ds)}")
print(f"Ca≈Çkowita liczba usuniƒôtych pr√≥bek: {len(ds_small) - len(final_ds)}")

In [None]:
print("\n--- PR√ìBKI 100 PO ZAAWANSOWANYM CZYSZCZENIU ---")

for i, item in enumerate(final_ds):
    if i >= 100:  # Zatrzymaj po pr√≥bce 100
        break
    print(f"--- PR√ìBKA {i+1} ---")
    print(item['text'])
    print("-" * 20)

# Export functions

In [None]:
import json
from datetime import datetime

def export_samples_for_gemini(dataset: Dataset):
    print("\n--- EKSPORT PR√ìBEK DLA GEMINI ---")

    sample_range = dataset.select(range(0, 1000))

    gemini_data = {
        "info": "Polish text samples from CulturaX dataset for regex analysis",
        "metadata": {
            "total_samples": len(sample_range),
            "sample_range": "0-999",  # Zaktualizuj je≈õli zmienisz range
            "export_date": datetime.now().isoformat(),
            "preprocessing": ["HTML tag removal", "whitespace normalization"]
        },
        "samples": []
    }

    for i, item in enumerate(sample_range):
        sample_data = {
            "id": i + 1,
            "text": item['text'],
            "length": len(item['text']),
            "word_count": len(item['text'].split())
        }
        gemini_data["samples"].append(sample_data)

    output_file = "samples_for_gemini_full.json"
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(gemini_data, f, ensure_ascii=False, indent=2)

    print(f"‚úÖ Exported {len(sample_range)} samples to {output_file}")
    print(f"üìä File size: {os.path.getsize(output_file) / 1024:.1f} KB")
    print(f"üìù Average text length: {sum(s['length'] for s in gemini_data['samples']) / len(gemini_data['samples']):.0f} chars")

    print(f"\n--- STATYSTYKI PR√ìBEK ---")
    lengths = [s['length'] for s in gemini_data['samples']]
    print(f"Najkr√≥tszy tekst: {min(lengths)} znak√≥w")
    print(f"Najd≈Çu≈ºszy tekst: {max(lengths)} znak√≥w")
    print(f"≈örednia d≈Çugo≈õƒá: {sum(lengths) / len(lengths):.0f} znak√≥w")

    print(f"\nüìÅ Plik {output_file} gotowy do wys≈Çania do Gemini!")
    print(f"üí° U≈ºyj prompt: 'Przeanalizuj te polskie teksty i zaproponuj wyra≈ºenia regularne do ekstrakcji danych'")

IDK

In [None]:
import json
import ollama
import re
import time
from pydantic import BaseModel, ValidationError
from datasets import Dataset
from tqdm.auto import tqdm

    
PROMPT_TASKS = {
    "datetimes": "Wypisz wszystkie daty i godziny (formaty cyfrowe, np. 2024-10-25, 25.10.2024, 14:30, 08:30:15).",
    "verbal_dates": "Wypisz wszystkie daty zapisane s≈Çownie (np. 10 grudnia 2021, 9 sierpnia, marzec 2020). Zwr√≥ƒá ca≈Çe dopasowanie.",
    "emails": "Wypisz wszystkie adresy e-mail (np. ktos@domena.pl).",
    "phones": "Wypisz wszystkie 9-cyfrowe numery telefon√≥w (polskie formaty, np. 123 456 789, 12-345-67-89, (12) 345 67 89).",
    "urls": "Wypisz wszystkie adresy URL i linki (zaczynajƒÖce siƒô od http, https lub www).",
    "pln_amounts": "Wypisz wszystkie kwoty pieniƒô≈ºne podane w polskiej walucie (np. 18.94 z≈Ç, 1 000 PLN, 500 z≈Çotych). Zwr√≥ƒá kwotƒô wraz z walutƒÖ.",
    "iban": "Wypisz wszystkie numery kont bankowych (polski format NRB, 26 cyfr, np. PL 12 3456... lub 123456...).",
    "flexion": "Wypisz wszystkie formy fleksyjne s≈Ç√≥w 'cz≈Çowiek' i 'ludzie' (np. cz≈Çowiek, cz≈Çowieka, ludziom, lud≈∫mi)."
}

def run_llm_extraction(model_name: str, user_prompt: str) -> ExtractionResponse:
    """Wywo≈Çuje Ollama z timeoutem klienta."""
    SYSTEM_PROMPT = 'Jeste≈õ precyzyjnym, automatycznym ekstraktorem wzorc√≥w. Zwracasz TYLKO i wy≈ÇƒÖcznie poprawny obiekt JSON w formacie {"matches": [...]}. Nie dodawaj ≈ºadnych wyja≈õnie≈Ñ ani markdowna. Bezwzglƒôdnie przestrzegaj REGU≈Å i OGRANICZE≈É podanych przez u≈ºytkownika. Je≈õli w tek≈õcie ABSOLUTNIE nie ma ≈ºadnych dopasowa≈Ñ, zwr√≥ƒá pustƒÖ listƒô: {"matches": []}.'
    
    try:
        response = ollama.chat(
            model=model_name,
            messages=[
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": user_prompt}
            ],
            format="json",
            options={
                'temperature': 0 
            }
        )
        json_string = response['message']['content']
        data = json.loads(json_string)
        validated_response = ExtractionResponse.model_validate(data)
        return validated_response
    except (json.JSONDecodeError, ValidationError, ollama.ResponseError, Exception) as e:
        #print(f"B≈ÅƒÑD (model: {model_name}, pomijam): {e}") 
        return ExtractionResponse(matches=[])

def run_llm_on_dataset_sequential(dataset: Dataset, task_key: str, num_samples: int = 100) -> Dataset:
    """Uruchamia ekstrakcjƒô LLM sekwencyjnie, z separacjƒÖ r√≥l."""
    if task_key not in PROMPT_TASKS:
        raise ValueError(f"Nieprawid≈Çowy klucz zadania: {task_key}.")
    
    print(f"\n--- Rozpoczynam SEKWENCYJNƒÑ ekstrakcjƒô LLM (Struktura XML, Temp=0) dla zadania: '{task_key}' ---")
    
    subset = dataset.select(range(num_samples))
    task_rules_and_constraints = PROMPT_TASKS[task_key] 
    
    col_gemma = f"gemma2_{task_key}"
    col_llama = f"llama3_{task_key}"
    
    results_gemma = []
    results_llama = []

    print(f"Przetwarzanie {len(subset)} pr√≥bek (sekwencyjnie)...")
    for sample in tqdm(subset):
        text = sample['text']
        
        user_prompt = (
        f"{task_rules_and_constraints}\n"
        f"\"\"\"\n"
        f"{text}\n"
        f"\"\"\""
        )
        
        gemma_result = run_llm_extraction("gemma2:2b", user_prompt)
        llama_result = run_llm_extraction("llama3.1:8b", user_prompt)
        
        results_gemma.append(gemma_result.matches)
        results_llama.append(llama_result.matches)

    if col_gemma in subset.column_names:
        subset = subset.remove_columns([col_gemma])
    if col_llama in subset.column_names:
        subset = subset.remove_columns([col_llama])

    final_subset = subset.add_column(col_gemma, results_gemma)
    final_subset = final_subset.add_column(col_llama, results_llama)
    
    print(f"Przetwarzanie '{task_key}' zako≈Ñczone.")
    return final_subset

Helpers

In [None]:
import os
import json

def export_text_batches(dataset, batch_size=100, out_dir="batches", prefix="batch"):
    os.makedirs(out_dir, exist_ok=True)
    total = len(dataset)
    for start in range(0, total, batch_size):
        end = min(start + batch_size, total)
        try:
            texts = dataset.select(range(start, end))['text']
        except Exception:
            texts = [dataset[i]['text'] for i in range(start, end)]
        records = [{"text": t} for t in texts]
        fname = os.path.join(out_dir, f"{prefix}_{start//batch_size + 1}.json")
        with open(fname, "w", encoding="utf-8") as f:
            json.dump(records, f, ensure_ascii=False, indent=2)
        print(f"Wrote {len(records)} records -> {fname}")

export_text_batches(ds_final_analysis, batch_size=100, out_dir="batches")

In [None]:
import os
import json
import re

def merge_batches_with_golden(batch_dir="batches",
                              golden_dir="golden_batches",
                              merged_dir="merged",
                              batch_prefix="batch",
                              golden_prefix="golden_batch"):
    os.makedirs(merged_dir, exist_ok=True)

    idx_re = re.compile(rf"{re.escape(batch_prefix)}_(\d+)\.json$")
    batch_files = sorted(
        [f for f in os.listdir(batch_dir) if idx_re.match(f)],
        key=lambda x: int(idx_re.match(x).group(1))
    )

    if not batch_files:
        print("Nie znaleziono plik√≥w batch w:", batch_dir)
        return

    for bf in batch_files:
        idx = idx_re.match(bf).group(1)
        batch_path = os.path.join(batch_dir, bf)
        golden_fname = f"{golden_prefix}_{idx}.json"
        golden_path = os.path.join(golden_dir, golden_fname)

        with open(batch_path, "r", encoding="utf-8") as f:
            batch_records = json.load(f)

        if os.path.exists(golden_path):
            with open(golden_path, "r", encoding="utf-8") as f:
                golden_records = json.load(f)
        else:
            golden_records = []

        max_len = max(len(batch_records), len(golden_records))
        merged_records = []
        for i in range(max_len):
            rec_batch = batch_records[i] if i < len(batch_records) else {}
            rec_golden = golden_records[i] if i < len(golden_records) else {}
            merged = dict(rec_batch)

            if rec_golden:
                merged["golden"] = rec_golden
            merged_records.append(merged)

        out_fname = os.path.join(merged_dir, f"merged_{batch_prefix}_{idx}.json")
        with open(out_fname, "w", encoding="utf-8") as f:
            json.dump(merged_records, f, ensure_ascii=False, indent=2)

        print(f"Merged {len(merged_records)} records -> {out_fname}")

merge_batches_with_golden(batch_dir="batches", golden_dir="golden_batches", merged_dir="merged")