In [1]:
import spacy
from spacy.tokens import DocBin

# adjust paths and language
nlp = spacy.blank("uk")

# load each DocBin
db1 = DocBin().from_disk("../data/test.spacy")
db2 = DocBin().from_disk("../data/train.spacy")

# create a new one and add all docs
combined = DocBin()
for db in (db1, db2):
    for doc in db.get_docs(nlp.vocab):
        combined.add(doc)

combined.to_disk("full_dataset.spacy")

In [3]:
import spacy
from spacy.tokens import DocBin
from pathlib import Path

nlp = spacy.blank("uk")
INPUT_TXT = Path("refined.txt")
OUT_DIR   = Path("sent_spacy_bins")
BIN_SIZE  = 150_000  # adjust to taste

# make output dir
OUT_DIR.mkdir(exist_ok=True)

# stream‐in sentences, flush every BIN_SIZE docs
doc_bin = DocBin()
count, file_idx = 0, 0
with INPUT_TXT.open(encoding="utf8") as f:
    for line in f:
        text = line.strip()
        if not text:
            continue
        doc_bin.add(nlp.make_doc(text))
        count += 1
        if count >= BIN_SIZE:
            doc_bin.to_disk(OUT_DIR / f"sentences_{file_idx}.spacy")
            file_idx += 1
            doc_bin = DocBin()
            count = 0

# write any leftovers
if count:
    doc_bin.to_disk(OUT_DIR / f"sentences_{file_idx}.spacy")

print(f"Wrote {file_idx+1} bins to {OUT_DIR}/")

Wrote 17 bins to sent_spacy_bins/


In [4]:
import spacy
from spacy.tokens import DocBin

nlp = spacy.blank("uk")

doc_bin1 = DocBin().from_disk("silver_spacy/UberNER1.spacy")
docs1 = list(doc_bin1.get_docs(nlp.vocab))

doc_bin2 = DocBin().from_disk("silver_spacy/UberNER2.spacy")
docs2 = list(doc_bin2.get_docs(nlp.vocab))

docs = docs1 + docs2

with open("UberText-NER_Silver.iob", "w", encoding="utf-8") as f:
    for doc in docs:
        for token in doc:
            if token.ent_type_:
                tag = f"{token.ent_iob_}-{token.ent_type_}"
            else:
                tag = token.ent_iob_
            f.write(f"{token.text} {tag}\n")
        f.write("\n")

In [2]:
import random
import spacy
from spacy.tokens import DocBin
from collections import defaultdict

def build_filtered_dataset(spacy_paths, output_path, lang="uk", no_ent_limit=80000):
    # Load blank model (to get Vocab) and read all docs
    nlp = spacy.blank(lang)
    all_docs = []
    for path in spacy_paths:
        docbin = DocBin().from_disk(path)
        all_docs.extend(docbin.get_docs(nlp.vocab))
    
    # 1) Pick one example per unique entity text
    seen = set()
    unique_entity_docs = []
    for doc in all_docs:
        for ent in doc.ents:
            text = ent.text
            if text not in seen:
                seen.add(text)
                unique_entity_docs.append(doc)
                break  # move to next doc after first new entity
    
    # 2) Collect docs with no entities, sample up to no_ent_limit
    no_ent_docs = [doc for doc in all_docs if not doc.ents]
    sampled_no_ent = random.sample(no_ent_docs, min(no_ent_limit, len(no_ent_docs)))
    
    # 3) Write to new DocBin, preserving entity annotations
    new_bin = DocBin(attrs=["ENT_IOB", "ENT_TYPE"])
    for doc in unique_entity_docs + sampled_no_ent:
        new_bin.add(doc)
    new_bin.to_disk(output_path)
    print(f"Wrote {len(unique_entity_docs)} docs with unique entities "
          f"+ {len(sampled_no_ent)} docs without entities to {output_path}")


files = ["silver_spacy/UberNER1.spacy", "silver_spacy/UberNER2.spacy"]
build_filtered_dataset(files, "silver_spacy/trimmed_UberNER.spacy")

Wrote 449742 docs with unique entities + 80000 docs without entities to silver_spacy/trimmed_UberNER.spacy


In [2]:
from datasets import load_dataset

# 1) Load your IOB as plain text
ds = load_dataset(
    "text",
    data_files={"train": "UberText-NER_Silver.iob"},
    split="train",
    keep_linebreaks=True  # keeps blank lines between sentences
)

# 2) Push to Hub, auto-sharded at ~100 MB
ds.push_to_hub(
    "lang-uk/UberText-NER-Silver",
    max_shard_size="100MB"
)

Uploading the dataset shards:   0%|          | 0/9 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5332 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/5332 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/5332 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/5332 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/5332 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/5332 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/5332 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/5332 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/5332 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/lang-uk/UberText-NER_Silver/commit/2cbc1a5c5464bd92946b13d147cb5aa0c5e1abf0', commit_message='Upload dataset', commit_description='', oid='2cbc1a5c5464bd92946b13d147cb5aa0c5e1abf0', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/lang-uk/UberText-NER_Silver', endpoint='https://huggingface.co', repo_type='dataset', repo_id='lang-uk/UberText-NER_Silver'), pr_revision=None, pr_num=None)