In [3]:
import spacy
from spacy.tokens import DocBin

def dataset_stats(spacy_paths, lang="uk"):
    # load blank model and add a sentencizer
    nlp = spacy.blank(lang)
    nlp.add_pipe("sentencizer")
    
    total_words = 0
    total_sentences = 0

    for path in spacy_paths:
        docbin = DocBin().from_disk(path)
        for doc in docbin.get_docs(nlp.vocab):
            # count tokens in the original Doc
            total_words += len(doc)
            # run sentencizer on its text to count sentences
            doc_with_sents = nlp(doc.text)
            total_sentences += len(list(doc_with_sents.sents))

    return total_words, total_sentences

if __name__ == "__main__":
    files = ["silver_spacy/UberNER1.spacy", "silver_spacy/UberNER2.spacy"]
    words, sentences = dataset_stats(files, lang="uk")
    print(f"Total words: {words}")
    print(f"Total sentences: {sentences}")

Total words: 45489533
Total sentences: 2573205


In [2]:
import spacy
from spacy.tokens import DocBin
from collections import Counter, defaultdict

def entity_distribution(spacy_paths, lang="uk"):
    nlp = spacy.blank(lang)
    total_counts = Counter()
    unique_entities = defaultdict(set)

    for path in spacy_paths:
        docbin = DocBin().from_disk(path)
        for doc in docbin.get_docs(nlp.vocab):
            for ent in doc.ents:
                total_counts[ent.label_] += 1
                unique_entities[ent.label_].add(ent.text)

    return total_counts, unique_entities

if __name__ == "__main__":
    files = ["silver_spacy/UberNER1.spacy", "silver_spacy/UberNER2.spacy"]
    counts, uniques = entity_distribution(files, lang="uk")

    print("Label     Total Occurrences   Unique Texts")
    print("-----     -----------------   ------------")
    for label, freq in counts.most_common():
        uniq_count = len(uniques[label])
        print(f"{label:10} {freq:18}   {uniq_count}")

    overall_unique = sum(len(texts) for texts in uniques.values())
    print(f"\nOverall distinct entity texts: {overall_unique}")


Label     Total Occurrences   Unique Texts
-----     -----------------   ------------
LOC                   1655906   90676
ORG                    866186   153999
PERS                   572179   127690
JOB                    542881   46418
DATE                   210526   22679
MISC                   136276   36232
ART                    129478   31578
PERIOD                 126979   23839
MON                     87102   34095
QUANT                   49024   18100
PCT                     46694   6180
TIME                    39236   5307
DOC                     30849   17272

Overall distinct entity texts: 614065


In [1]:
import spacy
from spacy.tokens import DocBin
from collections import Counter, defaultdict

def entity_distribution(spacy_paths, lang="uk"):
    nlp = spacy.blank(lang)
    total_counts = Counter()
    unique_entities = defaultdict(set)

    for path in spacy_paths:
        docbin = DocBin().from_disk(path)
        for doc in docbin.get_docs(nlp.vocab):
            for ent in doc.ents:
                total_counts[ent.label_] += 1
                unique_entities[ent.label_].add(ent.text)

    return total_counts, unique_entities

if __name__ == "__main__":
    files = ["silver_spacy/trimmed_UberNER.spacy"]
    counts, uniques = entity_distribution(files, lang="uk")

    print("Label     Total Occurrences   Unique Texts")
    print("-----     -----------------   ------------")
    for label, freq in counts.most_common():
        uniq_count = len(uniques[label])
        print(f"{label:10} {freq:18}   {uniq_count}")

    overall_unique = sum(len(texts) for texts in uniques.values())
    print(f"\nOverall distinct entity texts: {overall_unique}")

Label     Total Occurrences   Unique Texts
-----     -----------------   ------------
LOC                    477252   89334
ORG                    353458   151260
PERS                   246840   126325
JOB                    208195   46194
DATE                    78427   22380
MISC                    58477   33352
MON                     56402   34016
ART                     55929   29538
PERIOD                  48957   23269
QUANT                   29848   18016
DOC                     21806   17046
PCT                     18390   6112
TIME                    16250   5148

Overall distinct entity texts: 601990


In [2]:
import spacy
from spacy.tokens import DocBin
from collections import Counter

def entity_distribution(spacy_paths, lang="uk"):
    nlp = spacy.blank(lang)
    total_counts = Counter()
    for path in spacy_paths:
        docbin = DocBin().from_disk(path)
        for doc in docbin.get_docs(nlp.vocab):
            for ent in doc.ents:
                total_counts[ent.label_] += 1
    return total_counts

if __name__ == "__main__":
    files = ["silver_spacy/UberNER1.spacy", "silver_spacy/UberNER2.spacy"]
    dist = entity_distribution(files, lang="uk")
    print("Combined entity distribution:")
    for label, freq in dist.most_common():
        print(f"{label:10} {freq}")

Combined entity distribution:
LOC        1655906
ORG        866186
PERS       572179
JOB        542881
DATE       210526
MISC       136276
ART        129478
PERIOD     126979
MON        87102
QUANT      49024
PCT        46694
TIME       39236
DOC        30849


In [3]:
import spacy
from spacy.tokens import DocBin
from collections import Counter

def entity_distribution(spacy_paths, lang="uk"):
    nlp = spacy.blank(lang)
    total_counts = Counter()
    for path in spacy_paths:
        docbin = DocBin().from_disk(path)
        for doc in docbin.get_docs(nlp.vocab):
            for ent in doc.ents:
                total_counts[ent.label_] += 1
    return total_counts

if __name__ == "__main__":
    files = ["silver_spacy/UberNER1.spacy"]
    dist = entity_distribution(files, lang="uk")
    print("Combined entity distribution:")
    for label, freq in dist.most_common():
        print(f"{label:10} {freq}")

Combined entity distribution:
LOC        901057
ORG        467809
PERS       308641
JOB        292915
DATE       114050
MISC       73866
ART        70118
PERIOD     68921
MON        46533
QUANT      26368
PCT        24953
TIME       21449
DOC        16629


In [4]:
import spacy
from spacy.tokens import DocBin
from collections import Counter

def entity_distribution(spacy_paths, lang="uk"):
    nlp = spacy.blank(lang)
    total_counts = Counter()
    for path in spacy_paths:
        docbin = DocBin().from_disk(path)
        for doc in docbin.get_docs(nlp.vocab):
            for ent in doc.ents:
                total_counts[ent.label_] += 1
    return total_counts

if __name__ == "__main__":
    files = ["silver_spacy/UberNER2.spacy"]
    dist = entity_distribution(files, lang="uk")
    print("Combined entity distribution:")
    for label, freq in dist.most_common():
        print(f"{label:10} {freq}")

Combined entity distribution:
LOC        754849
ORG        398377
PERS       263538
JOB        249966
DATE       96476
MISC       62410
ART        59360
PERIOD     58058
MON        40569
QUANT      22656
PCT        21741
TIME       17787
DOC        14220


In [2]:
import spacy
from spacy.tokens import DocBin
from collections import Counter, defaultdict

def top_entities(spacy_paths, lang="uk", top_n=5):
    # blank model just for vocab
    nlp = spacy.blank(lang)
    freqs = defaultdict(Counter)

    # count each entity text per label
    for path in spacy_paths:
        docbin = DocBin().from_disk(path)
        for doc in docbin.get_docs(nlp.vocab):
            for ent in doc.ents:
                freqs[ent.label_][ent.text] += 1

    # select top N for each label
    top_per_label = {
        label: counter.most_common(top_n)
        for label, counter in freqs.items()
    }
    return top_per_label

if __name__ == "__main__":
    files = ["silver_spacy/UberNER1.spacy", "silver_spacy/UberNER2.spacy"]
    top5 = top_entities(files, lang="uk", top_n=5)

    for label, items in top5.items():
        print(f"{label}:")
        for text, count in items:
            print(f"  {text} — {count}")

ORG:
  ЗСУ — 34039
  ЄС — 16887
  НАТО — 14430
  СБУ — 11718
  ДСНС — 9327
PERIOD:
  добу — 10296
  минулу добу — 5218
  місяць — 2534
  тиждень — 1837
  рік — 1777
JOB:
  голова — 14621
  військові — 9928
  президент — 9184
  президента — 8832
  Президент — 6981
MISC:
  COVID-19 — 14177
  АТО — 1661
  ООС — 1201
  Другої світової війни — 1187
  БПЛА — 1011
PCT:
  50 % — 1586
  100 % — 1487
  80 % — 1374
  20 % — 1329
  30 % — 1275
LOC:
  України — 129760
  Україні — 89447
  США — 34284
  Україна — 32719
  Україну — 31277
PERS:
  Володимир Зеленський — 11081
  Зеленський — 9817
  Зеленського — 7478
  Путіна — 6240
  Путін — 4708
DATE:
  24 лютого — 3262
  сьогодні — 2609
  2014 році — 1858
  2014 року — 1609
  2021 році — 1596
QUANT:
  120 мм — 358
  15 - 20 м / с — 296
  155 мм — 283
  барель — 272
  га — 249
TIME:
  18:00 — 952
  12:00 — 886
  9:00 — 858
  18.00 — 819
  06.00 — 808
DOC:
  Кримінального кодексу України — 1005
  КК України — 379
  Конституції — 350
  Конституції Україн