#### Group by document

In [1]:
import os
import pandas as pd
import json

base_data_path = ""
train_data_path = os.path.join(base_data_path, "subtask_a_train.json")
dev_data_path = os.path.join(base_data_path, "subtask_a_dev.json")

with open(os.path.join(base_data_path, train_data_path)) as f:
    train_data = json.load(f)

with open(os.path.join(base_data_path, dev_data_path)) as f:
    dev_data = json.load(f)

group_by_doc = {}

for entry in train_data['data']:
    if entry['document_id'] not in group_by_doc:
        group_by_doc[entry['document_id']] = []
    group_by_doc[entry['document_id']].append(entry)

for k, v in group_by_doc.items():
    print(k, len(v))

doc_agropoli_09 50
doc_agropoli_13 15
doc_auletta_01 37
doc_auletta_04 32
doc_auletta_13 41
doc_battipaglia_02 15
doc_battipaglia_13 35
doc_capaccio_06 54
doc_capaccio_10 91
doc_capaccio_15 53
doc_capaccio_21 65
doc_capaccio_27 26
doc_capaccio_28 35
doc_caserta_01 12
doc_caserta_02 116
doc_caserta_06 52
doc_eboli_02 63
doc_fisciano_02 58
doc_francavillais_02 26
doc_francavillais_09 46
doc_gragnano_02 16
doc_gragnano_03 25
doc_marigliano_01 32
doc_nocerainferiore_06 55
doc_nocerainferiore_15 12
doc_nola_01 24
doc_nola_02 49
doc_nola_05 9
doc_pellezzano_01 51
doc_pellezzano_02 27
doc_poggiomarino_01 58
doc_poggiomarino_02 41
doc_poggiomarino_12 70
doc_praiano_02 47
doc_praiano_03 15
doc_praiano_04 27
doc_praiano_05 50
doc_praiano_07 44
doc_praiano_13 22
doc_prataprincipatodiultra_01 13
doc_prataprincipatodiultra_02 14
doc_salerno_02 13
doc_salerno_03 21
doc_salerno_05 102
doc_salerno_06 42
doc_salerno_12 80
doc_santagnello_16 35
doc_santagnello_19 13
doc_santegidiodelmontealbino_03 81
do

#### Count words per document

In [2]:
def group_entries_by_document(data):
    grouped_data = {}
    for entry in data['data']:
        doc_id = entry['document_id']
        if doc_id not in grouped_data:
            grouped_data[doc_id] = []
        grouped_data[doc_id].append(entry)
    # Sort entries within each document by paragraph_id first, then by sentence_id
    for doc_id in grouped_data:
        grouped_data[doc_id].sort(key=lambda x: (x['paragraph_id'], x['sentence_id']))
    return grouped_data

def aggregate_per_document(data):
    grouped_data = group_entries_by_document(data)
    aggregated_data_per_doc = {}
    for doc_id, entries in grouped_data.items():
        aggregated_data_per_doc[doc_id] = []
        for entry in entries:
            aggregated_data_per_doc[doc_id].append(entry['sentence_text'])

    return aggregated_data_per_doc

train_grouped = aggregate_per_document(train_data)
word_counts_per_doc = []
for k, v in train_grouped.items():
    word_count = sum(len(s.split()) for s in v)
    word_counts_per_doc.append(word_count)
    print(k, word_count)

# Overall dataset statistics
print("\n=== Dataset Statistics ===")
print(f"Total documents: {len(word_counts_per_doc)}")
print(f"Average words per document: {sum(word_counts_per_doc) / len(word_counts_per_doc):.2f}")
print(f"Max words per document: {max(word_counts_per_doc)}")
print(f"Min words per document: {min(word_counts_per_doc)}")
print(f"Total words in dataset: {sum(word_counts_per_doc)}")

# Distribution statistics
word_counts_per_doc.sort()
n = len(word_counts_per_doc)
median = word_counts_per_doc[n//2] if n % 2 == 1 else (word_counts_per_doc[n//2-1] + word_counts_per_doc[n//2]) / 2
q1 = word_counts_per_doc[n//4]
q3 = word_counts_per_doc[3*n//4]

print(f"Median words per document: {median}")
print(f"25th percentile (Q1): {q1}")
print(f"75th percentile (Q3): {q3}")
print(f"Interquartile range (IQR): {q3 - q1}")

# Distribution bins
bins = [0, 50, 100, 200, 500, 1000, float('inf')]
bin_labels = ['0-50', '51-100', '101-200', '201-500', '501-1000', '1000+']
bin_counts = [0] * len(bin_labels)

for count in word_counts_per_doc:
    for i, bin_edge in enumerate(bins[1:]):
        if count <= bin_edge:
            bin_counts[i] += 1
            break

print("\n=== Word Count Distribution ===")
for label, count in zip(bin_labels, bin_counts):
    percentage = (count / len(word_counts_per_doc)) * 100
    print(f"{label} words: {count} documents ({percentage:.1f}%)")

doc_agropoli_09 783
doc_agropoli_13 350
doc_auletta_01 390
doc_auletta_04 312
doc_auletta_13 880
doc_battipaglia_02 698
doc_battipaglia_13 830
doc_capaccio_06 741
doc_capaccio_10 922
doc_capaccio_15 501
doc_capaccio_21 524
doc_capaccio_27 502
doc_capaccio_28 772
doc_caserta_01 181
doc_caserta_02 887
doc_caserta_06 742
doc_eboli_02 910
doc_fisciano_02 870
doc_francavillais_02 792
doc_francavillais_09 781
doc_gragnano_02 82
doc_gragnano_03 641
doc_marigliano_01 912
doc_nocerainferiore_06 575
doc_nocerainferiore_15 108
doc_nola_01 122
doc_nola_02 792
doc_nola_05 131
doc_pellezzano_01 918
doc_pellezzano_02 660
doc_poggiomarino_01 732
doc_poggiomarino_02 667
doc_poggiomarino_12 733
doc_praiano_02 594
doc_praiano_03 60
doc_praiano_04 317
doc_praiano_05 806
doc_praiano_07 842
doc_praiano_13 351
doc_prataprincipatodiultra_01 118
doc_prataprincipatodiultra_02 513
doc_salerno_02 382
doc_salerno_03 591
doc_salerno_05 877
doc_salerno_06 814
doc_salerno_12 306
doc_santagnello_16 908
doc_santagnello

#### Paragraph analysis

In [3]:
def aggregate_per_paragraph(data):
    grouped_data = group_entries_by_document(data)
    aggregated_data_per_paragraph = {}
    for doc_id, entries in grouped_data.items():
        aggregated_data_per_paragraph[doc_id] = []
        current_paragraph_id = None
        for entry in entries:
            if entry['paragraph_id'] != current_paragraph_id:
                current_paragraph_id = entry['paragraph_id']
                aggregated_data_per_paragraph[doc_id].append([])  # Start a new paragraph
            aggregated_data_per_paragraph[doc_id][-1].append(entry['sentence_text'])

    return aggregated_data_per_paragraph

train_grouped = aggregate_per_paragraph(train_data)
for k, v in train_grouped.items():
    print(k)
    for para in v:
        print(" - ", sum(len(s) for s in para))

doc_agropoli_09
 -  307
 -  284
 -  969
 -  31
 -  747
 -  489
 -  193
 -  62
 -  170
 -  179
 -  312
 -  62
 -  80
 -  333
 -  198
 -  67
 -  202
 -  702
 -  218
doc_agropoli_13
 -  2225
 -  20
doc_auletta_01
 -  38
 -  260
 -  12
 -  55
 -  1219
 -  50
 -  220
 -  150
 -  808
doc_auletta_04
 -  88
 -  87
 -  98
 -  149
 -  442
 -  269
 -  344
 -  258
 -  347
 -  70
doc_auletta_13
 -  46
 -  1232
 -  1335
 -  1495
 -  948
 -  604
 -  217
doc_battipaglia_02
 -  65
 -  268
 -  620
 -  270
 -  418
 -  489
 -  774
 -  544
 -  299
 -  24
 -  974
doc_battipaglia_13
 -  2198
 -  263
 -  251
 -  1395
 -  363
 -  835
 -  248
 -  542
doc_capaccio_06
 -  80
 -  103
 -  1713
 -  79
 -  271
 -  450
 -  512
 -  1124
 -  248
 -  89
 -  328
doc_capaccio_10
 -  432
 -  411
 -  470
 -  441
 -  510
 -  341
 -  240
 -  79
 -  776
 -  767
 -  685
 -  455
 -  167
 -  751
 -  314
doc_capaccio_15
 -  30
 -  71
 -  82
 -  185
 -  556
 -  441
 -  181
 -  577
 -  517
 -  98
 -  70
 -  145
 -  482
doc_capaccio_2

### Check gold terms 

#### 1. Term Occurrences
→ tells you how often long terms appear in the annotated corpus.

In [16]:
from collections import Counter

term_len_counter = Counter()

for row in train_data["data"]:
    terms = row.get("term_list", [])
    if not isinstance(terms, list):
        continue

    for term in terms:
        term = str(term).strip()
        if not term:
            continue
        n_words = len(term.split())
        if n_words >= 4:
            bucket = "4+"
        else:
            bucket = str(n_words)  # "1", "2", "3"
        term_len_counter[bucket] += 1

total_terms = sum(term_len_counter.values())

print("\n=== Distribution of term length (in words) ===")
if total_terms == 0:
    print("No terms found in train_data['data']. Check the JSON or filtering.")
else:
    for bucket in ["1", "2", "3", "4+"]:
        count = term_len_counter.get(bucket, 0)
        perc = (count / total_terms * 100) if total_terms > 0 else 0.0
        print(f"{bucket} word(s): {count} terms ({perc:.1f}%)")





=== Distribution of term length (in words) ===
1 word(s): 984 terms (44.4%)
2 word(s): 480 terms (21.6%)
3 word(s): 466 terms (21.0%)
4+ word(s): 288 terms (13.0%)


#### Distribution of UNIQUE term types (vocabulary-level)

→ tells you how long the typical term is.

In [17]:
from collections import Counter

unique_terms = set()

for row in train_data["data"]:
    terms = row.get("term_list", [])
    if not isinstance(terms, list):
        continue
    for term in terms:
        term = str(term).strip()
        if term:
            unique_terms.add(term)

type_len_counter = Counter()
for t in unique_terms:
    n_words = len(t.split())
    if n_words >= 4:
        bucket = "4+"
    else:
        bucket = str(n_words)
    type_len_counter[bucket] += 1

total_types = sum(type_len_counter.values())

print("\n=== Distribution of UNIQUE term types (in words) ===")
if total_types == 0:
    print("No unique terms found.")
else:
    for bucket in ["1", "2", "3", "4+"]:
        count = type_len_counter.get(bucket, 0)
        perc = (count / total_types * 100) if total_types > 0 else 0.0
        print(f"{bucket} word(s): {count} types ({perc:.1f}%)")



=== Distribution of UNIQUE term types (in words) ===
1 word(s): 165 types (23.1%)
2 word(s): 175 types (24.5%)
3 word(s): 151 types (21.2%)
4+ word(s): 222 types (31.1%)
