In [3]:
import sys

sys.path.insert(0, "..")

In [4]:
import pandas as pd
import numpy as np
from collections import Counter

from src.scrapper import parse_conllu_file
from src.visualization import plot_frequency_of_

In [5]:
df_train = parse_conllu_file("../data/en_eslspok-ud-train.conllu")

In [6]:
def get_stats(dataset: list[list[tuple]]):
    sentence_lenghts = [len(sentence) for sentence in dataset]

    print(f"Total sentences: {len(dataset)}")
    print(f"Average sentence length: {round(np.mean(sentence_lenghts))}")
    print(f"Minimum sentence length: {min(sentence_lenghts)}")
    print(f"Maximum sentence length: {max(sentence_lenghts)}")
    print(f"Percentile 25, lenght: {np.percentile(sentence_lenghts, 25)}")
    print(f"Percentile 50, lenght: {np.percentile(sentence_lenghts, 50)}")
    print(f"Percentile 75, lenght: {np.percentile(sentence_lenghts, 75)}")

In [7]:
get_stats(df_train)

Total sentences: 1856
Average sentence length: 9
Minimum sentence length: 2
Maximum sentence length: 48
Percentile 25, lenght: 5.0
Percentile 50, lenght: 7.0
Percentile 75, lenght: 11.0


In [8]:
def build_counts(dataset: list[list[tuple]]) -> tuple[Counter, Counter, Counter]:
    word_counts = Counter()
    tag_counts = Counter()
    pair_counts = Counter()

    for sentence in dataset:
        for word, tag in sentence:
            word_counts[word] += 1
            tag_counts[tag] += 1
            pair_counts[f"({word},{tag})"] += 1

    return word_counts, tag_counts, pair_counts

In [9]:
train_word_counts, train_tag_counts, train_pair_counts = build_counts(df_train)

In [10]:
plot_frequency_of_("words", train_word_counts)

In [11]:
plot_frequency_of_("tags", train_tag_counts)

In [12]:
plot_frequency_of_("word pair tag", train_pair_counts)

More ideas:
* Plot top words by tag
* Expand EDA when all datasets are chosen