In [None]:
from datasets import load_from_disk
import string
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
ds = load_from_disk(dataset_path="../data/datasets/pl/text/")

In [None]:
def tagger(item):
    text = item["content"]
    dummy_tokens = text.split()

    item["chars"] = len(text)
    item["num_dummy_tokens"] = len(dummy_tokens)
    item["num_non_ws_tokens"] = sum(
        1 for tok in dummy_tokens if any(char not in string.punctuation for char in tok.strip())
    )

    return item


ds = ds.map(tagger, num_proc=20)
ds.cleanup_cache_files()

In [None]:
stats = (
    ds.select_columns(["_id", "type", "chars", "num_dummy_tokens", "num_non_ws_tokens"])
    .to_pandas()
    .convert_dtypes(dtype_backend="pyarrow")
)
stats["type"] = stats["type"].astype("category")
stats.head()

In [None]:
ax = sns.histplot(
    x=stats["num_non_ws_tokens"],
    log_scale=True,
    bins=50,
)
ax.set(title="#tokens distribution")

In [None]:
card_order = stats["type"].value_counts().index.tolist()
data = stats["type"].value_counts().plot.barh(logx=True, title="Types cardinality")

In [None]:
# sns.displot(data=stats, x="num_non_ws_tokens", col="type", col_wrap=3, log_scale=(True, False), facet_kws=dict(sharey=False, sharex=False), kind="hist", bins=25)

_, ax = plt.subplots(figsize=(8, 12))
ax.set(title="Per type text length ditribution")
sns.boxenplot(data=stats, y="type", x="num_non_ws_tokens", order=card_order, log_scale=True)

# Tokenize 

In [None]:
from transformers import AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("intfloat/multilingual-e5-large")
ds = ds.map(
    lambda examples: tokenizer(examples["content"], padding=False, truncation=False),
    batched=True,
    num_proc=20,
)

In [None]:
tokenized = []
for item in ds:
    tokenized.append({"num_tokens": len(item["input_ids"])})

In [None]:
sns.histplot(tokenized, bins=50)