In [11]:
import os
from pathlib import Path

base_dir = Path(os.getcwd()).parent

In [12]:
from transformers import AutoTokenizer
from src.utils.config_loader import load_config
from src.data.preprocessing import create_df
import pandas as pd

config = load_config(base_dir / 'model_params.yaml')

tokenizer = AutoTokenizer.from_pretrained(config['model']['model_name'])

data_df = create_df(base_dir / 'data/my_data/all_regplans.conllu')

data_df

Unnamed: 0,words,labels
0,"[Detaljregulering, ,, Solstad, boligfelt, ,, E...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,"[Reguleringsbestemmelser, for, Støperitomta, P...","[O, O, O, O, O, O]"
2,"[PLANBESTEMMELSER, Detaljregulering, for, Kjet...","[O, O, O, O, O, O, O, O, O, O, O, O, O]"
3,"[Reguleringsbestemmelser, for, Krossen, KROSSE...","[O, O, O, O, O, O, O, O, O]"
4,"[REGULERINGSBESTEMMELSER, ,, DETALJREGULERING,...","[O, O, O, O, O, O, O]"
...,...,...
2369,"[Det, skal, takast, kontakt, med, Universitets...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2370,"[Utgifter, i, samband, med, den, arkeologiske,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2371,"[10, ., FØRESEGNER]","[O, O, O]"
2372,"[TIL, DETALJREGULERING, HITTUN]","[O, O, O]"


In [13]:
entity_lengths = []

for labels in data_df['labels']:
    current_length = 0
    for label in labels:
        if label.startswith('B-'):
            if current_length > 0:
                entity_lengths.append(current_length)
            current_length = 1
        elif label.startswith('I-') and current_length > 0:
            current_length += 1
        else:
            if current_length > 0:
                entity_lengths.append(current_length)
            current_length = 0
    if current_length > 0:
        entity_lengths.append(current_length)

entities_df = pd.DataFrame(entity_lengths, columns=['length'])
entities_df.describe()

Unnamed: 0,length
count,739.0
mean,1.14885
std,0.525278
min,1.0
25%,1.0
50%,1.0
75%,1.0
max,7.0


In [14]:
filtered_lengths = entities_df[entities_df['length'].between(1, 7)]
distribution = filtered_lengths['length'].value_counts().sort_index()
distribution_df = distribution.reset_index()
distribution_df.columns = ['length', 'freq']

distribution_df

Unnamed: 0,length,freq
0,1,668
1,2,40
2,3,28
3,5,2
4,7,1
