# Preparation of the train, val, and test datasets

## Importando pacotes

In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd


## Carregando dados

In [None]:
folder = './'

In [None]:
df = pd.read_json(path_or_buf=f"{folder}/doccano.json", lines=True)
df

## Preparando dados

In [None]:
# Recuperando as entidades a partir dos spans
sents = []
for idx in range(0, len(df)):
#for idx in range(0,5):
    text = df.loc[idx, 'text']
    labels = df.loc[idx, 'label']
    ner = []
    if len(labels) > 0:
        for span in labels:
            token = text[span[0]:span[1]]
            label = span[2]
            ner.append([span[0], span[1], token, label])

        row = {'text': text, 'ner': ner}

    else:
        row = {'text': text, 'ner': '' }

    sents.append(row)

df_sents = pd.DataFrame.from_dict(sents)
df_sents

In [None]:
df_sents_anot = df_sents[ df_sents['ner'] != '' ]
df_sents_anot = df_sents_anot.reset_index(drop=True)
df_sents_anot

In [None]:
# Verificando sentenças sem rótulos
df_sents_sem_entidade = df_sents[ df_sents['ner'] == '' ]
df_sents_sem_entidade = df_sents_sem_entidade.reset_index(drop=True)
df_sents_sem_entidade


In [None]:
df_sents_sem_entidade.to_csv(f"{folder}/sents_sem_anotacao.csv", index=False, sep=';')

In [None]:
# Separa e conta a quantidade de cada label
def count_labels(df):
    labels = []
    for idx in range(0, len(df)):
    #for idx in range(20,50):
        ner = df.loc[idx, 'ner']
        if len(ner) > 0:
            for item in ner:
                label = item[3]
                labels.append(label)
                #print(f"{idx} - {item[0]} - {label}")

    df_counts = pd.DataFrame()
    df_counts['label'] = labels
    return df_counts.value_counts()


In [None]:
count_labels(df_sents_anot)

## Criando train, val, e test datasets

In [None]:
train, test = train_test_split(df_sents_anot, test_size=0.1, random_state=42)
train, val = train_test_split(train, test_size=0.2, random_state=42)

# tem que resetar o index, pois a função de cálculo dos labels espera os indices começando em zero
train = train.reset_index(drop=True)
val   = val.reset_index(drop=True)
test  = test.reset_index(drop=True)


In [None]:
len(train), len(val), len(test)

In [None]:
train_counts = count_labels(train)
val_counts   = count_labels(val)
test_counts  = count_labels(test)

df_counts = pd.concat([train_counts, val_counts, test_counts], axis=1)
df_counts.columns = ['train', 'val', 'test']
df_counts['total'] = df_counts['train'] + df_counts['val'] + df_counts['test']
df_counts

In [None]:
df_counts.to_csv(f"{folder}/counts.csv", index=False, sep=';')

In [None]:
train.to_csv(f"{folder}/train.csv", index=False, sep=';')
val.to_csv(f"{folder}/val.csv", index=False, sep=';')
test.to_csv(f"{folder}/test.csv", index=False, sep=';')