# Dataset preparation

In [27]:
from gc import set_debug

import pandas as pd
import itertools

ds = pd.read_json('filtered_data.json')

ds['tokens'] = ds['Text'].apply(lambda t: t.split(' '))
ds['ne'] = ds['tokens'].apply(lambda l: list(itertools.repeat('O', len(l))))

ds.to_json('./output.json', orient="records")

# Training

In [28]:
from sklearn.model_selection import train_test_split

dataset = pd.read_json("ner_data.json")

ne_label_col = dataset['ne']
label_list = set(itertools.chain.from_iterable(ne_label_col))

label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()}
num_labels = len(label_list)

# Split the dataset into train and temporary datasets (80% train, 20% temporary)
train_dataset, temp_dataset = train_test_split(dataset, test_size=0.2, random_state=42)

# Split the temporary dataset into validation and test datasets (50% validation, 50% test)
val_dataset, test_dataset = train_test_split(temp_dataset, test_size=0.5, random_state=42)

# Save the datasets to JSON files
train_dataset.to_json('train_data.json', orient="records")
val_dataset.to_json('val_data.json', orient="records")
test_dataset.to_json('test_data.json', orient="records")

In [29]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')


def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples['tokens'].tolist(), truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples['ne']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label_to_id[label[word_idx]])
            else:
                label_ids.append(label_to_id[label[word_idx]] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs['labels'] = labels
    return tokenized_inputs


label_all_tokens = True
train_dataset = tokenize_and_align_labels(train_dataset)
val_dataset = tokenize_and_align_labels(val_dataset)
test_dataset = tokenize_and_align_labels(test_dataset)
