In [21]:
from core_pro.ultilities import make_sync_folder
from datasets import Dataset
import polars as pl
from sklearn.model_selection import train_test_split
from rich import print
from pattern import pattern
from tqdm.auto import tqdm
from transformers import Trainer
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score
from transformers import EvalPrediction

path = make_sync_folder('nlp/ner')

In [2]:
file = path / 'tag_ner_bio.parquet'
df = pl.read_parquet(file).head(200_000)
df.head()

item_id,item_name,attribute,bio_label
i64,str,list[struct[2]],list[list[str]]
18163442447,"""Nắp lưng Huawei Mate 10 - Cao …","[{""Huawei"",""Brand""}]","[[""Nắp"", ""O""], [""lưng"", ""O""], … [""cấp"", ""O""]]"
13660698908,"""Áo len tay sọc (ảnh thật)""","[{""Len"",""Material""}]","[[""Áo"", ""O""], [""len"", ""B-Material""], … [""thật)"", ""O""]]"
8251901489,"""(Tặng Quần tây 420k) Sơ mi nam…","[{""Cotton"",""Material""}]","[[""(Tặng"", ""O""], [""Quần"", ""O""], … [""nhăn"", ""O""]]"
10813260047,"""HÓt Giày thể thao nữ đế cao ch…","[{""Da"",""Material""}, {""Thể thao"",""Style""}]","[[""HÓt"", ""O""], [""Giày"", ""O""], … [""A195"", ""O""]]"
3724340829,"""ÁO NỈ CAO CẤP FREESHIP_ÁO HOOD…","[{""Nỉ"",""Material""}]","[[""ÁO"", ""O""], [""NỈ"", ""B-Material""], … [""Thật)"", ""O""]]"


In [3]:
label_list = set([list(i.values())[0] for i in pattern])
label_list = [f'B-{i}' for i in label_list] + [f'I-{i}' for i in label_list] + ['O']
label_map = {label: i for i, label in enumerate(label_list)}
label_map

{'B-Flavour': 0,
 'B-Feature': 1,
 'B-People': 2,
 'B-Colour': 3,
 'B-Type': 4,
 'B-Style': 5,
 'B-SKU': 6,
 'B-Material': 7,
 'B-Brand': 8,
 'B-Region': 9,
 'B-Size': 10,
 'I-Flavour': 11,
 'I-Feature': 12,
 'I-People': 13,
 'I-Colour': 14,
 'I-Type': 15,
 'I-Style': 16,
 'I-SKU': 17,
 'I-Material': 18,
 'I-Brand': 19,
 'I-Region': 20,
 'I-Size': 21,
 'O': 22}

In [4]:
col = ['item_id', 'item_name', 'bio_label']
train, test = train_test_split(df.select(col), test_size=.2, random_state=42)
del df

In [5]:
def clean_label(example):
    return {
        'tokens': [i[0] for i in example['bio_label']],
        'labels': [i[1] for i in example['bio_label']],
    }

In [6]:
ds_train = Dataset.from_polars(train)
ds_train = ds_train.map(clean_label, remove_columns=['bio_label'])

ds_valid = Dataset.from_polars(test)
ds_valid = ds_valid.map(clean_label, remove_columns=['bio_label'])

Map:   0%|          | 0/160000 [00:00<?, ? examples/s]

Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

In [7]:
print(ds_train[0])

In [31]:
import copy
import json

# class InputFeatures(object):
#     """A single set of features of data."""
# 
#     def __init__(self, input_ids, attention_mask, token_type_ids, slot_labels_ids):
#         self.input_ids = input_ids
#         self.attention_mask = attention_mask
#         self.token_type_ids = token_type_ids
#         self.slot_labels_ids = slot_labels_ids
# 
#     def __repr__(self):
#         return str(self.to_json_string())
# 
#     def to_dict(self):
#         """Serializes this instance to a Python dictionary."""
#         output = copy.deepcopy(self.__dict__)
#         return output
# 
#     def to_json_string(self):
#         """Serializes this instance to a JSON string."""
#         return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
    

def convert_examples_to_features(
    examples,
    max_seq_len,
    tokenizer,
    pad_label_id=-100,
    cls_token_segment_id=0,
    pad_token_segment_id=0,
    sequence_segment_id=0,
    mask_padding_with_zero=True,
):
    # Get special tokens from the tokenizer
    cls_token = tokenizer.cls_token
    sep_token = tokenizer.sep_token
    unk_token = tokenizer.unk_token
    pad_token_id = tokenizer.pad_token_id

    # List to hold the converted features
    features = []

    for example in tqdm(examples):
        # Tokenize each word and align its corresponding label
        tokens = []
        label_ids = []

        for word, label in zip(example['tokens'], example['labels']):
            word_tokens = tokenizer.tokenize(word)

            # If the word cannot be tokenized, use [UNK] token
            if not word_tokens:
                word_tokens = [unk_token]

            tokens.extend(word_tokens)

            # Map string label to integer ID, apply pad_label_id for subword tokens
            label_id = label_map[label]
            label_ids.extend([label_id] + [pad_label_id] * (len(word_tokens) - 1))

        # Handle sequence truncation for [CLS] and [SEP] tokens
        special_tokens_count = 2
        if len(tokens) > max_seq_len - special_tokens_count:
            tokens = tokens[:max_seq_len - special_tokens_count]
            label_ids = label_ids[:max_seq_len - special_tokens_count]

        # Add [SEP] token at the end of the sentence
        tokens.append(sep_token)
        label_ids.append(pad_label_id)
        token_type_ids = [sequence_segment_id] * len(tokens)

        # Add [CLS] token at the start of the sentence
        tokens = [cls_token] + tokens
        label_ids = [pad_label_id] + label_ids
        token_type_ids = [cls_token_segment_id] + token_type_ids

        # Convert tokens to input IDs
        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # Create attention masks (1 for real tokens, 0 for padding tokens)
        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)


        # Pad sequences to the maximum sequence length
        padding_length = max_seq_len - len(input_ids)
        input_ids += [pad_token_id] * padding_length
        attention_mask += [0 if mask_padding_with_zero else 1] * padding_length
        token_type_ids += [pad_token_segment_id] * padding_length
        label_ids += [pad_label_id] * padding_length

        # Create InputFeatures object and append it to the list of features
        # features.append(
        #     InputFeatures(
        #         input_ids=input_ids,
        #         attention_mask=attention_mask,
        #         token_type_ids=token_type_ids,
        #         slot_labels_ids=label_ids,
        #     )
        # )
        features.append(
            dict(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
                slot_labels_ids=label_ids,
            )
        )

    return features

In [32]:
from transformers import RobertaTokenizerFast

# Initialize the tokenizer
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', add_prefix_space=True)

# Set the maximum sequence length
max_seq_len = 128  # You can adjust this based on your model/input

# Convert examples to features
train_features = convert_examples_to_features(ds_train, max_seq_len, tokenizer)
valid_features = convert_examples_to_features(ds_valid, max_seq_len, tokenizer)

  0%|          | 0/160000 [00:00<?, ?it/s]

  0%|          | 0/40000 [00:00<?, ?it/s]

In [34]:
from rich.pretty import Pretty
print(train_features[0])

In [10]:
import torch
from torch.utils.data import Dataset

# Define a Dataset class to wrap the tokenized features for training
class NERDataset(Dataset):
    def __init__(self, features):
        self.features = features

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        feature = self.features[idx]
        return {
            'input_ids': torch.tensor(feature.input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(feature.attention_mask, dtype=torch.long),
            'token_type_ids': torch.tensor(feature.token_type_ids, dtype=torch.long),
            'labels': torch.tensor(feature.slot_labels_ids, dtype=torch.long),
        }

# Convert tokenized features into PyTorch datasets
train_dataset = NERDataset(train_features)
valid_dataset = NERDataset(valid_features)

In [11]:
train_dataset[0]

{'input_ids': tensor([    0,   646, 25974,  3070,  7258, 28812,  8640,  1376,  3070, 11582,
           565, 27779,   952, 10172,   139,  2218,   139,  1526,   438,   229,
          7387,   234,  1376,  2023, 10965, 11065,  5563,   257,  4236, 16948,
          1376,  3070,  9253,   642,   234,   298,  8188,  7487,   289,  3849,
         11582,   282,   298,   732,  1376,  3070,  8210,    90,   226,  8188,
          7487,  1376,  2023,  2469,  2590,  4236, 16948,  1376,  3070,  2469,
           119,   163,  1376,  3070,  2469,   139,   646,  1577,  1376,  3070,
         10470,   791, 31528,  3849,  7471,   487, 29079,  1376,  2023,  7471,
           487, 27779,     2,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,   

In [12]:
from transformers import RobertaForTokenClassification

# Define the number of unique labels (ensure this matches your dataset's label set)
num_labels = len(label_list)  # e.g., the number of unique labels such as O, B-ORG, etc.

# Load the RoBERTa model for token classification
model = RobertaForTokenClassification.from_pretrained('roberta-base', num_labels=num_labels)


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
from transformers import TrainingArguments

log_step = 500
folder = path / 'model/roberta'
# Define training arguments
training_args = TrainingArguments(
    output_dir=folder,
    warmup_ratio=0.1,
    lr_scheduler_type='cosine',
    weight_decay=0.001,
    learning_rate=1e-4,
    per_device_train_batch_size=256,
    per_device_eval_batch_size=16,
    fp16=True,
    logging_strategy='steps',
    save_strategy='steps',
    eval_strategy='steps',
    save_steps=log_step,
    eval_steps=log_step,
    logging_steps=log_step,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    report_to="none",
    num_train_epochs=3,
    optim='adafactor',
)

In [14]:
def compute_metrics(p: EvalPrediction):
    predictions = p.predictions.argmax(axis=2)  # Get predicted label indices
    labels = p.label_ids  # True label IDs

    # Debugging: Print shapes of predictions and labels
    # print(f"Shape of predictions: {predictions.shape}")
    # print(f"Shape of labels: {labels.shape}")

    # Debugging: Log first few predictions and labels for inspection
    # print(f"First few predictions: {predictions[:2]}")
    # print(f"First few labels: {labels[:2]}")

    pred_labels = []
    true_labels = []

    # Iterate through predictions and labels
    for i, (pred_seq, true_seq) in enumerate(zip(predictions, labels)):
        pred_label_seq = []
        true_label_seq = []

        # Iterate through each token in the sequence
        for pred_idx, true_idx in zip(pred_seq, true_seq):
            if true_idx == -100:
                # Debugging: Log any padding tokens encountered
                # print(f"Padding token encountered at position {i}")
                continue

            # Check if the indices are within the valid range
            if pred_idx < len(label_list) and true_idx < len(label_list):
                pred_label_seq.append(label_list[pred_idx])
                true_label_seq.append(label_list[true_idx])
            else:
                # Debugging: Log when out-of-bound indices are encountered
                print(f"Index out of range: pred_idx={pred_idx}, true_idx={true_idx} at position {i}")

        pred_labels.append(pred_label_seq)
        true_labels.append(true_label_seq)

    # Debugging: Log final processed predictions and labels
    # print(f"Processed pred_labels: {pred_labels[:2]}")
    # print(f"Processed true_labels: {true_labels[:2]}")

    # Compute token-level F1, Precision, and Recall
    precision = precision_score(true_labels, pred_labels)
    # Trong 10 lần dự đoán nhãn PER: thì chúng ta đoán đúng 6 lần -> 6/10 = 60%

    recall = recall_score(true_labels, pred_labels)
    # Trong 8 nhãn PER thật: thì chúng ta đoán đúng 6 lần -> 6/8 = 75%

    f1 = f1_score(true_labels, pred_labels)

    # Debugging: Print classification report
    print("Classification Report:")
    print(classification_report(true_labels, pred_labels))

    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

In [15]:
# Initialize the Trainer with the modified compute_metrics function
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    processing_class=tokenizer,
    compute_metrics=compute_metrics  # Updated function
)

# Train the model
trainer.train()

Step,Training Loss,Validation Loss


  _warn_prf(average, modifier, msg_start, len(result))


Step,Training Loss,Validation Loss,Precision,Recall,F1
500,0.3154,0.081204,0.825771,0.896083,0.859491


  _warn_prf(average, modifier, msg_start, len(result))


Step,Training Loss,Validation Loss,Precision,Recall,F1
500,0.3154,0.081204,0.825771,0.896083,0.859491
1000,0.0639,0.055882,0.889601,0.921057,0.905056


  _warn_prf(average, modifier, msg_start, len(result))


Step,Training Loss,Validation Loss,Precision,Recall,F1
500,0.3154,0.081204,0.825771,0.896083,0.859491
1000,0.0639,0.055882,0.889601,0.921057,0.905056
1500,0.0477,0.046218,0.913395,0.923217,0.91828


TrainOutput(global_step=1875, training_loss=0.12190034993489583, metrics={'train_runtime': 543.4374, 'train_samples_per_second': 883.266, 'train_steps_per_second': 3.45, 'total_flos': 3.136156397568e+16, 'train_loss': 0.12190034993489583, 'epoch': 3.0})

In [16]:
valid_result = trainer.predict(valid_dataset)
# valid_report = MultiLabels().classification_report_html(
#     result=valid_result.predictions, 
#     labels=valid_result.label_ids, 
#     target_names=labels, 
#     show=True
# )

  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
valid_result.predictions.argmax(axis=2)

array([[22, 22, 22, ...,  7,  7,  7],
       [22, 22, 22, ..., 22, 22, 22],
       [22, 22, 22, ..., 22, 22, 22],
       ...,
       [22, 22, 22, ...,  7,  7,  7],
       [22, 22, 22, ...,  7,  7,  7],
       [22, 22, 22, ...,  7,  7,  7]])

In [18]:
predictions = valid_result.predictions.argmax(axis=2)  # Get predicted label indices
labels = valid_result.label_ids  # True label IDs

pred_labels = []
true_labels = []

# Iterate through predictions and labels
for i, (pred_seq, true_seq) in enumerate(zip(predictions, labels)):
    pred_label_seq = []
    true_label_seq = []

    # Iterate through each token in the sequence
    for pred_idx, true_idx in zip(pred_seq, true_seq):
        if true_idx == -100:
            # Debugging: Log any padding tokens encountered
            # print(f"Padding token encountered at position {i}")
            continue

        # Check if the indices are within the valid range
        if pred_idx < len(label_list) and true_idx < len(label_list):
            pred_label_seq.append(label_list[pred_idx])
            true_label_seq.append(label_list[true_idx])
        else:
            # Debugging: Log when out-of-bound indices are encountered
            print(f"Index out of range: pred_idx={pred_idx}, true_idx={true_idx} at position {i}")

    pred_labels.append(pred_label_seq)
    true_labels.append(true_label_seq)

# Debugging: Log final processed predictions and labels
# print(f"Processed pred_labels: {pred_labels[:2]}")
# print(f"Processed true_labels: {true_labels[:2]}")

# Compute token-level F1, Precision, and Recall
precision = precision_score(true_labels, pred_labels)
# Trong 10 lần dự đoán nhãn PER: thì chúng ta đoán đúng 6 lần -> 6/10 = 60%

recall = recall_score(true_labels, pred_labels)
# Trong 8 nhãn PER thật: thì chúng ta đoán đúng 6 lần -> 6/8 = 75%

f1 = f1_score(true_labels, pred_labels)

# Debugging: Print classification report
print("Classification Report:")
print(classification_report(true_labels, pred_labels))

  _warn_prf(average, modifier, msg_start, len(result))
