In [1]:
# !pip install datasets
# !pip install seqeval
# !pip install torch

In [2]:
# !pip install accelerate -U



In [3]:
# !pip install transformers



In [4]:
import os
import pandas as pd
import numpy as np
from datasets import Dataset
from datasets import load_metric
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
import torch
import itertools


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
label_list = ['0', 'FOOD_ITEM', 'MEASUREMENT']  # Update with your label classes
label_encoding_dict = {'0': 0, 'FOOD_ITEM': 1, 'MEASUREMENT': 2}

In [6]:
task = "ner"
model_checkpoint = "bert-base-uncased"
batch_size = 16

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)


In [8]:
def get_dataset(file_path):
    df = pd.read_csv(file_path)
    df['Tag'] = df['Tag'].replace(label_encoding_dict)  # Encode 'Tag' column according to label_encoding_dict
    split_list = [list(y) for x, y in itertools.groupby(df.values.tolist(), lambda z: pd.isna(z[0])) if not x]
    tokens = [[x[1] for x in y] for y in split_list]
    entities = [[x[2] for x in y] for y in split_list]
    return pd.DataFrame({'tokens': tokens, 'ner_tags': entities})


In [12]:
train_df = get_dataset('~/datasets/grocery_training_data.csv')


In [13]:
test_df = get_dataset('~/datasets/grocery_testing_data.csv')

In [14]:
train_df.head()

Unnamed: 0,tokens,ner_tags
0,"[I, need, 2KG, of, samba, rice, and, 2, ltr, c...","[0, 0, 2, 0, 1, 1, 0, 2, 2, 1, 1, 0, 0, 2, 2, ..."


In [15]:
train_dataset = Dataset.from_pandas(train_df)


In [16]:
test_dataset = Dataset.from_pandas(test_df)


In [17]:
train_dataset

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 1
})

In [18]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(list(examples["tokens"]), truncation=True, padding="max_length", is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif label[word_idx] == 0:
                label_ids.append(0)
            else:
                label_ids.append(label[word_idx])
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


In [19]:
train_tokenized_datasets = train_dataset.map(tokenize_and_align_labels, batched=True)


Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 96.72 examples/s]


In [20]:
test_tokenized_datasets = test_dataset.map(tokenize_and_align_labels, batched=True)


Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 259.24 examples/s]


In [21]:
train_tokenized_datasets

Dataset({
    features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 1
})

In [22]:
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
args = TrainingArguments(
    f"test-{task}",
    evaluation_strategy="epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=1e-5,
)

In [24]:
data_collator = DataCollatorForTokenClassification(tokenizer)
metric = load_metric("seqeval")

  metric = load_metric("seqeval")


In [25]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [[label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    true_labels = [[label_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {"precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"]}

In [26]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_tokenized_datasets,
     eval_dataset=test_tokenized_datasets,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [27]:
trainer.train()


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.597177,0.785714,0.785714,0.785714,0.912281
2,No log,0.473856,0.857143,0.857143,0.857143,0.929825
3,No log,0.425844,0.857143,0.857143,0.857143,0.929825


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=3, training_loss=0.6779602368672689, metrics={'train_runtime': 5.6408, 'train_samples_per_second': 0.532, 'train_steps_per_second': 0.532, 'total_flos': 783897357312.0, 'train_loss': 0.6779602368672689, 'epoch': 3.0})

In [28]:
from transformers import pipeline


In [29]:
ner_model = pipeline("ner", model=trainer.model, tokenizer=tokenizer)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [30]:
input_sentence = "I need 2KG of samba rice and 2 ltr coconut oil."


In [31]:
reverse_label_encoding_dict = {v: k for k, v in label_encoding_dict.items()}


In [32]:
ner_results = ner_model(input_sentence)


In [33]:
for result in ner_results:
    token = result["word"]
    label = result["entity"]
    if label == "LABEL_0":
      label= "0"
    elif label == "LABEL_1":
      label = "FOOD_ITEM"
    else:
      label = "MEASUREMENT"
    print(f"Token: {token}\tLabel: {label}")


Token: i	Label: 0
Token: need	Label: 0
Token: 2	Label: 0
Token: ##k	Label: FOOD_ITEM
Token: ##g	Label: 0
Token: of	Label: 0
Token: samba	Label: FOOD_ITEM
Token: rice	Label: FOOD_ITEM
Token: and	Label: 0
Token: 2	Label: 0
Token: lt	Label: 0
Token: ##r	Label: 0
Token: coconut	Label: FOOD_ITEM
Token: oil	Label: FOOD_ITEM
Token: .	Label: FOOD_ITEM
