<a href="https://colab.research.google.com/github/saba-ramezani/Fine_Tuning_DistilBERT_For_Restaurant_Search_NER/blob/main/Fine_Tuning_DistilBERT_For_Restaurant_Search_NER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fine Tuning DistilBERT For Restaurant Search NER

In [1]:
# hide warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# !pip install -U transformers
# !pip install -U accelerate
# !pip install -U datasets

## 1. Load & Preprocess the MIT Restaurant Dataset

In [15]:
import requests

response = requests.get("https://raw.githubusercontent.com/laxmimerit/All-CSV-ML-Data-Files-Download/master/mit_restaurant_search_ner/train.bio")
response = response.text

In [16]:
response = response.splitlines()
response[:10]

['B-Rating\t2',
 'I-Rating\tstart',
 'O\trestaurants',
 'O\twith',
 'B-Amenity\tinside',
 'I-Amenity\tdining',
 '',
 'O\t34',
 '',
 'B-Rating\t5']

In [17]:
train_tokens = []
train_tags = []

temp_tokens = []
temp_tags = []
for line in response:
    if line != "":
        tag, token = line.strip().split("\t")
        temp_tags.append(tag)
        temp_tokens.append(token)
    else:
        train_tokens.append(temp_tokens)
        train_tags.append(temp_tags)

        temp_tokens, temp_tags = [], []

In [18]:
train_tokens[:10], train_tags[:10]

([['2', 'start', 'restaurants', 'with', 'inside', 'dining'],
  ['34'],
  ['5', 'star', 'resturants', 'in', 'my', 'town'],
  ['98', 'hong', 'kong', 'restaurant', 'reasonable', 'prices'],
  ['a',
   'great',
   'lunch',
   'spot',
   'but',
   'open',
   'till',
   '2',
   'a',
   'm',
   'passims',
   'kitchen'],
  ['a', 'place', 'that', 'serves', 'soft', 'serve', 'ice', 'cream'],
  ['a', 'restaurant', 'that', 'is', 'good', 'for', 'groups'],
  ['a', 'salad', 'would', 'make', 'my', 'day'],
  ['a', 'smoothie', 'would', 'hit', 'the', 'spot'],
  ['a', 'steak', 'would', 'be', 'nice']],
 [['B-Rating', 'I-Rating', 'O', 'O', 'B-Amenity', 'I-Amenity'],
  ['O'],
  ['B-Rating', 'I-Rating', 'O', 'B-Location', 'I-Location', 'I-Location'],
  ['O', 'B-Restaurant_Name', 'I-Restaurant_Name', 'O', 'B-Price', 'O'],
  ['O',
   'O',
   'O',
   'O',
   'O',
   'B-Hours',
   'I-Hours',
   'I-Hours',
   'I-Hours',
   'I-Hours',
   'B-Restaurant_Name',
   'I-Restaurant_Name'],
  ['O', 'O', 'O', 'O', 'B-Dish', '

In [19]:
len(train_tokens), len(train_tags)

(7659, 7659)

In [20]:
response = requests.get("https://raw.githubusercontent.com/laxmimerit/All-CSV-ML-Data-Files-Download/master/mit_restaurant_search_ner/test.bio")
response = response.text
response = response.splitlines()

test_tokens = []
test_tags = []

temp_tokens = []
temp_tags = []
for line in response:
    if line != "":
        tag, token = line.strip().split("\t")
        temp_tags.append(tag)
        temp_tokens.append(token)
    else:
        test_tokens.append(temp_tokens)
        test_tags.append(temp_tags)

        temp_tokens, temp_tags = [], []

len(test_tokens), len(test_tags)

(1520, 1520)

In [21]:
from datasets import Dataset, DatasetDict
import pandas as pd

train_df = pd.DataFrame({'tokens': train_tokens, 'ner_tags_str': train_tags})

test_df = pd.DataFrame({'tokens': test_tokens, 'ner_tags_str': test_tags})

df = pd.concat([train_df, test_df], ignore_index=True)

train_df.shape, test_df.shape, df.shape

((7659, 2), (1520, 2), (9179, 2))

In [22]:
df

Unnamed: 0,tokens,ner_tags_str
0,"[2, start, restaurants, with, inside, dining]","[B-Rating, I-Rating, O, O, B-Amenity, I-Amenity]"
1,[34],[O]
2,"[5, star, resturants, in, my, town]","[B-Rating, I-Rating, O, B-Location, I-Location..."
3,"[98, hong, kong, restaurant, reasonable, prices]","[O, B-Restaurant_Name, I-Restaurant_Name, O, B..."
4,"[a, great, lunch, spot, but, open, till, 2, a,...","[O, O, O, O, O, B-Hours, I-Hours, I-Hours, I-H..."
...,...,...
9174,"[will, i, be, able, to, find, a, romantic, res...","[O, O, O, O, O, O, O, B-Amenity, O, O, O, O, B..."
9175,"[will, waffle, house, accept, a, prepaid, visa...","[O, B-Restaurant_Name, I-Restaurant_Name, O, O..."
9176,"[yes, please, get, me, mcdonalds, phone, numbe...","[O, O, O, O, B-Restaurant_Name, O, O, O, B-Loc..."
9177,"[yes, the, new, diner, on, south, street, please]","[O, O, O, B-Cuisine, O, B-Location, I-Location..."


In [23]:
from sklearn.model_selection import train_test_split

# 70% for training, 20% test, 10% validation
train, test = train_test_split(df, test_size=0.3)
test, validation = train_test_split(test, test_size=1/3)

train.shape, test.shape, validation.shape, df.shape

((6425, 2), (1836, 2), (918, 2), (9179, 2))

In [24]:
from datasets import Dataset, DatasetDict

dataset = DatasetDict(
    {
        "train": Dataset.from_pandas(train, preserve_index=False),
        "test": Dataset.from_pandas(test, preserve_index=False),
        "validation": Dataset.from_pandas(validation, preserve_index=False)
    }
)

dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags_str'],
        num_rows: 6425
    })
    test: Dataset({
        features: ['tokens', 'ner_tags_str'],
        num_rows: 1836
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags_str'],
        num_rows: 918
    })
})

In [25]:
dataset['train'][0]

{'tokens': ['i',
  'am',
  'looking',
  'for',
  'a',
  'joes',
  'crab',
  'shack',
  'where',
  'is',
  'the',
  'nearest',
  'one'],
 'ner_tags_str': ['O',
  'O',
  'O',
  'O',
  'O',
  'B-Restaurant_Name',
  'I-Restaurant_Name',
  'I-Restaurant_Name',
  'O',
  'O',
  'O',
  'B-Location',
  'I-Location']}

In [27]:
unique_tags = set()
for tag in dataset['train']['ner_tags_str']:
    unique_tags.update(tag)

unique_tags = list(set([x[2:] for x in list(unique_tags) if x!='O']))

tag2index = {"O": 0}
for i, tag in enumerate(unique_tags):
    tag2index[f'B-{tag}'] = len(tag2index)
    tag2index[f'I-{tag}'] = len(tag2index)

index2tag = {v:k for k,v in tag2index.items()}

tag2index, index2tag

({'O': 0,
  'B-Location': 1,
  'I-Location': 2,
  'B-Hours': 3,
  'I-Hours': 4,
  'B-Restaurant_Name': 5,
  'I-Restaurant_Name': 6,
  'B-Cuisine': 7,
  'I-Cuisine': 8,
  'B-Amenity': 9,
  'I-Amenity': 10,
  'B-Rating': 11,
  'I-Rating': 12,
  'B-Dish': 13,
  'I-Dish': 14,
  'B-Price': 15,
  'I-Price': 16},
 {0: 'O',
  1: 'B-Location',
  2: 'I-Location',
  3: 'B-Hours',
  4: 'I-Hours',
  5: 'B-Restaurant_Name',
  6: 'I-Restaurant_Name',
  7: 'B-Cuisine',
  8: 'I-Cuisine',
  9: 'B-Amenity',
  10: 'I-Amenity',
  11: 'B-Rating',
  12: 'I-Rating',
  13: 'B-Dish',
  14: 'I-Dish',
  15: 'B-Price',
  16: 'I-Price'})

In [None]:
dataset = dataset.map(lambda example: {"ner_tags": [tag2index[tag] for tag in example['ner_tags_str']]})

In [29]:
dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags_str', 'ner_tags'],
        num_rows: 6425
    })
    test: Dataset({
        features: ['tokens', 'ner_tags_str', 'ner_tags'],
        num_rows: 1836
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags_str', 'ner_tags'],
        num_rows: 918
    })
})

In [32]:
dataset['train'][5]

{'tokens': ['what', 'is', 'pizza', 'hut', 'phone', 'number'],
 'ner_tags_str': ['O',
  'O',
  'B-Restaurant_Name',
  'I-Restaurant_Name',
  'O',
  'O'],
 'ner_tags': [0, 0, 5, 6, 0, 0]}

## 2. Tokenize the data

In [33]:
from transformers import AutoTokenizer

In [None]:
model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [41]:
dataset['train'][1]['tokens']

['find',
 'the',
 'read',
 'hot',
 'and',
 'blue',
 'restaurant',
 'with',
 'a',
 'date',
 'spot',
 'amenity']

In [40]:
input = dataset['train'][1]['tokens']
output = tokenizer(input, is_split_into_words=True)
tokenizer.convert_ids_to_tokens(output.input_ids)

['[CLS]',
 'find',
 'the',
 'read',
 'hot',
 'and',
 'blue',
 'restaurant',
 'with',
 'a',
 'date',
 'spot',
 'am',
 '##enity',
 '[SEP]']

In [42]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples['ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)

        previous_word_idx = None
        label_ids = []

        for word_idx in word_ids:
            # if id=-100 then loss is not calculated
            if word_idx is None:
                label_ids.append(-100)

            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])

            else:
                label_ids.append(-100)

            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs['labels'] = labels

    return tokenized_inputs



In [None]:
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

In [44]:
tokenized_dataset['train'][1]

{'tokens': ['find',
  'the',
  'read',
  'hot',
  'and',
  'blue',
  'restaurant',
  'with',
  'a',
  'date',
  'spot',
  'amenity'],
 'ner_tags_str': ['O',
  'O',
  'B-Restaurant_Name',
  'I-Restaurant_Name',
  'I-Restaurant_Name',
  'I-Restaurant_Name',
  'O',
  'O',
  'O',
  'B-Amenity',
  'I-Amenity',
  'O'],
 'ner_tags': [0, 0, 5, 6, 6, 6, 0, 0, 0, 9, 10, 0],
 'input_ids': [101,
  2424,
  1996,
  3191,
  2980,
  1998,
  2630,
  4825,
  2007,
  1037,
  3058,
  3962,
  2572,
  20693,
  102],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [-100, 0, 0, 5, 6, 6, 6, 0, 0, 0, 9, 10, 0, -100, -100]}

## 3. Data Collation & Metrics

In [None]:
# !pip install seqeval
# !pip install evaluate

In [46]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
import evaluate
import numpy as np

metric = evaluate.load('seqeval')
label_names = list(tag2index)

def compute_metrics(eval_preds):
    logits, labels = eval_preds

    predictions = np.argmax(logits, axis=-1)
    true_labels = [[label_names[l] for l in label if l!=-100] for label in labels]

    true_predictions = [[label_names[p] for p, l in zip(prediction, label) if l != -100]
                        for prediction, label in zip(predictions, labels)]

    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)

    return {
        "precision": all_metrics['overall_precision'],
        'recall': all_metrics['overall_recall'],
        'f1': all_metrics['overall_f1'],
        'accuracy': all_metrics['overall_accuracy'],
    }

## 4. Train the TokenClassification model

In [56]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(model_ckpt, id2label=index2tag, label2id=tag2index)

In [58]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments("finetuned-ner", eval_strategy='epoch',
                         save_strategy='epoch',
                         learning_rate=2e-5,
                         num_train_epochs=3,
                         weight_decay=0.01,
                         report_to="none"
                         )


In [59]:
trainer = Trainer(model=model, args=args,
                  train_dataset=tokenized_dataset['train'],
                  eval_dataset=tokenized_dataset['validation'],
                  data_collator=data_collator,
                  compute_metrics=compute_metrics,
                  tokenizer=tokenizer)

In [60]:
trainer.train()

Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.6372,0.330997,0.693707,0.754142,0.722663,0.899035
2,0.2547,0.315599,0.75649,0.778728,0.767448,0.908224
3,0.2063,0.318171,0.755934,0.783004,0.769231,0.910783


TrainOutput(global_step=2412, training_loss=0.3246846491622292, metrics={'train_runtime': 2597.0506, 'train_samples_per_second': 7.422, 'train_steps_per_second': 0.929, 'total_flos': 88224971539440.0, 'train_loss': 0.3246846491622292, 'epoch': 3.0})

## 5. Store the model

In [61]:
trainer.save_model("/content/drive/MyDrive/llm_finetuning_transformers/Restaurant_Search_NER/distilbert-base-uncased-ner-model")

## 6. Evaluate the model

In [63]:
preds_output = trainer.predict(tokenized_dataset['test'])

In [64]:
preds_output.metrics

{'test_loss': 0.28334367275238037,
 'test_precision': 0.7796478508544795,
 'test_recall': 0.8065898740959014,
 'test_f1': 0.7928900592495063,
 'test_accuracy': 0.9201449783701625,
 'test_runtime': 130.7185,
 'test_samples_per_second': 14.045,
 'test_steps_per_second': 1.76}