# Fine Tuning DistilBERT For Restaurant Search NER

In [1]:
# hide warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
!pip install -U transformers
!pip install -U accelerate
!pip install -U datasets

## 1. Load the MIT Restaurant Dataset

In [15]:
import requests

response = requests.get("https://raw.githubusercontent.com/laxmimerit/All-CSV-ML-Data-Files-Download/master/mit_restaurant_search_ner/train.bio")
response = response.text

In [16]:
response = response.splitlines()
response[:10]

['B-Rating\t2',
 'I-Rating\tstart',
 'O\trestaurants',
 'O\twith',
 'B-Amenity\tinside',
 'I-Amenity\tdining',
 '',
 'O\t34',
 '',
 'B-Rating\t5']

In [17]:
train_tokens = []
train_tags = []

temp_tokens = []
temp_tags = []
for line in response:
    if line != "":
        tag, token = line.strip().split("\t")
        temp_tags.append(tag)
        temp_tokens.append(token)
    else:
        train_tokens.append(temp_tokens)
        train_tags.append(temp_tags)

        temp_tokens, temp_tags = [], []

In [18]:
train_tokens[:10], train_tags[:10]

([['2', 'start', 'restaurants', 'with', 'inside', 'dining'],
  ['34'],
  ['5', 'star', 'resturants', 'in', 'my', 'town'],
  ['98', 'hong', 'kong', 'restaurant', 'reasonable', 'prices'],
  ['a',
   'great',
   'lunch',
   'spot',
   'but',
   'open',
   'till',
   '2',
   'a',
   'm',
   'passims',
   'kitchen'],
  ['a', 'place', 'that', 'serves', 'soft', 'serve', 'ice', 'cream'],
  ['a', 'restaurant', 'that', 'is', 'good', 'for', 'groups'],
  ['a', 'salad', 'would', 'make', 'my', 'day'],
  ['a', 'smoothie', 'would', 'hit', 'the', 'spot'],
  ['a', 'steak', 'would', 'be', 'nice']],
 [['B-Rating', 'I-Rating', 'O', 'O', 'B-Amenity', 'I-Amenity'],
  ['O'],
  ['B-Rating', 'I-Rating', 'O', 'B-Location', 'I-Location', 'I-Location'],
  ['O', 'B-Restaurant_Name', 'I-Restaurant_Name', 'O', 'B-Price', 'O'],
  ['O',
   'O',
   'O',
   'O',
   'O',
   'B-Hours',
   'I-Hours',
   'I-Hours',
   'I-Hours',
   'I-Hours',
   'B-Restaurant_Name',
   'I-Restaurant_Name'],
  ['O', 'O', 'O', 'O', 'B-Dish', '

In [19]:
len(train_tokens), len(train_tags)

(7659, 7659)

In [20]:
response = requests.get("https://raw.githubusercontent.com/laxmimerit/All-CSV-ML-Data-Files-Download/master/mit_restaurant_search_ner/test.bio")
response = response.text
response = response.splitlines()

test_tokens = []
test_tags = []

temp_tokens = []
temp_tags = []
for line in response:
    if line != "":
        tag, token = line.strip().split("\t")
        temp_tags.append(tag)
        temp_tokens.append(token)
    else:
        test_tokens.append(temp_tokens)
        test_tags.append(temp_tags)

        temp_tokens, temp_tags = [], []

len(test_tokens), len(test_tags)

(1520, 1520)

In [21]:
from datasets import Dataset, DatasetDict
import pandas as pd

train_df = pd.DataFrame({'tokens': train_tokens, 'ner_tags_str': train_tags})

test_df = pd.DataFrame({'tokens': test_tokens, 'ner_tags_str': test_tags})

df = pd.concat([train_df, test_df], ignore_index=True)

train_df.shape, test_df.shape, df.shape

((7659, 2), (1520, 2), (9179, 2))

In [22]:
df

Unnamed: 0,tokens,ner_tags_str
0,"[2, start, restaurants, with, inside, dining]","[B-Rating, I-Rating, O, O, B-Amenity, I-Amenity]"
1,[34],[O]
2,"[5, star, resturants, in, my, town]","[B-Rating, I-Rating, O, B-Location, I-Location..."
3,"[98, hong, kong, restaurant, reasonable, prices]","[O, B-Restaurant_Name, I-Restaurant_Name, O, B..."
4,"[a, great, lunch, spot, but, open, till, 2, a,...","[O, O, O, O, O, B-Hours, I-Hours, I-Hours, I-H..."
...,...,...
9174,"[will, i, be, able, to, find, a, romantic, res...","[O, O, O, O, O, O, O, B-Amenity, O, O, O, O, B..."
9175,"[will, waffle, house, accept, a, prepaid, visa...","[O, B-Restaurant_Name, I-Restaurant_Name, O, O..."
9176,"[yes, please, get, me, mcdonalds, phone, numbe...","[O, O, O, O, B-Restaurant_Name, O, O, O, B-Loc..."
9177,"[yes, the, new, diner, on, south, street, please]","[O, O, O, B-Cuisine, O, B-Location, I-Location..."


In [23]:
from sklearn.model_selection import train_test_split

# 70% for training, 20% test, 10% validation
train, test = train_test_split(df, test_size=0.3)
test, validation = train_test_split(test, test_size=1/3)

train.shape, test.shape, validation.shape, df.shape

((6425, 2), (1836, 2), (918, 2), (9179, 2))

In [24]:
from datasets import Dataset, DatasetDict

dataset = DatasetDict(
    {
        "train": Dataset.from_pandas(train, preserve_index=False),
        "test": Dataset.from_pandas(test, preserve_index=False),
        "validation": Dataset.from_pandas(validation, preserve_index=False)
    }
)

dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags_str'],
        num_rows: 6425
    })
    test: Dataset({
        features: ['tokens', 'ner_tags_str'],
        num_rows: 1836
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags_str'],
        num_rows: 918
    })
})

In [25]:
dataset['train'][0]

{'tokens': ['i',
  'am',
  'looking',
  'for',
  'a',
  'joes',
  'crab',
  'shack',
  'where',
  'is',
  'the',
  'nearest',
  'one'],
 'ner_tags_str': ['O',
  'O',
  'O',
  'O',
  'O',
  'B-Restaurant_Name',
  'I-Restaurant_Name',
  'I-Restaurant_Name',
  'O',
  'O',
  'O',
  'B-Location',
  'I-Location']}