In [1]:
import re

file_path = 'restauranttrain.bio'

with open(file_path, 'r') as f:
    content = f.read()

# Split content by double newline as each block represents a data point.
blocks = content.strip().split('\n\n')

train_data = []

for block in blocks:
    lines = block.split('\n')
    sentence = []
    entities = []
    start_idx = 0
    
    for line in lines:
        if line.startswith('O\t'):
            # "O" represents tokens that are not part of any entity.
            token = line[2:]
            sentence.append(token)
            start_idx += len(token) + 1  # 1 is for the space between tokens.
        else:
            # Extract entity information.
            tag, token = re.match(r'(B|I)-(\w+)\t(.+)', line).groups()[1:]
            sentence.append(token)
            end_idx = start_idx + len(token)
            entities.append((start_idx, end_idx, tag))
            start_idx = end_idx + 1  # 1 is for the space between tokens.

    train_data.append((' '.join(sentence), {'entities': entities}))

# Print some examples to make sure.
for i in range(3):
    print(train_data[i])

('2 start restaurants with inside dining', {'entities': [(0, 1, 'Rating'), (2, 7, 'Rating'), (25, 31, 'Amenity'), (32, 38, 'Amenity')]})
('34', {'entities': []})
('5 star resturants in my town', {'entities': [(0, 1, 'Rating'), (2, 6, 'Rating'), (18, 20, 'Location'), (21, 23, 'Location'), (24, 28, 'Location')]})


In [2]:
import spacy
from spacy.training.example import Example

nlp = spacy.blank("en")
ner = nlp.add_pipe('ner')

In [3]:
for _, annotations in train_data:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])


In [4]:
from tqdm import tqdm

optimizer = nlp.begin_training()

for i in tqdm(range(5), desc='Epochs'):  # Number of training iterations
    total_loss = 0
    for text, annotations in tqdm(train_data, desc='Training', leave=False):
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, annotations)
        losses = {}
        nlp.update([example], drop=0.2, losses=losses)
        total_loss += losses['ner']
    print(f"Epoch {i}, Loss: {total_loss}")


Epochs:  20%|██        | 1/5 [01:25<05:42, 85.52s/it]

Epoch 0, Loss: 17384.022361525844


Epochs:  40%|████      | 2/5 [02:51<04:17, 85.69s/it]

Epoch 1, Loss: 12887.014031802471


Epochs:  60%|██████    | 3/5 [04:16<02:51, 85.67s/it]

Epoch 2, Loss: 11462.276149228494


Epochs:  80%|████████  | 4/5 [05:42<01:25, 85.69s/it]

Epoch 3, Loss: 10489.521462399167


Epochs: 100%|██████████| 5/5 [07:09<00:00, 85.85s/it]

Epoch 4, Loss: 9887.95163881269





In [5]:
nlp.to_disk("path_to_save_model")

In [9]:
nlp = spacy.load("path_to_save_model")
doc = nlp("5 star resturants in my town")

# Extract entities
entities = {ent.label_: ent.text for ent in doc.ents}
print(entities)

{'Rating': 'star', 'Location': 'town'}
