In [65]:
import random

# Units for each quantity
units = {
    "width": ["cm", "ft", "in", "m", "mm", "yd"],
    "depth": ["cm", "ft", "in", "m", "mm", "yd"],
    "height": ["cm", "ft", "in", "m", "mm", "yd"],
    "item_weight": ["g", "kg", "µg", "mg", "oz", "lb", "t"],
    "maximum_weight_recommendation": ["g", "kg", "µg", "mg", "oz", "lb", "t"],
    "voltage": ["kV", "mV", "V"],
    "wattage": ["kW", "W"],
    "item_volume": ["cL", "cu ft", "cu in", "cup", "dL", "fl oz", "gal", "imp gal", "L", "µL", "mL", "pint", "qt"]
}

def generate_training_data(num_sentences=5000):
    data = []
    for _ in range(num_sentences):
        quantity = random.choice(list(units.keys()))
        unit = random.choice(units[quantity])
        number = random.randint(1, 1000)
        sentence = f"The {quantity} is {number} {unit}."

        # Find the start and end indices of the number and unit using regex
        import re
        match = re.search(rf"({number})\s+({unit})", sentence)
        if match:
            number_start, number_end = match.start(1), match.end(1)
            unit_start, unit_end = match.start(2), match.end(2)

            # Create the entity annotation
            entity = [(number_start, unit_end, "QUANTITY")]
            data.append({"text": sentence, "entities": entity})
        else:
            print(f"Skipping sentence due to no match: {sentence}")
    return data

# Generate training data and save it to a JSON file
training_data = generate_training_data(num_sentences=5000)
print(training_data)

[{'text': 'The item_weight is 419 g.', 'entities': [(19, 24, 'QUANTITY')]}, {'text': 'The depth is 590 m.', 'entities': [(13, 18, 'QUANTITY')]}, {'text': 'The width is 661 cm.', 'entities': [(13, 19, 'QUANTITY')]}, {'text': 'The depth is 410 in.', 'entities': [(13, 19, 'QUANTITY')]}, {'text': 'The item_volume is 539 cL.', 'entities': [(19, 25, 'QUANTITY')]}, {'text': 'The item_weight is 548 g.', 'entities': [(19, 24, 'QUANTITY')]}, {'text': 'The voltage is 334 kV.', 'entities': [(15, 21, 'QUANTITY')]}, {'text': 'The item_weight is 903 t.', 'entities': [(19, 24, 'QUANTITY')]}, {'text': 'The item_volume is 678 L.', 'entities': [(19, 24, 'QUANTITY')]}, {'text': 'The item_weight is 611 g.', 'entities': [(19, 24, 'QUANTITY')]}, {'text': 'The width is 107 cm.', 'entities': [(13, 19, 'QUANTITY')]}, {'text': 'The item_weight is 571 g.', 'entities': [(19, 24, 'QUANTITY')]}, {'text': 'The item_volume is 958 cu in.', 'entities': [(19, 28, 'QUANTITY')]}, {'text': 'The voltage is 838 kV.', 'entitie

In [66]:
import pandas as pd
import os
from tqdm import tqdm
import spacy
from spacy.tokens import DocBin

nlp = spacy.blank("en")

db = DocBin() # create a DocBin object

for item in tqdm(training_data):  # item is a dictionary
    text = item["text"]  # get the text
    annot = item["entities"]  # get the entities
    doc = nlp.make_doc(text)  # create doc object from text
    ents = []
    for start, end, label in annot:  # add character indexes
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print(f"Skipping entity in text: {text}")
        else:
            ents.append(span)
    doc.ents = ents  # label the text with the ents
    db.add(doc)

db.to_disk("./train.spacy") # save the docbin object

100%|██████████| 5000/5000 [00:00<00:00, 21167.06it/s]


In [67]:
nlp = spacy.load("./output/model-best")

In [78]:
doc = nlp("I love pancake")
for entity in doc.ents:
    print(entity.text)