# Creating training data

In [1]:
import json

In [2]:
from spacy.matcher import Matcher

In [3]:
from spacy.lang.en import English

In [4]:
TEXTS = [
  "How to preorder the iPhone X",
  "iPhone X is coming",
  "Should I pay $1,000 for the iPhone X?",
  "The iPhone 8 reviews are here",
  "Your iPhone goes up to 11 today",
  "I need a new phone! Any tips?"
]

In [5]:
nlp = English()

In [6]:
matcher = Matcher(nlp.vocab)

In [7]:
# Two tokens whose lowercase forms match 'iphone' and 'x'
pattern1 = [{"LOWER": "iphone"}, {"LOWER": "x"}]

In [8]:
# Token whose lowercase form matches 'iphone' and an optional digit
pattern2 = [{"LOWER": "iphone"}, {"IS_DIGIT": True, "OP": "?"}]

In [9]:
# Add patterns to the matcher
matcher.add("GADGET", None, pattern1, pattern2)

In [10]:
TRAINING_DATA = []

In [11]:
# Create a Doc object for each text in TEXTS
for doc in nlp.pipe(TEXTS):
    # Match on the doc and create a list of matched spans
    spans = [doc[start:end] for match_id, start, end in matcher(doc)]
    # Get (start character, end character, label) tuples of matches
    entities = [(span.start_char, span.end_char, "GADGET") for span in spans]
    # Format the matches as a (doc.text, entities) tuple
    training_example = (doc.text, {"entities": entities})
    # Append the example to the training data
    TRAINING_DATA.append(training_example)

In [12]:
print(*TRAINING_DATA, sep="\n")

('How to preorder the iPhone X', {'entities': [(20, 28, 'GADGET'), (20, 26, 'GADGET')]})
('iPhone X is coming', {'entities': [(0, 8, 'GADGET'), (0, 6, 'GADGET')]})
('Should I pay $1,000 for the iPhone X?', {'entities': [(28, 36, 'GADGET'), (28, 34, 'GADGET')]})
('The iPhone 8 reviews are here', {'entities': [(4, 10, 'GADGET'), (4, 12, 'GADGET')]})
('Your iPhone goes up to 11 today', {'entities': [(5, 11, 'GADGET')]})
('I need a new phone! Any tips?', {'entities': []})
