In [2]:
import spacy
from spacy.tokens import Doc, Span,Token
from spacy.matcher import Matcher
nlp = spacy.load('en_core_web_sm')
matcher = Matcher(nlp.vocab)

# Two tokens whose lowercase forms match 'iphone' and 'x'
pattern1 = [{'LOWER': 'iphone'}, {'LOWER': 'x'}]

# Token whose lowercase form matches 'iphone' and an optional digit
pattern2 = [{'LOWER': 'iphone'}, {'IS_DIGIT': True, 'OP': '?'}]

# Add patterns to the matcher
matcher.add('GADGET', None, pattern1, pattern2)

In [4]:
TEXTS=['How to preorder the iPhone X', 'iPhone X is coming', 'Should I pay $1,000 for the iPhone X?', 'The iPhone 8 reviews are here', 'Your iPhone goes up to 11 today', 'I need a new phone! Any tips?']

# Create a Doc object for each text in TEXTS
for doc in nlp.pipe(TEXTS):
    # Find the matches in the doc
    matches = matcher(doc)
    
    # Get a list of (start, end, label) tuples of matches in the text
    entities = [(start, end, 'GADGET') for match_id, start, end in matches]
    print(doc.text, entities)    

How to preorder the iPhone X [(4, 6, 'GADGET'), (4, 5, 'GADGET')]
iPhone X is coming [(0, 2, 'GADGET'), (0, 1, 'GADGET')]
Should I pay $1,000 for the iPhone X? [(7, 9, 'GADGET'), (7, 8, 'GADGET')]
The iPhone 8 reviews are here [(1, 3, 'GADGET')]
Your iPhone goes up to 11 today [(1, 2, 'GADGET')]
I need a new phone! Any tips? []


In [5]:
TRAINING_DATA = []

# Create a Doc object for each text in TEXTS
for doc in nlp.pipe(TEXTS):
    # Match on the doc and create a list of matched spans
    spans = [doc[start:end] for match_id, start, end in matcher(doc)]
    # Get (start character, end character, label) tuples of matches
    entities = [(span.start_char, span.end_char, 'GADGET') for span in spans]
    
    # Format the matches as a (doc.text, entities) tuple
    training_example = (doc.text, {'entities': entities})
    # Append the example to the training data
    TRAINING_DATA.append(training_example)
    
print(*TRAINING_DATA, sep='\n')   

('How to preorder the iPhone X', {'entities': [(20, 28, 'GADGET'), (20, 26, 'GADGET')]})
('iPhone X is coming', {'entities': [(0, 8, 'GADGET'), (0, 6, 'GADGET')]})
('Should I pay $1,000 for the iPhone X?', {'entities': [(28, 36, 'GADGET'), (28, 34, 'GADGET')]})
('The iPhone 8 reviews are here', {'entities': [(4, 12, 'GADGET')]})
('Your iPhone goes up to 11 today', {'entities': [(5, 11, 'GADGET')]})
('I need a new phone! Any tips?', {'entities': []})


In [6]:
# Create a blank 'en' model
nlp = spacy.blank('en')

# Create a new entity recognizer and add it to the pipeline
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner)

# Add the label 'GADGET' to the entity recognizer
ner.add_label('GADGET')

In [8]:
# Start the training
nlp.begin_training()
import random 
# Loop for 10 iterations
for itn in range(10):
    # Shuffle the training data
    random.shuffle(TRAINING_DATA)
    losses = {}
    
    # Batch the examples and iterate over them
    for batch in spacy.util.minibatch(TRAINING_DATA, size=2):
        texts = [text for text, entities in batch]
        annotations = [entities for text, entities in batch]
        
        # Update the model
        nlp.update(texts, annotations, losses=losses)
        print(losses)

{'ner': 3.433677762746811}
{'ner': 5.936666294932365}
{'ner': 7.112199451876222}
{'ner': 0.6542383352061755}
{'ner': 0.6542383706891249}
{'ner': 0.6550147984360253}
{'ner': 1.877040375803071e-12}
{'ner': 0.0020535622601711994}
{'ner': 0.007795572455185397}
{'ner': 2.1864920008888236e-13}
{'ner': 3.92556733717065e-13}
{'ner': 5.62158963606562e-08}
{'ner': 3.533059136383577e-10}
{'ner': 1.2803596326004853e-07}
{'ner': 1.2819330285215678e-07}
{'ner': 6.59359803741132e-10}
{'ner': 1.458360137738461e-05}
{'ner': 0.015062897416354675}
{'ner': 5.832131998056795e-10}
{'ner': 9.109583810941555e-10}
{'ner': 9.109583813258981e-10}
{'ner': 2.9623015801658015e-16}
{'ner': 2.9625896394810323e-16}
{'ner': 6.806061345150098e-10}
{'ner': 6.323084563242539e-10}
{'ner': 6.32586891940723e-10}
{'ner': 6.327781508151673e-10}
{'ner': 4.518053306539394e-12}
{'ner': 4.5180539830787156e-12}
{'ner': 1.9842889902471378e-10}


In [9]:
TEST_DATA=['Apple is slowing down the iPhone 8 and iPhone X - how to stop it', "I finally understand what the iPhone X 'notch' is for", 'Everything you need to know about the Samsung Galaxy S9', 'Looking to compare iPad models? Here’s how the 2018 lineup stacks up', 'The iPhone 8 and iPhone 8 Plus are smartphones designed, developed, and marketed by Apple', 'what is the cheapest ipad, especially ipad pro???', 'Samsung Galaxy is a series of mobile computing devices designed, manufactured and marketed by Samsung Electronics']

In [10]:
# Process each text in TEST_DATA
for doc in nlp.pipe(TEST_DATA):
    # Print the document text and entitites
    print(doc.text)
    print(doc.ents, '\n\n')

Apple is slowing down the iPhone 8 and iPhone X - how to stop it
(iPhone, iPhone) 


I finally understand what the iPhone X 'notch' is for
(iPhone,) 


Everything you need to know about the Samsung Galaxy S9
(you,) 


Looking to compare iPad models? Here’s how the 2018 lineup stacks up
(iPad,) 


The iPhone 8 and iPhone 8 Plus are smartphones designed, developed, and marketed by Apple
(iPhone 8, iPhone 8, and) 


what is the cheapest ipad, especially ipad pro???
(,,) 


Samsung Galaxy is a series of mobile computing devices designed, manufactured and marketed by Samsung Electronics
(and,) 


