In [1]:
# https://spacy.io/models
# https://course.spacy.io/en/chapter4
# https://ljvmiranda921.github.io/notebook/2021/11/20/spacy-v3/
# https://ner.pythonhumanities.com/03_02_train_spacy_ner_model.html

import json
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span

#with open("exercises/en/iphone.json", encoding="utf8") as f:
#    TEXTS = json.loads(f.read())

TEXTS = ['How to preorder the iPhone X', 'iPhone X is coming', 'Should I pay $1,000 for the iPhone X?', 'The iPhone 8 reviews are here', "iPhone 11 vs iPhone 8: What's the difference?", 'I need a new phone! Any tips?']

nlp = spacy.blank("en")
matcher = Matcher(nlp.vocab)

# Two tokens whose lowercase forms match "iphone" and "x"
pattern1 = [{"LOWER": "iphone"}, {"LOWER": "x"}]

# Token whose lowercase form matches "iphone" and a digit
pattern2 = [{"LOWER": "iphone"}, {"IS_DIGIT": True}]

# Add patterns to the matcher and create docs with matched entities
matcher.add("GADGET", [pattern1, pattern2])
docs = []
for doc in nlp.pipe(TEXTS):
    matches = matcher(doc)
    spans = [Span(doc, start, end, label=match_id) for match_id, start, end in matches]
    print(spans)
    doc.ents = spans
    docs.append(doc)
print(TEXTS)

[iPhone X]
[iPhone X]
[iPhone X]
[iPhone 8]
[iPhone 11, iPhone 8]
[]
['How to preorder the iPhone X', 'iPhone X is coming', 'Should I pay $1,000 for the iPhone X?', 'The iPhone 8 reviews are here', "iPhone 11 vs iPhone 8: What's the difference?", 'I need a new phone! Any tips?']


In [2]:
import json
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span, DocBin

#with open("exercises/en/iphone.json", encoding="utf8") as f:
#    TEXTS = json.loads(f.read())
TEXTS = ['How to preorder the iPhone X', 'iPhone X is coming', 'Should I pay $1,000 for the iPhone X?', 'The iPhone 8 reviews are here', "iPhone 11 vs iPhone 8: What's the difference?", 'I need a new phone! Any tips?']


nlp = spacy.blank("en")
matcher = Matcher(nlp.vocab)
# Add patterns to the matcher
pattern1 = ([{"LOWER": "iphone"}, {"LOWER": "x"}])
pattern2 = [{"LOWER": "iphone"}, {"IS_DIGIT": True}]
matcher.add("GADGET", [pattern1, pattern2])
docs = []
for doc in nlp.pipe(TEXTS):
    matches = matcher(doc)
    spans = [Span(doc, start, end, label=match_id) for match_id, start, end in matches]
    doc.ents = spans
    docs.append(doc)

doc_bin = DocBin(docs=docs)
doc_bin.to_disk("./train.spacy")

print(TEXTS)

['How to preorder the iPhone X', 'iPhone X is coming', 'Should I pay $1,000 for the iPhone X?', 'The iPhone 8 reviews are here', "iPhone 11 vs iPhone 8: What's the difference?", 'I need a new phone! Any tips?']


In [None]:
#### NOVEL APPROACH
#### NOVEL APPROACH 

import json
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span, DocBin

#with open("exercises/en/iphone.json", encoding="utf8") as f:
#    TEXTS = json.loads(f.read())
TEXTS = ['How to preorder the iPhone X', 'iPhone X is coming', 'Should I pay $1,000 for the iPhone X?', 'The iPhone 8 reviews are here', "iPhone 11 vs iPhone 8: What's the difference?", 'I need a new phone! Any tips?']


nlp = spacy.blank("en")
matcher = Matcher(nlp.vocab)
# Add patterns to the matcher
pattern1 = ([{"LOWER": "iphone"}, {"LOWER": "x"}])
pattern2 = [{"LOWER": "iphone"}, {"IS_DIGIT": True}]
matcher.add("GADGET", [pattern1, pattern2])
docs = []
for doc in nlp.pipe(TEXTS):
    matches = matcher(doc)
    spans = [Span(doc, start, end, label=match_id) for match_id, start, end in matches]
    doc.ents = spans
    docs.append(doc)

doc_bin = DocBin(docs=docs)
doc_bin.to_disk("./train.spacy")

print(TEXTS)

In [3]:
!python -m spacy init config --force ./config.cfg  --lang en --pipeline ner

[i] Generated config template specific for your use case
- Language: en
- Pipeline: ner
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[+] Auto-filled config with all values
[+] Saved config
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [3]:
#!python -m spacy train config.cfg --gpu-id 0 --paths.train train.spacy --paths.dev train.spacy --output output_folder

[i] Saving to output directory: output_folder

[2022-11-14 16:36:36,404] [INFO] Set up nlp object from config
[2022-11-14 16:36:36,408] [INFO] Pipeline: ['tok2vec', 'ner']
[2022-11-14 16:36:36,416] [INFO] Created vocabulary
[2022-11-14 16:36:36,418] [INFO] Finished initializing nlp object
[2022-11-14 16:36:37,529] [INFO] Initialized pipeline components: ['tok2vec', 'ner']



[i] Using GPU: 0
[1m
[+] Initialized pipeline
[1m
[i] Pipeline: ['tok2vec', 'ner']
[i] Initial learn rate: 0.001
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     24.33    0.00    0.00    0.00    0.00
200     200          2.79    340.12  100.00  100.00  100.00    1.00
400     400          0.00      0.00  100.00  100.00  100.00    1.00
600     600          0.00      0.00  100.00  100.00  100.00    1.00
800     800          0.00      0.00  100.00  100.00  100.00    1.00
1000    1000          0.00      0.00  100.00  100.00  100.00    1.00
1200    1200          0.00      0.00  100.00  100.00  100.00    1.00
1400    1400          0.00      0.00  100.00  100.00  100.00    1.00
1600    1600          0.00      0.00  100.00  100.00  100.00    1.00
1800    1800          0.00      0.00  100.00  100.00  100.00    1.00
[+] Saved pipeline to output directory
output_folder\model-last

In [5]:
#!python -m spacy train config.cfg --output ./output          --paths.train ./exercises/en/train_gadget.spacy --paths.dev ./exercises/en/dev_gadget.spacy

!python -m spacy train config.cfg --gpu-id 0 --output ./output_folder   --paths.train ./train.spacy                     --paths.dev ./train.spacy

[i] Saving to output directory: output_folder
[i] Using GPU: 0
[1m
[+] Initialized pipeline
[1m
[i] Pipeline: ['tok2vec', 'ner']
[i] Initial learn rate: 0.001
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     24.33    0.00    0.00    0.00    0.00
200     200          2.79    340.12  100.00  100.00  100.00    1.00
400     400          0.00      0.00  100.00  100.00  100.00    1.00
600     600          0.00      0.00  100.00  100.00  100.00    1.00
800     800          0.00      0.00  100.00  100.00  100.00    1.00
1000    1000          0.00      0.00  100.00  100.00  100.00    1.00
1200    1200          0.00      0.00  100.00  100.00  100.00    1.00
1400    1400          0.00      0.00  100.00  100.00  100.00    1.00
1600    1600          0.00      0.00  100.00  100.00  100.00    1.00
1800    1800          0.00      0.00  100.00  100.00  100.00    1.00
[+] Saved pipeline

[2022-11-15 00:21:20,834] [INFO] Set up nlp object from config
[2022-11-15 00:21:20,845] [INFO] Pipeline: ['tok2vec', 'ner']
[2022-11-15 00:21:20,849] [INFO] Created vocabulary
[2022-11-15 00:21:20,849] [INFO] Finished initializing nlp object
[2022-11-15 00:21:21,955] [INFO] Initialized pipeline components: ['tok2vec', 'ner']


In [6]:
# TEST TRAINED NETWORK
doc = nlp('Apple is slowing down the iPhone 8 and iPhone X - how to stop it')
doc = nlp('How to preorder the iPhone X')
for ent in doc.ents:
    print(doc.ents)
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

In [7]:
doc = nlp('Apple is slowing down the iPhone 8 and iPhone X - how to stop it')
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

Entities []


In [7]:
import spacy

nlp = spacy.load("en_core_web_sm")

text = "Upcoming iPhone X release date leaked as Apple reveals pre-orders"

# Process the text
doc = nlp(text)

# Iterate over the entities
for ent in doc.ents:
    # Print the entity text and label
    print(ent.text, ent.label_)

# Get the span for "iPhone X"
iphone_x = doc[1:3]

# Print the span text
print("Missing entity:", iphone_x.text)

Apple ORG
Missing entity: iPhone X


In [9]:
!python -m spacy validate


[2K[+] Loaded compatibility table
[1m
[i] spaCy installation:
C:\Users\rob\anaconda3\envs\spacy-GPU-env\lib\site-packages\spacy

NAME              SPACY            VERSION      
en_core_web_lg    >=3.4.0,<3.5.0   3.4.0     [+]
en_core_web_md    >=3.4.0,<3.5.0   3.4.0     [+]
en_core_web_sm    >=3.4.0,<3.5.0   3.4.0     [+]
en_core_web_trf   >=3.4.0,<3.5.0   3.4.0     [+]
nl_core_news_lg   >=3.4.0,<3.5.0   3.4.0     [+]
nl_core_news_md   >=3.4.0,<3.5.0   3.4.0     [+]
nl_core_news_sm   >=3.4.0,<3.5.0   3.4.0     [+]

