<a href="https://colab.research.google.com/github/raymondwcs/learning_spacy/blob/main/spaCy_NER_training%2C_Entity_Ruler.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This example demonstrates how to use rules-augmented model-based NER to locate stock codes

In [1]:
!pip install --quiet -U spacy
!python -m spacy download zh_core_web_lg
!git clone http://github.com/raymondwcs/learning_spacy

[K     |████████████████████████████████| 5.9 MB 29.4 MB/s 
[K     |████████████████████████████████| 623 kB 47.8 MB/s 
[K     |████████████████████████████████| 10.1 MB 44.3 MB/s 
[K     |████████████████████████████████| 42 kB 1.6 MB/s 
[K     |████████████████████████████████| 456 kB 58.3 MB/s 
[?25hCollecting zh-core-web-lg==3.1.0
  Downloading https://github.com/explosion/spacy-models/releases/download/zh_core_web_lg-3.1.0/zh_core_web_lg-3.1.0-py3-none-any.whl (603.8 MB)
[K     |████████████████████████████████| 603.8 MB 8.4 kB/s 
Collecting spacy-pkuseg<0.1.0,>=0.0.27
  Downloading spacy_pkuseg-0.0.28-cp37-cp37m-manylinux2014_x86_64.whl (2.4 MB)
[K     |████████████████████████████████| 2.4 MB 19.9 MB/s 
Installing collected packages: spacy-pkuseg, zh-core-web-lg
Successfully installed spacy-pkuseg-0.0.28 zh-core-web-lg-3.1.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('zh_core_web_lg')
Cloning into 'learning_spacy'...
r

In [2]:
import spacy
import random
from spacy.training.example import Example
from spacy import displacy


In [3]:
nlp = spacy.load('zh_core_web_lg')
nlp.tokenizer.initialize(pkuseg_model="./learning_spacy/spacy_pkuseg/models")

# Data for training the model-based NER
Training can also be done via CLI.  Details below.

https://github.com/raymondwcs/learning_spacy/tree/main/NER_Training_CLI

In [4]:
TRAIN_DATA = [
    ("一齊係國企成份股調整期間 大家一齊不問價掃貨 尤其是386 857 成份股佔指數比重只係5%左右 冇咩受新計法影響 挾死班歐美鬼佬 港股10月29000 遠必挾之", 
        {"entities": [(26,29,"STOCK"),(30,33,"STOCK")]}),
    ("高盛：維持對騰訊(0700)買入評級 目標價705港元高盛發表報告指,與騰訊管理層於路演活動溝通後,重申對騰訊嘅積極正面睇法。",
        {"entities": [(6,8,"STOCK")]}),
    ("中電、匯控、恒大8月暴升8.5%。",
        {'entities': [(0,2,"STOCK"),(3,5,"STOCK"),(6,8,"STOCK")]}),
    ("匯控(00005)將於下周一（2日）公布2021年度中期業績。",
        {'entities': [(0,2,"STOCK")]}),
    ("中電，匯控，港交所齊齊跌8.5%金融海嘯後最差。",
        {"entities": [(0,2,"STOCK"),(3,5,"STOCK"),(6,9,"STOCK")]}),    
    ("港燈將恢復派息",
        {"entities": [(0,2,"STOCK")]})
]

In [5]:
ner=nlp.get_pipe("ner")

# Adding labels to the `ner`
for _, annotations in TRAIN_DATA:
  for ent in annotations.get("entities"):
    ner.add_label(ent[2])

# Start training

In [6]:
epoch = 50
optimizer = nlp.resume_training()

# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
  for itn in range(epoch):
    random.shuffle(TRAIN_DATA)
    losses = {}
    # batch up the examples using spaCy's minibatch
    batches = spacy.util.minibatch(TRAIN_DATA,size=2)
    for batch in batches:
        texts, annotations = zip(*batch)
        example = []
        for i in range(len(texts)):
          doc = nlp.make_doc(texts[i])
          example.append(Example.from_dict(doc, annotations[i]))
        nlp.update(example,drop=0.2,sgd=optimizer,losses=losses)
    print("Losses", losses)

Losses {'ner': 27.174205583445975}
Losses {'ner': 20.602176457318798}
Losses {'ner': 17.820162218805145}
Losses {'ner': 15.422787848095565}
Losses {'ner': 14.271302128220004}
Losses {'ner': 12.36997590793824}
Losses {'ner': 11.812037936693876}
Losses {'ner': 7.068529113365521}
Losses {'ner': 3.0906280035020757}
Losses {'ner': 2.343405767949662}
Losses {'ner': 0.47858492012588627}
Losses {'ner': 0.08122197914558081}
Losses {'ner': 0.18780374568056063}
Losses {'ner': 0.21515904354306098}
Losses {'ner': 0.25429279631817703}
Losses {'ner': 0.0004915435000139884}
Losses {'ner': 1.796596378614676e-05}
Losses {'ner': 0.8312920596870434}
Losses {'ner': 7.568776269697458e-05}
Losses {'ner': 2.750554704010387e-06}
Losses {'ner': 6.112969720922181e-07}
Losses {'ner': 1.629736456224959e-05}
Losses {'ner': 6.067612626545516e-07}
Losses {'ner': 2.900375534370305e-05}
Losses {'ner': 4.359029867693783e-08}
Losses {'ner': 3.781668739592886e-08}
Losses {'ner': 6.021872108463889e-06}
Losses {'ner': 1.577

# Debug

In [7]:
for text, annotations in TRAIN_DATA:
  for ent in annotations.get("entities"):
    # print(text,ent)
    # print(text[ent[0]:ent[1]])
    doc = nlp(text)
    char_span = doc.char_span(ent[0],ent[1])
    if char_span is None:  # start and end don't map to tokens
        print("Misaligned tokens", text, ent)

# Save trained model to disk

In [8]:
nlp.to_disk('./ner_model')

# Load trained model from disk

In [22]:
nlp = spacy.load('./ner_model')
nlp.tokenizer.initialize(pkuseg_model="./learning_spacy/spacy_pkuseg/models")

# Define custom rules for EntityRuler

In [23]:
patterns = [
    {"label": "STOCK", "pattern": [{"IS_DIGIT": True},{"ORTH":"."},{"ORTH": "HK"}]},
    {"label": "STOCK", "pattern": [{"IS_DIGIT": True},{"ORTH":"."},{"ORTH": "HK"}]},
    {"label": "STOCK", "pattern": [{"TEXT": {"REGEX":"\d+\.HK"}}]},
    # {"label": "STOCK", "pattern": [{"TEXT": "（"},{"IS_DIGIT": True},{"TEXT": "）"}]},
    # {"label": "STOCK", "pattern": [{'lower': {'IN': ['581','323','347','1053','2600']}}]},
    # {"label": "ORG", "pattern": [{'ORTH': {'IN': ['中國東方','馬鋼','重鋼','中鋁','鞍鋼']}}]},
    # {"label": "STOCK", "pattern": [{"IS_DIGIT": True},{"ENT_TYPE": "ORG"}]},
    {"label": "STOCK2", "pattern": [{"POS": "NUM"},{"POS": "NOUN"}]},
]

if "entity_ruler" in nlp.pipe_names:
  nlp.remove_pipe("entity_ruler")

entity_ruler = nlp.add_pipe("entity_ruler", before='ner')
entity_ruler.add_patterns(patterns)

# Remove the noun suffix from entity label 'STOCK2'

In [24]:
from spacy.language import Language
from spacy.tokens import Span

@Language.component("remove_stockno_suffix")
def remove_stockno_suffix(doc):
    new_ents = []
    for ent in doc.ents:
        if ent.label_ == "STOCK2":  #and ent.start != 0:
            next_token = doc[ent.start + 1]
            if next_token.pos_ == "NOUN":
                new_ent = Span(doc, ent.start, ent.end - 1, label="STOCK")
                new_ents.append(new_ent)
        else:
            new_ents.append(ent)
    doc.ents = new_ents
    return doc

# Add the component after the named entity recognizer
if "remove_stockno_suffix" in nlp.pipe_names:
  nlp.remove_pipe("remove_stockno_suffix")

nlp.add_pipe("remove_stockno_suffix", after="ner")

<function __main__.remove_stockno_suffix>

# Test the trained NER model and EntityRuler

In [25]:
sentences = [
  "睇好港鐡(0066.HK)強烈買入 😎",
  "內地疫情受控,旅遊相關股同程藝龍(780)最近區間上落橫行,大戶收集似近完成,中線支持位$13.83,可考慮作中長線投資。",
  "睿見教育（6068.HK）：凈利與現金流大增超四成，估值吸引力凸顯，獲大行睇多	隻1765都賺多幾成 升左少少就跌凸 支股即係支股  🤷 ",
  "密切留意鋼鐵股，581中國東方，323馬鋼，347鞍鋼，1053重鋼，2600中鋁！",
]

# def replace_zh_punctuation(sentence):
#   str = sentence
#   str = str.replace("（", "(")
#   str = str.replace("）", ")")
#   return(str)

for sentence in sentences:
  # sentence = replace_zh_punctuation(sentence)
  doc = nlp(sentence)
  displacy.render(doc,style='ent',jupyter=True)