<a href="https://colab.research.google.com/github/raymondwcs/learning_spacy/blob/main/spaCy_NER_training%2C_Entity_Ruler.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install --quiet -U spacy
!python -m spacy download zh_core_web_lg
!git clone http://github.com/raymondwcs/learning_spacy

Collecting zh-core-web-lg==3.1.0
  Downloading https://github.com/explosion/spacy-models/releases/download/zh_core_web_lg-3.1.0/zh_core_web_lg-3.1.0-py3-none-any.whl (603.8 MB)
[K     |████████████████████████████████| 603.8 MB 8.1 kB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('zh_core_web_lg')
fatal: destination path 'learning_spacy' already exists and is not an empty directory.


In [2]:
import spacy
import random
from spacy.training.example import Example
from spacy import displacy


In [3]:
nlp = spacy.load('zh_core_web_lg')
nlp.tokenizer.initialize(pkuseg_model="./learning_spacy/spacy_pkuseg/models")

# Data for training the model-based NER
Training can also be done via CLI.  Details below.

https://github.com/raymondwcs/learning_spacy/tree/main/NER_Training_CLI

In [4]:
TRAIN_DATA = [
    ("一齊係國企成份股調整期間 大家一齊不問價掃貨 尤其是386 857 成份股佔指數比重只係5%左右 冇咩受新計法影響 挾死班歐美鬼佬 港股10月29000 遠必挾之", 
        {"entities": [(26,29,"STOCK"),(30,33,"STOCK")]}),
    ("高盛：維持對騰訊(0700)買入評級 目標價705港元高盛發表報告指,與騰訊管理層於路演活動溝通後,重申對騰訊嘅積極正面睇法。",
        {"entities": [(6,8,"STOCK")]}),
    ("中電、匯控、恒大8月暴升8.5%。",
        {'entities': [(0,2,"STOCK"),(3,5,"STOCK"),(6,8,"STOCK")]}),
    ("匯控(00005)將於下周一（2日）公布2021年度中期業績。",
        {'entities': [(0,2,"STOCK")]}),
    ("中電，匯控，港交所齊齊跌8.5%金融海嘯後最差。",
        {"entities": [(0,2,"STOCK"),(3,5,"STOCK"),(6,9,"STOCK")]}),    
    ("港燈將恢復派息",
        {"entities": [(0,2,"STOCK")]})
]

In [5]:
ner=nlp.get_pipe("ner")

# Adding labels to the `ner`
for _, annotations in TRAIN_DATA:
  for ent in annotations.get("entities"):
    ner.add_label(ent[2])

# Start training

In [6]:
epoch = 50
optimizer = nlp.resume_training()

# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
  for itn in range(epoch):
    random.shuffle(TRAIN_DATA)
    losses = {}
    # batch up the examples using spaCy's minibatch
    batches = spacy.util.minibatch(TRAIN_DATA,size=2)
    for batch in batches:
        texts, annotations = zip(*batch)
        example = []
        for i in range(len(texts)):
          doc = nlp.make_doc(texts[i])
          example.append(Example.from_dict(doc, annotations[i]))
        nlp.update(example,drop=0.2,sgd=optimizer,losses=losses)
    print("Losses", losses)

Losses {'ner': 27.748337944976083}
Losses {'ner': 21.57304497384083}
Losses {'ner': 18.85354080900602}
Losses {'ner': 17.847891764059362}
Losses {'ner': 14.112840334841913}
Losses {'ner': 12.714001215883556}
Losses {'ner': 9.864464898056795}
Losses {'ner': 6.230198501315115}
Losses {'ner': 2.9674113108699984}
Losses {'ner': 2.264431454244175}
Losses {'ner': 1.1841055445805164}
Losses {'ner': 1.2169862022426081}
Losses {'ner': 0.15350530105653903}
Losses {'ner': 0.0009526065804629353}
Losses {'ner': 0.05458916711631747}
Losses {'ner': 1.3985732670141895e-05}
Losses {'ner': 0.002368482427535186}
Losses {'ner': 0.022687221133385282}
Losses {'ner': 2.535232420665878e-05}
Losses {'ner': 7.209452713331168e-05}
Losses {'ner': 4.1145673390392424e-05}
Losses {'ner': 1.257213718643376}
Losses {'ner': 6.462689302200198e-07}
Losses {'ner': 0.012546615980283168}
Losses {'ner': 2.466913827214344e-07}
Losses {'ner': 0.00011301531452379647}
Losses {'ner': 7.644904712889862e-06}
Losses {'ner': 6.507325

# Debug

In [7]:
for text, annotations in TRAIN_DATA:
  for ent in annotations.get("entities"):
    # print(text,ent)
    # print(text[ent[0]:ent[1]])
    doc = nlp(text)
    char_span = doc.char_span(ent[0],ent[1])
    if char_span is None:  # start and end don't map to tokens
        print("Misaligned tokens", text, ent)

# Save trained model to disk

In [8]:
nlp.to_disk('./ner_model')

# Load trained model from disk, define custom rules for EntityRuler

In [18]:
nlp.from_disk('./ner_model')
nlp.tokenizer.initialize(pkuseg_model="./learning_spacy/spacy_pkuseg/models")

patterns = [
    {"label": "STOCK", "pattern": [{"IS_DIGIT": True},{"ORTH":"."},{"ORTH": "HK"}]},
    {"label": "STOCK", "pattern": [{"TEXT": "("},{"IS_DIGIT": True},{"TEXT": ")"}]},
    {"label": "STOCK", "pattern": [{"TEXT": "（"},{"IS_DIGIT": True},{"TEXT": "）"}]},
    # {"label": "STOCK", "pattern": [{'lower': {'IN': ['581','323','347','1053','2600']}}]},
    {"label": "STOCK", "pattern": [{"POS": "NUM"},{"POS": "NOUN"}]},
]

if "entity_ruler" in nlp.pipe_names:
  nlp.remove_pipe("entity_ruler")

entity_ruler = nlp.add_pipe("entity_ruler", before='ner')
entity_ruler.add_patterns(patterns)

# Test the trained NER model with EntityRuler

In [20]:
sentences = [
  "睇好港鐡(0066.HK)強烈買入 😎",
  "內地疫情受控,旅遊相關股同程藝龍(780)最近區間上落橫行,大戶收集似近完成,中線支持位$13.83,可考慮作中長線投資。",
  "睿見教育（6068.HK）：凈利與現金流大增超四成，估值吸引力凸顯，獲大行睇多	隻1765都賺多幾成 升左少少就跌凸 支股即係支股  🤷 ",
  "密切留意鋼鐵股，581中國東方，323馬鋼，347鞍鋼，1053重鋼，2600中鋁！",
]

def replace_zh_punctuation(sentence):
  str = sentence
  str = str.replace("（", "(")
  str = str.replace("）", ")")
  return(str)

for sentence in sentences:
  sentence = replace_zh_punctuation(sentence)
  doc = nlp(sentence)
  displacy.render(doc,style='ent',jupyter=True)