<a href="https://colab.research.google.com/github/raymondwcs/learning_spacy/blob/main/spaCy_NER_training%2C_Entity_Ruler.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This example demonstrates how to use rules-augmented model-based NER to locate stock codes

In [1]:
!pip install --quiet -U spacy
!python -m spacy download zh_core_web_lg
!git clone http://github.com/raymondwcs/learning_spacy

Collecting zh-core-web-lg==3.1.0
  Downloading https://github.com/explosion/spacy-models/releases/download/zh_core_web_lg-3.1.0/zh_core_web_lg-3.1.0-py3-none-any.whl (603.8 MB)
[K     |████████████████████████████████| 603.8 MB 8.9 kB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('zh_core_web_lg')
fatal: destination path 'learning_spacy' already exists and is not an empty directory.


In [2]:
import spacy
import random
from spacy.training.example import Example
from spacy import displacy
import re


In [3]:
nlp = spacy.load('zh_core_web_lg')
nlp.tokenizer.initialize(pkuseg_model="./learning_spacy/spacy_pkuseg/models")

# Part 1 - Train model-based NER

## Data for training the model-based NER
Training can also be done via CLI.  Details below.

https://github.com/raymondwcs/learning_spacy/tree/main/NER_Training_CLI

In [4]:
TRAIN_DATA = [
    ("一齊係國企成份股調整期間 大家一齊不問價掃貨 尤其是386 857 成份股佔指數比重只係5%左右 冇咩受新計法影響 挾死班歐美鬼佬 港股10月29000 遠必挾之", 
        {"entities": [(26,29,"STOCK"),(30,33,"STOCK")]}),
    ("高盛：維持對騰訊(0700)買入評級 目標價705港元高盛發表報告指,與騰訊管理層於路演活動溝通後,重申對騰訊嘅積極正面睇法。",
        {"entities": [(6,8,"STOCK")]}),
    ("中電、匯控、恒大8月暴升8.5%。",
        {'entities': [(0,2,"STOCK"),(3,5,"STOCK"),(6,8,"STOCK")]}),
    ("匯控(00005)將於下周一（2日）公布2021年度中期業績。",
        {'entities': [(0,2,"STOCK")]}),
    ("中電，匯控，港交所齊齊跌8.5%金融海嘯後最差。",
        {"entities": [(0,2,"STOCK"),(3,5,"STOCK"),(6,9,"STOCK")]}),    
    ("港燈將恢復派息",
        {"entities": [(0,2,"STOCK")]})
]

In [5]:
ner=nlp.get_pipe("ner")

# Adding labels to the `ner`
for _, annotations in TRAIN_DATA:
  for ent in annotations.get("entities"):
    ner.add_label(ent[2])

## Start training

In [6]:
epoch = 50
optimizer = nlp.resume_training()

# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
  for itn in range(epoch):
    random.shuffle(TRAIN_DATA)
    losses = {}
    # batch up the examples using spaCy's minibatch
    batches = spacy.util.minibatch(TRAIN_DATA,size=2)
    for batch in batches:
        texts, annotations = zip(*batch)
        example = []
        for i in range(len(texts)):
          doc = nlp.make_doc(texts[i])
          example.append(Example.from_dict(doc, annotations[i]))
        nlp.update(example,drop=0.2,sgd=optimizer,losses=losses)
    print("Losses", losses)

Losses {'ner': 30.66365236725184}
Losses {'ner': 20.535725836958818}
Losses {'ner': 18.337547548141224}
Losses {'ner': 16.068195706656272}
Losses {'ner': 13.498557920853894}
Losses {'ner': 12.493912998414066}
Losses {'ner': 9.760448115241282}
Losses {'ner': 6.151994594813331}
Losses {'ner': 4.536754130217901}
Losses {'ner': 3.279793493113438}
Losses {'ner': 0.2823935921201741}
Losses {'ner': 0.574943195942957}
Losses {'ner': 0.6645287696830977}
Losses {'ner': 0.04638215958796692}
Losses {'ner': 0.0012096859049819531}
Losses {'ner': 0.004151983546153217}
Losses {'ner': 0.0332430002578767}
Losses {'ner': 3.0275319816197946e-05}
Losses {'ner': 0.00024175256444800838}
Losses {'ner': 1.1689136720023962e-06}
Losses {'ner': 1.1129212454450931e-07}
Losses {'ner': 2.325519552073216e-06}
Losses {'ner': 3.2701564812786366e-07}
Losses {'ner': 1.8495704051915652e-08}
Losses {'ner': 2.1419984237256308e-05}
Losses {'ner': 3.146855669003286e-05}
Losses {'ner': 4.188323901551126e-06}
Losses {'ner': 2.0

## Debug

In [7]:
for text, annotations in TRAIN_DATA:
  for ent in annotations.get("entities"):
    # print(text,ent)
    # print(text[ent[0]:ent[1]])
    doc = nlp(text)
    char_span = doc.char_span(ent[0],ent[1])
    if char_span is None:  # start and end don't map to tokens
        print("Misaligned tokens", text, ent)

## Save trained model to disk

In [8]:
nlp.to_disk('./ner_model')

# Part 2

## Load trained model from disk

In [9]:
nlp = spacy.load('./ner_model')
nlp.tokenizer.initialize(pkuseg_model="./learning_spacy/spacy_pkuseg/models")

## Define custom rules for EntityRuler

In [10]:
from pandas import *

# https://docs.google.com/spreadsheets/d/1-btKGkOo_ywPeH8qTlJL0IKF1bxeaPEKTVhAOPeYAU8/edit?usp=sharing
data = read_csv("stockcodes.csv",dtype=str) 

stockcodes = data.iloc[:,1].to_list()   # STOCK
orgs = data.iloc[:,2].to_list()         # ORG-S

org_alias = ['5號仔','大笨象']     # ORG-S-A

print(stockcodes[:10])
print(orgs[:10])

['2', '3', '4', '5', '6', '8', '10', '11', '12', '14']
['中電控股', '香港中華煤氣', '九龍倉集團', '匯豐控股', '電能實業', '電訊盈科', '恒隆集團', '恒生銀行', '恒基地產', '希慎興業']


In [11]:
patterns = [
    {"label": "STOCK", "pattern": [{"IS_DIGIT": True},{"ORTH":"."},{"ORTH": "HK"}]},
    {"label": "STOCK", "pattern": [{"TEXT": {"REGEX":"\d+\.HK"}}]},
    {"label": "STOCK", "pattern": [{"ORTH": "("},{"IS_DIGIT": True},{"ORTH": ")"}]},
    {"label": "STOCK", "pattern": [{"IS_DIGIT": True},{"ENT_TYPE": "ORG"}]},
    {"label": "STOCK2", "pattern": [{"POS": "NUM"},{"POS": "NOUN"}]},
    {"label": "STOCK3", "pattern": [{'TEXT': {'IN': stockcodes}}]},
    {"label": "ORG-S", "pattern": [{'ORTH': {'IN': orgs}}]},
    {"label": "ORG-S-A", "pattern": [{'ORTH': {'IN': org_alias}}]},
]

if "entity_ruler" in nlp.pipe_names:
  nlp.remove_pipe("entity_ruler")

entity_ruler = nlp.add_pipe("entity_ruler", before='ner')
entity_ruler.add_patterns(patterns)

## Check stock suffix ('STOCK2', 'STOCK3')

In [12]:
from spacy.language import Language
from spacy.tokens import Span

def print_ent_details(ents):
    ent_details = []
    for ent in doc.ents:
      ent_details.append([ent.text, ent.label_])
    print('doc.ents: {}'.format(ent_details))

@Language.component("check_stockno_suffix")
def remove_stockno_suffix(doc):
    # print_ent_details(doc.ents)
    new_ents = []
    for i in range(len(doc.ents)):
      if doc.ents[i].label_ == "STOCK3" or doc.ents[i].label_ == "STOCK":
        token      = doc.ents[i]
        next_token = doc[doc.ents[i].start+1]
        # print('[{},{},{}]'.format(doc.ents[i].text,doc.ents[i].label_,next_token.text))
        # print(token.text,next_token.text)
        if  not re.search(r'[一二三四十五六七八九十零百千萬億壹貳參叄肆伍陸柒捌玖拾佰仟]+',token.text) and \
            not re.search(r'[蚊號]',token.text) and \
            not re.search(r'[元文蚊]',next_token.text):
          new_ent = Span(doc, doc.ents[i].start, doc.ents[i].end, label="STOCK")
          new_ents.append(new_ent)
      else:
        new_ents.append(doc.ents[i])
    doc.ents = new_ents
    # print_ent_details(doc.ents)
    return doc

# Add the component after the named entity recognizer
if "check_stockno_suffix" in nlp.pipe_names:
  nlp.remove_pipe("check_stockno_suffix")

nlp.add_pipe("check_stockno_suffix", after="ner")

<function __main__.remove_stockno_suffix>

## Remove stock noun suffix ('STOCK2')

In [13]:
from spacy.language import Language
from spacy.tokens import Span

@Language.component("remove_stockno_suffix")
def remove_stockno_suffix(doc):
    new_ents = []
    for ent in doc.ents:
        if ent.label_ == "STOCK2":  #and ent.start != 0:
            next_token = doc[ent.start + 1]
            if next_token.pos_ == "NOUN":
                new_ent = Span(doc, ent.start, ent.end - 1, label="STOCK")
                new_ents.append(new_ent)
        else:
            new_ents.append(ent)
    doc.ents = new_ents
    return doc

# Add the component after the named entity recognizer
if "remove_stockno_suffix" in nlp.pipe_names:
  nlp.remove_pipe("remove_stockno_suffix")

nlp.add_pipe("remove_stockno_suffix", after="ner")

<function __main__.remove_stockno_suffix>

# Test the trained NER model and EntityRuler

In [14]:
sentences = [
  "睇好港鐡(0066.HK)強烈建議買入 😎",
  "內地疫情受控,旅遊相關股同程藝龍(780)最近區間上落橫行,大戶收集似近完成,中線支持位$13.83,可考慮作中長線投資。",
  "睿見教育（6068.HK）：凈利與現金流大增超四成，估值吸引力凸顯，獲大行睇多	隻1765都賺多幾成 升左少少就跌凸 支股即係支股  🤷 ",
  "密切留意鋼鐵股，581中國東方，323馬鋼，347鞍鋼，1053重鋼，2600中鋁！",
  '入左中女 (308)  升四成，放定加馬	all in pk 左,起唔翻身  加一注 算, 3元走人!  贏咗購買 tesla 電車  輸咗就pk',
  '各位愛國人仕請支持883號中海油	有你地支持 相信中海油股價今日可以超英趕美  節節上升 多謝各位支持  883 號中海油 油中矛台 😎 ',
  '1727 終於止跌。回升中。	正常今日會反彈既  問題係彈完會企得穩收市嗎?  見佢買貨多過出貨  但都係俾人㩒住升5上去',
  '5號仔大笨象有冇機會企番起身	放心,我信佢會升上去 Ps: 33文入咗貨',
  '981中芯國際聽日爆升！淨利潤超預期，第二季度淨利潤1.38億美元，市場預期盈利9519萬美元，舊年同期盈利1854萬美元，同比增長6.44倍	35應該無問題 今月上返40蚊 GOGO'
]

# def replace_zh_punctuation(sentence):
#   str = sentence
#   str = str.replace("（", "(")
#   str = str.replace("）", ")")
#   return(str)

for sentence in sentences:
  # sentence = replace_zh_punctuation(sentence)
  doc = nlp(sentence)
  displacy.render(doc,style='ent',jupyter=True)
  print()


























