<a href="https://colab.research.google.com/github/raymondwcs/learning_spacy/blob/main/spaCy_NER_training%2C_Entity_Ruler.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This example demonstrates how to use rules-augmented model-based NER to locate stock codes

In [1]:
!pip install --quiet -U spacy
!python -m spacy download zh_core_web_lg
!git clone http://github.com/raymondwcs/learning_spacy

[K     |████████████████████████████████| 5.9 MB 5.2 MB/s 
[K     |████████████████████████████████| 456 kB 47.3 MB/s 
[K     |████████████████████████████████| 623 kB 39.2 MB/s 
[K     |████████████████████████████████| 42 kB 683 kB/s 
[K     |████████████████████████████████| 10.1 MB 55.8 MB/s 
[?25hCollecting zh-core-web-lg==3.1.0
  Downloading https://github.com/explosion/spacy-models/releases/download/zh_core_web_lg-3.1.0/zh_core_web_lg-3.1.0-py3-none-any.whl (603.8 MB)
[K     |████████████████████████████████| 603.8 MB 8.4 kB/s 
[?25hCollecting spacy-pkuseg<0.1.0,>=0.0.27
  Downloading spacy_pkuseg-0.0.28-cp37-cp37m-manylinux2014_x86_64.whl (2.4 MB)
[K     |████████████████████████████████| 2.4 MB 4.8 MB/s 
Installing collected packages: spacy-pkuseg, zh-core-web-lg
Successfully installed spacy-pkuseg-0.0.28 zh-core-web-lg-3.1.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('zh_core_web_lg')
Cloning into 'learning_spacy'.

In [2]:
import spacy
import random
from spacy.training.example import Example
from spacy import displacy
import re


In [10]:
nlp = spacy.load('zh_core_web_lg')
nlp.tokenizer.initialize(pkuseg_model="./learning_spacy/spacy_pkuseg/models")

# Part 1 - Train model-based NER

## Data for training the model-based NER
Training can also be done via CLI.  Details below.

https://github.com/raymondwcs/learning_spacy/tree/main/NER_Training_CLI

In [4]:
TRAIN_DATA = [
    ("一齊係國企成份股調整期間 大家一齊不問價掃貨 尤其是386 857 成份股佔指數比重只係5%左右 冇咩受新計法影響 挾死班歐美鬼佬 港股10月29000 遠必挾之", 
        {"entities": [(26,29,"STOCK"),(30,33,"STOCK")]}),
    ("高盛：維持對騰訊(0700)買入評級 目標價705港元高盛發表報告指,與騰訊管理層於路演活動溝通後,重申對騰訊嘅積極正面睇法。",
        {"entities": [(6,8,"STOCK")]}),
    ("中電、匯控、恒大8月暴升8.5%。",
        {'entities': [(0,2,"STOCK"),(3,5,"STOCK"),(6,8,"STOCK")]}),
    ("匯控(00005)將於下周一（2日）公布2021年度中期業績。",
        {'entities': [(0,2,"STOCK")]}),
    ("中電，匯控，港交所齊齊跌8.5%金融海嘯後最差。",
        {"entities": [(0,2,"STOCK"),(3,5,"STOCK"),(6,9,"STOCK")]}),    
    ("港燈將恢復派息",
        {"entities": [(0,2,"STOCK")]})
]

In [11]:
ner=nlp.get_pipe("ner")

# Adding labels to the `ner`
for _, annotations in TRAIN_DATA:
  for ent in annotations.get("entities"):
    ner.add_label(ent[2])

## Start training

In [12]:
epoch = 50
optimizer = nlp.resume_training()

# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
  for itn in range(epoch):
    random.shuffle(TRAIN_DATA)
    losses = {}
    # batch up the examples using spaCy's minibatch
    batches = spacy.util.minibatch(TRAIN_DATA,size=2)
    for batch in batches:
        texts, annotations = zip(*batch)
        example = []
        for i in range(len(texts)):
          doc = nlp.make_doc(texts[i])
          example.append(Example.from_dict(doc, annotations[i]))
        nlp.update(example,drop=0.2,sgd=optimizer,losses=losses)
    print("Losses", losses)

Losses {'ner': 26.348347192643548}
Losses {'ner': 18.208515847694656}
Losses {'ner': 18.274711830580735}
Losses {'ner': 16.577816989467287}
Losses {'ner': 12.833680604567826}
Losses {'ner': 13.161493444192011}
Losses {'ner': 11.789627826497199}
Losses {'ner': 7.014239586074808}
Losses {'ner': 3.5087354394732184}
Losses {'ner': 2.213450825725637}
Losses {'ner': 2.327670356527099}
Losses {'ner': 0.5413397776200213}
Losses {'ner': 0.6226636060465666}
Losses {'ner': 0.03476697086379524}
Losses {'ner': 0.11581376217310267}
Losses {'ner': 0.013039198338745301}
Losses {'ner': 9.404973220487454e-05}
Losses {'ner': 0.0002882803263944925}
Losses {'ner': 1.110779170992249}
Losses {'ner': 3.812469084365352e-05}
Losses {'ner': 6.5898111146762e-06}
Losses {'ner': 3.6876273298927655e-05}
Losses {'ner': 2.785644535020505e-07}
Losses {'ner': 0.000633551674627712}
Losses {'ner': 5.646302526820085e-07}
Losses {'ner': 3.1954594609599857e-06}
Losses {'ner': 0.002813887832679257}
Losses {'ner': 2.5330005978

## Debug

In [13]:
for text, annotations in TRAIN_DATA:
  for ent in annotations.get("entities"):
    # print(text,ent)
    # print(text[ent[0]:ent[1]])
    doc = nlp(text)
    char_span = doc.char_span(ent[0],ent[1])
    if char_span is None:  # start and end don't map to tokens
        print("Misaligned tokens", text, ent)

## Save trained model to disk

In [14]:
nlp.to_disk('./ner_model')

# Part 2

## Load trained model from disk

In [29]:
nlp = spacy.load('./ner_model')
# nlp = spacy.load('zh_core_web_lg')
nlp.tokenizer.initialize(pkuseg_model="./learning_spacy/spacy_pkuseg/models")
nlp.tokenizer.pkuseg_update_user_dict(['中長線投資','派息','低開','高開','股份回購','每股盈利','恒生科技指數'])

## Define custom rules for EntityRuler

In [30]:
!gdown --id 1eo5O6h69_cK09oeYoy9nHp3X8OJ0hKuF

Downloading...
From: https://drive.google.com/uc?id=1eo5O6h69_cK09oeYoy9nHp3X8OJ0hKuF
To: /content/stockcodes.tsv
  0% 0.00/46.7k [00:00<?, ?B/s]100% 46.7k/46.7k [00:00<00:00, 17.3MB/s]


In [31]:
from pandas import *

# https://docs.google.com/spreadsheets/d/1-btKGkOo_ywPeH8qTlJL0IKF1bxeaPEKTVhAOPeYAU8/edit?usp=sharing
# data = read_csv("stockcodes.tsv",delimiter="\t",dtype=str) 
data = read_csv("stockcodes.tsv",delimiter="\t") 
data = data.astype(str)

stockcodes = data.iloc[:,0].to_list()   # STOCK
orgs = data.iloc[:,1].to_list()         # ORG-S

org_alias = ['5號仔','大笨象','港鐡','中海油']     # ORG-S-A

print(stockcodes[:10])
print(orgs[:10])
print(org_alias[:10])

['2', '3', '4', '5', '6', '7', '8', '9', '10', '11']
['中電控股', '中華煤氣', '九龍倉集團', '滙豐控股', '電能實業', '凱富能源', '電訊盈科', '九號通運', '恒隆集團', '恒生銀行']
['5號仔', '大笨象', '港鐡', '中海油']


In [32]:
mkt_directions = ['爆升','會升','上升','升上去','升5上去','超預期','上返','增長','回升','止跌','反彈','企得穩']
opinions_advices = ['買入','中長線投資']

In [33]:
patterns = [
    {"label": "ORG-S",  "pattern": [{'ORTH': {'IN': orgs}}]},
    {"label": "ORG-S-A","pattern": [{'ORTH': {'IN': org_alias}}]},
    {"label": "STOCK",  "pattern": [{"IS_DIGIT": True},{"ORTH":"."},{"ORTH": "HK"}]},
    {"label": "STOCK",  "pattern": [{"TEXT": {"REGEX":"\d+\.HK"}}]},
    # {"label": "STOCK2", "pattern": [{"ORTH": "(", "OP": "?"},{"IS_DIGIT": True},{"ORTH": ")","OP": "?"}]},
    # {"label": "STOCK2", "pattern": [{"ORTH": " (", "OP": "?"},{"IS_DIGIT": True},{"ORTH": "）","OP": "?"}]},
    # {"label": "STOCK2", "pattern": [{"IS_DIGIT": True},{"ENT_TYPE": "ORG-S"}]},
    # {"label": "STOCK2", "pattern": [{"IS_DIGIT": True},{"POS": "NOUN"}]},
    # {"label": "STOCK2", "pattern": [{"IS_DIGIT": True},{"POS": "PROPN"}]},
    {"label": "STOCK2", "pattern": [{"IS_DIGIT": True}]},
    # {"label": "STOCK3", "pattern": [{'TEXT': {'IN': stockcodes}}]},
    {"label": "MKT-DIR", "pattern": [{'TEXT': {'IN': mkt_directions}}]},
    {"label": "OPN-ADV", "pattern": [{'TEXT': {'IN': opinions_advices}}]},
    {"label": "ORG-S",  "pattern": [{'ORTH': {'IN': orgs}}]},
    {"label": "ORG-S-A","pattern": [{'ORTH': {'IN': org_alias}}]},
]

if "entity_ruler" in nlp.pipe_names:
  nlp.remove_pipe("entity_ruler")

entity_ruler = nlp.add_pipe("entity_ruler", before='ner')
entity_ruler.add_patterns(patterns)

## Check stockcode suffix 

In [34]:
from spacy.language import Language
from spacy.tokens import Span

def print_ent_details(ents):
    ent_details = []
    for ent in ents:
      ent_details.append([ent.text, ent.label_])
    print('doc.ents: {}'.format(ent_details))

@Language.component("check_stockno_suffix")
def remove_stockno_suffix(doc):
    # print_ent_details(doc.ents)
    new_ents = []
    for i in range(len(doc.ents)):
      if doc.ents[i].label_ == "STOCK2" or doc.ents[i].label_ == "STOCK3":
        token      = doc.ents[i]
        next_token = doc[doc.ents[i].start+1]
        # print('[{},{},{}]'.format(doc.ents[i].text,doc.ents[i].label_,next_token.text))
        # print(token.text,next_token.text)
        if  not re.search(r'[一二三四十五六七八九十零百千萬億壹貳參叄肆伍陸柒捌玖拾佰仟]+',token.text) and \
            not re.search(r'[蚊號]',token.text) and \
            not re.search(r'[美元文蚊]',next_token.text):
          new_ent = Span(doc, doc.ents[i].start, doc.ents[i].end, label="STOCK")
          new_ents.append(new_ent)
      else:
        new_ents.append(doc.ents[i])
    doc.ents = new_ents
    # print_ent_details(doc.ents)
    return doc

# Add the component after the named entity recognizer
if "check_stockno_suffix" in nlp.pipe_names:
  nlp.remove_pipe("check_stockno_suffix")

nlp.add_pipe("check_stockno_suffix", after="ner")

<function __main__.remove_stockno_suffix>

# Test the trained NER model and EntityRuler

In [35]:
sentences = [
  "睇好港鐡(0066.HK)強烈建議買入 😎",
  "內地疫情受控,旅遊相關股同程藝龍(780)最近區間上落橫行,大戶收集似近完成,中線支持位$13.83,可考慮作中長線投資。",
  "睿見教育（6068.HK）：凈利與現金流大增超四成，估值吸引力凸顯，獲大行睇多	隻1765都賺多幾成 升左少少就跌凸 支股即係支股  🤷 ",
  "密切留意鋼鐵股，581中國東方，323馬鋼，347鞍鋼，1053重鋼，2600中鋁！",
  '入左中女 (308)  升四成，放定加馬	all in pk 左,起唔翻身  加一注 算, 3元走人!  贏咗購買 tesla 電車  輸咗就pk',
  '各位愛國人仕請支持883號中海油	有你地支持 相信中海油股價今日可以超英趕美  節節上升 多謝各位支持  883 號中海油 油中矛台 😎 ',
  '1727 終於止跌。回升中。	正常今日會反彈既  問題係彈完會企得穩收市嗎?  見佢買貨多過出貨  但都係俾人㩒住升5上去',
  '5號仔大笨象有冇機會企番起身	放心,我信佢會升上去 Ps: 33文入咗貨',
  '981中芯國際聽日爆升！淨利潤超預期，第二季度淨利潤1.38億美元，市場預期盈利9519萬美元，舊年同期盈利1854萬美元，同比增長6.44倍	35應該無問題 今月上返40蚊 GOGO'
]

# def replace_zh_punctuation(sentence):
#   str = sentence
#   str = str.replace("（", "(")
#   str = str.replace("）", ")")
#   return(str)

for sentence in sentences:
  # sentence = replace_zh_punctuation(sentence)
  doc = nlp(sentence)
  displacy.render(doc,style='ent',jupyter=True)
  print()


























