<a href="https://colab.research.google.com/github/raymondwcs/learning_spacy/blob/main/spaCy_NER_training%2C_Matcher.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This example demonstrates how to use rules-augmented model-based NER to locate stock codes, market direction indicators and market opinions/advices.

In [None]:
!pip install --quiet -U spacy
!python -m spacy download zh_core_web_lg
!git clone http://github.com/raymondwcs/learning_spacy

[K     |████████████████████████████████| 5.9 MB 5.1 MB/s 
[K     |████████████████████████████████| 456 kB 47.8 MB/s 
[K     |████████████████████████████████| 42 kB 1.3 MB/s 
[K     |████████████████████████████████| 623 kB 59.9 MB/s 
[K     |████████████████████████████████| 10.1 MB 49.5 MB/s 
[?25hCollecting zh-core-web-lg==3.1.0
  Downloading https://github.com/explosion/spacy-models/releases/download/zh_core_web_lg-3.1.0/zh_core_web_lg-3.1.0-py3-none-any.whl (603.8 MB)
[K     |████████████████████████████████| 603.8 MB 8.6 kB/s 
Collecting spacy-pkuseg<0.1.0,>=0.0.27
  Downloading spacy_pkuseg-0.0.28-cp37-cp37m-manylinux2014_x86_64.whl (2.4 MB)
[K     |████████████████████████████████| 2.4 MB 4.9 MB/s 
Installing collected packages: spacy-pkuseg, zh-core-web-lg
Successfully installed spacy-pkuseg-0.0.28 zh-core-web-lg-3.1.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('zh_core_web_lg')
Cloning into 'learning_spacy'...
rem

In [None]:
import spacy
import random
from spacy.training.example import Example
from spacy import displacy
import re


In [None]:
nlp = spacy.load('zh_core_web_lg')
nlp.tokenizer.initialize(pkuseg_model="./learning_spacy/spacy_pkuseg/models")

# Part 1 - Train model-based NER

## Data for training the model-based NER
Training can also be done via CLI.  Details below.

https://github.com/raymondwcs/learning_spacy/tree/main/NER_Training_CLI

In [None]:
TRAIN_DATA = [
    ("一齊係國企成份股調整期間 大家一齊不問價掃貨 尤其是386 857 成份股佔指數比重只係5%左右 冇咩受新計法影響 挾死班歐美鬼佬 港股10月29000 遠必挾之", 
        {"entities": [(26,29,"STOCK"),(30,33,"STOCK")]}),
    ("高盛：維持對騰訊(0700)買入評級 目標價705港元高盛發表報告指,與騰訊管理層於路演活動溝通後,重申對騰訊嘅積極正面睇法。",
        {"entities": [(6,8,"STOCK")]}),
    ("中電、匯控、恒大8月暴升8.5%。",
        {'entities': [(0,2,"STOCK"),(3,5,"STOCK"),(6,8,"STOCK")]}),
    ("匯控(00005)將於下周一（2日）公布2021年度中期業績。",
        {'entities': [(0,2,"STOCK")]}),
    ("中電，匯控，港交所齊齊跌8.5%金融海嘯後最差。",
        {"entities": [(0,2,"STOCK"),(3,5,"STOCK"),(6,9,"STOCK")]}),    
    ("港燈第二季將恢復派息",
        {"entities": [(0,2,"STOCK")]})
]

In [None]:
ner=nlp.get_pipe("ner")

# Adding labels to the `ner`
for _, annotations in TRAIN_DATA:
  for ent in annotations.get("entities"):
    ner.add_label(ent[2])

## Start training

In [None]:
epoch = 50
optimizer = nlp.resume_training()

# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
  for itn in range(epoch):
    random.shuffle(TRAIN_DATA)
    losses = {}
    # batch up the examples using spaCy's minibatch
    batches = spacy.util.minibatch(TRAIN_DATA,size=2)
    for batch in batches:
        texts, annotations = zip(*batch)
        example = []
        for i in range(len(texts)):
          doc = nlp.make_doc(texts[i])
          example.append(Example.from_dict(doc, annotations[i]))
        nlp.update(example,drop=0.2,sgd=optimizer,losses=losses)
    print("Losses", losses)

Losses {'ner': 29.475952283754637}
Losses {'ner': 20.49263261895309}
Losses {'ner': 18.066602039087297}
Losses {'ner': 15.300575005851854}
Losses {'ner': 13.200043526762379}
Losses {'ner': 11.501417354820413}
Losses {'ner': 10.186244458202992}
Losses {'ner': 5.4004701430655455}
Losses {'ner': 3.329468723480346}
Losses {'ner': 1.6900578168594693}
Losses {'ner': 1.4545811325998617}
Losses {'ner': 0.6591259214736938}
Losses {'ner': 0.3143478770694165}
Losses {'ner': 0.0019418074176596302}
Losses {'ner': 0.006290662456834585}
Losses {'ner': 0.8575361679093183}
Losses {'ner': 0.0024653552502066065}
Losses {'ner': 0.0005322046474905367}
Losses {'ner': 3.8895482163023655e-06}
Losses {'ner': 0.0013032431039422853}
Losses {'ner': 1.254021405667432e-05}
Losses {'ner': 0.003687029802378146}
Losses {'ner': 1.3898287639669315e-08}
Losses {'ner': 1.1475893236894077e-08}
Losses {'ner': 5.319469170986467e-06}
Losses {'ner': 1.3855095536756493e-08}
Losses {'ner': 8.459197387575875e-08}
Losses {'ner': 3

## Debug

In [None]:
for text, annotations in TRAIN_DATA:
  for ent in annotations.get("entities"):
    # print(text,ent)
    # print(text[ent[0]:ent[1]])
    doc = nlp(text)
    char_span = doc.char_span(ent[0],ent[1])
    if char_span is None:  # start and end don't map to tokens
        print("Misaligned tokens", text, ent)

## Save trained model to disk

In [None]:
nlp.to_disk('./ner_model')

# Part 2

## Load trained model from disk

In [30]:
# nlp = spacy.load('./ner_model')
nlp = spacy.load('zh_core_web_lg')
nlp.tokenizer.initialize(pkuseg_model="./learning_spacy/spacy_pkuseg/models")
nlp.tokenizer.pkuseg_update_user_dict(['中長線投資','派息','低開','高開','股份回購','每股盈利','恒生科技指數','入咗貨'])

## Define Matcher Rules

In [31]:
!gdown --id 1eo5O6h69_cK09oeYoy9nHp3X8OJ0hKuF

Downloading...
From: https://drive.google.com/uc?id=1eo5O6h69_cK09oeYoy9nHp3X8OJ0hKuF
To: /content/stockcodes.tsv
  0% 0.00/46.7k [00:00<?, ?B/s]100% 46.7k/46.7k [00:00<00:00, 64.5MB/s]


In [32]:
from pandas import *

# https://docs.google.com/spreadsheets/d/1-btKGkOo_ywPeH8qTlJL0IKF1bxeaPEKTVhAOPeYAU8/edit?usp=sharing
# data = read_csv("stockcodes.tsv",delimiter="\t",dtype=str) 
data = read_csv("stockcodes.tsv",delimiter="\t") 
data = data.astype(str)

stockcodes = data.iloc[:,0].to_list()   # STOCK
orgs = data.iloc[:,1].to_list()         # ORG-S

org_alias = ['5號仔','大笨象','港鐡','中海油', '中女']     # ORG-S-A

print(stockcodes[:10])
print(orgs[:10])
print(org_alias[:10])

['2', '3', '4', '5', '6', '7', '8', '9', '10', '11']
['中電控股', '中華煤氣', '九龍倉集團', '滙豐控股', '電能實業', '凱富能源', '電訊盈科', '九號通運', '恒隆集團', '恒生銀行']
['5號仔', '大笨象', '港鐡', '中海油', '中女']


In [33]:
mkt_directions = [
  '跌','跌到','大跌','狂跌','跌近','跌近','累跌','跌過','跌番','點跌','先跌','止跌','急跌','向下',
  '爆升','狂升','勁升','彈升','倒升','驟升','倍升','竄升','暴升','大升','即升','趨升','快升','續升','累升','跳升',
  '會升','上升','超預期','上返','增長','回升','止跌','反彈','企得穩']

opinions_advices = ['買入','入咗貨','中長線投資']

pos_emoji = ["😀", "😃", "😂", "🤣", "😊", "😍","😎"]  # Positive emoji
neg_emoji = ["😞", "😠", "😩", "😢", "😭", "😒","🤷"]  # Negative emoji

In [35]:
from spacy.language import Language
from spacy.matcher import Matcher
from spacy.tokens import Doc, Span, Token

Doc.set_extension("is_mkt_direction_bearing",default=False,force=True)    # utimately ['up','down','none']
Doc.set_extension("is_opinion_advice_bearing",default=False,force=True)   # utimately ['buy','sell','hold']
Doc.set_extension("sentiment",default="neu",force=True)
matcher = Matcher(nlp.vocab)

pos_patterns = [[{"ORTH": emoji}] for emoji in pos_emoji]
neg_patterns = [[{"ORTH": emoji}] for emoji in neg_emoji]
stockcode_patterns_01 = [[{"ORTH": stockcode}] for stockcode in stockcodes]
stockcode_patterns_02 = [[{"ORTH": "("},{"TEXT": {"REGEX": "\d+"}},{"ORTH": ")"}]]
official_stockcode_patterns_01 = [[{"IS_DIGIT": True},{"ORTH":"."},{"ORTH": "HK"}]]
official_stockcode_patterns_02 = [[{"TEXT": {"REGEX":"\d+\.HK"}}]]
mkt_direction_patterns_01 = [[{"TEXT": mkt_dir}] for mkt_dir in mkt_directions]
opinion_advice_patterns_01 = [[{"ORTH": opn_adv}] for opn_adv in opinions_advices]
opinion_advice_patterns_02 = [[{"LOWER": "all"},{"LOWER": "in"}],[{"lower":"gogo"}],[{"lower":"gogogo"}]]
org_patterns   = [[{"ORTH": org}] for org in orgs]
org_a_patterns = [[{"ORTH": org}] for org in org_alias]

# Function to label doc._.sentiment
def label_sentiment(matcher, doc, i, matches):
    doc.sentiment = 0
    match_id, start, end = matches[i]
    if doc.vocab.strings[match_id] == "sentiment-pos":  
        doc.sentiment += 1  
    elif doc.vocab.strings[match_id] == "sentiment-neg":
        doc.sentiment -= 1  
    if doc.sentiment > 0:
        doc._.sentiment = "pos"
    elif doc.sentiment < 0:
        doc._.sentiment = "neg"
    else:
        doc._.sentiment = "neu"
    # print('label_sentiment({}) = {} ({})'.format(doc,doc._.sentiment,doc.sentiment))

# Function to label stockcode
def label_stockcode(matcher, doc, i, matches):
    new_ents = []
    match_id, start, end = matches[i]
    token = doc[start:end]
    next_token = doc[end:end+1]
    # print('label_stockcode({}), next token: ({})'.format(token.text,next_token.text))
    # if doc.vocab.strings[match_id] == "stockcode":
    for ent in doc.ents:
        if ent.text in token.text or token.text in ent.text:
            continue
        new_ents.append(ent)
    # print('label_stockcode, doc.ents: {}'.format(new_ents))
    if  not re.search(r'[一二三四十五六七八九十零百千萬億壹貳參叄肆伍陸柒捌玖拾佰仟]+',token.text) and \
        not re.search(r'[蚊號]',token.text) and \
        not re.search(r'[美元文蚊]',next_token.text):
        new_ent = Span(doc, start, end, label="STOCK")
        new_ents.append(new_ent)
        doc.ents = new_ents

def label_official_stockcode(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    new_ent = Span(doc, start, end, label="STOCK")
    doc.ents += (new_ent,)

def label_org(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    token = doc[start:end]
    next_token = doc[end:end+1]
    new_ents = []
    for ent in doc.ents:
        if token.text not in ent.text:
            new_ents.append(ent)
    new_ent = Span(doc, start, end, label="ORG-S")
    new_ents.append(new_ent)
    doc.ents = new_ents

def label_org_a(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    token = doc[start:end]
    next_token = doc[end:end+1]
    new_ents = []
    for ent in doc.ents:
        if token.text not in ent.text:
            new_ents.append(ent)
    new_ent = Span(doc, start, end, label="ORG-A")
    new_ents.append(new_ent)
    doc.ents = new_ents

def label_mkt_direction(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    token = doc[start:end]
    next_token = doc[end:end+1]
    new_ents = []
    for ent in doc.ents:
        if token.text not in ent.text:
            new_ents.append(ent)
    new_ent = Span(doc, start, end, label="MKT-DIR")
    new_ents.append(new_ent)
    doc.ents = new_ents
    # if context is STOCK or MARKET then:
    doc._.is_mkt_direction_bearing = True

def label_opinion_advice(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    token = doc[start:end]
    next_token = doc[end:end+1]
    new_ents = []
    for ent in doc.ents:
        if token.text not in ent.text:
            new_ents.append(ent)
    new_ent = Span(doc, start, end, label="OPN-ADV")
    new_ents.append(new_ent)
    doc.ents = new_ents
    # if context is STOCK or MARKET then:
    doc._.is_opinion_advice_bearing = True


In [36]:
matcher.add("sentiment-pos", pos_patterns, on_match=label_sentiment)  
matcher.add("sentiment-neg", neg_patterns, on_match=label_sentiment)  
matcher.add("stockcode", stockcode_patterns_01, on_match=label_stockcode)  
matcher.add("stockcode", stockcode_patterns_02, on_match=label_stockcode)  
matcher.add("official-stockcode", official_stockcode_patterns_01, on_match=label_official_stockcode)  
matcher.add("official-stockcode", official_stockcode_patterns_02, on_match=label_official_stockcode)  
matcher.add("mkt-direction", mkt_direction_patterns_01, on_match=label_mkt_direction) 
matcher.add("opinion-advice", opinion_advice_patterns_01, on_match=label_opinion_advice) 
matcher.add("opinion-advice", opinion_advice_patterns_02, on_match=label_opinion_advice) 
matcher.add("org", org_patterns, on_match=label_org) 
matcher.add("org-a", org_a_patterns, on_match=label_org_a)  

# Test the trained NER model and Matcher

In [38]:
sentences = [
  "睇好港鐡(0066.HK)強烈建議買入 😎",
  "內地疫情受控,旅遊相關股同程藝龍(780)最近區間上落橫行,大戶收集似近完成,中線支持位$13.83,可考慮作中長線投資。",
  "睿見教育（6068.HK）：凈利與現金流大增超四成，估值吸引力凸顯，獲大行睇多	隻1765都賺多幾成 升左少少就跌凸 支股即係支股  🤷 ",
  "密切留意鋼鐵股，581中國東方，323馬鋼，347鞍鋼，1053重鋼，2600中鋁！",
  '入左中女 (308)  升四成，放定加馬	all in pk 左,起唔翻身  加一注 算, 3元走人!  贏咗購買 tesla 電車  輸咗就pk',
  '各位愛國人仕請支持883號中海油	有你地支持 相信中海油股價今日可以超英趕美  節節上升 多謝各位支持  883 號中海油 油中矛台 😎 ',
  '1727 終於止跌。回升中。	正常今日會反彈既  問題係彈完會企得穩收市嗎?  見佢買貨多過出貨  但都係俾人㩒住升5上去',
  '5號仔大笨象有冇機會企番起身	放心,我信佢會升上去 Ps: 33文入咗貨',
  '981中芯國際聽日爆升！淨利潤超預期，第二季度淨利潤1.38億美元，市場預期盈利9519萬美元，舊年同期盈利1854萬美元，同比增長6.44倍	35應該無問題 今月上返40蚊 GOGO'
]

# def replace_zh_punctuation(sentence):
#   str = sentence
#   str = str.replace("（", "(")
#   str = str.replace("）", ")")
#   return(str)

for sentence in sentences:
  # sentence = replace_zh_punctuation(sentence)
  doc = nlp(sentence)
  matches = matcher(doc)
  # print([(token.text,token.pos_) for token in doc])
  displacy.render(doc,style='ent',jupyter=True)
  print('is_mkt_direction_bearing: {}'.format(doc._.is_mkt_direction_bearing))
  print('is_opinion_advice_bearing: {}'.format(doc._.is_opinion_advice_bearing))
  print('sentiment: {}'.format(doc._.sentiment))


is_mkt_direction_bearing: False
is_opinion_advice_bearing: True
sentiment: pos


is_mkt_direction_bearing: False
is_opinion_advice_bearing: True
sentiment: neu


is_mkt_direction_bearing: False
is_opinion_advice_bearing: False
sentiment: neg


is_mkt_direction_bearing: False
is_opinion_advice_bearing: False
sentiment: neu


is_mkt_direction_bearing: False
is_opinion_advice_bearing: True
sentiment: neu


is_mkt_direction_bearing: False
is_opinion_advice_bearing: False
sentiment: pos


is_mkt_direction_bearing: True
is_opinion_advice_bearing: False
sentiment: neu


is_mkt_direction_bearing: False
is_opinion_advice_bearing: True
sentiment: neu


is_mkt_direction_bearing: True
is_opinion_advice_bearing: True
sentiment: neu
