<a href="https://colab.research.google.com/github/ssm951/chinese-genealogy/blob/main/Experiments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Model Experimenting

## SpaCy (as baseline)

In [1]:
!pip install -q spacy
!python -m spacy download zh_core_web_trf


Collecting zh-core-web-trf==3.7.2
  Downloading https://github.com/explosion/spacy-models/releases/download/zh_core_web_trf-3.7.2/zh_core_web_trf-3.7.2-py3-none-any.whl (415.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m415.1/415.1 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Collecting spacy-curated-transformers<0.3.0,>=0.2.0 (from zh-core-web-trf==3.7.2)
  Downloading spacy_curated_transformers-0.2.2-py2.py3-none-any.whl.metadata (2.7 kB)
Collecting spacy-pkuseg<0.1.0,>=0.0.27 (from zh-core-web-trf==3.7.2)
  Downloading spacy_pkuseg-0.0.33-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting curated-transformers<0.2.0,>=0.1.0 (from spacy-curated-transformers<0.3.0,>=0.2.0->zh-core-web-trf==3.7.2)
  Downloading curated_transformers-0.1.1-py2.py3-none-any.whl.metadata (965 bytes)
Collecting curated-tokenizers<0.1.0,>=0.0.9 (from spacy-curated-transformers<0.3.0,>=0.2.0->zh-core-web-trf==3.7.2)
  Downloading curated_tokenizers-0

In [2]:

import spacy
spacy.require_cpu()
print(spacy.__version__)
nlp = spacy.load("zh_core_web_trf")


3.7.5


In [3]:
doc = nlp("我叫孟庆延。你叫什么名字？")
for token in doc:
    print(token.text,token.ent_iob_, token.ent_type_)

我 O 
叫 O 
孟庆延 B PERSON
。 O 
你 O 
叫 O 
什么 O 
名字 O 
？ O 


## Load Chinese-Literature-NER-RE-Dataset

In [4]:
!git clone https://github.com/lancopku/Chinese-Literature-NER-RE-Dataset.git

Cloning into 'Chinese-Literature-NER-RE-Dataset'...
remote: Enumerating objects: 1733, done.[K
remote: Total 1733 (delta 0), reused 0 (delta 0), pack-reused 1733[K
Receiving objects: 100% (1733/1733), 7.30 MiB | 9.52 MiB/s, done.
Resolving deltas: 100% (17/17), done.


In [80]:
from spacy.training import Example
from spacy.tokens import Span

def re_label_to_spacy(label):
  split = label.split('_')
  if len(split) == 1:
    return 'O'
  split[1] = split[1].replace('\n','')
  if split[1] == 'Thing':
    return split[0] + '-PRODUCT'
  if split[1] == 'Person':
    return split[0] + '-PERSON'
  if split[1] == 'Location':
    return split[0] + '-LOC'
  if split[1] == 'Time':
    return split[0] + '-TIME'
  if split[1] == 'Metric':
    return split[0] + '-QUANTITY'
  if split[1] == 'Organization':
    return split[0] + '-ORG'
  if split[1] == 'Abstract':
    return split[0] + '-WORK_OF_ART'
  print('Failed to parse', label)
  return 'O'

dataset_path = "/content/Chinese-Literature-NER-RE-Dataset/ner/test.txt"
re_dataset = []
example_dataset = []
text = ""
entities = []
count = 0
with open(dataset_path) as my_file:
  # Read line by line
  for line in my_file:
    split = line.split(' ')
    if len(split) == 1: # sentence complete, creating an example from it
      reference = nlp.make_doc(text)
      converted_ents = []
      start_index = 0
      curr_ent = ''
      for i, token in enumerate(reference):
        start_i = token.idx
        if entities[start_i] == 'O' and curr_ent != "":
          converted_ents.append(Span(reference, start_index, i, label=curr_ent))
          curr_ent = ""
        if entities[start_i].startswith('B'):
          if (curr_ent != ""):
            converted_ents.append(Span(reference, start_index, i, label=curr_ent))
          curr_ent = entities[start_i].split('-')[1]
          start_index = i
      if (curr_ent != ""):
        converted_ents.append(Span(reference, start_index, i, label=curr_ent))
      print(converted_ents)
      reference.ents = converted_ents
      predicted = nlp(text)
      re_dataset.append((text, {"entities": entities}))
      example_dataset.append(Example(predicted,reference))

      text = ""
      entities = []
      count += 1
      if count % 1000 == 0:
        print(count)
    else:
      text += split[0]
      entities.append(re_label_to_spacy(split[1]))



[清明, 人们, 先人, 日子]
[宋代, 诗人高翥, 南北山头, 墓田, 清明]
[纸灰飞作, 白蝴蝶, 泪血, 红杜鹃]
[清明之时, 母亲]
[母亲, 孙名讳秋兰, 我, 陈毅, 幽兰, 山谷, 人识]
[母亲, 一枝, 幽兰]
[穷人, 孩子, 母亲, 11岁时, 姥姥, 16岁出嫁后, 几年, 奶奶]
[母亲]
[母亲, 老人, 晚辈, 村里, 口碑]
[我, 小的时候, 家里, 每天晚上, 母亲, 油灯, 破衣服, 我, 母亲, 母亲, 我, 时候, 母亲, 山里, 我们, 早饭]
[人民公社时期, 农村, 口粮, 我家兄妹]
[有一年, 生产队分, 小麦, 母亲, 人, 家, 公社, 父亲, 小麦, 晚上, 小麦款]
[母亲, 明天, 钱]
[老共产党员, 大队干部, 我, 叔叔, 小麦, 生产队]
[母亲, 她, 叔叔, 我们, 叔叔]
[每到年终, 生产队, 我家]
[母亲, 别人, 钱, 几个, 有钱的]
[我家, 邻居, 地主, 我父亲, 叔叔, 党员, 干部, 年代, 我家, 家庭]
[后来, 我, 父亲, 母亲, 他们, 钱]
[我服兵役, 西北金昌, 父母]
[父亲, 他, 我, 泪水]
[母亲, 共产党, 领导, 那里, 黄土, 我, 母亲, 西北, 戈壁大漠, 金昌成, 我, 第二故乡]
[母亲, 子女们, 自己, 母亲]
[我, 母亲, 女防老难]
[五虎两凤, 我兄弟5人姐妹2人, 母亲, 1996年, 我们]
[母亲, 三弟, 我他, 母亲, 头发, 我, 母亲, 我, 青丝白发胜]
[万千, 胸中栖]
[帝迎佛骨招人, 棒捧花, 母亲]
[瑶池]
[母亲]
[两年, 父亲, 母亲, 我们]
[父亲, 次年清明, 我, 父母, 金昌, 时, 火车, 车上, 音响, 我]
[两眼泪花滴]
[双亲]
[面窗, 清泪]
[黄土埋白骨]
[爹娘, 家, 孩崽娇]
[兄弟姐妹情, 父母老]
[瑶池]
[癸巳清明, 我, 父母]
[我, 父母, 儿女]
[朋友, 家乡, 晚霞湖, 晚霞湖, 女子, 我]
[晚霞湖, 环湖路, 格桑花, 晚霞湖, 东、南, 丘陵, 湖面, 黄土高原上]
[湖, 海]
[水坝, 水面, 湖面, 游舟, 水鸟]
[晚霞湖象, 宝镜, 湖边, 谢庄, 青沟, 姬尧, 阳坡, 麻庄, 新农村, 葵花,, 水坝, 宝镜

In [81]:
from spacy.scorer import Scorer


scorer = Scorer()
scorer.score(example_dataset)

{'token_acc': 1.0,
 'token_p': 1.0,
 'token_r': 1.0,
 'token_f': 1.0,
 'sents_p': 1.0,
 'sents_r': 1.0,
 'sents_f': 1.0,
 'tag_acc': None,
 'pos_acc': None,
 'morph_acc': None,
 'morph_micro_p': None,
 'morph_micro_r': None,
 'morph_micro_f': None,
 'morph_per_feat': None,
 'dep_uas': None,
 'dep_las': None,
 'dep_las_per_type': None,
 'ents_p': 0.17616261164151525,
 'ents_r': 0.03826342899190582,
 'ents_f': 0.06287096065069246,
 'ents_per_type': {'TIME': {'p': 0.7014925373134329,
   'r': 0.038180341186027617,
   'f': 0.07241910631741139},
  'PERSON': {'p': 0.41625615763546797,
   'r': 0.036149732620320854,
   'f': 0.06652233812241685},
  'DATE': {'p': 0.0, 'r': 0.0, 'f': 0.0},
  'LOC': {'p': 0.5154394299287411,
   'r': 0.0822592873388931,
   'f': 0.14187643020594964},
  'PRODUCT': {'p': 0.6666666666666666,
   'r': 0.0011372251705837756,
   'f': 0.002270577105014191},
  'QUANTITY': {'p': 0.6542056074766355,
   'r': 0.11608623548922056,
   'f': 0.1971830985915493},
  'ORG': {'p': 0.2479

## Load custom dataset

In [None]:
import os
from google.colab import drive
drive.mount('/content/drive')

In [None]:
filepath = "/content/drive/MyDrive/Meng_FamilySearch/cleaned/"
dirs = os.listdir(filepath)
words = []
labels = []
for dir in dirs:
  with open(filepath + dir) as my_file:
    data_array = my_file.readlines()
    words += list(data_array[0])[:-1] # Remove new line
    labels += list(data_array[1]) # second line are labels

sample_data = "".join(words)

In [None]:
sample_data

In [None]:
doc = nlp(sample_data)

In [None]:
prev_ent = False
for token in doc:
  if (token.ent_type_):
    if not prev_ent:
      print()
    prev_ent = True
    print(token.text,f'{token.ent_iob_}-{token.ent_type_}', spacy.explain(token.ent_type_))
  else:
    prev_ent = False
    print(token.text, end='')

## Evaluate Spacy model on custom dataset


In [None]:
from spacy.scorer import Scorer

scorer = Scorer()

examples = []
scorer = Scorer()
for example in re_dataset:
    example.predicted = nlp(str(example.predicted))
    examples.append(example)
scores = scorer.score(examples)

In [None]:
str(re_dataset[0].predicted)

'清明是人们祭扫先人，怀念追思的日子。'

In [None]:
x = nlp(str(re_dataset[0].predicted))

In [None]:

for example in re_dataset:
  doc = nlp(str(re_dataset[0].predicted))
  for token in doc:
    if (token.ent_type_):
      print(doc)
      print(token.text, token.ent_iob_, token.ent_type_)
      continue

In [None]:
example = Example.from_dict(re_dataset[0][0], {"entities": re_dataset[0][1]})


In [7]:
def debug_doc(doc):
  for token in doc:
    print(token.text, token.ent_iob_, token.ent_type_)