<a href="https://colab.research.google.com/github/ssm951/chinese-genealogy/blob/main/Experiments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Model Experimenting

## SpaCy (as baseline)

In [None]:
!pip install -q spacy
!python -m spacy download zh_core_web_trf


In [1]:

import spacy
spacy.require_cpu()
print(spacy.__version__)
nlp = spacy.load("zh_core_web_trf")


3.7.5


In [65]:
doc = nlp("我叫孟庆延。你叫什么名字？")
for token in doc:
    print(token.text,token.ent_iob_, token.ent_type_)

我 2 
叫 2 
孟庆延 3 PERSON
。 2 
你 2 
叫 2 
什么 2 
名字 2 
？ 2 


## Load Chinese-Literature-NER-RE-Dataset

In [4]:
!git clone https://github.com/lancopku/Chinese-Literature-NER-RE-Dataset.git

fatal: destination path 'Chinese-Literature-NER-RE-Dataset' already exists and is not an empty directory.


In [32]:
from spacy.training import Example

def re_label_to_spacy(label):
  split = label.split('_')
  if len(split) == 1:
    return 'O'
  split[1] = split[1].replace('\n','')
  if split[1] == 'Thing':
    return split[0] + '-PRODUCT'
  if split[1] == 'Person':
    return split[0] + '-PERSON'
  if split[1] == 'Location':
    return split[0] + '-LOC'
  if split[1] == 'Time':
    return split[0] + '-TIME'
  if split[1] == 'Metric':
    return split[0] + '-QUANTITY'
  if split[1] == 'Organization':
    return split[0] + '-ORG'
  if split[1] == 'Abstract':
    return split[0] + '-WORK_OF_ART'
  print('Failed to parse', label)
  return 'O'

dataset_path = "/content/Chinese-Literature-NER-RE-Dataset/ner/test.txt"
re_dataset = []
text = ""
entities = []
count = 0
with open(dataset_path) as my_file:
  # Read line by line
  for line in my_file:
    split = line.split(' ')
    if len(split) == 1:
      doc = nlp(text)
      for token in doc:
        if (token.ent_type_):
          tokenized = [];
          entity_of_tokenized = []
          new_token = ""
          prev_ent = None
          for i, char in enumerate(text):
            entity = entities[i].split('-')
            if entity[0] == 'B':
              new_token = char
              prev_ent = entities[i]
            elif entity[0] == 'I':
              new_token += char
            elif len(new_token) != 0:
              tokenized.append(new_token)
              entity_of_tokenized.append(prev_ent)
              new_token = ""
          re_dataset.append((doc, {"words": tokenized, "entities": entity_of_tokenized}))
          break
      text = ""
      entities = []
      count += 1
      if count == 100:
        break
    else:
      text += split[0]
      entities.append(re_label_to_spacy(split[1]))



## Load custom dataset

In [None]:
import os
from google.colab import drive
drive.mount('/content/drive')

In [None]:
filepath = "/content/drive/MyDrive/Meng_FamilySearch/cleaned/"
dirs = os.listdir(filepath)
words = []
labels = []
for dir in dirs:
  with open(filepath + dir) as my_file:
    data_array = my_file.readlines()
    words += list(data_array[0])[:-1] # Remove new line
    labels += list(data_array[1]) # second line are labels

sample_data = "".join(words)

In [None]:
sample_data

In [None]:
doc = nlp(sample_data)

In [None]:
prev_ent = False
for token in doc:
  if (token.ent_type_):
    if not prev_ent:
      print()
    prev_ent = True
    print(token.text,f'{token.ent_iob_}-{token.ent_type_}', spacy.explain(token.ent_type_))
  else:
    prev_ent = False
    print(token.text, end='')

## Evaluate Spacy model

In [29]:
from spacy.scorer import Scorer

scorer = Scorer()

examples = []
scorer = Scorer()
for example in re_dataset:
    example.predicted = nlp(str(example.predicted))
    examples.append(example)
scores = scorer.score(examples)

In [66]:
str(re_dataset[0].predicted)

'清明是人们祭扫先人，怀念追思的日子。'

In [61]:
x = nlp(str(re_dataset[0].predicted))

In [72]:

for example in re_dataset:
  doc = nlp(str(re_dataset[0].predicted))
  for token in doc:
    if (token.ent_type_):
      print(doc)
      print(token.text, token.ent_iob_, token.ent_type_)
      continue

In [12]:
example = Example.from_dict(re_dataset[0][0], {"entities": re_dataset[0][1]})


In [16]:
re_dataset[0][1]

['O',
 'O',
 'B-TIME',
 'I-TIME',
 'B-PERSON',
 'I-PERSON',
 'I-PERSON',
 'I-PERSON',
 'O',
 'O',
 'O',
 'B-LOC',
 'I-LOC',
 'I-LOC',
 'I-LOC',
 'O',
 'B-PRODUCT',
 'I-PRODUCT',
 'O',
 'B-TIME',
 'I-TIME',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

In [38]:
debug_doc(example.reference)

诗人高翥 O 
南北山头 O 
墓田 O 
清明 O 


In [7]:
def debug_doc(doc):
  for token in doc:
    print(token.text, token.ent_iob_, token.ent_type_)

In [8]:
doc = nlp("清明是人们祭扫先人，怀念追思的日子。")

In [11]:
debug_doc(re_dataset[0][0])

正如 O 
宋代 B DATE
诗人 O 
高翥 B PERSON
所 O 
云 O 
“ O 
南北 B LOC
山头 I LOC
多 O 
墓田 O 
， O 
清明 O 
祭扫 O 
各 O 
纷然 O 
。 O 


In [36]:
re_dataset[0][1]

example = Example.from_dict(re_dataset[0][0], re_dataset[0][1])