# Ref
- https://spacy.io/usage/training

# Import

In [1]:
import spacy
import json
import random
from tqdm.auto import tqdm

from spacy.tokens import DocBin
from spacy.util import filter_spans

# データの読み込み

In [2]:
with open("../ner-wikipedia-dataset/ner.json") as f:
    stockmark_data = json.load(f)

# データをSpacyで読み込める形式に変換

In [3]:
# データ型を定義する
Span = list[int]
EntitiesData = list[dict[str, str | Span]]
OneData = dict[str, str | EntitiesData]
StockmarkData = list[OneData]

In [4]:
def random_split_to_train_dev_test(data: StockmarkData) -> tuple[StockmarkData, StockmarkData, StockmarkData]:
    all_len = len(data)

    train_len = int(all_len * 0.6)
    dev_len = train_len + int(all_len * 0.2)

    random.shuffle(data)

    train = data[:train_len]
    dev = data[train_len:dev_len]
    test = data[dev_len:]

    return train, dev, test

train, dev, test = random_split_to_train_dev_test(stockmark_data)

In [5]:
print(len(train))
print(len(dev))
print(len(test))

3205
1068
1070


In [6]:
def make_spacy(data: StockmarkData, name: str) -> None:
    nlp = spacy.blank("ja")
    db = DocBin()
    for training_example in tqdm(data):
        text = training_example['text']
        annotations = training_example['entities']
        doc = nlp(text)
        ents = []
        for annotation in annotations:
            start = annotation["span"][0]
            end = annotation["span"][1]
            label = annotation["type"]

            span = doc.char_span(start, end, label=label)

            if span is None:
                print("Skipping entity")
            else:
                ents.append(span)
        doc.ents = ents
        db.add(doc)
    db.to_disk(f"../data/{name}.spacy")

make_spacy(data=train, name="train")
make_spacy(data=dev, name="dev")
make_spacy(data=test, name="test")

  0%|          | 0/3205 [00:00<?, ?it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity


  0%|          | 0/1068 [00:00<?, ?it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity


  0%|          | 0/1070 [00:00<?, ?it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity


In [7]:
text = "Sansan株式会社は、「出会いからイノベーションを生み出す」をミッションとして掲げています"
start = 0
end = 10
label = "法人名"

nlp = spacy.blank("ja")
doc = nlp(text)
span = doc.char_span(start, end, label=label)

print(span)
print(type(span))

Sansan株式会社
<class 'spacy.tokens.span.Span'>


In [8]:
text = "また、草戸稲荷神社前には遊女町を造ったといわれる。"
start = 3
end = 9
label = "施設名"

nlp = spacy.blank("ja")
doc = nlp(text)
span = doc.char_span(start, end, label=label)

print(f"期待出力: {text[start:end]}")
print(f"実際の出力: {span}")
print(f"データ型: {type(span)}")

期待出力: 草戸稲荷神社
実際の出力: None
データ型: <class 'NoneType'>


In [9]:
nlp = spacy.blank("ja")
doc = nlp("また、草戸稲荷神社前には遊女町を造ったといわれる。")
print([token for token in doc])

[また, 、, 草戸, 稲荷, 神社前, に, は, 遊女, 町, を, 造っ, た, と, いわ, れる, 。]


# Configファイルの作成

In [10]:
!python -m spacy init fill-config ../config/base_config.cfg ../config/config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
../config/config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


# 学習

In [11]:
!python -m spacy train ../config/config.cfg --output ./ --paths.train ../data/train.spacy --paths.dev ../data/dev.spacy

[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using CPU[0m
[1m
[2023-07-20 23:08:45,447] [INFO] Set up nlp object from config
[2023-07-20 23:08:45,454] [INFO] Pipeline: ['tok2vec', 'ner']
[2023-07-20 23:08:45,455] [INFO] Created vocabulary
[2023-07-20 23:08:46,067] [INFO] Added vectors: ja_core_news_lg
[2023-07-20 23:08:46,067] [INFO] Finished initializing nlp object
[2023-07-20 23:08:51,046] [INFO] Initialized pipeline components: ['tok2vec', 'ner']
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     51.59    0.00    0.00    0.00    0.00
  0     200        241.81   3057.86    3.64    6.87    2.48    0.04
  0     400       1078.83   4190.00   26.86   30.05   24.28    0.27
  0     600       1349.77   4286.86   39.50   40.39   38.65    0.40

# 評価

In [12]:
!python -m spacy benchmark accuracy model-best ../data/test.spacy --output ../evaluate_result/test_metrics.json --displacy-path ../evaluate_result

[38;5;4mℹ Using CPU[0m
[1m

TOK     100.00
NER P   70.85 
NER R   68.16 
NER F   69.48 
SPEED   1604  

[1m

              P       R       F
人名        74.68   78.90   76.73
法人名       71.00   66.60   68.73
イベント名     77.99   62.31   69.27
地名        70.90   85.49   77.51
施設名       59.05   53.68   56.24
製品名       56.70   51.84   54.16
政治的組織名    80.33   73.28   76.65
その他の組織名   69.09   41.53   51.88

[38;5;2m✔ Generated 25 parses as HTML[0m
../evaluate_result
[38;5;2m✔ Saved results to ../evaluate_result/test_metrics.json[0m


# 推論

In [13]:
model = spacy.load("model-best")

# こちらより引用: https://jp.corp-sansan.com/news/2023/0526.html
text = """
働き方を変えるDXサービスを提供するSansan株式会社は、契約DXサービス「Contract One」がサービス価値向上を目的に、マイクロソフト社が提供するAzure OpenAI Serviceを活用した「Contract One AI」を搭載したことを発表します。
今回は第一弾として文章内検索機能を追加します。契約書の内容について、定型質問から選択または質問内容を直接問いかけると、「Contract One AI」が適切な情報を抽出し質問に回答します。本機能の追加によって、法務担当者に限らず誰もが早く、簡単に契約情報を把握することが可能となります。「Contract One AI」は順次アップデートしていく予定です。
"""

colors = {"法人名": "#F67DE3", "製品名": "#7DF6D9"}
options = {"colors": colors} 

doc = model(text)

spacy.displacy.render(doc, style="ent", options=options, jupyter=True)