In [37]:
import spacy
import json
import random
from tqdm.auto import tqdm

from spacy.tokens import DocBin
from spacy.util import filter_spans

# データの読み込み

In [38]:
with open("../ner-wikipedia-dataset/ner.json") as f:
    stock_mark_data = json.load(f)

# データをSpacyで読み込める形式に変換

In [39]:
def stockmark_to_spacy_train_data(stock_mark_data: list) -> list:
    spacy_train_data = []
    for data in stock_mark_data:
        text = data["text"]
        entities = data["entities"]

        spacy_entites = [(entity["span"][0], entity["span"][1], entity["type"]) for entity in entities]

        spacy_train = {"text": text, "entities": spacy_entites}

        spacy_train_data.append(spacy_train)
    return spacy_train_data

spacy_data = stockmark_to_spacy_train_data(stock_mark_data)

In [40]:
def random_split_to_train_dev_test(spacy_data: list) -> tuple[list, list, list]:
    all_len = len(spacy_data)

    train_len = int(all_len * 0.6)
    dev_len = train_len + int(all_len * 0.2)

    random.shuffle(spacy_data)

    train = spacy_data[:train_len]
    dev = spacy_data[train_len:dev_len]
    test = spacy_data[dev_len:]

    return train, dev, test

train, dev, test = random_split_to_train_dev_test(spacy_data)

In [41]:
print(len(train))
print(len(dev))
print(len(test))

3205
1068
1070


In [42]:
def make_spacy(data: list, name: str) -> None:
    nlp = spacy.blank("ja") # load a new spacy model
    db = DocBin()
    for training_example in tqdm(data):
        text = training_example['text']
        annotations = training_example['entities']
        doc = nlp(text)
        ents = []
        for start, end, label in annotations:
            span = doc.char_span(start, end, label=label)

            if span is None:
                print("Skipping entity")
            else:
                ents.append(span)
        doc.ents = ents
        db.add(doc)
    db.to_disk(f"../data/{name}.spacy")

make_spacy(data=train, name="train")
make_spacy(data=dev, name="dev")
make_spacy(data=test, name="test")

  6%|▌         | 193/3205 [00:00<00:01, 1927.65it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity


 13%|█▎        | 423/3205 [00:00<00:01, 2142.43it/s]

Skipping entity
Skipping entity
Skipping entity


 28%|██▊       | 883/3205 [00:00<00:01, 2242.02it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity


 35%|███▍      | 1116/3205 [00:00<00:00, 2268.81it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity


 78%|███████▊  | 2484/3205 [00:01<00:00, 2181.49it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity


 92%|█████████▏| 2936/3205 [00:01<00:00, 2221.87it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity


100%|██████████| 3205/3205 [00:01<00:00, 2232.97it/s]
 42%|████▏     | 453/1068 [00:00<00:00, 2263.04it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity


100%|██████████| 1068/1068 [00:00<00:00, 2376.23it/s]


Skipping entity


 20%|██        | 215/1070 [00:00<00:00, 2148.78it/s]

Skipping entity
Skipping entity


 64%|██████▍   | 690/1070 [00:00<00:00, 2335.94it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity


 87%|████████▋ | 930/1070 [00:00<00:00, 2360.94it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity


100%|██████████| 1070/1070 [00:00<00:00, 2325.05it/s]


In [43]:
text = "Sansan株式会社は、「出会いからイノベーションを生み出す」をミッションとして掲げています"
start = 0
end = 10
label = "法人名"

nlp = spacy.blank("ja")
doc = nlp(text)
span = doc.char_span(start, end, label=label)

print(span)
print(type(span))

Sansan株式会社
<class 'spacy.tokens.span.Span'>


In [44]:
text = "また、草戸稲荷神社前には遊女町を造ったといわれる。"
start = 3
end = 9
label = "施設名"

nlp = spacy.blank("ja")
doc = nlp(text)
span = doc.char_span(start, end, label=label)

print(f"期待出力: {text[start:end]}")
print(f"実際の出力: {span}")
print(f"データ型: {type(span)}")

期待出力: 草戸稲荷神社
実際の出力: None
データ型: <class 'NoneType'>


In [45]:
nlp = spacy.blank("ja")
doc = nlp("また、草戸稲荷神社前には遊女町を造ったといわれる。")
print([token for token in doc])

[また, 、, 草戸, 稲荷, 神社前, に, は, 遊女, 町, を, 造っ, た, と, いわ, れる, 。]


# Configファイルの作成

In [50]:
!python -m spacy init fill-config ../config/base_config.cfg ../config/config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
../config/config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


# 学習

In [51]:
!python -m spacy train ../config/config.cfg --output ./ --paths.train ../data/train.spacy --paths.dev ../data/dev.spacy

[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using CPU[0m
[1m
[2023-07-18 13:22:36,897] [INFO] Set up nlp object from config
[2023-07-18 13:22:36,903] [INFO] Pipeline: ['tok2vec', 'ner']
[2023-07-18 13:22:36,904] [INFO] Created vocabulary
[2023-07-18 13:22:37,509] [INFO] Added vectors: ja_core_news_lg
[2023-07-18 13:22:37,509] [INFO] Finished initializing nlp object
[2023-07-18 13:22:42,643] [INFO] Initialized pipeline components: ['tok2vec', 'ner']
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     51.24    0.39    0.44    0.34    0.00
  0     200        190.58   3211.08    9.95   13.77    7.79    0.10
  0     400        347.40   3210.15   22.88   25.64   20.66    0.23
  0     600        434.84   3822.97   25.66   26.96   24.48    0.26

# 評価

In [52]:
!python -m spacy benchmark accuracy model-best ../data/test.spacy --output ../evaluate_result/test_metrics.json --displacy-path ../evaluate_result

[38;5;4mℹ Using CPU[0m
[1m

TOK     100.00
NER P   68.37 
NER R   67.26 
NER F   67.81 
SPEED   1556  

[1m

              P       R       F
イベント名     74.68   55.02   63.36
地名        78.49   83.86   81.09
製品名       40.33   53.30   45.92
施設名       71.60   56.02   62.86
法人名       64.05   69.50   66.67
政治的組織名    74.65   72.65   73.64
人名        75.18   74.52   74.85
その他の組織名   65.77   44.14   52.83

[38;5;2m✔ Generated 25 parses as HTML[0m
../evaluate_result
[38;5;2m✔ Saved results to ../evaluate_result/test_metrics.json[0m


# 推論

In [58]:
model = spacy.load("model-best")

text = """
働き方を変えるDXサービスを提供するSansan株式会社は、契約DXサービス「Contract One」がサービス価値向上を目的に、マイクロソフト社が提供するAzure OpenAI Serviceを活用した「Contract One AI」を搭載したことを発表します。
今回は第一弾として文章内検索機能を追加します。契約書の内容について、定型質問から選択または質問内容を直接問いかけると、「Contract One AI」が適切な情報を抽出し質問に回答します。本機能の追加によって、法務担当者に限らず誰もが早く、簡単に契約情報を把握することが可能となります。「Contract One AI」は順次アップデートしていく予定です。
"""

colors = {"法人名": "#F67DE3", "製品名": "#7DF6D9"}
options = {"colors": colors} 

doc = model(text)

spacy.displacy.render(doc, style="ent", options=options, jupyter=True)