## The first solution to assignment 3 using spacy

In [19]:
import os
import re
import random
import string
from typing import List
import spacy
from spacy import displacy
from spacy.util import minibatch, compounding
from spacy.training import Example
import json
from spacy.pipeline.spancat import DEFAULT_SPANCAT_MODEL
import pandas as pd
from tqdm import tqdm

nlp = spacy.blank('ru')
span_key = "sc"

In [20]:
%%capture
from datasets import load_dataset

In [21]:
# loading data
with open("../../data/train.jsonl", "r") as f:
    train_data = [json.loads(line) for line in f.readlines()]

with open("../../data/dev.jsonl", "r") as f:
    dev_data = [json.loads(line) for line in f.readlines()]

Here we will prepare data for training. During training, spacy requires Example objects with the full text and annotations of spans and labels. Lets create them

In [24]:
train_data_examples = []
counter = 0 # counter of skipped samples for some reasons

for sample in train_data:
  id = sample["id"]
  text = sample["sentences"]
  entities = sample["ners"][:2]

  # add 1 to every second value in every tuple to make second value exclusive
  entities = [[tup[0], tup[1]+1, tup[2]] for tup in entities]

  # This is the argument Example object needs when creating
  annotation = {'spans' : {span_key : entities}}
  try:

    # create Example from text and annotations (with spans and labels)
    train_data_examples.append(Example.from_dict(nlp.make_doc(text), annotation))
  except:
    # If we could not create Example for this text, then skip. This happens if boundaries of words are not spaces or punctuation
    counter +=1
    continue
counter

2

In [26]:
%%capture
entities = load_dataset('MalakhovIlya/RuNNE', 'ent_types')['ent_types']

In [27]:
config = {
    # this refers to the minimum probability to consider a prediction positive
    "threshold": 0.85,
    # the span key refers to the key in doc.spans
    "spans_key": span_key,
    # this refers to the maximum number of labels to consider positive per span
    "max_positive": None,
    # a model instance that is given a list of documents with start end indices representing the labelled spans
    "model": DEFAULT_SPANCAT_MODEL,
    # A function that suggests spans. This suggester is fixed n-gram length of up to 5 tokens
    "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3, 4, 5]},
}

#add spancat component to nlp object
nlp.add_pipe("spancat", config=config)

#get spancat component
span=nlp.get_pipe('spancat')

#Add labels to spancat component
for label in entities["type"]:
    span.add_label(label)

In [28]:
pipe_exceptions = ["spancat"]

# disable redundant pipelines
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

# initialise spacy object
nlp.initialize()
sgd = nlp.create_optimizer()

Next is training process

In [34]:
with nlp.disable_pipes(*unaffected_pipes):
    for iteration in tqdm(range(50)):
        random.shuffle(train_data_examples)
        losses = {}
        batches = minibatch(train_data_examples, size=compounding(4.0, 32.0, 1.001))
        for batch in batches:
            nlp.update(list(batch), losses=losses, drop=0.1, sgd=sgd)

        print(f"iteration {iteration} train loss: {losses['spancat']:.2f}")

  2%|▏         | 1/50 [00:32<26:30, 32.47s/it]

iteration 0 train loss: 1018.34


  4%|▍         | 2/50 [01:04<25:32, 31.93s/it]

iteration 1 train loss: 1018.05


  6%|▌         | 3/50 [01:35<24:49, 31.70s/it]

iteration 2 train loss: 1017.93


  8%|▊         | 4/50 [02:06<24:07, 31.46s/it]

iteration 3 train loss: 1017.87


 10%|█         | 5/50 [02:39<23:57, 31.95s/it]

iteration 4 train loss: 1017.79


 12%|█▏        | 6/50 [03:12<23:46, 32.42s/it]

iteration 5 train loss: 1016.30


 14%|█▍        | 7/50 [03:45<23:26, 32.71s/it]

iteration 6 train loss: 1036.72


 16%|█▌        | 8/50 [04:17<22:38, 32.35s/it]

iteration 7 train loss: 999.38


 18%|█▊        | 9/50 [04:52<22:39, 33.17s/it]

iteration 8 train loss: 994.12


 20%|██        | 10/50 [05:23<21:35, 32.38s/it]

iteration 9 train loss: 985.98


 22%|██▏       | 11/50 [05:54<20:53, 32.13s/it]

iteration 10 train loss: 983.48


 24%|██▍       | 12/50 [06:25<20:06, 31.75s/it]

iteration 11 train loss: 966.39


 26%|██▌       | 13/50 [06:57<19:34, 31.74s/it]

iteration 12 train loss: 942.70


 28%|██▊       | 14/50 [07:28<19:00, 31.68s/it]

iteration 13 train loss: 865.10


 30%|███       | 15/50 [08:06<19:30, 33.44s/it]

iteration 14 train loss: 834.97


 32%|███▏      | 16/50 [08:38<18:38, 32.90s/it]

iteration 15 train loss: 829.74


 34%|███▍      | 17/50 [09:07<17:35, 31.99s/it]

iteration 16 train loss: 799.82


 36%|███▌      | 18/50 [09:37<16:40, 31.26s/it]

iteration 17 train loss: 778.50


 38%|███▊      | 19/50 [10:06<15:50, 30.65s/it]

iteration 18 train loss: 771.64


 40%|████      | 20/50 [10:35<15:05, 30.19s/it]

iteration 19 train loss: 765.75


 42%|████▏     | 21/50 [11:04<14:23, 29.77s/it]

iteration 20 train loss: 761.32


 44%|████▍     | 22/50 [11:38<14:26, 30.93s/it]

iteration 21 train loss: 760.88


 46%|████▌     | 23/50 [12:17<15:04, 33.51s/it]

iteration 22 train loss: 750.54


 48%|████▊     | 24/50 [12:47<14:05, 32.51s/it]

iteration 23 train loss: 737.51


 50%|█████     | 25/50 [13:18<13:20, 32.01s/it]

iteration 24 train loss: 731.08


 52%|█████▏    | 26/50 [13:55<13:22, 33.42s/it]

iteration 25 train loss: 729.73


 54%|█████▍    | 27/50 [14:31<13:05, 34.14s/it]

iteration 26 train loss: 728.37


 56%|█████▌    | 28/50 [15:02<12:11, 33.23s/it]

iteration 27 train loss: 720.37


 58%|█████▊    | 29/50 [15:33<11:23, 32.53s/it]

iteration 28 train loss: 719.21


 60%|██████    | 30/50 [16:02<10:29, 31.49s/it]

iteration 29 train loss: 715.58


 62%|██████▏   | 31/50 [16:32<09:48, 30.99s/it]

iteration 30 train loss: 711.86


 64%|██████▍   | 32/50 [17:04<09:25, 31.39s/it]

iteration 31 train loss: 703.22


 66%|██████▌   | 33/50 [17:40<09:17, 32.76s/it]

iteration 32 train loss: 688.80


 68%|██████▊   | 34/50 [18:17<09:05, 34.11s/it]

iteration 33 train loss: 669.39


 70%|███████   | 35/50 [18:50<08:23, 33.59s/it]

iteration 34 train loss: 662.49


 72%|███████▏  | 36/50 [19:20<07:35, 32.56s/it]

iteration 35 train loss: 655.98


 74%|███████▍  | 37/50 [19:51<06:59, 32.31s/it]

iteration 36 train loss: 650.81


 76%|███████▌  | 38/50 [20:24<06:29, 32.49s/it]

iteration 37 train loss: 649.98


 78%|███████▊  | 39/50 [20:57<05:56, 32.37s/it]

iteration 38 train loss: 644.73


 80%|████████  | 40/50 [21:32<05:32, 33.22s/it]

iteration 39 train loss: 637.02


 82%|████████▏ | 41/50 [22:04<04:56, 32.98s/it]

iteration 40 train loss: 636.34


 84%|████████▍ | 42/50 [22:35<04:17, 32.20s/it]

iteration 41 train loss: 630.62


 86%|████████▌ | 43/50 [23:04<03:39, 31.35s/it]

iteration 42 train loss: 627.27


 88%|████████▊ | 44/50 [23:33<03:04, 30.83s/it]

iteration 43 train loss: 622.97


 90%|█████████ | 45/50 [24:03<02:31, 30.34s/it]

iteration 44 train loss: 617.32


 92%|█████████▏| 46/50 [24:33<02:00, 30.22s/it]

iteration 45 train loss: 608.24


 94%|█████████▍| 47/50 [25:03<01:30, 30.13s/it]

iteration 46 train loss: 590.04


 96%|█████████▌| 48/50 [25:33<01:00, 30.11s/it]

iteration 47 train loss: 586.13


 98%|█████████▊| 49/50 [26:06<00:31, 31.22s/it]

iteration 48 train loss: 583.00


100%|██████████| 50/50 [26:44<00:00, 32.10s/it]

iteration 49 train loss: 572.41





Having the model, we can pass the test/dev data to it to get the result

In [35]:
with open("test.jsonl", "w") as f:
    for sample in dev_data:
        text = sample["senences"]
        id = sample["id"]
        doc = nlp(text)
        spans = doc.spans[span_key]
        d = {"id" : id, "ners" : []}
        for span, confidence in zip(spans, spans.attrs["scores"]):
            d["ners"].append([span.start, span.end, span.label_])
        f.write(str(d).replace("'", "\""))
        f.write("\n")

In [36]:
!zip test test.jsonl

updating: test.jsonl (164 bytes security) (deflated 84%)


In [1]:
import shutil

In [2]:
shutil.move("./test.jsonl", "./output/test.jsonl")
shutil.move("./test.zip", "./output/test.zip")

'./output/test.zip'