    author: Roman Makarov
    e-mail: mcronomus@gmail.com

#Downloading and preprocessing training data

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
# Downloading training and testing data from drive
!gdown 1fBbJhp7ZkaJWXCdV4XwJRAtdrG4J3BUT

Downloading...
From: https://drive.google.com/uc?id=1fBbJhp7ZkaJWXCdV4XwJRAtdrG4J3BUT
To: /content/data.zip
  0% 0.00/3.83M [00:00<?, ?B/s]100% 3.83M/3.83M [00:00<00:00, 224MB/s]


In [None]:
%%capture
!unzip data.zip

In [None]:
%%capture
import nltk
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
import os

dir_name = 'train_data'

data_folders = [x[0] for x in os.walk(dir_name)]
data_folders = data_folders[1:]

In [None]:
from nltk.tokenize import word_tokenize
import os
import pandas as pd

# A function to read the text and annotation files
def read_data(data_dir, file_id):
    with open(os.path.join(data_dir, f'{file_id}.txt'), 'r', encoding='utf-8') as f:
        text = f.read()

    annotations = []
    with open(os.path.join(data_dir, f'{file_id}.ann'), 'r', encoding='utf-8') as f:
        for line in f:
            if line.startswith('T'):
                annotation = line.strip().split('\t')[1:]

                # Splitting and saving data
                try:
                    annotation_type, start, end = annotation[0].split()
                    start = int(start)
                    end = int(end)
                    annotations.append((annotation_type, start, end))
                except:
                    continue

    return text, annotations

# Load the dataset to data list
data = []
for data_dir in data_folders:
    file_ids = sorted([filename.split('.')[0] for filename in os.listdir(data_dir) if filename.endswith('.txt')])

    for file_id in file_ids:
        text, annotations = read_data(data_dir, file_id)

        labels = []
        for annotation in annotations:
            label = annotation[0]
            start = int(annotation[1])
            end = int(annotation[2])

            # excluding intersections
            intersect = False
            for label_ in labels:
                if label_[0] <= start <= label_[1] or label_[0] <= end <= label_[1]:
                    intersect = True
                    break

            if intersect:
                continue

            labels.append((start, end, label, text[start:end]))

        # Checking if some tag is labeled twice and excluding it in this case
        new_labels = []
        for i in range(len(labels)):
            good = True
            for j in range(i + 1, len(labels)):
                if labels[i][3] in labels[j][3]:
                    good = False
                    break
            if good:
                new_labels.append(labels[i][:3])
        labels = new_labels

        data.append((text, {'entities': sorted(labels, key=lambda x: x[0])}))

In [None]:
len(data)

841

#Preparing spacy model

In [None]:
%%capture
!python -m spacy download ru_core_news_lg

In [None]:
import spacy
nlp = spacy.load("ru_core_news_lg")

In [None]:
!gdown 1MSMwpbMi7QAbUzqtq8A1_aUppL-ke7GY

Downloading...
From: https://drive.google.com/uc?id=1MSMwpbMi7QAbUzqtq8A1_aUppL-ke7GY
To: /content/base_config.cfg
  0% 0.00/1.80k [00:00<?, ?B/s]100% 1.80k/1.80k [00:00<00:00, 2.79MB/s]


In [None]:
!python -m spacy init fill-config base_config.cfg config.cfg

2023-04-14 06:40:07.444500: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


##Converting training and testing data to train.spacy and dev.spacy files

*Reference: https://spacy.io/usage/training*

In [None]:
from sklearn.model_selection import train_test_split

data_train, data_test = train_test_split(data, test_size=0.2, shuffle=True)

In [None]:
import pandas as pd
import os
from tqdm import tqdm
from spacy.tokens import DocBin

db = DocBin()
num_skipped = 0

for text, annot in tqdm(data_train):
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annot["entities"]:
        # contract
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            num_skipped += 1
            # print("Skipping entity")
        else:
            ents.append(span)

    doc.ents = ents
    db.add(doc)

print(f'Skipped: {num_skipped} entities')
db.to_disk("./train.spacy")

100%|██████████| 672/672 [00:03<00:00, 207.29it/s]


Skipped: 64 entities


In [None]:
import pandas as pd
import os
from tqdm import tqdm
from spacy.tokens import DocBin

db = DocBin()
num_skipped = 0

for text, annot in tqdm(data_test):
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annot["entities"]:
        # contract
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            num_skipped += 1
            # print("Skipping entity")
        else:
            ents.append(span)

    doc.ents = ents
    db.add(doc)

print(f'Skipped: {num_skipped} entities')
db.to_disk("./dev.spacy")

100%|██████████| 169/169 [00:00<00:00, 240.60it/s]


Skipped: 17 entities


##Staring training process with initialized config and training data

In [None]:
!python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./dev.spacy --gpu-id 0

[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[2023-04-14 04:23:28,161] [INFO] Set up nlp object from config
[2023-04-14 04:23:28,177] [INFO] Pipeline: ['tok2vec', 'ner']
[2023-04-14 04:23:28,183] [INFO] Created vocabulary
[2023-04-14 04:23:28,184] [INFO] Finished initializing nlp object
[2023-04-14 04:23:36,562] [INFO] Initialized pipeline components: ['tok2vec', 'ner']
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00    121.67    0.00    0.00    0.00    0.00
  0     200       2188.53  14585.28   27.82   47.59   19.66    0.28
  0     400        463.43  10835.43   27.71   39.40   21.36    0.28
  0     600       1510.30  10774.82   38.69   53.89   30.18    0.39
  1     800        965.99  10081.80   46.37   53.47   4

#Checking how the model trained

In [None]:
import spacy
nlp = spacy.load("./output/model-best")

In [None]:
doc = nlp("""Глава департамента ЦБ РФ Надежда Иванова получила статус зампреда

Иванова, которой 13 июня исполнилось 60 лет, всю свою жизнь проработала в системе ЦБ. Сводный экономический департамент Банка России возглавляет с 1995 года.
Здание Центрального банка РФ. Архив

Директор сводного экономического департамента Банка России Надежда Иванова назначена также на должность заместителя председателя ЦБ, сообщил в четверг регулятор.



Иванова, у которой 13 июня был 60-летний юбилей, работает в системе Банка России (ранее — Госбанка СССР) с окончания института, то есть с 1975 года. Сводный экономический департамент возглавляет почти 20 лет — с 1995 года.

Иванова входит в совет директоров Центробанка. До сводного экономического департамента она трудилась в департаменте банковского надзора.

Сводный экономический департамент входит в блок денежно-кредитной политики.

Это первое назначение нового председателя Банка России Эльвиры Набиуллиной на этом посту. Раньше в руководстве Центробанка преобладали мужчины. Эксперты ждут назначения на пост первого зампреда ЦБ по вопросам денежно-кредитной политики Ксении Юдаевой, возглавляющей экспертное управление президента РФ. Ранее этот пост занимал Алексей Улюкаев, который в понедельник стал руководителем Минэкономразвития.""")

for ent in doc.ents:
    print(f'{ent.label_} {ent.start_char} {ent.end_char} for {ent}')

PROFESSION 0 24 for Глава департамента ЦБ РФ
PERSON 25 40 for Надежда Иванова
CITY 67 74 for Иванова
DATE 84 91 for 13 июня
AGE 104 110 for 60 лет
ORGANIZATION 149 151 for ЦБ
DATE 212 223 for с 1995 года
ORGANIZATION 232 253 for Центрального банка РФ
PERSON 321 336 for Надежда Иванова
EVENT 337 346 for назначена
PROFESSION 366 393 for заместителя председателя ЦБ
DATE 403 412 for в четверг
PERSON 427 434 for Иванова
DATE 446 453 for 13 июня
AGE 458 467 for 60-летний
ORGANIZATION 495 507 for Банка России
PERSON 517 530 for Госбанка СССР
DATE 563 574 for с 1975 года
NUMBER 622 630 for почти 20
DATE 637 648 for с 1995 года
PERSON 651 658 for Иванова
ORGANIZATION 668 696 for совет директоров Центробанка
ORGANIZATION 754 786 for департаменте банковского надзора
ORDINAL 870 876 for первое
EVENT 877 887 for назначение
PROFESSION 895 920 for председателя Банка России
PERSON 921 940 for Эльвиры Набиуллиной
PERSON 977 988 for Центробанка
ORDINAL 1043 1050 for первого
ORGANIZATION 1060 1101 for ЦБ

#Predicting tags for test data

In [None]:
import os
import re
import string

def spacy(model, out_file='answer.txt', test_dir='test_data'):
    result_string = ''

    for filename in os.listdir(test_dir):
        file_path = os.path.join(test_dir, filename)
        result_list = []

        result_string += f'{filename[:-4]}.ann\n'

        # Reading the text and converting it to lower case
        file_text = ''
        with open(file_path, 'rb') as f:
            file_text = f.read().decode(errors='replace')

        doc = model(file_text)

        for ent in doc.ents:
            result_string += f'{ent.label_} {ent.start_char} {ent.end_char}\n'
            # print(f'{ent.label_} {ent.start_char} {ent.end_char} for {ent}')

    with open(out_file, 'w') as f:
        f.write(result_string)

In [None]:
spacy(nlp)

#Saving the model parameters for future usage

In [None]:
!zip -r output.zip /content/output

  adding: content/output/ (stored 0%)
  adding: content/output/model-best/ (stored 0%)
  adding: content/output/model-best/meta.json (deflated 71%)
  adding: content/output/model-best/vocab/ (stored 0%)
  adding: content/output/model-best/vocab/strings.json (deflated 90%)
  adding: content/output/model-best/vocab/key2row (stored 0%)
  adding: content/output/model-best/vocab/lookups.bin (stored 0%)
  adding: content/output/model-best/vocab/vectors (deflated 45%)
  adding: content/output/model-best/vocab/vectors.cfg (stored 0%)
  adding: content/output/model-best/ner/ (stored 0%)
  adding: content/output/model-best/ner/moves (deflated 80%)
  adding: content/output/model-best/ner/model (deflated 8%)
  adding: content/output/model-best/ner/cfg (deflated 33%)
  adding: content/output/model-best/tokenizer (deflated 84%)
  adding: content/output/model-best/config.cfg (deflated 60%)
  adding: content/output/model-best/tok2vec/ (stored 0%)
  adding: content/output/model-best/tok2vec/model (defl