# Prepare data


## Prepare environment

In [None]:
%%capture
!pip install -q peft
!pip install -q accelerate
!pip install -q -i https://pypi.org/simple/ bitsandbytes
!pip install --upgrade -q huggingface_hub
!pip install -q wandb pybrat razdel
!pip install jsonlines

In [None]:
# Authorize to hf and wandb
from huggingface_hub import login

# Enter your tokens here
login(token="")
!wandb login

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
import pandas as pd
import numpy as np
import jsonlines
import requests, zipfile, io
import json
import string
import warnings

warnings.filterwarnings("ignore")

## Download data

In [None]:
# Load data
url = "https://codalab.lisn.upsaclay.fr/my/datasets/download/2be26d3f-9630-46d5-8a68-414034ba4bdc"
r = requests.get(url)
if r.ok:
    z = zipfile.ZipFile(io.BytesIO(r.content))
    z.extractall(".")
else:
    print("Downloading error")

In [None]:
# Load data in pd df
train_df = pd.read_json('train.jsonl', lines=True)
dev_df = pd.read_json('dev.jsonl', lines=True)
test_df = pd.read_json('test.jsonl', lines=True)

In [None]:
train_df.head(3)

Unnamed: 0,ners,sentences,id
0,"[[0, 5, CITY], [16, 23, PERSON], [34, 41, PERS...",Бостон взорвали Тамерлан и Джохар Царнаевы из ...,0
1,"[[21, 28, PROFESSION], [53, 67, ORGANIZATION],...",Умер избитый до комы гитарист и сооснователь г...,1
2,"[[0, 4, PERSON], [37, 42, COUNTRY], [47, 76, O...",Путин подписал распоряжение о выходе России из...,2


## Explore entities

In [None]:
# Count entities frequency
entities_count = {}
for ind, row in train_df.iterrows():
    for st, end, lab in row["ners"]:
        if lab not in entities_count.keys():
            entities_count[lab] = 0
        entities_count[lab] += 1
print(entities_count)

{'CITY': 1261, 'PERSON': 5119, 'LOCATION': 314, 'EVENT': 3335, 'AGE': 657, 'DATE': 2689, 'ORGANIZATION': 4088, 'ORDINAL': 614, 'PROFESSION': 5039, 'COUNTRY': 2510, 'NUMBER': 1107, 'CRIME': 221, 'STATE_OR_PROVINCE': 412, 'DISTRICT': 103, 'FAMILY': 24, 'AWARD': 404, 'TIME': 182, 'FACILITY': 424, 'DISEASE': 220, 'WORK_OF_ART': 270, 'LAW': 405, 'MONEY': 179, 'RELIGION': 89, 'NATIONALITY': 437, 'IDEOLOGY': 273, 'PRODUCT': 245, 'PERCENT': 68, 'LANGUAGE': 54, 'PENALTY': 92}


In [None]:
# Print top-10 frequent entities
chosen_entities = [x[0] for x in sorted(entities_count.items(), key=lambda x: -x[1])[:10]]
print(chosen_entities)

['PERSON', 'PROFESSION', 'ORGANIZATION', 'EVENT', 'DATE', 'COUNTRY', 'CITY', 'NUMBER', 'AGE', 'ORDINAL']


In [None]:
# Replace new line with space (the indexing is still the same)
train_df["sentences"] = train_df["sentences"].apply(lambda x: x.replace("\n", " "))
test_df["senences"] = test_df["senences"].apply(lambda x: x.replace("\n", " "))

In [None]:
# Check whether multispace is in entities
for id, row in train_df.iterrows():
    for st, end, lab in row["ners"]:
        if row["sentences"][st].isspace() | row["sentences"][end].isspace() | ("  " in row["sentences"][st:end+1]) | ("\n" in row["sentences"][st:end+1]):
            print(id, lab, row["sentences"][st:end+1])

# Drop training entity with space inside
train_df = train_df.drop(6)

6 PERSON Ильи  Ноябрева


## Build dataset

Instruct NER project work with instructions, that are simply dictionary of 5 values:
instruction - the input task,
input - text to be analyzed,
output - string with the result expected from the model,
source - the prompt without answer,
raw_entities - dictionary, where key is a label and values is a list of entities of this label,
id - query id.
When all data was preprocessed, it was saved in the json lines format.


In [None]:
# Prepare train instructions
instrs = []
for ind, row in train_df.iterrows():

    example = {'instruction': f'Ты решаешь задачу NER. Извлеки из текста слова, относящиеся к каждой из следующих сущностей: {" ".join(unique_entities)}.',
     'input': row['sentences'],
     'output': '',
     'source': f'### Задание: Ты решаешь задачу NER. Извлеки из текста слова, относящиеся к каждой из следующих сущностей: {" ".join(unique_entities)}.\n### Вход: text.\n### Ответ: ',
     'raw_entities': {ent: [] for ent in unique_entities},
     'id': row["id"]}

    example['source'] = f'### Задание: {example["instruction"]}.\n### Вход: {example["input"]}.\n### Ответ: '

    for st, end, label in row["ners"]:
        example["raw_entities"][label].append(row["sentences"][st:end+1])

    example['output'] = "\n".join([label+": "+" , ".join(st)  for label, st in example["raw_entities"].items()])
    instrs.append(example)

In [None]:
# Prepare test instructions
test_instrs = []
for ind, row in test_df.iterrows():

    example = {'instruction': f'Ты решаешь задачу NER. Извлеки из текста слова, относящиеся к каждой из следующих сущностей: {" ".join(unique_entities)}.',
     'input': row['senences'],
     'output': '',
     'source': f'### Задание: Ты решаешь задачу NER. Извлеки из текста слова, относящиеся к каждой из следующих сущностей: {" ".join(unique_entities)}.\n### Вход: text.\n### Ответ: ',
     'raw_entities': {ent: [] for ent in unique_entities},
     'id': row["id"]}

    example['source'] = f'### Задание: {example["instruction"]}.\n### Вход: {example["input"]}.\n### Ответ: '

    example['output'] = "\n".join([label+": "+" , ".join(st)  for label, st in example["raw_entities"].items()])
    test_instrs.append(example)

In [None]:
# How many datapoints
len(instrs), len(test_instrs)

(518, 65)

In [None]:
# Save data to files
with open("model/test_data.jsonl", "w") as f:
    json.dump(test_instrs, f)

# Train val split
with open("model/val_data.jsonl", "w") as f:
    json.dump(instrs[:64], f)

with open("model/train_data.jsonl", "w") as f:
    json.dump(instrs[64:], f)

# Instruct NER

## Prepare algorithms

Unfortunately, Instruct NER project does not support custom models, so I changed the source code to include the NEREL dataset. Firstly, I created a nerel_utils.py that contains information about the dataset, such as unique and most frequent entities. Then I created nerel_reader.py to load data from a file and send it in a required way. Also, I needed to change code in train_instruct.py and inference_instruct.py to overcome stopping the process for unknown dataset. Also, I move data to the project folder for easier call of it.


In [None]:
!git clone https://github.com/poteminr/instruct-ner.git

Cloning into 'instruct-ner'...
remote: Enumerating objects: 720, done.[K
remote: Counting objects: 100% (73/73), done.[K
remote: Compressing objects: 100% (39/39), done.[K
remote: Total 720 (delta 52), reused 50 (delta 34), pack-reused 647[K
Receiving objects: 100% (720/720), 297.20 KiB | 3.10 MiB/s, done.
Resolving deltas: 100% (433/433), done.


In [None]:
%%capture
# Move files to overwrite files
!mkdir instruct-ner/instruction_ner/utils/nerel
!mv model/nerel_utils.py instruct-ner/instruction_ner/utils/nerel/nerel_utils.py
!mv model/nerel_reader.py instruct-ner/instruction_ner/utils/nerel/nerel_reader.py
!mv model/inference_instruct.py instruct-ner/instruction_ner/inference_instruct.py
!mv model/train_instruct.py instruct-ner/instruction_ner/train_instruct.py

# Move data
!mv model/train_data.jsonl instruct-ner/instruction_ner/utils/nerel/train_data.jsonl
!mv model/dev_data.jsonl instruct-ner/instruction_ner/utils/nerel/dev_data.jsonl
!mv model/test_data.jsonl instruct-ner/instruction_ner/utils/nerel/test_data.jsonl


In [None]:
# Change working directory
%cd instruct-ner/instruction_ner

/kaggle/working/instruct-ner/instruction_ner


## Train model

After all the algorithms and data were ready, I started the training process. Although the model was pre-trained and only 20% of the weights were tuned, the process still required a large amount of resources, both computational and time. The available processing power was not enough to use the Mistral model, so the training was done on the T5 model. Over 4 epochs, the loss dropped to 2.2 while processing about 1 sample per second.  

In [None]:
# Train t5 model
!python train_instruct.py \
        --config_file configs/t5_lora.json \
        --model_type t5 \
        --dataset_name nerel \
        --max_instances -1

2024-04-28 15:50:19.992433: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-28 15:50:19.992496: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-28 15:50:19.994125: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Vocab size:  50257
PAD:  0 <pad>
BOS:  1 <s>
EOS:  2 </s>
UNK:  3 <unk>
SEP:  1 <s>
100%|████████████████████████████████████████

# Make prediction

In [None]:
# Inference on test data (with no true predictions)
!python inference_instruct.py \
        --batch_size 4 \
        --dataset_name nerel \
        --model_type t5 \
        --config_file configs/t5_lora.json \
        --model_name poteminr/t5-rudrec \
        --max_instances -1

2024-04-28 13:12:17.996327: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-28 13:12:17.996393: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-28 13:12:17.998035: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
  return self.fget.__get__(instance, owner)()
100%|███████████████████████████████████████| 65/65 [00:00<00:00, 559814.70it/s]
100%|████████████████████████████████████████

In [None]:
# Load predictions
pred_df = pd.read_json('prediction.json')
pred_df.head(3)

Unnamed: 0,id,extracted,target,generated_text
0,584,"{'AGE': [], 'AWARD': [], 'CITY': [], 'COUNTRY'...","{'AGE': [], 'AWARD': [], 'CITY': [], 'COUNTRY'...",\n### Выход: Владелец «Бирмингема» получил шес...
1,585,"{'AGE': [], 'AWARD': [], 'CITY': [], 'COUNTRY'...","{'AGE': [], 'AWARD': [], 'CITY': [], 'COUNTRY'...",\n### Задание: Ты решаешь задачу NER. Извлеки ...
2,586,"{'AGE': [], 'AWARD': [], 'CITY': [], 'COUNTRY'...","{'AGE': [], 'AWARD': [], 'CITY': [], 'COUNTRY'...",\n\n\n\n.\n### Вход: Фольксваген может перейти...


In [None]:
# Prediction example
pred_df.loc[6]["extracted"]

{'AGE': [],
 'AWARD': [],
 'CITY': [],
 'COUNTRY': [],
 'CRIME': [],
 'DATE': [],
 'DISEASE': [],
 'DISTRICT': [],
 'EVENT': [],
 'FACILITY': [],
 'FAMILY': [],
 'IDEOLOGY': [],
 'LANGUAGE': [],
 'LAW': [],
 'LOCATION': [],
 'MONEY': [],
 'NATIONALITY': [],
 'NUMBER': [],
 'ORDINAL': ['первые'],
 'ORGANIZATION': [],
 'PENALTY': [],
 'PERCENT': [],
 'PERSON': [],
 'PRODUCT': [],
 'PROFESSION': ['юристом'],
 'RELIGION': [],
 'STATE_OR_PROVINCE': [],
 'TIME': [],
 'WORK_OF_ART': []}

In [None]:
# Find entities strt and end and convert to needed format
result = {}
# Make predictions for test set
for ind, row in pred_df.iterrows():
    text = test_df.loc[ind]["senences"]
    result[row["id"]] = [[]]

    for label, ents in row["extracted"].items():
        ents = np.unique(ents)
        for ent in ents:
            if len(ent) < 2:
                # Don't add entities 1 char long
                continue

            start = 0
            # Label all substrings that match entity text
            for iter in range(text.count(ent)):
                st = start + text[start:].index(ent)
                result[row["id"]][0].append([st, st+len(ent)-1, label])
                start = st + len(ent)

res = []
for ind, ent in result.items():
    res.append([ind, ent])
# Convert result to a dataframe
answer = pd.DataFrame(res, columns=["id", "ners"])
answer.head()

Unnamed: 0,id,ners
0,584,"[[137, 137, NUMBER], [534, 537, NUMBER]]"
1,585,[]
2,586,"[[248, 258, ORGANIZATION], [501, 511, ORGANIZA..."
3,587,"[[368, 376, PROFESSION], [1053, 1061, ORGANIZA..."
4,588,"[[322, 325, PERSON]]"


In [None]:
# Save results in a file
output_path = "./test.jsonl"

with open(output_path, "w") as f:
    f.write(answer.to_json(orient='records', lines=True))

In [None]:
# Zip to submit predictions
!zip test test.jsonl

  adding: test.jsonl (deflated 80%)
