In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import warnings
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM
from transformers import DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
import torch
import evaluate
import os
import json
import random

warnings.filterwarnings("ignore")


data = pd.read_json("data/train.json")
data = data.drop_duplicates(subset="text")

2024-05-25 17:48:56.504385: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def createATE_dataset(sample, prompts_file_path):
    """function to create ATE dataset for FLAN"""
    with open(prompts_file_path, encoding = 'UTF-8') as fp:
        template = json.load(fp)

    num = random.randint(1, len(template))
    instruction = template["ATE"][str(num)]

    sample["aspect_list"] = ",".join([item["word"] for item in sample["entities"]])
    sample["ner_list"] = ",".join(
        [item["entity_group"] for item in sample["entities"]]
    )
    sample["aspect_ner_list"] = ",".join(
        [f"{item['word']}:{item['entity_group']}" for item in sample["entities"]]
    )
    sample["aspect_ner_output"] = f"Ответ: \n{sample['aspect_list']}</s>"
    sample["aspect_ner_input"] = (
        f"<LM>Задача: Извлечение именованных сущностей \n{instruction}\n{sample['text']}\n"
    )
    return sample

In [3]:
def createASC_dataset(sample, prompts_file_path):
    """function to create ASC dataset for FLAN"""
    with open(prompts_file_path, encoding="UTF-8") as fp:
        template = json.load(fp)

    num = random.randint(1, len(template))
    instruction = template["ASC"][str(num)]

    sample["aspect_list"] = ",".join([item["word"] for item in sample["entities"]])
    sample["ner_list"] = ",".join(
        [item["entity_group"] for item in sample["entities"]]
    )
    sample["aspect_ner_list"] = ",".join(
        [f"{item['word']}:{item['entity_group']}" for item in sample["entities"]]
    )
    sample["aspect_ner_output"] = f"Ответ: \n{sample['ner_list']}</s>"
    sample["aspect_ner_input"] = (
        f"<LM>Задача: Классификация именованных сущностей \n{instruction}\nТекст: \n{sample['text']}\nВыделенные именованные сущности: {sample['ner_list']}\n"
    )
    return sample

In [4]:
def createABSA_dataset(sample, prompts_file_path):
    """function to create ABSA dataset for FLAN"""
    with open(prompts_file_path, encoding="UTF-8") as fp:
        template = json.load(fp)

    num = random.randint(1, len(template))
    instruction = template["ABSA"][str(num)]

    sample["aspect_list"] = ",".join([item["word"] for item in sample["entities"]])
    sample["ner_list"] = ",".join(
        [item["entity_group"] for item in sample["entities"]]
    )
    sample["aspect_ner_list"] = ",".join(
        [f"{item['word']}:{item['entity_group']}" for item in sample["entities"]]
    )

    sample["aspect_ner_output"] = f"Ответ: \n{sample['aspect_ner_list']}</s>"
    sample["aspect_ner_input"] = (
        f"<LM>Задача: Извлечение и классификация именованных сущностей \n{instruction}\n{sample['text']}\n"
    )
    return sample

In [5]:
from datasets import concatenate_datasets
dataset = Dataset.from_pandas(data)
dataset = dataset.shuffle()
train_test_split = dataset.train_test_split(test_size=0.1)

ATE_dataset_1 = train_test_split["train"].map(
    createATE_dataset, fn_kwargs={"prompts_file_path": "prompts.json"}
)
ATE_dataset_2 = train_test_split["train"].map(
    createATE_dataset, fn_kwargs={"prompts_file_path": "prompts.json"}
)
ASC_dataset_1 = train_test_split["train"].map(
    createASC_dataset, fn_kwargs={"prompts_file_path": "prompts.json"}
)
ASC_dataset_2 = train_test_split["train"].map(
    createASC_dataset, fn_kwargs={"prompts_file_path": "prompts.json"}
)
ABSA_dataset_1 = train_test_split["train"].map(
    createABSA_dataset, fn_kwargs={"prompts_file_path": "prompts.json"}
)
ABSA_dataset_2 = train_test_split["train"].map(
    createABSA_dataset, fn_kwargs={"prompts_file_path": "prompts.json"}
)
ABSA_test = train_test_split["test"].map(
    createABSA_dataset, fn_kwargs={"prompts_file_path": "prompts.json"}
)
combined_datasets = concatenate_datasets(
    [ATE_dataset_1, ASC_dataset_1, ABSA_dataset_1,ATE_dataset_2, ASC_dataset_2, ABSA_dataset_2 ],
)
combined_datasets = combined_datasets.shuffle()
dataset_train_test = combined_datasets.train_test_split(test_size=0.1)

final_ds = DatasetDict(
    {
        "train": dataset_train_test["train"],
        "test": dataset_train_test["test"],
        "val": ABSA_test,
    }
)

Map: 100%|██████████| 296/296 [00:00<00:00, 3641.67 examples/s]
Map: 100%|██████████| 296/296 [00:00<00:00, 3928.63 examples/s]
Map: 100%|██████████| 296/296 [00:00<00:00, 3980.86 examples/s]
Map: 100%|██████████| 296/296 [00:00<00:00, 3940.50 examples/s]
Map: 100%|██████████| 296/296 [00:00<00:00, 4018.62 examples/s]
Map: 100%|██████████| 296/296 [00:00<00:00, 4015.04 examples/s]
Map: 100%|██████████| 33/33 [00:00<00:00, 2845.76 examples/s]


In [6]:
from datasets import concatenate_datasets

tokenizer = AutoTokenizer.from_pretrained("ai-forever/FRED-T5-large", eos_token='</s>')
tokenized_inputs = concatenate_datasets(
    [
        final_ds["train"],
        final_ds["test"],
        final_ds["val"],
    ],
).map(
    lambda x: tokenizer(x["aspect_ner_input"], truncation=True),
    batched=True,
    remove_columns=[
        "text",
        "entities",
        "aspect_list",
        "ner_list",
        "aspect_ner_list",
        "aspect_ner_output",
        "aspect_ner_input",
        "__index_level_0__",
    ],
)
max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])

tokenized_targets = concatenate_datasets(
    [
        final_ds["train"],
        final_ds["test"],
        final_ds["val"],
    ],
).map(
    lambda x: tokenizer(x["aspect_ner_output"], truncation=True),
    batched=True,
    remove_columns=[
        "text",
        "entities",
        "aspect_list",
        "ner_list",
        "aspect_ner_list",
        "aspect_ner_output",
        "aspect_ner_input",
        "__index_level_0__",
    ],
)
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Map:   0%|          | 0/1809 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|██████████| 1809/1809 [00:00<00:00, 4434.96 examples/s]
Map: 100%|██████████| 1809/1809 [00:00<00:00, 39646.44 examples/s]


In [7]:
def tokenize_function(
    sample, tokenizer, max_source_length, max_target_length, padding="max_length"
):
    """function to tokenize data for FLAN"""
    model_inputs = tokenizer(
        sample["aspect_ner_input"],
        max_length=max_source_length,
        padding=padding,
        truncation=True,
    )
    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(
        text_target=sample["aspect_ner_output"],
        max_length=max_target_length,
        padding=padding,
        truncation=True,
    )
    # If we are padding here, replace all tokenizer.pad_token_id in the labels
    # by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label]
            for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [8]:
tokenized_dataset = final_ds.map(
    tokenize_function,
    batched=True,
    fn_kwargs={
        "max_source_length": max_source_length,
        "max_target_length": max_target_length,
        "tokenizer": tokenizer,
    },
    remove_columns=[
        "text",
        "entities",
        "aspect_list",
        "ner_list",
        "aspect_ner_list",
        "aspect_ner_output",
        "aspect_ner_input",
        "__index_level_0__",
    ],
)

Map: 100%|██████████| 1598/1598 [00:00<00:00, 2626.48 examples/s]
Map: 100%|██████████| 178/178 [00:00<00:00, 2567.42 examples/s]
Map: 100%|██████████| 33/33 [00:00<00:00, 1873.34 examples/s]


In [9]:
model = AutoModelForSeq2SeqLM.from_pretrained("ai-forever/FRED-T5-large")
    # we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8,
)

In [10]:
repository_id = f"fred-test"

    # Define training args
training_args = Seq2SeqTrainingArguments(
        output_dir=repository_id,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        predict_with_generate=True,
        fp16=False,  # Overflows with fp16
        learning_rate=1e-4,
        num_train_epochs=10,
        warmup_ratio=0.1,
        weight_decay=0.01,
        optim="adamw_torch",
        # logging & evaluation strategies
        evaluation_strategy="epoch",
        save_strategy="no",
        save_total_limit=1,
        push_to_hub=False,

    )
    # Create Trainer instance
trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["test"],
    )
trainer.train()


 10%|█         | 200/2000 [03:19<27:45,  1.08it/s]
  0%|          | 0/23 [00:00<?, ?it/s][A
  9%|▊         | 2/23 [00:00<00:03,  6.51it/s][A
 13%|█▎        | 3/23 [00:00<00:04,  4.58it/s][A
 17%|█▋        | 4/23 [00:00<00:04,  3.96it/s][A
 22%|██▏       | 5/23 [00:01<00:04,  3.67it/s][A
 26%|██▌       | 6/23 [00:01<00:04,  3.51it/s][A
 30%|███       | 7/23 [00:01<00:04,  3.42it/s][A
 35%|███▍      | 8/23 [00:02<00:04,  3.36it/s][A
 39%|███▉      | 9/23 [00:02<00:04,  3.32it/s][A
 43%|████▎     | 10/23 [00:02<00:03,  3.29it/s][A
 48%|████▊     | 11/23 [00:03<00:03,  3.27it/s][A
 52%|█████▏    | 12/23 [00:03<00:03,  3.26it/s][A
 57%|█████▋    | 13/23 [00:03<00:03,  3.25it/s][A
 61%|██████    | 14/23 [00:04<00:02,  3.25it/s][A
 65%|██████▌   | 15/23 [00:04<00:02,  3.24it/s][A
 70%|██████▉   | 16/23 [00:04<00:02,  3.24it/s][A
 74%|███████▍  | 17/23 [00:04<00:01,  3.24it/s][A
 78%|███████▊  | 18/23 [00:05<00:01,  3.24it/s][A
 83%|████████▎ | 19/23 [00:05<00:01,  3.24it/s]

{'eval_loss': 0.19681577384471893, 'eval_runtime': 6.9257, 'eval_samples_per_second': 25.701, 'eval_steps_per_second': 3.321, 'epoch': 1.0}


 20%|██        | 400/2000 [06:43<24:40,  1.08it/s]  
  0%|          | 0/23 [00:00<?, ?it/s][A
  9%|▊         | 2/23 [00:00<00:03,  6.50it/s][A
 13%|█▎        | 3/23 [00:00<00:04,  4.58it/s][A
 17%|█▋        | 4/23 [00:00<00:04,  3.96it/s][A
 22%|██▏       | 5/23 [00:01<00:04,  3.67it/s][A
 26%|██▌       | 6/23 [00:01<00:04,  3.51it/s][A
 30%|███       | 7/23 [00:01<00:04,  3.42it/s][A
 35%|███▍      | 8/23 [00:02<00:04,  3.36it/s][A
 39%|███▉      | 9/23 [00:02<00:04,  3.32it/s][A
 43%|████▎     | 10/23 [00:02<00:03,  3.29it/s][A
 48%|████▊     | 11/23 [00:03<00:03,  3.27it/s][A
 52%|█████▏    | 12/23 [00:03<00:03,  3.26it/s][A
 57%|█████▋    | 13/23 [00:03<00:03,  3.25it/s][A
 61%|██████    | 14/23 [00:04<00:02,  3.25it/s][A
 65%|██████▌   | 15/23 [00:04<00:02,  3.24it/s][A
 70%|██████▉   | 16/23 [00:04<00:02,  3.24it/s][A
 74%|███████▍  | 17/23 [00:04<00:01,  3.24it/s][A
 78%|███████▊  | 18/23 [00:05<00:01,  3.24it/s][A
 83%|████████▎ | 19/23 [00:05<00:01,  3.24it/s

{'eval_loss': 0.08956189453601837, 'eval_runtime': 6.9244, 'eval_samples_per_second': 25.706, 'eval_steps_per_second': 3.322, 'epoch': 2.0}


 25%|██▌       | 500/2000 [08:29<24:45,  1.01it/s]  

{'loss': 0.3963, 'grad_norm': 0.9926210045814514, 'learning_rate': 8.333333333333334e-05, 'epoch': 2.5}


 30%|███       | 600/2000 [10:08<21:36,  1.08it/s]
  0%|          | 0/23 [00:00<?, ?it/s][A
  9%|▊         | 2/23 [00:00<00:03,  6.51it/s][A
 13%|█▎        | 3/23 [00:00<00:04,  4.58it/s][A
 17%|█▋        | 4/23 [00:00<00:04,  3.96it/s][A
 22%|██▏       | 5/23 [00:01<00:04,  3.67it/s][A
 26%|██▌       | 6/23 [00:01<00:04,  3.51it/s][A
 30%|███       | 7/23 [00:01<00:04,  3.42it/s][A
 35%|███▍      | 8/23 [00:02<00:04,  3.36it/s][A
 39%|███▉      | 9/23 [00:02<00:04,  3.32it/s][A
 43%|████▎     | 10/23 [00:02<00:03,  3.29it/s][A
 48%|████▊     | 11/23 [00:03<00:03,  3.27it/s][A
 52%|█████▏    | 12/23 [00:03<00:03,  3.26it/s][A
 57%|█████▋    | 13/23 [00:03<00:03,  3.25it/s][A
 61%|██████    | 14/23 [00:04<00:02,  3.25it/s][A
 65%|██████▌   | 15/23 [00:04<00:02,  3.24it/s][A
 70%|██████▉   | 16/23 [00:04<00:02,  3.24it/s][A
 74%|███████▍  | 17/23 [00:04<00:01,  3.24it/s][A
 78%|███████▊  | 18/23 [00:05<00:01,  3.24it/s][A
 83%|████████▎ | 19/23 [00:05<00:01,  3.23it/s]

{'eval_loss': 0.04632798582315445, 'eval_runtime': 6.9234, 'eval_samples_per_second': 25.71, 'eval_steps_per_second': 3.322, 'epoch': 3.0}


 40%|████      | 800/2000 [13:33<18:29,  1.08it/s]  
  0%|          | 0/23 [00:00<?, ?it/s][A
  9%|▊         | 2/23 [00:00<00:03,  6.50it/s][A
 13%|█▎        | 3/23 [00:00<00:04,  4.58it/s][A
 17%|█▋        | 4/23 [00:00<00:04,  3.96it/s][A
 22%|██▏       | 5/23 [00:01<00:04,  3.68it/s][A
 26%|██▌       | 6/23 [00:01<00:04,  3.52it/s][A
 30%|███       | 7/23 [00:01<00:04,  3.42it/s][A
 35%|███▍      | 8/23 [00:02<00:04,  3.36it/s][A
 39%|███▉      | 9/23 [00:02<00:04,  3.32it/s][A
 43%|████▎     | 10/23 [00:02<00:03,  3.29it/s][A
 48%|████▊     | 11/23 [00:03<00:03,  3.27it/s][A
 52%|█████▏    | 12/23 [00:03<00:03,  3.26it/s][A
 57%|█████▋    | 13/23 [00:03<00:03,  3.25it/s][A
 61%|██████    | 14/23 [00:04<00:02,  3.25it/s][A
 65%|██████▌   | 15/23 [00:04<00:02,  3.24it/s][A
 70%|██████▉   | 16/23 [00:04<00:02,  3.24it/s][A
 74%|███████▍  | 17/23 [00:04<00:01,  3.24it/s][A
 78%|███████▊  | 18/23 [00:05<00:01,  3.24it/s][A
 83%|████████▎ | 19/23 [00:05<00:01,  3.24it/s

{'eval_loss': 0.03354667127132416, 'eval_runtime': 6.9235, 'eval_samples_per_second': 25.71, 'eval_steps_per_second': 3.322, 'epoch': 4.0}


 50%|█████     | 1000/2000 [16:58<15:25,  1.08it/s] 

{'loss': 0.0326, 'grad_norm': 0.7232740521430969, 'learning_rate': 5.555555555555556e-05, 'epoch': 5.0}



  0%|          | 0/23 [00:00<?, ?it/s][A
  9%|▊         | 2/23 [00:00<00:03,  6.51it/s][A
 13%|█▎        | 3/23 [00:00<00:04,  4.58it/s][A
 17%|█▋        | 4/23 [00:00<00:04,  3.96it/s][A
 22%|██▏       | 5/23 [00:01<00:04,  3.68it/s][A
 26%|██▌       | 6/23 [00:01<00:04,  3.51it/s][A
 30%|███       | 7/23 [00:01<00:04,  3.42it/s][A
 35%|███▍      | 8/23 [00:02<00:04,  3.36it/s][A
 39%|███▉      | 9/23 [00:02<00:04,  3.32it/s][A
 43%|████▎     | 10/23 [00:02<00:03,  3.29it/s][A
 48%|████▊     | 11/23 [00:03<00:03,  3.27it/s][A
 52%|█████▏    | 12/23 [00:03<00:03,  3.26it/s][A
 57%|█████▋    | 13/23 [00:03<00:03,  3.25it/s][A
 61%|██████    | 14/23 [00:04<00:02,  3.25it/s][A
 65%|██████▌   | 15/23 [00:04<00:02,  3.24it/s][A
 70%|██████▉   | 16/23 [00:04<00:02,  3.24it/s][A
 74%|███████▍  | 17/23 [00:04<00:01,  3.24it/s][A
 78%|███████▊  | 18/23 [00:05<00:01,  3.24it/s][A
 83%|████████▎ | 19/23 [00:05<00:01,  3.24it/s][A
 87%|████████▋ | 20/23 [00:05<00:00,  3.24it/s]

{'eval_loss': 0.028165750205516815, 'eval_runtime': 6.8963, 'eval_samples_per_second': 25.811, 'eval_steps_per_second': 3.335, 'epoch': 5.0}


 60%|██████    | 1200/2000 [20:22<12:19,  1.08it/s]
  0%|          | 0/23 [00:00<?, ?it/s][A
  9%|▊         | 2/23 [00:00<00:03,  6.51it/s][A
 13%|█▎        | 3/23 [00:00<00:04,  4.58it/s][A
 17%|█▋        | 4/23 [00:00<00:04,  3.96it/s][A
 22%|██▏       | 5/23 [00:01<00:04,  3.68it/s][A
 26%|██▌       | 6/23 [00:01<00:04,  3.52it/s][A
 30%|███       | 7/23 [00:01<00:04,  3.42it/s][A
 35%|███▍      | 8/23 [00:02<00:04,  3.36it/s][A
 39%|███▉      | 9/23 [00:02<00:04,  3.32it/s][A
 43%|████▎     | 10/23 [00:02<00:03,  3.29it/s][A
 48%|████▊     | 11/23 [00:03<00:03,  3.27it/s][A
 52%|█████▏    | 12/23 [00:03<00:03,  3.26it/s][A
 57%|█████▋    | 13/23 [00:03<00:03,  3.25it/s][A
 61%|██████    | 14/23 [00:04<00:02,  3.25it/s][A
 65%|██████▌   | 15/23 [00:04<00:02,  3.24it/s][A
 70%|██████▉   | 16/23 [00:04<00:02,  3.24it/s][A
 74%|███████▍  | 17/23 [00:04<00:01,  3.24it/s][A
 78%|███████▊  | 18/23 [00:05<00:01,  3.24it/s][A
 83%|████████▎ | 19/23 [00:05<00:01,  3.24it/s]

{'eval_loss': 0.020645666867494583, 'eval_runtime': 6.9222, 'eval_samples_per_second': 25.714, 'eval_steps_per_second': 3.323, 'epoch': 6.0}


 70%|███████   | 1400/2000 [23:47<09:15,  1.08it/s]
  0%|          | 0/23 [00:00<?, ?it/s][A
  9%|▊         | 2/23 [00:00<00:03,  6.51it/s][A
 13%|█▎        | 3/23 [00:00<00:04,  4.58it/s][A
 17%|█▋        | 4/23 [00:00<00:04,  3.96it/s][A
 22%|██▏       | 5/23 [00:01<00:04,  3.68it/s][A
 26%|██▌       | 6/23 [00:01<00:04,  3.52it/s][A
 30%|███       | 7/23 [00:01<00:04,  3.42it/s][A
 35%|███▍      | 8/23 [00:02<00:04,  3.36it/s][A
 39%|███▉      | 9/23 [00:02<00:04,  3.32it/s][A
 43%|████▎     | 10/23 [00:02<00:03,  3.29it/s][A
 48%|████▊     | 11/23 [00:03<00:03,  3.28it/s][A
 52%|█████▏    | 12/23 [00:03<00:03,  3.26it/s][A
 57%|█████▋    | 13/23 [00:03<00:03,  3.26it/s][A
 61%|██████    | 14/23 [00:04<00:02,  3.25it/s][A
 65%|██████▌   | 15/23 [00:04<00:02,  3.25it/s][A
 70%|██████▉   | 16/23 [00:04<00:02,  3.24it/s][A
 74%|███████▍  | 17/23 [00:04<00:01,  3.24it/s][A
 78%|███████▊  | 18/23 [00:05<00:01,  3.24it/s][A
 83%|████████▎ | 19/23 [00:05<00:01,  3.24it/s]

{'eval_loss': 0.015695836395025253, 'eval_runtime': 6.9213, 'eval_samples_per_second': 25.718, 'eval_steps_per_second': 3.323, 'epoch': 7.0}


 75%|███████▌  | 1500/2000 [25:33<08:17,  1.00it/s]

{'loss': 0.0129, 'grad_norm': 0.17571555078029633, 'learning_rate': 2.777777777777778e-05, 'epoch': 7.5}


 80%|████████  | 1600/2000 [27:12<06:10,  1.08it/s]
  0%|          | 0/23 [00:00<?, ?it/s][A
  9%|▊         | 2/23 [00:00<00:03,  6.50it/s][A
 13%|█▎        | 3/23 [00:00<00:04,  4.58it/s][A
 17%|█▋        | 4/23 [00:00<00:04,  3.96it/s][A
 22%|██▏       | 5/23 [00:01<00:04,  3.67it/s][A
 26%|██▌       | 6/23 [00:01<00:04,  3.52it/s][A
 30%|███       | 7/23 [00:01<00:04,  3.42it/s][A
 35%|███▍      | 8/23 [00:02<00:04,  3.36it/s][A
 39%|███▉      | 9/23 [00:02<00:04,  3.32it/s][A
 43%|████▎     | 10/23 [00:02<00:03,  3.29it/s][A
 48%|████▊     | 11/23 [00:03<00:03,  3.27it/s][A
 52%|█████▏    | 12/23 [00:03<00:03,  3.26it/s][A
 57%|█████▋    | 13/23 [00:03<00:03,  3.25it/s][A
 61%|██████    | 14/23 [00:04<00:02,  3.25it/s][A
 65%|██████▌   | 15/23 [00:04<00:02,  3.24it/s][A
 70%|██████▉   | 16/23 [00:04<00:02,  3.24it/s][A
 74%|███████▍  | 17/23 [00:04<00:01,  3.24it/s][A
 78%|███████▊  | 18/23 [00:05<00:01,  3.24it/s][A
 83%|████████▎ | 19/23 [00:05<00:01,  3.24it/s]

{'eval_loss': 0.014588749967515469, 'eval_runtime': 6.9243, 'eval_samples_per_second': 25.706, 'eval_steps_per_second': 3.322, 'epoch': 8.0}


 90%|█████████ | 1800/2000 [30:36<03:05,  1.08it/s]
  0%|          | 0/23 [00:00<?, ?it/s][A
  9%|▊         | 2/23 [00:00<00:03,  6.55it/s][A
 13%|█▎        | 3/23 [00:00<00:04,  4.59it/s][A
 17%|█▋        | 4/23 [00:00<00:04,  3.97it/s][A
 22%|██▏       | 5/23 [00:01<00:04,  3.68it/s][A
 26%|██▌       | 6/23 [00:01<00:04,  3.52it/s][A
 30%|███       | 7/23 [00:01<00:04,  3.42it/s][A
 35%|███▍      | 8/23 [00:02<00:04,  3.36it/s][A
 39%|███▉      | 9/23 [00:02<00:04,  3.32it/s][A
 43%|████▎     | 10/23 [00:02<00:03,  3.29it/s][A
 48%|████▊     | 11/23 [00:03<00:03,  3.28it/s][A
 52%|█████▏    | 12/23 [00:03<00:03,  3.26it/s][A
 57%|█████▋    | 13/23 [00:03<00:03,  3.25it/s][A
 61%|██████    | 14/23 [00:04<00:02,  3.25it/s][A
 65%|██████▌   | 15/23 [00:04<00:02,  3.24it/s][A
 70%|██████▉   | 16/23 [00:04<00:02,  3.24it/s][A
 74%|███████▍  | 17/23 [00:04<00:01,  3.24it/s][A
 78%|███████▊  | 18/23 [00:05<00:01,  3.24it/s][A
 83%|████████▎ | 19/23 [00:05<00:01,  3.24it/s]

{'eval_loss': 0.011799040250480175, 'eval_runtime': 6.9227, 'eval_samples_per_second': 25.713, 'eval_steps_per_second': 3.322, 'epoch': 9.0}


100%|██████████| 2000/2000 [34:01<00:00,  1.08it/s]

{'loss': 0.0069, 'grad_norm': 0.8691480755805969, 'learning_rate': 0.0, 'epoch': 10.0}



  0%|          | 0/23 [00:00<?, ?it/s][A
  9%|▊         | 2/23 [00:00<00:03,  6.52it/s][A
 13%|█▎        | 3/23 [00:00<00:04,  4.58it/s][A
 17%|█▋        | 4/23 [00:00<00:04,  3.96it/s][A
 22%|██▏       | 5/23 [00:01<00:04,  3.67it/s][A
 26%|██▌       | 6/23 [00:01<00:04,  3.51it/s][A
 30%|███       | 7/23 [00:01<00:04,  3.42it/s][A
 35%|███▍      | 8/23 [00:02<00:04,  3.36it/s][A
 39%|███▉      | 9/23 [00:02<00:04,  3.32it/s][A
 43%|████▎     | 10/23 [00:02<00:03,  3.29it/s][A
 48%|████▊     | 11/23 [00:03<00:03,  3.27it/s][A
 52%|█████▏    | 12/23 [00:03<00:03,  3.26it/s][A
 57%|█████▋    | 13/23 [00:03<00:03,  3.25it/s][A
 61%|██████    | 14/23 [00:04<00:02,  3.25it/s][A
 65%|██████▌   | 15/23 [00:04<00:02,  3.24it/s][A
 70%|██████▉   | 16/23 [00:04<00:02,  3.24it/s][A
 74%|███████▍  | 17/23 [00:04<00:01,  3.24it/s][A
 78%|███████▊  | 18/23 [00:05<00:01,  3.24it/s][A
 83%|████████▎ | 19/23 [00:05<00:01,  3.24it/s][A
 87%|████████▋ | 20/23 [00:05<00:00,  3.24it/s]

{'eval_loss': 0.010190648958086967, 'eval_runtime': 6.8984, 'eval_samples_per_second': 25.803, 'eval_steps_per_second': 3.334, 'epoch': 10.0}
{'train_runtime': 2048.8741, 'train_samples_per_second': 7.799, 'train_steps_per_second': 0.976, 'train_loss': 0.11216087079048156, 'epoch': 10.0}





TrainOutput(global_step=2000, training_loss=0.11216087079048156, metrics={'train_runtime': 2048.8741, 'train_samples_per_second': 7.799, 'train_steps_per_second': 0.976, 'total_flos': 2.713057896824832e+16, 'train_loss': 0.11216087079048156, 'epoch': 10.0})

In [26]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def generate_response(
    model, tokenizer, question, top_p, temperature, prompts_path, device
):
    """function to generate response from FLAN models"""
    with open(prompts_path, encoding = 'UTF-8') as fp:
        template = json.load(fp)
    num = random.randint(1, len(template))
    instruction =template["ABSA"][str(num)]
    input = f"<LM>Задача: Извлечение и классификация именованных сущностей \n{instruction}\n{question}\n"
    input_ids = tokenizer.encode(input, return_tensors="pt")
    sample_output = model.generate(
        input_ids=input_ids.to(device),
        do_sample=True,
        max_length=100,
        top_p=top_p,
        temperature=temperature,
        top_k=70,
        early_stopping=True,
        #no_repeat_ngram_size=2, 
        eos_token_id=tokenizer.eos_token_id

    )
    out = tokenizer.decode(sample_output[0][1:], skip_special_tokens=True)
    if "</s>" in out:
        out = out[: out.find("</s>")].strip()
    return out

In [31]:
def evaluate(
    trial_dataset,
    model,
    tokenizer,
    device,
    prompts_path,
    top_p=0.5,
    temperature=0.5,
):
    """function to evaluate FLAN generation results"""

    TP_aspect = 0
    FN_aspect = 0
    FP_aspect = 0
    TP_sent = 0
    FN_sent = 0
    FP_sent = 0
    answers = pd.DataFrame()

    for i in trial_dataset:
        answer = generate_response(
            model=model,
            tokenizer=tokenizer,
            question=i["text"],
            top_p=0.6,
            temperature=0.7,
            prompts_path=prompts_path,
            device=device,
        )
        answer = answer.split("Ответ:")[1].strip().replace(": ", ":")
        new_row = {"y_pred": answer, "y_true": i["aspect_ner_list"]}
        answers = pd.concat([answers, pd.DataFrame([new_row])])

        y_pred = answer.split(",")
        y_true = i["aspect_ner_list"].split(",")

        aspects_true_lst = [item.split(":")[0] for item in y_true]
        aspects_pred_lst = [item.split(":")[0] for item in y_pred]

        for aspect in aspects_true_lst:
            if aspect in aspects_pred_lst:
                TP_aspect += 1
            else:
                FN_aspect += 1
        for aspect in aspects_pred_lst:
            if aspect not in aspects_true_lst:
                FP_aspect += 1
                FP_sent += 1

        for item in y_true:
            if item in y_pred:
                TP_sent += 1
            else:
                FN_sent += 1

    F1_aspect = 2 * TP_aspect / (2 * TP_aspect + FN_aspect + FP_aspect)
    F1_sent = 2 * TP_sent / (2 * TP_sent + FN_sent + FP_sent)
    F1_macro = (F1_aspect + F1_sent) / 2
    F1_micro = (
        2
        * (TP_aspect + TP_sent)
        / ((2 * (TP_aspect + TP_sent)) + (FN_aspect + FN_sent + FP_aspect + FP_sent))
    )

    print(F1_sent)
    display(answers)
    return answers

In [32]:
answers = evaluate(
        trial_dataset=final_ds["val"],
        model=model,
        tokenizer=tokenizer,
        device=device,
        prompts_path="prompts.json"
    )

0.8614232209737828


Unnamed: 0,y_pred,y_true
0,"Табакошоп:ORG,Алкошоп:ORG,9:00:00:DATE,22:00:D...",potap64@npo.biz:MAIL
0,"KPR:TECH,+7 010 618 1753:TELEPHONE,HMN:ACRONYM...","KPR:ACRONYM,+7 010 618 1753:TELEPHONE,HMN:ACRO..."
0,"Кока-Кола Эйчбиси Евразия (Coca-Cola):ORG,Кока...","Кока-Кола Эйчбиси Евразия (Coca-Cola):ORG,Кока..."
0,"OguI1537:NUM,СК Согласие:ORG,Восточная горнору...","OguI1537:NUM,16.03.2006:DATE,СК Согласие:ORG,В..."
0,"SFB:TECH,ChekhovChain:TECH,SFB:TECH,ChekhovCha...","SFB:TECH,ChekhovChain:TECH,SFB:TECH,ChekhovCha..."
0,"028.36%:PERCENT,10.06.2000:DATE,28.12.2010:DAT...","28.36%:PERCENT,10.06.2000:DATE,10.06.2000:DATE..."
0,"Railgo:ORG,Трест КХМ:ORG,Концерн Титан-2:ORG,R...","Railgo:ORG,Трест КХМ:ORG,Концерн Титан-2:ORG,R..."
0,"WRL:ACRONYM,CYU:ACRONYM,OZP:TECH,RMV:ACRONYM","WRL:ACRONYM,CYU:ACRONYM,OZP:TECH,RMV:ACRONYM"
0,"MDW:ACRONYM,YGL:TECH,YTM:ACRONYM,YGL:TECH","MDW:ACRONYM,YGL:TECH,YTM:ACRONYM,YGL:TECH"
0,"ZMV:TECH,+7 010 618 1753:TELEPHONE,RMV:ACRONYM...","ZMV:TECH,+7 010 618 1753:TELEPHONE,RMV:ACRONYM..."


In [33]:
predictions = list(answers['y_pred'])
labels = list(answers['y_true'])

for pred, label in zip(predictions, labels):
    print(pred)
    print(label)
    print('===================')


Табакошоп:ORG,Алкошоп:ORG,9:00:00:DATE,22:00:DATE:TELEPHONE,potap64@npo.biz:MAIL
potap64@npo.biz:MAIL
KPR:TECH,+7 010 618 1753:TELEPHONE,HMN:ACRONYM,PLU:TECH,5800:NUM,6016:NUM,1612:NUM,3805:NUM,8512:NUM,9603:NUM,9332:NUM,QGN:ACRONYM
KPR:ACRONYM,+7 010 618 1753:TELEPHONE,HMN:ACRONYM,5800:NUM,6016:NUM,989:NUM,1612:NUM,3805:NUM,8512:NUM,637:NUM,9603:NUM,9332:NUM,QGN:ACRONYM
Кока-Кола Эйчбиси Евразия (Coca-Cola):ORG,Кока-Кола Эйчбиси Евразия (Coca-Cola):ORG
Кока-Кола Эйчбиси Евразия (Coca-Cola):ORG,Кока-Кола Эйчбиси Евразия (Coca-Cola):ORG,Кока-Кола Эйчбиси Евразия (Coca-Cola):ORG
OguI1537:NUM,СК Согласие:ORG,Восточная горнорудная компания:ORG
OguI1537:NUM,16.03.2006:DATE,СК Согласие:ORG,Восточная горнорудная компания:ORG
SFB:TECH,ChekhovChain:TECH,SFB:TECH,ChekhovChain:TECH
SFB:TECH,ChekhovChain:TECH,SFB:TECH,ChekhovChain:TECH
028.36%:PERCENT,10.06.2000:DATE,28.12.2010:DATE,6.24%:PERCENT
28.36%:PERCENT,10.06.2000:DATE,10.06.2000:DATE,6.24%:PERCENT
Railgo:ORG,Трест КХМ:ORG,Концерн Титан-2:

In [34]:
model.save_pretrained("./results/fred")
tokenizer.save_pretrained("./results/fred")

('./results/fred/tokenizer_config.json',
 './results/fred/special_tokens_map.json',
 './results/fred/vocab.json',
 './results/fred/merges.txt',
 './results/fred/added_tokens.json',
 './results/fred/tokenizer.json')