In [None]:
# Download datasets
!wget https://github.com/sevskii111/one-hot-gen/blob/main/datasets/dataset_train_our.csv?raw=true -O dataset_train_our.csv -q
!wget https://github.com/sevskii111/one-hot-gen/blob/main/datasets/dataset_train_public.csv?raw=true -O dataset_train_public.csv -q
!wget https://github.com/sevskii111/one-hot-gen/blob/main/datasets/dataset_valid.csv?raw=true -O dataset_valid.csv -q

In [None]:
!pip install -Uq transformers rich[jupyter] sentencepiece gdown

[K     |████████████████████████████████| 2.6 MB 7.6 MB/s 
[K     |████████████████████████████████| 209 kB 54.2 MB/s 
[K     |████████████████████████████████| 1.2 MB 61.9 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
[K     |████████████████████████████████| 895 kB 67.6 MB/s 
[K     |████████████████████████████████| 636 kB 66.9 MB/s 
[K     |████████████████████████████████| 3.3 MB 28.4 MB/s 
[K     |████████████████████████████████| 51 kB 8.2 MB/s 
[?25h  Building wheel for gdown (PEP 517) ... [?25l[?25hdone


In [None]:
MAX_SOURCE_TEXT_LENGTH = 512
MAX_TARGET_TEXT_LENGTH = 17
NEWS_PER_STORY_PUBLIC = 5
NEWS_PER_STORY_OTHER = 1
BATCH_SIZE = 8
TRAIN_EPOCHS = 3

OUTPUT_DIR = 'output_dir'

In [None]:
import os
import requests
import pickle
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import T5Tokenizer, T5ForConditionalGeneration
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# Загрузка Dataset-ов
def limit_news_per_story(df, news_per_story):
  ys = df["y"].unique()
  for y in ys:
    y_rows = df[df["y"] == y]
    y_rows = y_rows[news_per_story:]
    df =  df.drop(y_rows.index).reset_index(drop=True)
  return df


train_dfs_files = [('./dataset_train_public.csv', NEWS_PER_STORY_PUBLIC), ('./dataset_train_our.csv', NEWS_PER_STORY_PUBLIC)]
train_df = pd.concat([limit_news_per_story(pd.read_csv(f), nps) for f, nps in train_dfs_files], ignore_index=True)

ys = train_df["y"].unique()
for y in ys:
  if (y[-5:] == " Фото"): # В нашем датасете некоторые названия сюжетов заканчиваются Фото, надо убрать
    train_df.replace(y, y[:-5], inplace=True)

valid_df = pd.read_csv('./dataset_valid.csv')

train_df['is_valid'] = False
valid_df['is_valid'] = True
train_df = pd.concat([train_df, valid_df])

In [None]:
# Код по большему счёту позимствован с https://shivanandroy.com/fine-tune-t5-transformer-with-pytorch/ и адаптирован к нашей задаче

# rich: for a better display on terminal
from rich.table import Column, Table
from rich import box
from rich.console import Console

# define a rich console logger
console = Console(record=True)

# to display dataframe in ASCII format
def display_df(df):
    """display dataframe in ASCII format"""

    console = Console()
    table = Table(
        Column("source_text", justify="center"),
        Column("target_text", justify="center"),
        title="Sample Data",
        pad_edge=False,
        box=box.ASCII,
    )

    for i, row in enumerate(df.values.tolist()):
        table.add_row(row[0], row[1])

    console.print(table)

# training logger to log training progress
training_logger = Table(
    Column("Epoch", justify="center"),
    Column("Steps", justify="center"),
    Column("Loss", justify="center"),
    title="Training Status",
    pad_edge=False,
    box=box.ASCII,
)

# Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

class NewsDataset(Dataset):
    """
    Creating a custom dataset for reading the dataset and
    loading it into the dataloader to pass it to the
    neural network for finetuning the model

    """

    def __init__(
        self, dataframe, tokenizer, source_len, target_len, source_text, target_text
    ):
        """
        Initializes a Dataset class

        Args:
            dataframe (pandas.DataFrame): Input dataframe
            tokenizer (transformers.tokenizer): Transformers tokenizer
            source_len (int): Max length of source text
            target_len (int): Max length of target text
            source_text (str): column name of source text
            target_text (str): column name of target text
        """
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = target_len
        self.target_text = self.data[target_text]
        self.source_text = self.data[source_text]

    def __len__(self):
        """returns the length of dataframe"""

        return len(self.target_text)

    def __getitem__(self, index):
        """return the input ids, attention masks and target ids"""

        source_text = str(self.source_text[index])
        target_text = str(self.target_text[index])

        # cleaning data so as to ensure data is in string type
        source_text = " ".join(source_text.split())
        target_text = " ".join(target_text.split())

        source = self.tokenizer.batch_encode_plus(
            [source_text],
            max_length=self.source_len,
            pad_to_max_length=True,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )
        target = self.tokenizer.batch_encode_plus(
            [target_text],
            max_length=self.summ_len,
            pad_to_max_length=True,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )

        source_ids = source["input_ids"].squeeze()
        source_mask = source["attention_mask"].squeeze()
        target_ids = target["input_ids"].squeeze()
        target_mask = target["attention_mask"].squeeze()

        return {
            "source_ids": source_ids.to(dtype=torch.long),
            "source_mask": source_mask.to(dtype=torch.long),
            "target_ids": target_ids.to(dtype=torch.long),
            "target_ids_y": target_ids.to(dtype=torch.long),
        }

def train(epoch, tokenizer, model, device, loader, optimizer):

    """
    Function to be called for training with the parameters passed from main function

    """

    model.train()
    for _, data in enumerate(loader, 0):
        y = data["target_ids"].to(device, dtype=torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data["source_ids"].to(device, dtype=torch.long)
        mask = data["source_mask"].to(device, dtype=torch.long)

        outputs = model(
            input_ids=ids,
            attention_mask=mask,
            decoder_input_ids=y_ids,
            labels=lm_labels,
        )
        loss = outputs[0]

        if _ % 100 == 0:
            training_logger.add_row(str(epoch), str(_), str(loss))
            console.print(training_logger)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

def validate(epoch, tokenizer, model, device, loader):

  """
  Function to evaluate model for predictions

  """
  model.eval()
  predictions = []
  actuals = []
  with torch.no_grad():
      for _, data in enumerate(loader, 0):
          y = data['target_ids'].to(device, dtype = torch.long)
          ids = data['source_ids'].to(device, dtype = torch.long)
          mask = data['source_mask'].to(device, dtype = torch.long)

          generated_ids = model.generate(
              input_ids = ids,
              attention_mask = mask, 
              max_length=MAX_TARGET_TEXT_LENGTH, 
              num_beams=2,
              repetition_penalty=1.0, 
              length_penalty=1.0, 
              early_stopping=True
              )
          preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
          target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]
          if _%10==0:
              console.print(f'Completed {_}')

          predictions.extend(preds)
          actuals.extend(target)
  return predictions, actuals

def T5Trainer(
    dataframe, source_text, target_text, model_params, output_dir="./outputs/"
):

    """
    T5 trainer

    """

    # Set random seeds and deterministic pytorch for reproducibility
    torch.manual_seed(model_params["SEED"])  # pytorch random seed
    np.random.seed(model_params["SEED"])  # numpy random seed
    torch.backends.cudnn.deterministic = True

    # logging
    console.log(f"""[Model]: Loading {model_params["MODEL"]}...\n""")

    # tokenzier for encoding the text
    tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"])

    # Defining the model. We are using t5-base model and added a Language model layer on top for generation of Summary.
    # Further this model is sent to device (GPU/TPU) for using the hardware.
    model = T5ForConditionalGeneration.from_pretrained(model_params["MODEL"])
    model = model.to(device)

    # logging
    console.log(f"[Data]: Reading data...\n")

    # Importing the raw dataset
    dataframe = dataframe[[source_text, target_text, 'is_valid']]
    display_df(dataframe.head(2))

    # Creation of Dataset and Dataloader
    # Defining the train size. So 80% of the data will be used for training and the rest for validation.
    train_dataset = dataframe[dataframe["is_valid"] == False].drop(columns="is_valid").reset_index(drop=True)
    val_dataset = dataframe[dataframe["is_valid"] == True].drop(columns="is_valid").reset_index(drop=True)


    console.print(f"FULL Dataset: {dataframe.shape}")
    console.print(f"TRAIN Dataset: {train_dataset.shape}")
    console.print(f"TEST Dataset: {val_dataset.shape}\n")

    # Creating the Training and Validation dataset for further creation of Dataloader
    training_set = NewsDataset(
        train_dataset,
        tokenizer,
        model_params["MAX_SOURCE_TEXT_LENGTH"],
        model_params["MAX_TARGET_TEXT_LENGTH"],
        source_text,
        target_text,
    )
    val_set = NewsDataset(
        val_dataset,
        tokenizer,
        model_params["MAX_SOURCE_TEXT_LENGTH"],
        model_params["MAX_TARGET_TEXT_LENGTH"],
        source_text,
        target_text,
    )

    # Defining the parameters for creation of dataloaders
    train_params = {
        "batch_size": model_params["TRAIN_BATCH_SIZE"],
        "shuffle": True,
        "num_workers": 0,
    }

    val_params = {
        "batch_size": model_params["VALID_BATCH_SIZE"],
        "shuffle": False,
        "num_workers": 0,
    }

    # Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
    training_loader = DataLoader(training_set, **train_params)
    val_loader = DataLoader(val_set, **val_params)

    # Defining the optimizer that will be used to tune the weights of the network in the training session.
    optimizer = torch.optim.Adam(
        params=model.parameters(), lr=model_params["LEARNING_RATE"]
    )

    # Training loop
    console.log(f"[Initiating Fine Tuning]...\n")

    for epoch in range(model_params["TRAIN_EPOCHS"]):
        train(epoch, tokenizer, model, device, training_loader, optimizer)

    console.log(f"[Saving Model]...\n")
    # Saving the model after training
    path = os.path.join(output_dir, "model_files")
    model.save_pretrained(path)
    tokenizer.save_pretrained(path)

    # evaluating test dataset
    console.log(f"[Initiating Validation]...\n")
    valid_dfs = []
    for epoch in range(model_params["VAL_EPOCHS"]):
        predictions, actuals = validate(epoch, tokenizer, model, device, val_loader)
        valid_dfs.append(pd.DataFrame({"Generated Text": predictions, "Actual Text": actuals}))
    pd.concat(valid_dfs).to_csv(os.path.join(output_dir, "predictions.csv"))

    console.save_text(os.path.join(output_dir, "logs.txt"))

    console.log(f"[Validation Completed.]\n")
    console.print(
        f"""[Model] Model saved @ {os.path.join(output_dir, "model_files")}\n"""
    )
    console.print(
        f"""[Validation] Generation on Validation data saved @ {os.path.join(output_dir,'predictions.csv')}\n"""
    )
    console.print(f"""[Logs] Logs saved @ {os.path.join(output_dir,'logs.txt')}\n""")

In [None]:
model_params = {
    "MODEL": "cointegrated/rut5-base-multitask",
    "TRAIN_BATCH_SIZE": BATCH_SIZE,  # training batch size
    "VALID_BATCH_SIZE": BATCH_SIZE,  # validation batch size
    "TRAIN_EPOCHS": TRAIN_EPOCHS,  # number of training epochs
    "VAL_EPOCHS": 1,  # number of validation epochs
    "LEARNING_RATE": 1e-4,  # learning rate
    "MAX_SOURCE_TEXT_LENGTH": MAX_SOURCE_TEXT_LENGTH,  # max length of source text
    "MAX_TARGET_TEXT_LENGTH": MAX_TARGET_TEXT_LENGTH,  # max length of target text
    "SEED": 42,  # set seed for reproducibility
}

train_df["X_headline"] = "headline | " + train_df["X"]

T5Trainer(
    dataframe=train_df,
    source_text="X_headline",
    target_text="y",
    model_params=model_params,
    output_dir=OUTPUT_DIR,
)

Downloading:   0%|          | 0.00/828k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/260 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/726 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/977M [00:00<?, ?B/s]

To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /pytorch/aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)


In [None]:
# model.tar.gz GDrive вручную
!tar -czvf model.tar.gz $OUTPUT_DIR/model_files

output_dir/
output_dir/model_files/
output_dir/model_files/config.json
output_dir/model_files/spiece.model
output_dir/model_files/pytorch_model.bin
output_dir/model_files/tokenizer_config.json
output_dir/model_files/special_tokens_map.json
output_dir/logs.txt
output_dir/predictions.csv


In [None]:
def get_stopwords(encoding='utf-8', to_lower=True):
    url = "https://raw.githubusercontent.com/stopwords-iso/stopwords-ru/master/stopwords-ru.txt"
    r = requests.get(url)
    if not r.ok:
        r.raise_for_status()
    return r.text.lower() if to_lower else r.text

tfidf = TfidfVectorizer(stop_words=get_stopwords().split('\n'))
tfidf.fit(train_df[train_df['is_valid'] == False]['X'])
# tfidf.pickle загружается на GDrive вручную
with open('./tfidf.pickle', 'wb') as handle:
    pickle.dump(tfidf, handle)

  'stop_words.' % sorted(inconsistent))


In [None]:
# Не обязательно к запуску, показывает результаты на валидации
predictions = pd.read_csv(OUTPUT_DIR + '/predictions.csv', index_col=0)
gts = predictions["Actual Text"].unique()

feature_names = np.array(tfidf.get_feature_names())

def get_top_tf_idf_words(response, top_n=2):
    sorted_nzs = np.argsort(response.data)[:-(top_n+1):-1]
    return feature_names[response.indices[sorted_nzs]]

dl_results = []

for gt in gts:
  curr_preds = predictions[predictions['Actual Text'] == gt]["Generated Text"]
  t_text = tfidf.transform(['. '.join(curr_preds.values)])

  top_words = get_top_tf_idf_words(t_text, 2)
  variants = []
  for pred in curr_preds:
    pred_words = pred.lower().split(' ')
    i = len(set(top_words).intersection(set(pred_words)))
    if len(pred_words) > 1:
      i /= len(pred_words)
    variants.append((i, pred, gt))
  res = sorted(variants, reverse=True)[0]
  dl_results.append((res[1:]))

pd.DataFrame(dl_results, columns=["predicted", "ground_truth"])

Unnamed: 0,predicted,ground_truth
0,ЯНДЕКС.ТАКСИ И ГК,Сделка Яндекс.Такси и Везет
1,Курсы валют,Курсы валют
2,Кросс-курсы основных мировых валют,Кросс-курсы валют
3,Стратегия Х5,X5
4,"Впервые российский танк ""Армата""",Оборонная выставка IDEX-2021
5,Значения индексов Мосбиржи и РТС,Значения индексов Мосбиржи и РТС
6,Авиасообщение в Ереване,Авиасообщение РФ с Белоруссией и Арменией
7,Российские вакцины от COVID-19,Вакцина от COVID-19 КовиВак
8,Большой землетрясение в Монголии,Землетрясение в Монголии
9,Вспышка коронавируса,Заражения COVID-19
