In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics
import warnings
warnings.filterwarnings("ignore")

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
df = pd.read_csv('/content/drive/MyDrive/Data/fin_horo.csv')
df

Unnamed: 0.1,Unnamed: 0,date,zodiac,horo,source,len
0,50,2023-08-07,Овен,Сегодня Овнам стоит провести утро не просто сп...,https://horoscopes.rambler.ru,380
1,51,2023-08-07,Телец,Сегодня Тельцам стоит лениться до обеда – пуст...,https://horoscopes.rambler.ru,369
2,52,2023-08-07,Близнецы,Сегодня Близнецы думают только о деньгах – их ...,https://horoscopes.rambler.ru,378
3,53,2023-08-07,Рак,Сегодня Ракам стоит провести утро лениво – для...,https://horoscopes.rambler.ru,369
4,54,2023-08-07,Лев,Сегодня у Львов есть возможность исполнить нес...,https://horoscopes.rambler.ru,357
...,...,...,...,...,...,...
87937,256050,2023-08-29,Рак,"Дорогие Раки, а не пора ли вам заняться своими...",https://astroscope.ru/horoskop/ejednevniy_goro...,403
87938,256052,2023-08-29,Дева,"Если вас, дорогие Девы, так и подмывает ввязат...",https://astroscope.ru/horoskop/ejednevniy_goro...,490
87939,256056,2023-08-29,Козерог,Неуёмные амбиции Козерогов требуют повышения с...,https://astroscope.ru/horoskop/ejednevniy_goro...,455
87940,256057,2023-08-29,Водолей,"Успешный день для тех Водолеев, кто занимается...",https://astroscope.ru/horoskop/ejednevniy_goro...,389


In [4]:
!pip install transformers



In [5]:
!pip install sentencepiece



In [6]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(df[['zodiac', 'horo']], test_size = 0.3,
                                   random_state=42)
train_df = train_df.reset_index(drop= True)
val_df = val_df.reset_index(drop = True)
train_df.shape, val_df.shape

((61559, 2), (26383, 2))

In [7]:
import torch
device = torch.device ('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [8]:
train_df = train_df.rename(columns = {'zodiac': 'source_text', 'horo':'target_text'})
val_df = val_df.rename(columns = {'zodiac': 'source_text', 'horo':'target_text'})

In [9]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("ai-forever/ruT5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("ai-forever/ruT5-base")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=True`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


# New Section

In [10]:
from torch.utils.data import Dataset
class AdditionDataset(Dataset):
    """
        """

    def __init__(self, sentences, targets, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.sentences = sentences
        self.targets = targets

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, index):
        sentence = str(self.sentences[index])
        sentence = " ".join(sentence.split())
        target = self.targets[index]

        inputs = self.tokenizer.encode_plus(
            sentence
                                )
        return {"input_ids":self.tokenizer(sentence)['input_ids'], "labels":self.tokenizer(target)['input_ids']}



In [11]:
train_dataset = AdditionDataset(train_df['source_text'].tolist(),
                                train_df['target_text'].tolist(), tokenizer, 500)
test_dataset = AdditionDataset(val_df['source_text'].tolist(),
                                val_df['target_text'].tolist(), tokenizer, 500)
next(iter(test_dataset))

{'input_ids': [165, 1479, 2],
 'labels': [3452,
  2650,
  122,
  4275,
  81,
  110,
  178,
  12083,
  13,
  14460,
  6524,
  4,
  104,
  2848,
  364,
  147,
  3,
  16,
  178,
  293,
  3578,
  11275,
  22,
  3966,
  75,
  1872,
  3,
  1219,
  964,
  17,
  1343,
  194,
  291,
  698,
  5,
  10,
  22728,
  232,
  17,
  16468,
  158,
  4,
  929,
  3,
  31,
  220,
  5358,
  1327,
  9261,
  19,
  5,
  10650,
  3,
  47,
  178,
  336,
  622,
  3578,
  21351,
  24,
  26315,
  7533,
  5,
  10,
  7749,
  276,
  1533,
  9,
  51,
  7003,
  3,
  2529,
  1147,
  31,
  1321,
  9,
  901,
  4,
  4753,
  12,
  1200,
  3051,
  6,
  155,
  5,
  10,
  9515,
  940,
  426,
  4,
  2]}

In [12]:
!pip install accelerate -U



In [13]:
from transformers import Seq2SeqTrainingArguments

args = Seq2SeqTrainingArguments(
    evaluation_strategy = 'epoch',
    learning_rate = 2e-4,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    weight_decay = 0.01,
    num_train_epochs = 3,
    predict_with_generate = True,
    push_to_hub = False,
    output_dir = "outputs"

)

In [14]:
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer, model = model)

In [15]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(model,
                        args,
                        train_dataset = train_dataset,
                        eval_dataset = test_dataset,
                        data_collator = data_collator,
                        tokenizer = tokenizer,)

trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,3.1242,2.861024
2,2.9132,2.754408
3,2.7945,2.710234


TrainOutput(global_step=11544, training_loss=3.0148070937465317, metrics={'train_runtime': 7571.7741, 'train_samples_per_second': 24.39, 'train_steps_per_second': 1.525, 'total_flos': 868125373424640.0, 'train_loss': 3.0148070937465317, 'epoch': 3.0})

In [16]:
pt_save_directory = "t5"
tokenizer.save_pretrained(pt_save_directory)
model.save_pretrained(pt_save_directory)