In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import torch
import torch.nn as nn
import pandas as pd
import re
import random
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from transformers import BertTokenizerFast
from tqdm import tqdm
from sklearn.model_selection import train_test_split

from src.utils import clean_string
from src.constants import PATH_DATA, PATH_MODEL

In [None]:
random_state=42

In [None]:
data_file_path = f'{PATH_DATA}tweets.txt'
model_revision = 0

# Этап 1. Сбор и подготовка данных

##  1.1 Загрузка данных

In [None]:
with open(data_file_path, "r", encoding="utf8") as text_file:
    data_raw = text_file.readlines()

In [None]:
df_raw = pd.DataFrame(data_raw, columns=['tweet'])

In [None]:
df_raw

In [None]:
# df_raw.to_csv(f'{PATH_DATA}raw_dataset.csv')

In [None]:
# df_raw = pd.read_csv(f'{PATH_DATA}raw_dataset.csv', index_col=0)

## 1.2 Предобработка данных

In [None]:
df_processed = df_raw['tweet'].apply(lambda x: clean_string(x))

In [None]:
# df_processed.to_csv(f'{PATH_DATA}dataset_processed.csv')

## 1.3 Разбиение на train / valid / test

In [None]:
train_texts, valtest_texts = train_test_split(df_processed.tolist(), test_size=0.2, random_state=random_state)
val_texts, test_texts = train_test_split(valtest_texts, test_size=0.5, random_state=random_state)

In [None]:
len(train_texts), len(val_texts), len(test_texts)

In [None]:
train_texts

In [None]:
# assert len(df_processed) == len(X_train) + len(X_val) + len(X_test)

## 1.4 Создание объектов Dataset и Dataloader

In [None]:
# tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
tokenizer.add_special_tokens({'bos_token': '[BOS]', 'eos_token': '[EOS]'})
print(tokenizer.tokenize('ai can be misinterpreted as artificial intelligence'))

In [None]:
# # Access the vocabulary dictionary
# vocab_dict = tokenizer.vocab
# new_dict = {value:key for (key,value) in vocab_dict.items()}

In [None]:
# класс датасета
class TextGenerationDataset(Dataset):
    def __init__(self, texts, tokenizer, input_seq_len=4):
        self.samples = []
        
        for line in texts:
            line = ' '.join([tokenizer.bos_token, line, tokenizer.eos_token])
            token_ids = tokenizer.encode(line, add_special_tokens=False, max_length=512, truncation=True)
            if len(token_ids) == 1:
                continue

            for i in range(1, len(token_ids)):

                if i >= input_seq_len:
                    context = token_ids[i-input_seq_len:i] 
                else:
                    context = [tokenizer.pad_token_type_id for _ in range(input_seq_len - i)] + token_ids[:i]

                context += [tokenizer.mask_token_id]
                
                target = token_ids[i]
                self.samples.append((context, target))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        x, y = self.samples[idx]
        return torch.tensor(x), torch.tensor(y)

In [None]:
def text_generation_inference(input_text, model, tokenizer, input_seq_len=4):
    # text preproc

    line = ' '.join([tokenizer.bos_token, clean_string(input_text)])
    token_ids = tokenizer.encode(line, add_special_tokens=False, max_length=512, truncation=True)

    add_pads_cnt = input_seq_len - len(token_ids) 
    if add_pads_cnt > 0:
        token_ids = [tokenizer.pad_token_type_id for _ in range(add_pads_cnt)] + token_ids

    token_ids += [tokenizer.mask_token_id]

    


    token_ids_tensor = torch.tensor(token_ids).unsqueeze(0)

    logit = model(token_ids_tensor)
    pred = torch.argmax(logit, dim=1)

    pred_tok = tokenizer.convert_ids_to_tokens([pred.item()])[0]

    print(
        f'line = {line}\n'
        f'token_ids = {token_ids}\n'
        f'token_ids_tensor = {token_ids_tensor}\n'
        f'logit = {logit}\n'
        f'pred = {pred}\n'
        f'pred_tok = {pred_tok}\n'
    )
    
    return pred_tok

In [None]:
input_text = 'Sorry seems to be at work'
predo = text_generation_inference(
    input_text, 
    model, 
    tokenizer
    )

In [None]:
predo

In [None]:
# # класс датасета
# class FunnyDataset(Dataset):
#     def __init__(self, texts, tokenizer, input_seq_len=4):
#         self.samples = []
#         self.empty_token = tokenizer.pad_token_type_id

#         for line in texts:
#             line = tokenizer.bos_token + line + tokenizer.eos_token
#             # token_ids = tokenizer.encode(line, add_special_tokens=False, max_length=512, truncation=True)
#             token_ids = line.split() 
#             print(f'line = {line}')
#             if len(token_ids) == 1:
#                 continue

#             for i in range(1, len(token_ids)):

#                 if i >= input_seq_len:
#                     context = token_ids[i-input_seq_len:i]
#                 else: # i < input_seq_len
#                     print(f'BINGO')
#                     context = [self.empty_token for _ in range(input_seq_len - i)] + token_ids[:i]
                
#                 print(f'context = {context}')
#                 target = token_ids[i]
#                 self.samples.append((context, target))
#             print(f'self.samples = {self.samples}')
#             break
           
#     def __len__(self):
#         return len(self.samples)

#     def __getitem__(self, idx):
#         x, y = self.samples[idx]
#         return x, y


In [None]:
# val_dataset = FunnyDataset(val_texts, tokenizer, input_seq_len=4)

In [None]:
# тренировочный и валидационный датасеты
train_dataset = TextGenerationDataset(train_texts, tokenizer, input_seq_len=4)
val_dataset = TextGenerationDataset(val_texts, tokenizer, input_seq_len=4)

In [None]:
# даталоадеры
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

In [None]:
for batch in val_loader:
    print(batch)
    break

In [None]:
batch[0].shape

In [None]:
batch[1].shape

# Этап 2. Реализация рекуррентной сети

In [None]:
class RnnTextGenerator(nn.Module):
    def __init__(self, vocab_size, hidden_dim=128, rnn_type="GRU",):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_dim)


        rnn_cls = {"RNN": nn.RNN, "GRU": nn.GRU, "LSTM": nn.LSTM}[rnn_type]
        self.rnn = rnn_cls(hidden_dim, hidden_dim, batch_first=True, bidirectional=False)

        self.fc = nn.Linear(hidden_dim, vocab_size)


    def forward(self, x):
        emb = self.embedding(x)
        out, _ = self.rnn(emb)
        # linear_out = self.fc(out)


        # mask_position = x.size(1) # позиция центрального <MASK> токена
        hidden_state = out[:, -1, :]

        linear_out = self.fc(hidden_state)

        # print(f'FORWARD emb.shape = {emb.shape}')
        # print(f'FORWARD out.shape = {out.shape}')
        # print(f'FORWARD hidden_state.shape = {hidden_state.shape}')
        # print(f'FORWARD linear_out.shape = {linear_out.shape}')

        # raise ZeroDivisionError()
        return linear_out

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters())

In [None]:
vocab_size = len(tokenizer)  
hidden_dim = 128

rnn_types = ["RNN", "GRU", "LSTM"]

In [None]:
# Сравнение
print(f"{'RNN Type':<8} | {'Params':>10}")
print("-" * 35)
for rnn_type in rnn_types:
    model = RnnTextGenerator(vocab_size, hidden_dim, rnn_type,)
    param_count = count_parameters(model)
    print(f"{rnn_type:<8} | {param_count:>10,}")

## 2.1 Создание модели-LSTM

In [None]:
model = RnnTextGenerator(vocab_size, rnn_type="LSTM",)
optimizer = torch.optim.Adam(model.parameters(), lr=0.002)
criterion = nn.CrossEntropyLoss()

# Этап 3. Тренировка модели

In [None]:
def evaluate(model, loader):
    model.eval()
    correct, total = 0, 0
    sum_loss = 0
    with torch.no_grad():
        for i, (x_batch, y_batch) in enumerate(loader):
            # if i > 50:
            #     break
            # x_batch, y_batch = x_batch, y_batch
            x_output = model(x_batch)
            loss = criterion(x_output, y_batch)
            preds = torch.argmax(x_output, dim=1)
            correct += (preds == y_batch).sum().item()
            total += y_batch.size(0)
            sum_loss += loss.item()
    return sum_loss / len(loader), correct / total

In [None]:
# Основной цикл обучения
n_epochs = 5

for epoch in range(n_epochs):
    model.train()
    train_loss = 0.
    for i, (x_batch, y_batch) in tqdm(enumerate(train_loader)):
        # print(f'x_batch.shape = {x_batch.shape} / y_batch.shape = {y_batch.shape}')
        if i > 1000:
            break
        optimizer.zero_grad()

        x_output = model(x_batch)

        # print(f'x_output = {x_output}')
        # print(f'x_output.shape = {x_output.shape}')
        # print(f'y_batch = {y_batch}')
        
        loss = criterion(model(x_batch), y_batch)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()


    train_loss /= len(train_loader)
    val_loss, val_acc = evaluate(model, val_loader)
    print(f"Epoch {epoch+1} | Train Loss: {train_loss:.3f} | Val Loss: {val_loss:.3f} | Val Accuracy: {val_acc:.2%}")

In [None]:
model

In [None]:
model_revision += 1
model_save_path = f'{PATH_MODEL}rnn_model_weights_rev{model_revision}.pth'

In [None]:
torch.save(model.state_dict(), model_save_path)

In [None]:
model = RnnTextGenerator(vocab_size, rnn_type="LSTM",)  # архитектура должна совпадать с сохранённой
model.load_state_dict(torch.load(model_save_path))

In [None]:
model_loaded

In [None]:
def text_generation_inference(input_text, model, tokenizer, gen_max_length=50):
    # text preproc
    clean_text = clean_string(input_text)
    # make inference in cicle

    generated_text = []

    for i in range(gen_max_length):
        


## 3.1 код замера и вывода метрики ROUGE

## 3.2 Код тренировки модели

## 3.3 Обучение модель, подобрав оптимальные параметры.

# Этап 4. Использование предобученного трансформера

## 4.1 Воспользуйтесь моделью трансформера distilgpt2 из Transformers и дополните тексты

## 4.2 код замера и вывода метрики ROUGE, но уже с использованием трансформера.

## 4.3 Подберите параметры генерации, замерьте качество модели на валидационной выборке, выведите примеры предсказаний

# Этап 5. Формулирование выводов

## 5.1 Сравните примеры предсказаний двух моделей, а также получившиеся метрики.

## 5.2 Сделайте выводы