# Автодополнение текстов

## 0. Подготовка окружения
Окружение настроено, этап пропускаем.

In [2]:
kaggle_test = False

In [None]:
if kaggle_test:
    import sys
    sys.path.append('/kaggle/input/datasets/ruslanmushtakov/ya-dl-nlp-sprint2-project-dataset')

    !pip install -q evaluate
else:
    %load_ext autoreload
    %autoreload 2

In [None]:
from pathlib import Path

import pandas as pd

import torch
from torch import nn
from torch.utils.data import DataLoader

from transformers import AutoTokenizer

from sklearn.model_selection import train_test_split

# Обработка датасета
from src.data_utils import (
    read_raw_dataset,
    process_dataset
)

# Подготовка датасетов
from src.next_token_dataset import NextTokenDataset, collate_fn

# LSTM модель
# from src.lstm_model import LSTMAutocompleteModel
import src.lstm_model as lstm_model

# Обучение модели
# from src.lstm_train import train_lstm_model
import src.lstm_train as lstm_train


from src.eval_transformer_pipeline import create_distilgpt2_generator, evaluate_transformer_rouge

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
RAND_SEED = 42

## 1. Сбор и подготовка данных

1. Скачайте датасет, положите его в папку data.
2. «Почистите» тексты в датасете, а затем токенизируйте их. Для удобства можете сохранить почищенный и токенизированный датасет.
3. Разбейте датасет на трейн, валидацию и тест.
4. Создайте torch.Dataset и torch.DataLoader для обучения модели.

Определим путь до директории с данными.

In [None]:
if kaggle_test:
    DATA_DIR = Path('/kaggle/input/datasets/ruslanmushtakov/ya-dl-nlp-sprint2-project-dataset/data')
else:
    DATA_DIR = Path('data')

RAW_PATH = DATA_DIR/'raw_dataset.csv'

Читаем сырой датасет.

In [5]:
raw_df = read_raw_dataset(RAW_PATH)
raw_df.head()

Unnamed: 0,raw_text
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,is upset that he can't update his Facebook by ...
2,@Kenichan I dived many times for the ball. Man...
3,my whole body feels itchy and like its on fire
4,"@nationwideclass no, it's not behaving at all...."


In [6]:
processed_df = process_dataset(raw_df)
processed_df.head()

Unnamed: 0,raw_text,clean_text
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","- awww, that's a bummer. you shoulda got david..."
1,is upset that he can't update his Facebook by ...,is upset that he can't update his facebook by ...
2,@Kenichan I dived many times for the ball. Man...,i dived many times for the ball. managed to sa...
3,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire
4,"@nationwideclass no, it's not behaving at all....","no, it's not behaving at all. i'm mad. why am ..."


Разбиваем датасет на обучающую, валидационную и тестовую выборки. Проверяем, объем данных в каждом датасете.

In [7]:
train_df, val_test_df = train_test_split(processed_df, test_size=0.2, random_state=RAND_SEED)
val_df, test_df = train_test_split(val_test_df, test_size=0.5, random_state=RAND_SEED)

train_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

print(f'raw: {len(raw_df)}')
print(f'processed: {len(processed_df)}')
print(f'train: {len(train_df)}, val: {len(val_df)}, test: {len(test_df)}')

raw: 139852
processed: 136026
train: 108820, val: 13603, test: 13603


Сохраняем выборки

In [None]:
if kaggle_test:
    OUTPUT_DIR = Path('/kaggle/working/')
    processed_df.to_csv(OUTPUT_DIR/'processed_df.csv', encoding='utf-8')
    train_df.to_csv(OUTPUT_DIR/'train_df.csv', encoding='utf-8')
    val_df.to_csv(OUTPUT_DIR/'val_df.csv', encoding='utf-8')
    test_df.to_csv(OUTPUT_DIR/'test_df.csv', encoding='utf-8')
else:
    processed_df.to_csv(DATA_DIR/'processed_df.csv', encoding='utf-8')
    train_df.to_csv(DATA_DIR/'train_df.csv', encoding='utf-8')
    val_df.to_csv(DATA_DIR/'val_df.csv', encoding='utf-8')
    test_df.to_csv(DATA_DIR/'test_df.csv', encoding='utf-8')

In [9]:
model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [10]:
batch_size = 64

In [None]:
train_texts = train_df['clean_text'].tolist()
val_texts = val_df['clean_text'].tolist()

In [None]:
train_dataset = NextTokenDataset(train_texts, tokenizer)
val_dataset = NextTokenDataset(val_texts, tokenizer)
# test_dataset = NextTokenDataset(test_df, tokenizer)

In [13]:
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
# test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

In [14]:
print(f'Количество батчей в train_dataloader: {len(train_dataloader)}')
print(f'Количество батчей в val_dataloader: {len(val_dataloader)}')

РљРѕР»РёС‡РµСЃС‚РІРѕ Р±Р°С‚С‡РµР№ РІ train_dataloader: 1701
РљРѕР»РёС‡РµСЃС‚РІРѕ Р±Р°С‚С‡РµР№ РІ val_dataloader: 1


In [15]:
batch = next(iter(train_dataloader))
print('input_ids shape:', tuple(batch['input_ids'].shape))
print('labels shape:', tuple(batch['labels'].shape))

input_ids shape: (64, 40)
labels shape: (64, 40)


In [16]:
batch

{'input_ids': tensor([[ 101, 2003, 5881,  ...,    0,    0,    0],
         [ 101, 2307, 2305,  ...,    0,    0,    0],
         [ 101, 5683, 6659,  ...,    0,    0,    0],
         ...,
         [ 101, 1048, 2863,  ...,    0,    0,    0],
         [ 101, 2125, 2000,  ...,    0,    0,    0],
         [ 101, 2026, 2905,  ...,    0,    0,    0]]),
 'labels': tensor([[ 2003,  5881,  1999,  ...,  -100,  -100,  -100],
         [ 2307,  2305,  1010,  ...,  -100,  -100,  -100],
         [ 5683,  6659,  1012,  ...,  -100,  -100,  -100],
         ...,
         [ 1048,  2863,  9541,  ...,  -100,  -100,  -100],
         [ 2125,  2000,  2131,  ...,  -100,  -100,  -100],
         [ 2026,  2905, 18138,  ...,  -100,  -100,  -100]]),
 'lengths': tensor([ 5, 33, 11, 19, 29, 40, 24,  9, 22, 37, 14, 16, 15, 17, 32, 25, 24, 30,
         13, 19, 10, 25,  4, 21, 25, 38, 27,  5, 19, 35,  7, 11, 24,  7, 23,  9,
         11,  5, 19, 19, 21, 15, 28, 21, 18, 10, 35, 14, 23, 32, 11, 26,  4, 35,
         30, 30, 23

## Этап 2. Реализация рекуррентной сети

Создание модели на основе LSTM

In [None]:
model = lstm_model.LSTMAutocompleteModel(
    vocab_size=tokenizer.vocab_size,
    embedding_dim=256,
    hidden_dim=128,
    num_layers=2,
    dropout=0.2,
    pad_token_id=tokenizer.pad_token_id,
)

: 

## Этап 3. Тренировка модели

Определяем, где будем запускать обучение

In [18]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

Создание оптимизатора и функции потерь

In [24]:
lr = 1e-3
weight_decay = 1e-5

optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
criterion = nn.CrossEntropyLoss(ignore_index=-100)

Обучаем модель

In [None]:
history = lstm_train.train_lstm_model(
    model=model,
    train_loader=train_dataloader,
    val_loader=val_dataloader,
    tokenizer=tokenizer,
    optimizer=optimizer,
    criterion=criterion,
    num_epochs=3,
    device=device,
    print_examples=3,
)

pd.DataFrame(history)

tokenizer.vocab_size: 30522
emb.shape: torch.Size([64, 38, 256])
outputs.shape: torch.Size([64, 38, 128])
logits.shape: torch.Size([64, 38, 30522])
labels.shape: torch.Size([64, 38])
emb.shape: torch.Size([64, 45, 256])
outputs.shape: torch.Size([64, 45, 128])
logits.shape: torch.Size([64, 45, 30522])
labels.shape: torch.Size([64, 45])


## Этап 4. Использование предобученного трансформера



In [None]:
# инициализируем предобученный трансформер distilgpt2
transformer_generator, transformer_tokenizer = create_distilgpt2_generator('distilgpt2', device)

# оцениваем качество дополнения последней четверти текста на валидации
transformer_scores = evaluate_transformer_rouge(
    generator=transformer_generator,
    tokenizer=transformer_tokenizer,
    texts=val_texts,
    print_examples=3,
    do_sample=True,
    top_k=50
)

transformer_scores

## Этап 5. Формулирование выводов