In [21]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

from transformers import GPT2Tokenizer
from torch.nn.utils.rnn import pad_sequence
from transformers import pipeline
from rouge_score import rouge_scorer
from tqdm import tqdm

from src.data_utils import clean_text, prepare_data, train_test_val
from src.next_token_dataset import NextTokenDataset
from src.lstm_model import LSTMGenerateWord
from src.lstm_train import model_train
from src.eval_lstm import model_eval
from src.eval_transformer_pipeline import evaluate_transformer

In [3]:
with open('data/tweets.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()

data = pd.DataFrame({'text': lines})

In [4]:
prepare_data(data)
train_test_val()

Удалено 3563 пропусков.
Датасет предобработан.
Разделение на трейн, валидацию и тест прошло успешно.
Train: (1277548, 1)
Val: (159693, 1)
Test: (159694, 1)


In [23]:
train = pd.read_csv('data/train.csv')['text_clean'].tolist()
val = pd.read_csv('data/val.csv')['text_clean'].tolist()

In [6]:
train = train[:100]
val = val[:100]


In [24]:
tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token

In [25]:
def collate_fn(batch, pad_token_id=50256):
    input_ids = [torch.tensor(item['input_ids']) for item in batch]
    labels = [torch.tensor(item['labels']) for item in batch]

    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=pad_token_id)
    labels = pad_sequence(labels, batch_first=True, padding_value=pad_token_id)

    return {
        'input_ids': input_ids,
        'labels': labels
    }

In [27]:
train_dataset = NextTokenDataset(train, tokenizer, max_length=20)
val_dataset = NextTokenDataset(val, tokenizer, max_length=20)

train_dataloader = DataLoader(train_dataset, batch_size=256, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=256, shuffle=False, collate_fn=collate_fn)

In [28]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [29]:
model = model_train(train_dataloader, tokenizer, device)

Epoch 1/5, Loss: 6.4881
Epoch 2/5, Loss: 5.5851
Epoch 3/5, Loss: 5.3827
Epoch 4/5, Loss: 5.2791
Epoch 5/5, Loss: 5.2126
Модель сохранена


In [30]:
rouge1_lstm, rouge2_lstm = model_eval(model, val_dataloader, tokenizer, device)

LSTM ROUGE-1: 0.0000
LSTM ROUGE-2: 0.0000


In [31]:
scores, examples = evaluate_transformer(val_dataloader, tokenizer, device=device, max_examples=30)

Using device: -1


Device set to use cpu


Preparing prompts and targets.....
Evaluating on 30 examples...


Evaluating DistilGPT2: 100%|██████████| 30/30 [00:08<00:00,  3.59it/s]


DistilGPT2 Evaluation Results (30 examples)
ROUGE-1: 0.0454
ROUGE-2: 0.0076

Examples:

Example 1:
Prompt: ...peace
Target:  good
Generated: It is not that the same laws or practices as those
ROUGE-1: 0.000, ROUGE-2: 0.000
--------------------------------------------------------------------------------

Example 2:
Prompt: ...taking my rotten apple to
Target:  the doc
Generated: the point that I could see a little more of it.
ROUGE-1: 0.154, ROUGE-2: 0.000
--------------------------------------------------------------------------------

Example 3:
Prompt: ...having a gappyf
Target: ringe day
Generated: ag about how it works. It is a lot more complex
ROUGE-1: 0.000, ROUGE-2: 0.000
--------------------------------------------------------------------------------

Example 4:
Prompt: ...missing squints terribly ill
Target:  always remember
Generated: , and the rest of the people around you are just waiting
ROUGE-1: 0.000, ROUGE-2: 0.000
------------------------------------------------------




In [38]:
generator_DistilGPT = pipeline(
    "text-generation", 
    model="distilgpt2", 
    tokenizer=tokenizer,
    pad_token_id=tokenizer.pad_token_id
    )

# Примеры промптов — начала фраз
examples = [
    "i love",
    "this person is",
    "she go",
    "we work",
    "i want"
]
print("LSTM:")
for prompt in examples:
    generated = model.generate(tokenizer, prompt, max_length=20, device=device)
    print(f"Промпт: {prompt}")
    print(f"Дополнение LSTM: {generated}")

print("\nDistilGPT:")

for prompt in examples:
    result = generator_DistilGPT(prompt, max_new_tokens=10, do_sample=True, temperature=0.8, top_k=50)
    generated = result[0]['generated_text']
    print(f"Промпт: {prompt}")
    print(f"Дополнение DistilGPT: {generated}")


Device set to use cuda:0


LSTM:
Промпт: i love
Дополнение LSTM: i love you guys and i love you guys i love you guys i love you guys i love you
Промпт: this person is
Дополнение LSTM: this person is a little bit of a little bit of a little bit of a headache i hate it
Промпт: she go
Дополнение LSTM: she goin to bed i miss you guys too much i love you guys and i love you guys
Промпт: we work
Дополнение LSTM: we work on the new album i have to go to the gym and i have to go to work
Промпт: i want
Дополнение LSTM: i want to go to the beach and i have to go to work tomorrow i have to go to

DistilGPT:
Промпт: i love
Дополнение DistilGPT: i love and hate.‬We‬g love
Промпт: this person is
Дополнение DistilGPT: this person is a good man, but you just can't do
Промпт: she go
Дополнение DistilGPT: she go to sleep after you're gone.”
Промпт: we work
Дополнение DistilGPT: we work on the “‘‘‘
Промпт: i want
Дополнение DistilGPT: i want to talk to them.››�


Выводы:
1) У LSTM ROUGE-1 и ROUGE-2 равны 0.0. У DistilGPT ROUGE-1 и ROUGE-2 равны 0.0454 и 0.0076 соответственно.
2) DistilGPT дополняет текст более логично, генерируя более связанные слова. LSTM с такой задачей справляется хуже.

Итог: DistilGPT лучше подходит для задачи автодополения текста.