In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

from transformers import GPT2Tokenizer
from torch.nn.utils.rnn import pad_sequence

from src.data_utils import clean_text, prepare_data, train_test_val
from src.next_token_dataset import NextTokenDataset
from src.lstm_model import LSTMGenerateWord
from src.lstm_train import model_train
from src.eval_lstm import model_eval
#from src.eval_transformer_pipeline import evaluate_transformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open('data/tweets.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()

data = pd.DataFrame({'text': lines})

In [3]:
prepare_data(data)
train_test_val()

Удалено 3563 пропусков.
Датасет предобработан.
Разделение на трейн, валидацию и тест прошло успешно.
Train: (1277548, 1)
Val: (159693, 1)
Test: (159694, 1)


In [2]:
train = pd.read_csv('data/train.csv')['text_clean'].tolist()
val = pd.read_csv('data/val.csv')['text_clean'].tolist()

In [3]:
train = train[:100]
val = val[:100]


In [4]:
tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token

In [5]:
def collate_fn(batch, pad_token_id=50256):
    input_ids = [torch.tensor(item['input_ids']) for item in batch]
    labels = [torch.tensor(item['labels']) for item in batch]

    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=pad_token_id)
    labels = pad_sequence(labels, batch_first=True, padding_value=pad_token_id)

    return {
        'input_ids': input_ids,
        'labels': labels
    }

In [6]:
train_dataset = NextTokenDataset(train, tokenizer, max_length=20)
val_dataset = NextTokenDataset(val, tokenizer, max_length=20)

train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


In [8]:
model = model_train(train_dataloader, tokenizer, device)

Epoch 1/5, Loss: 10.8261
Epoch 2/5, Loss: 10.8020
Epoch 3/5, Loss: 10.7724
Epoch 4/5, Loss: 10.7317
Epoch 5/5, Loss: 10.6677
Модель сохранена


In [9]:
rouge1_lstm, rouge2_lstm = model_eval(model, val_dataloader, tokenizer, device)

LSTM ROUGE-1: 0.0000
LSTM ROUGE-2: 0.0000


In [10]:

from transformers import pipeline
from rouge_score import rouge_scorer
from tqdm import tqdm
import torch

def evaluate_transformer(dataloader, tokenizer, device='cpu', max_examples=50, split_ratio=0.75):
    """
    Оценка distilgpt2 с использованием pipeline
    """
    # Определяем устройство
    device_id = 0 if device == "cuda" else -1
    print(f"Using device: {device_id}")
    
    # Загружаем модель через pipeline
    generator = pipeline(
        "text-generation",
        model="distilgpt2",
        device=device_id,
        tokenizer=tokenizer,
        pad_token_id=tokenizer.eos_token_id,
        return_full_text=False  # возвращаем только сгенерированную часть
    )

    # Собираем промпты и целевые тексты
    prompts = []
    target_texts = []
    full_texts = []

    print("Preparing prompts and targets.....")
    for batch in dataloader:
        input_ids = batch['input_ids']
        labels = batch['labels']
        
        for i in range(input_ids.size(0)):
            if len(prompts) >= max_examples:
                break
                
            # Получаем последовательность (игнорируем паддинг)
            sequence = input_ids[i]
            sequence = sequence[sequence != tokenizer.pad_token_id]
            
            #if len(sequence) < 20:  # Пропускаем слишком короткие
                #continue
                
            # Разделяем на промпт (3/4) и таргет (1/4)
            split_point = int(len(sequence) * split_ratio)
            prompt_tokens = sequence[:split_point]
            target_tokens = sequence[split_point:]
            
            #if len(target_tokens) < 5:  # Минимальная длина таргета
                #continue
                
            # Декодируем
            prompt_text = tokenizer.decode(prompt_tokens, skip_special_tokens=True)
            target_text = tokenizer.decode(target_tokens, skip_special_tokens=True)
            full_text = tokenizer.decode(sequence, skip_special_tokens=True)
            
            prompts.append(prompt_text)
            target_texts.append(target_text)
            full_texts.append(full_text)
            
        if len(prompts) >= max_examples:
            break

    if not prompts:
        print("Нет подходящих данных для оценки.")
        return None, []

    print(f"Evaluating on {len(prompts)} examples...")
    
    # Инициализация ROUGE scorer
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2'], use_stemmer=True)
    total_scores = {'rouge1': 0.0, 'rouge2': 0.0}
    examples = []

    # Генерация и оценка
    for i, (prompt, target, full) in enumerate(tqdm(zip(prompts, target_texts, full_texts), 
                                                   desc="Evaluating DistilGPT2", 
                                                   total=len(prompts))):
        try:
            # Генерация с различными параметрами
            outputs = generator(
                prompt,
                max_new_tokens=len(target.split()) + 10,  # Длина + запас
                temperature=0.7,
                top_k=50,
                top_p=0.9,
                do_sample=True,
                num_return_sequences=1,
                truncation=True
            )
            
            generated_text = outputs[0]['generated_text'].strip()
            
            # Вычисляем ROUGE между сгенерированной частью и таргетом
            scores = scorer.score(target, generated_text)
            
            # Суммируем scores
            for key in total_scores:
                total_scores[key] += scores[key].fmeasure
            
            # Сохраняем примеры для вывода
            if i < 5:  # Первые 5 примеров
                examples.append({
                    'prompt': prompt[-100:],  # Последние 100 символов промпта
                    'target': target,
                    'generated': generated_text,
                    'rouge1': scores['rouge1'].fmeasure,
                    'rouge2': scores['rouge2'].fmeasure
                })
                
        except Exception as e:
            print(f"Error on example {i}: {e}")
            continue

    # Вычисляем средние значения
    count = len(prompts)
    avg_scores = {key: total_scores[key] / count for key in total_scores}
    
    # Вывод результатов
    print(f"\n{'='*60}")
    print(f"DistilGPT2 Evaluation Results ({count} examples)")
    print(f"{'='*60}")
    print(f"ROUGE-1: {avg_scores['rouge1']:.4f}")
    print(f"ROUGE-2: {avg_scores['rouge2']:.4f}")
    
    # Вывод примеров
    print(f"\n{'='*60}")
    print("Examples:")
    print(f"{'='*60}")
    
    for i, example in enumerate(examples):
        print(f"\nExample {i+1}:")
        print(f"Prompt: ...{example['prompt']}")
        print(f"Target: {example['target']}")
        print(f"Generated: {example['generated']}")
        print(f"ROUGE-1: {example['rouge1']:.3f}, ROUGE-2: {example['rouge2']:.3f}")
        print("-" * 80)
    
    return avg_scores, examples


In [11]:
scores, examples = evaluate_transformer(val_dataloader, tokenizer, device=device, max_examples=30)

Using device: -1


Device set to use cpu


Preparing prompts and targets.....
Evaluating on 30 examples...


Evaluating DistilGPT2:  63%|██████▎   | 19/30 [00:03<00:01,  5.97it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Evaluating DistilGPT2: 100%|██████████| 30/30 [00:04<00:00,  6.14it/s]


DistilGPT2 Evaluation Results (30 examples)
ROUGE-1: 0.0552
ROUGE-2: 0.0000

Examples:

Example 1:
Prompt: ...peace
Target:  good
Generated: It's time to stop playing the card. I'm
ROUGE-1: 0.000, ROUGE-2: 0.000
--------------------------------------------------------------------------------

Example 2:
Prompt: ...taking my rotten apple to
Target:  the doc
Generated: be a friend to my sister.
I‪m
ROUGE-1: 0.000, ROUGE-2: 0.000
--------------------------------------------------------------------------------

Example 3:
Prompt: ...having a gappyf
Target: ringe day
Generated: iddle.
ROUGE-1: 0.000, ROUGE-2: 0.000
--------------------------------------------------------------------------------

Example 4:
Prompt: ...missing squints terribly ill
Target:  always remember
Generated: 
ROUGE-1: 0.000, ROUGE-2: 0.000
--------------------------------------------------------------------------------

Example 5:
Prompt: ...i dont feel good i want to
Target:  go out ton
Generated: be the next one, i




In [24]:
generator_DistilGPT = pipeline(
    "text-generation", 
    model="distilgpt2", 
    tokenizer=tokenizer,
    pad_token_id=tokenizer.pad_token_id
    )

# Примеры промптов — начала фраз
examples = [
    "i love",
    "today is",
    "i feel",
    "this is",
    "i want"
]
print("Оценка автодополнения LSTM:")
for prompt in examples:
    generated = model.generate(tokenizer, prompt, max_length=20, device=device)
    print(f"Промпт: {prompt}")
    print(f"Дополнение LSTM: {generated}")
print("\nОценка автодополнения DistilGPT:")
for prompt in examples:
    result = generator_DistilGPT(prompt, max_new_tokens=10, do_sample=True, temperature=0.8, top_k=50)
    generated = result[0]['generated_text']
    print(f"Промпт: {prompt}")
    print(f"Дополнение DistilGPT: {generated}")


Device set to use cpu


Оценка автодополнения LSTM:
Промпт: i love
Дополнение LSTM: i love 630 630 the the the the the the the the the the the the the the the the
Промпт: today is
Дополнение LSTM: today is not not not not not not not not not the the the the the the the the the
Промпт: i feel
Дополнение LSTM: i feel 630 and the the the the the the the the the the the the the the the the
Промпт: this is
Дополнение LSTM: this is not not not not not not not not not the the the the the the the the the
Промпт: i want
Дополнение LSTM: i want 630 not not not not not not not not the the the the the the the the the

Оценка автодополнения DistilGPT:
Промпт: i love
Дополнение DistilGPT: i love you. In an era of unprecedented political polarization,
Промпт: today is
Дополнение DistilGPT: today is a new concept to build on the original research done
Промпт: i feel
Дополнение DistilGPT: i feel as though they’re in the right place
Промпт: this is
Дополнение DistilGPT: this is going to be a fantastic story and I want to
Промп

In [24]:
# Посмотрим информацию о датасете
print(f"Размер train_dataset: {len(train_dataset)} примеров")
print(f"Размер val_dataset: {len(val_dataset)} примеров")

# Посмотрим несколько примеров из датасета
print("\nПервые 3 примера из train_dataset:")
for i in range(min(3, len(train_dataset))):
    sample = train_dataset[i]
    print(f"\nПример {i}:")
    print(f"Input IDs: {sample['input_ids']}")
    print(f"Labels: {sample['labels']}")
    print(f"Декодированный input: {tokenizer.decode(sample['input_ids'], skip_special_tokens=True)}")
    print(f"Декодированный target: {tokenizer.decode(sample['labels'], skip_special_tokens=True)}")

Размер train_dataset: 1272634 примеров
Размер val_dataset: 159106 примеров

Первые 3 примера из train_dataset:

Пример 0:
Input IDs: [11195, 422, 670, 1110, 734, 19698, 257, 7534, 3052, 287, 1022, 2111, 284, 1334, 290, 21509, 617, 2568, 26645]
Labels: [422, 670, 1110, 734, 19698, 257, 7534, 3052, 287, 1022, 2111, 284, 1334, 290, 21509, 617, 2568, 26645, 1660]
Декодированный input: home from work day two updating a clients website in between trying to rest and regain some energy retaining
Декодированный target:  from work day two updating a clients website in between trying to rest and regain some energy retaining water

Пример 1:
Input IDs: [10919, 389, 345, 991, 1804, 510, 1862]
Labels: [389, 345, 991, 1804, 510, 1862, 10846]
Декодированный input: what are you still doing up young
Декодированный target:  are you still doing up young lady

Пример 2:
Input IDs: [361, 7722, 373, 281, 267, 6760, 291, 6332, 4686, 423, 1760, 35918, 1044, 6613, 428, 5041, 1312, 765, 284]
Labels: [7722, 373, 

In [27]:
# Посмотрим первый батч из DataLoader
print("\nПервый батч из train_dataloader:")
for batch_idx, batch in enumerate(train_dataloader):
    print(f"\nБатч {batch_idx}:")
    print(f"Input IDs shape: {batch['input_ids'].shape}")
    print(f"Labels shape: {batch['labels'].shape}")
    #print(f"Attention mask shape: {batch['attention_mask'].shape}")
    
    print(f"\nInput IDs:\n{batch['input_ids']}")
    print(f"\nLabels:\n{batch['labels']}")
    #print(f"\nAttention mask:\n{batch['attention_mask']}")
    
    # Декодируем примеры из батча
    print(f"\nДекодированные примеры:")
    for i in range(min(2, batch['input_ids'].shape[0])):  # Первые 2 примера
        print(f"\nПример {i} в батче:")
        input_text = tokenizer.decode(batch['input_ids'][i], skip_special_tokens=True)
        # Для labels игнорируем -100 (паддинг)
        label_ids = [x for x in batch['labels'][i].tolist() if x != -100]
        label_text = tokenizer.decode(label_ids, skip_special_tokens=True)
        
        print(f"Input: '{input_text}'")
        print(f"Target: '{label_text}'")
        #print(f"Attention mask: {batch['attention_mask'][i].tolist()}")
    
    break  # Смотрим только первый батч


Первый батч из train_dataloader:

Батч 0:
Input IDs shape: torch.Size([64, 19])
Labels shape: torch.Size([64, 19])

Input IDs:
tensor([[ 6814,  7252,   321,  ...,  1381,   407,   257],
        [ 3137,   925,   340,  ...,     0,     0,     0],
        [   72,  1842,   345,  ...,     0,     0,     0],
        ...,
        [ 1820,  3329,   318,  ...,     0,     0,     0],
        [   72,  8138,   479,  ...,   788,  1312,  3285],
        [20342,     0,     0,  ...,     0,     0,     0]])

Labels:
tensor([[ 7252,   321,  3020,  ...,   407,   257,   922],
        [  925,   340,   832,  ...,     0,     0,     0],
        [ 1842,   345,  1165,  ...,     0,     0,     0],
        ...,
        [ 3329,   318,  8805,  ...,     0,     0,     0],
        [ 8138,   479,  5892,  ...,  1312,  3285, 23611],
        [19462,     0,     0,  ...,     0,     0,     0]])

Декодированные примеры:

Пример 0 в батче:
Input: 'daaaammmnnn and i wear a sz 9 soo dats not a'
Target: 'aaammmnnn and i wear a sz 9 soo 

In [26]:
# Быстрая проверка без деталей
print("Быстрая проверка DataLoader:")
batch = next(iter(train_dataloader))
print(f"Типы данных:")
print(f"  Input IDs: {batch['input_ids'].dtype}, shape: {batch['input_ids'].shape}")
print(f"  Labels: {batch['labels'].dtype}, shape: {batch['labels'].shape}")
#print(f"  Attention mask: {batch['attention_mask'].dtype}, shape: {batch['attention_mask'].shape}")

# Проверка на наличие NaN значений
print(f"\nПроверка на NaN:")
print(f"  Input IDs has NaN: {torch.isnan(batch['input_ids']).any()}")
print(f"  Labels has NaN: {torch.isnan(batch['labels']).any()}")

Быстрая проверка DataLoader:
Типы данных:
  Input IDs: torch.int64, shape: torch.Size([64, 19])
  Labels: torch.int64, shape: torch.Size([64, 19])

Проверка на NaN:
  Input IDs has NaN: False
  Labels has NaN: False


In [28]:
batch

{'input_ids': tensor([[ 6814,  7252,   321,  ...,  1381,   407,   257],
         [ 3137,   925,   340,  ...,     0,     0,     0],
         [   72,  1842,   345,  ...,     0,     0,     0],
         ...,
         [ 1820,  3329,   318,  ...,     0,     0,     0],
         [   72,  8138,   479,  ...,   788,  1312,  3285],
         [20342,     0,     0,  ...,     0,     0,     0]]),
 'labels': tensor([[ 7252,   321,  3020,  ...,   407,   257,   922],
         [  925,   340,   832,  ...,     0,     0,     0],
         [ 1842,   345,  1165,  ...,     0,     0,     0],
         ...,
         [ 3329,   318,  8805,  ...,     0,     0,     0],
         [ 8138,   479,  5892,  ...,  1312,  3285, 23611],
         [19462,     0,     0,  ...,     0,     0,     0]])}

In [12]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


In [13]:
import torch
from torch import nn

class LSTMGenerateWord(nn.Module):
    def __init__(self, vocab_size, embedding_dim=128, hidden_dim=128, num_layers=2, dropout=0.2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim, vocab_size)


    def forward(self, x, hidden=None):
        x = self.embedding(x)

        if hidden is None:
            lstm_out, hidden = self.lstm(x)
        else:
            lstm_out, hidden = self.lstm(x, hidden)

        lstm_out = self.dropout(lstm_out)
        out = self.fc(lstm_out)

        return out, hidden

    def generate(self, tokenizer, prompt, max_length=20, device='cpu'):
        self.eval()
        with torch.no_grad():
            tokens = tokenizer.encode(prompt.lower(), return_tensors='pt').to(device)
            generated = tokens.clone()

            for _ in range(max_length - tokens.size(1)):
                logits, _ = self.forward(generated)
                next_token_logits = logits[:, -1, :]  # последний токен
                next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True)
                generated = torch.cat([generated, next_token], dim=1)

                if next_token.item() == tokenizer.eos_token_id:
                    break

            return tokenizer.decode(generated[0], skip_special_tokens=True)

In [14]:
from torch import nn
from torch.optim import Adam
#from src.lstm_model import LSTMGenerateWord

model = LSTMGenerateWord(vocab_size=tokenizer.vocab_size).to(device)
optimizer = Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss(ignore_index=-100)

In [55]:
from torch import nn
from torch.optim import Adam
#from src.lstm_model import LSTMGenerateWord

model = LSTMGenerateWord(vocab_size=tokenizer.vocab_size).to(device)
optimizer = Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss(ignore_index=-100)
num_epochs = 5
for epoch in range(num_epochs):

    model.train()
    running_loss = 0

    for batch in train_dataloader:
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs, _ = model(input_ids)
        loss = criterion(outputs.reshape(-1, outputs.size(-1)), labels.reshape(-1))
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    
    epoch_loss = running_loss / len(train_dataloader)

    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}')


KeyboardInterrupt: 

In [20]:
import evaluate
from rouge_score import rouge_scorer

#rouge = evaluate.load('rouge')
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2'], use_stemmer=True)

model.eval()
total_rouge1 = 0
total_rouge2 = 0
count = 0
with torch.no_grad():
    for batch in val_dataloader:
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)

        for i in range(input_ids.size(0)):
            seg_len = int(input_ids.size(0) * 0.75)

            promt_ids = input_ids[i][: seg_len]
            promt = tokenizer.decode(promt_ids, skip_special_tokens=True)

            target_ids = labels[i][seg_len:]
            target_ids = target_ids[target_ids != -100]  # Убираем паддинг
            target = tokenizer.decode(target_ids, skip_special_tokens=True)

            generated = model.generate(tokenizer, promt, max_length=30, device=device)
            
            results = scorer.score(target, generated)

            #results = rouge.compute(predictions=target, references=generated, use_stemmer=True)
            total_rouge1 += results['rouge1'].fmeasure
            total_rouge2 += results['rouge2'].fmeasure
            count += 1

print(f"LSTM ROUGE-1: {total_rouge1/count:.4f}")
print(f"LSTM ROUGE-2: {total_rouge2/count:.4f}")



KeyboardInterrupt: 