In [4]:
# %% [markdown]
# # Запуск проекта автодополнения текста из модулей

# %%
# Автоматическая перезагрузка модулей при изменениях кода
%load_ext autoreload
%autoreload 2

# %%
import os
import torch
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# Импорты из модулей проекта
from data_utils import process_file_and_save, tokens_to_indices, pad_sequences_torch, compute_rouge
from next_token_dataset import TextDataset
from lstm_model import LSTMModel
from eval_lstm import train_epoch, eval_epoch

print("Текущая рабочая директория:", os.getcwd())

# %%
# Настройка устройства
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Используем устройство: {device}")

# %%
# Путь к исходным данным
input_file = '/home/assistant/text-autocomplete/data/tweets.txt'

# %%
# Подготовка данных
samples = process_file_and_save(input_file)

all_tokens = [token for sample in samples for token in sample['X']] + [token for sample in samples for token in sample['Y']]
vocab = {token: idx+1 for idx, token in enumerate(sorted(set(all_tokens)))}
vocab_size = len(vocab) + 1

X_indices, Y_indices = tokens_to_indices(samples, vocab)
max_len = max(len(x) for x in X_indices)
X_pad = pad_sequences_torch(X_indices, max_len, torch)
Y_pad = pad_sequences_torch(Y_indices, max_len, torch)

# %%
# Разделение на обучающую, валидационную и тестовую выборки
X_train, X_temp, Y_train, Y_temp = train_test_split(X_pad, Y_pad, test_size=0.2, random_state=42)
X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size=0.5, random_state=42)

# Сохранение выборок csv (опционально)
os.makedirs('data', exist_ok=True)
def tensor_to_str_list(tensor):
    return [' '.join(map(str, seq.tolist())) for seq in tensor]

pd.DataFrame({'X': tensor_to_str_list(X_train), 'Y': tensor_to_str_list(Y_train)}).to_csv('/home/assistant/text-autocomplete/data/train.csv', index=False, encoding='utf-8')
pd.DataFrame({'X': tensor_to_str_list(X_val), 'Y': tensor_to_str_list(Y_val)}).to_csv('/home/assistant/text-autocomplete/data/val.csv', index=False, encoding='utf-8')
pd.DataFrame({'X': tensor_to_str_list(X_test), 'Y': tensor_to_str_list(Y_test)}).to_csv('/home/assistant/text-autocomplete/data/test.csv', index=False, encoding='utf-8')

# %%
# Создание объектов Dataset и DataLoader
train_dataset = TextDataset(X_train, Y_train)
val_dataset = TextDataset(X_val, Y_val)

batch_size = 64
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size)

# %%
# Инициализация модели, функции потерь и оптимизатора
embedding_dim = 128
hidden_dim = 256
model = LSTMModel(vocab_size, embedding_dim, hidden_dim).to(device)

criterion = torch.nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.Adam(model.parameters())

# %%
# Обучение и оценка модели
train_losses = []
val_losses = []
epochs = 10

for epoch in range(epochs):
    train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
    val_loss = eval_epoch(model, val_loader, criterion, device)
    train_losses.append(train_loss)
    val_losses.append(val_loss)
    print(f"Epoch {epoch+1}/{epochs} - train_loss: {train_loss:.4f}, val_loss: {val_loss:.4f}")

# %%
# Визуализация результатов обучения
plt.figure(figsize=(10,6))
plt.plot(range(1, epochs+1), train_losses, label='Train Loss')
plt.plot(range(1, epochs+1), val_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.show()

# %%
# Сохранение обученной модели
os.makedirs('models', exist_ok=True)
torch.save(model.state_dict(), '/home/assistant/text-autocomplete/models/model_lstm_weights.pth')
print("Модель сохранена по пути models/model_lstm_weights.pth")

# %%
# Интерактивная генерация текста LSTM (пример)
# Пример использования генерации можно добавить здесь

# %% 
# Дополнительно: выполнение кода из eval_transformer_pipeline.py
# импортируем необходимые функции и запускаем оценку трансформера distilgpt2

from transformers import pipeline, AutoTokenizer
import re
from data_utils import compute_rouge

generator_distilgpt2 = pipeline("text-generation", model="distilgpt2")
tokenizer_distilgpt2 = AutoTokenizer.from_pretrained("distilgpt2")

def split_text_for_completion(text):
    tokens = tokenizer_distilgpt2.tokenize(text)
    cut_off = (len(tokens) * 3) // 4
    input_text = tokenizer_distilgpt2.convert_tokens_to_string(tokens[:cut_off])
    target_text = tokenizer_distilgpt2.convert_tokens_to_string(tokens[cut_off:])
    return input_text, target_text

def clean_text_for_distilgpt2(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'[^\w\sа-яёa-z]', '', text, flags=re.UNICODE)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

texts = [clean_text_for_distilgpt2(t) for t in open(input_file, encoding='utf-8').readlines() if len(t.strip()) > 10]
_, val_texts = train_test_split(texts, test_size=0.2, random_state=42)

rouge1_gpt2_scores, rouge2_gpt2_scores = [], []

print("Оцениваем модель distilgpt2...")

for idx, text in enumerate(val_texts[:50]):
    input_text, ref_text = split_text_for_completion(text)

    gpt2_out = generator_distilgpt2(input_text,
                                    max_length=len(tokenizer_distilgpt2.encode(input_text + ref_text)),
                                    do_sample=True, top_k=50, num_return_sequences=1)
    gpt2_pred = gpt2_out[0]['generated_text'][len(input_text):].strip()
    r1_gpt2, r2_gpt2 = compute_rouge(ref_text, gpt2_pred)
    rouge1_gpt2_scores.append(r1_gpt2)
    rouge2_gpt2_scores.append(r2_gpt2)

    if idx < 5:
        print(f"\nПример #{idx+1}")
        print("Вход:", input_text)
        print("Эталон:", ref_text)
        print(f"distilgpt2 предсказание: {gpt2_pred}")
        print(f"ROUGE-1: {r1_gpt2:.3f}, ROUGE-2: {r2_gpt2:.3f}")

print(f"\nСреднее ROUGE-1 distilgpt2: {np.mean(rouge1_gpt2_scores):.3f}")
print(f"Среднее ROUGE-2 distilgpt2: {np.mean(rouge2_gpt2_scores):.3f}")


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Текущая рабочая директория: /home/assistant/text-autocomplete/src
Используем устройство: cuda
Raw dataset saved to data/raw_dataset.csv
Tokenized dataset saved to data/dataset_processed.csv
Epoch 1/10 - train_loss: 3.9236, val_loss: 3.6846


KeyboardInterrupt: 