# Автодополнение текста —

### Этап 0. Подготовка данных.


In [1]:
import re
import zipfile

# Unpack the archive
with zipfile.ZipFile('data/raw_data.txt.zip', 'r') as z:
    z.extractall('data/')

# Read the raw data
with open('data/raw_data.txt', 'r', encoding='utf-8') as f:
    text = f.read()

print(f"Original length: {len(text)} chars")
sample_chars = 300
print(f"Sample (first {sample_chars} chars):")
print(text[:sample_chars])

Original length: 120184721 chars
Sample (first 300 chars):
@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D
is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!
@Kenichan I dived many times for the ball. Managed to save 50%  The rest


In [2]:
import emoji

# Lowercase
text = text.lower()

# Remove URLs (http/https/www links)
text = re.sub(r'http\S+|www\.\S+', '', text)

# Remove mentions (@username)
text = re.sub(r'@\w+', '', text)

# Remove emojis
text = emoji.replace_emoji(text, replace='')

# Remove all symbols except letters, numbers, and whitespace
text = re.sub(r'[^a-zA-Zа-яА-ЯёЁ0-9\s]', '', text)

# Remove duplicate whitespaces (spaces, tabs, etc.) and strip lines
text = re.sub(r'[^\S\n]+', ' ', text)   # collapse whitespaces to single space
text = re.sub(r' *\n *', '\n', text)    # clean spaces around newlines
text = re.sub(r'\n{2,}', '\n', text)    # collapse multiple newlines
text = text.strip()

print(f"Cleaned length: {len(text)} chars")
print(f"\nCleaned text (first {sample_chars} chars):")
print(text[:sample_chars])

Cleaned length: 101272763 chars

Cleaned text (first 300 chars):
awww thats a bummer you shoulda got david carr of third day to do it d
is upset that he cant update his facebook by texting it and might cry as a result school today also blah
i dived many times for the ball managed to save 50 the rest go out of bounds
my whole body feels itchy and like its on fire



In [3]:
# Save cleaned data
with open('data/clean_data.txt', 'w', encoding='utf-8') as f:
    f.write(text)

print("Saved to data/clean_data.txt")

Saved to data/clean_data.txt


### Этап 1. Токенизация и подготовка словаря

In [4]:
import json
from collections import Counter

# Загрузка очищенных данных
with open('data/clean_data.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# Разбиваем текст на токены
tokens = text.split()
print(f"Total tokens: {len(tokens):,}")

# Строим словарь из тренинг данных
n = len(tokens)
train_end = int(0.8 * n)
train_tokens = tokens[:train_end]

# Считаем частоту слов из выборки для тренинга
counter = Counter(train_tokens)
VOCAB_SIZE = 20_000  # Размер словаря + <UNK>

vocab = {'<UNK>': 0}
for word, _ in counter.most_common(VOCAB_SIZE - 1):
    vocab[word] = len(vocab)

print(f"Vocabulary size: {len(vocab):,}")
print(f"Most common words: {counter.most_common(10)}")

# Save vocabulary
with open('data/vocab.json', 'w', encoding='utf-8') as f:
    json.dump(vocab, f, ensure_ascii=False)

print("Saved vocabulary to data/vocab.json")

Total tokens: 19,995,368
Vocabulary size: 20,000
Most common words: [('i', 629331), ('to', 460330), ('the', 411280), ('a', 296646), ('my', 261286), ('and', 237489), ('you', 195161), ('is', 191042), ('it', 184570), ('in', 172956)]
Saved vocabulary to data/vocab.json


### Разбиение данных: Тренировка / Валидация / Тестирование

In [5]:
import csv

SEQ_LEN = 20  # размер входной последовательности для модели

val_end = int(0.9 * n)
val_tokens = tokens[train_end:val_end]
test_tokens = tokens[val_end:]

# сохраняем токены для тренировки
with open('data/train_data.txt', 'w', encoding='utf-8') as f:
    f.write(' '.join(train_tokens))

print(f"Train: {len(train_tokens):,} tokens -> data/train_data.txt")

# Нарезает список токенов на неперекрывающиеся окна и сохраняет как CSV с двумя колонками: input (входная последовательность) и target (слово, которое нужно предсказать).   Допустим tokens = ['i', 'love', 'my', 'cat', 'and', 'dog', ...] и seq_len = 3. Input =  i love my ; target = cat.
def save_as_csv(tokens, filepath, seq_len):
    rows = 0
    with open(filepath, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['input', 'target'])
        for i in range(0, len(tokens) - seq_len, seq_len):
            input_seq = ' '.join(tokens[i:i + seq_len])
            target = tokens[i + seq_len]
            writer.writerow([input_seq, target])
            rows += 1
    return rows

val_rows = save_as_csv(val_tokens, 'data/val_data.csv', SEQ_LEN)
test_rows = save_as_csv(test_tokens, 'data/test.csv', SEQ_LEN)

print(f"Val:   {len(val_tokens):,} tokens -> {val_rows:,} samples -> data/val_data.csv")
print(f"Test:  {len(test_tokens):,} tokens -> {test_rows:,} samples -> data/test.csv")

Train: 15,996,294 tokens -> data/train_data.txt
Val:   1,999,537 tokens -> 99,976 samples -> data/val_data.csv
Test:  1,999,537 tokens -> 99,976 samples -> data/test.csv


### Подготовка Dataset и DataLoader

In [16]:
import json
import torch
from torch.utils.data import DataLoader
from src.next_token_dataset import NextTokenDataset

# Load vocabulary
with open('data/vocab.json', 'r', encoding='utf-8') as f:
    vocab = json.load(f)

# Load training tokens
with open('data/train_data.txt', 'r', encoding='utf-8') as f:
    train_tokens = f.read().split()

# Create datasets (val_tokens / test_tokens are already in memory from the split above)
train_dataset = NextTokenDataset(train_tokens, vocab, seq_len=SEQ_LEN)
val_dataset = NextTokenDataset(val_tokens, vocab, seq_len=SEQ_LEN)

# Create DataLoaders
use_pin_memory = torch.cuda.is_available()
train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True, num_workers=2, pin_memory=use_pin_memory)
val_loader = DataLoader(val_dataset, batch_size=256, shuffle=False, num_workers=2, pin_memory=use_pin_memory)

print(f"Train dataset: {len(train_dataset):,} samples")
print(f"Val dataset:   {len(val_dataset):,} samples")
print(f"Train batches: {len(train_loader):,}")
print(f"Val batches:   {len(val_loader):,}")

# Inspect one batch
x_batch, y_batch = next(iter(train_loader))
print(f"\nBatch shapes: x={x_batch.shape}, y={y_batch.shape}")

Train dataset: 15,996,274 samples
Val dataset:   1,999,517 samples
Train batches: 62,486
Val batches:   7,811

Batch shapes: x=torch.Size([256, 20]), y=torch.Size([256])


## Этап 2. Рекуррентная сеть (LSTM)

### Архитектура модели

In [9]:
import torch
from src.lstm_model import LSTMLanguageModel

VOCAB_SIZE = len(vocab)     # Размер словаря токенов. Определяет количество строк в Embedding-таблице и размер выхода финального Linear-слоя
EMBED_DIM  = 128            # Размерность векторов слов. Каждый индекс слова превращается в вектор этой размерности перед подачей в LSTM
HIDDEN_DIM = 256            # Размер скрытого слоя LSTM. Че больше, тем больше ёмкость для запоминания контекста, но медленнее и выше риск переобучения
NUM_LAYERS = 2              # Количество слоёв LSTM. Для двух слоёв - Слой 1 обрабатывает эмбеддинги, слой 2 — выход слоя 1. Добавляет глубину модели
DROPOUT    = 0.3            # Вероятность обнуления выходов между слоями LSTM (0.3 = 30%). Применяется только между слоями, поэтому игнорируется при num_layers == 1. Регуляризация для снижения переобучения


# MPS - для запуска на M4Max
# CUDA - для запуска на NVidia Jetson Orin Nano или на видео-карте
# CPU - по дефолту
device = torch.device('mps' if torch.backends.mps.is_available()
                       else 'cuda' if torch.cuda.is_available()
                       else 'cpu')
print(f"Device: {device}")

model = LSTMLanguageModel(
    vocab_size=VOCAB_SIZE,
    embed_dim=EMBED_DIM,
    hidden_dim=HIDDEN_DIM,
    num_layers=NUM_LAYERS,
    dropout=DROPOUT,
).to(device)

total_params = sum(p.numel() for p in model.parameters())
print(f"Model parameters: {total_params:,}")
print(model)

Device: mps
Model parameters: 8,621,600
LSTMLanguageModel(
  (embedding): Embedding(20000, 128)
  (lstm): LSTM(128, 256, num_layers=2, batch_first=True, dropout=0.3)
  (fc): Linear(in_features=256, out_features=20000, bias=True)
)


### Проверка forward pass

In [10]:
# Test forward pass with a single batch
x_test = x_batch.to(device)   # (256, 20)
logits, hidden = model(x_test)

print(f"Input shape:   {x_test.shape}")
print(f"Logits shape:  {logits.shape}")    # expected: (256, 20000)
print(f"Hidden h shape:{hidden[0].shape}") # expected: (num_layers, 256, hidden_dim)
print(f"Hidden c shape:{hidden[1].shape}")

Input shape:   torch.Size([256, 20])
Logits shape:  torch.Size([256, 20000])
Hidden h shape:torch.Size([2, 256, 256])
Hidden c shape:torch.Size([2, 256, 256])


### Проверка генерации текста (до обучения — случайные веса)

In [17]:
# Build reverse mapping: index -> word
idx2word = {idx: word for word, idx in vocab.items()}

#prompt = ['i', 'love', 'machine', 'learning']
prompt = ['is','upset', 'that', 'he', 'cant', 'update', 'his', 'facebook', 'by', 'texting', 'it', 'and', 'might']
generated = model.generate(
    prompt=prompt,
    vocab=vocab,
    idx2word=idx2word,
    max_new_tokens=20,
    temperature=1.0
)

print("Prompt:    ", ' '.join(prompt))
print("Generated: ", ' '.join(generated))
print("Full text: ", ' '.join(prompt + generated))

Prompt:     is upset that he cant update his facebook by texting it and might
Generated:  12pm oi boobie eye hopes zane ignorance bullies tue workstation convos mabye congratulate never katie cravin fork willing schools siento
Full text:  is upset that he cant update his facebook by texting it and might 12pm oi boobie eye hopes zane ignorance bullies tue workstation convos mabye congratulate never katie cravin fork willing schools siento
