# 1 Предобработка

### 1.1 Очистка и нормализация текста

In [None]:
import pandas as pd

data = pd.read_csv(
    r'data\raw_dataset.csv',
    encoding='latin-1',
    header=None,
    names=['target', 'id', 'date', 'flag', 'user', 'text']
)

data.head()

: 

In [None]:
data['target'].hist()

В нижний регистр

In [None]:
text_lower = data['text'].str.lower()

text_lower

Удаление ссылок итд + удаление дубликатов

In [None]:
import re

# ссылки и домены
URL_RE = re.compile(r'(https?://\S+|www\.\S+|[a-zA-Z0-9\-]+\.[a-zA-Z]{2,})')
# упоминания
MENTION_RE = re.compile(r'@\w+')
# хэштеги
HASHTAG_RE = re.compile(r'#\w+')

# оставить только латиницу/цифры/пробелы и пунктуацию . , '
ALLOWED_CHARS_RE = re.compile(r"[^a-z0-9\s\.\,']")

# схлопывание повторов:
# 1) буквы: 3+ одинаковых подряд -> 2 (cooool -> coool -> coo)
LETTER_RUNS_RE = re.compile(r'([a-z])\1{2,}')
# 2) пунктуация . , ' : 2+ -> 1 (.... -> .  ,, -> ,  ''' -> ')
PUNCT_RUNS_RE = re.compile(r"([\.\,'])\1+")

def clean(text: str) -> str:
    if not isinstance(text, str):
        return ""
    text = text.lower()

    # убрать ссылки/упоминания/хэштеги
    text = URL_RE.sub(" ", text)
    text = MENTION_RE.sub(" ", text)
    text = HASHTAG_RE.sub(" ", text)

    # убрать все, кроме [a-z0-9] пробелов и . , '
    text = ALLOWED_CHARS_RE.sub(" ", text)

    # схлопнуть длинные буквы (оставить максимум две подряд)
    text = LETTER_RUNS_RE.sub(r"\1\1", text)

    # схлопнуть повторы пунктуации до одной
    text = PUNCT_RUNS_RE.sub(r"\1", text)

    # убрать лишние пробелы
    text = re.sub(r"\s+", " ", text).strip()
    return text



clean_text_lower = text_lower.apply(clean)
clean_text_lower = clean_text_lower.drop_duplicates().reset_index(drop=True)

clean_text_lower

Статистика

In [None]:
lengths = [len(w) for s in clean_text_lower for w in s.split()]


print(pd.Series(lengths, name="word_len").describe().round(2))
pd.Series(lengths, name="word_len").hist(bins = 100)

### 1.2 Токенизация

In [None]:
from transformers import AutoTokenizer


tokenizer = AutoTokenizer.from_pretrained("distilgpt2", use_fast=True)
tokenizer.pad_token = tokenizer.eos_token 

texts = clean_text_lower.astype(str).tolist()

enc = tokenizer(
    texts,
    truncation=True,
    padding=False,
    max_length=32,
    return_attention_mask=True,
    return_tensors=None
)

input_ids = [ids + [tokenizer.eos_token_id] for ids in enc["input_ids"]]
input_ids

### 1.3 Разделение на трейн, валидацию и тест

In [None]:
from sklearn.model_selection import train_test_split


data = pd.DataFrame({'text': texts, "input_ids": input_ids})

train_df, temp_df = train_test_split(data, test_size=0.2, random_state=42, shuffle=True)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, shuffle=True)

In [None]:
import os


os.makedirs("data", exist_ok=True)

train_df.to_csv('data/train.csv', index = False)
val_df.to_csv('data/val.csv', index = False)
test_df.to_csv('data/test.csv', index = False)

## 2 Обучение модели

In [None]:
import pandas as pd
import ast
import os
from tqdm import tqdm
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from transformers import AutoTokenizer

from src.dataset import NextTokenDataset
from src.model import RNNAutocompletion


data_train = pd.read_csv(r"data\train.csv")
data_train["input_ids"] = data_train["input_ids"].apply(ast.literal_eval)

data_val = pd.read_csv(r"data\val.csv")
data_val["input_ids"] = data_val["input_ids"].apply(ast.literal_eval)


EXP_NAME = "exp2"
TRAIN_MAX_LENGTH = 32
EPOCHS = 100
BATCH_SIZE = 4048
LR = 2e-3
DIM = 512
NUM_LAYERS = 2
DROPOUT = 0.3

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

tokenizer = AutoTokenizer.from_pretrained("distilgpt2", use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
pad_token_id = tokenizer.pad_token_id
vocab_size = tokenizer.vocab_size

dataset_train = NextTokenDataset(data_train, pad_token=pad_token_id, max_length=TRAIN_MAX_LENGTH)
dataset_val = NextTokenDataset(data_val, pad_token=pad_token_id, max_length=TRAIN_MAX_LENGTH)

dl_train = DataLoader(dataset_train, batch_size=BATCH_SIZE, shuffle=True, num_workers=4, pin_memory=True)
dl_val = DataLoader(dataset_val, batch_size=BATCH_SIZE, shuffle=False, num_workers=4, pin_memory=True)

model = RNNAutocompletion(
    vocab_size=vocab_size,
    pad_token_id=pad_token_id,
    dim=DIM,
    num_layers=NUM_LAYERS,
    dropout=DROPOUT
    ).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=1e-5)
criterion = nn.CrossEntropyLoss()
scaler = torch.cuda.amp.GradScaler()
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
    optimizer, T_0=10, T_mult=2, eta_min=1e-5
)

save_path = os.path.join("exp", EXP_NAME)
os.makedirs(f"{save_path}/weights", exist_ok=True)
writer = SummaryWriter(log_dir=f"{save_path}/logs")

best_val_loss = float("inf")
train_step = 0

for epoch in range(EPOCHS):

    model.eval()
    val_losses = []
    val_correct = 0
    val_correct_top5 = 0
    val_total = 0

    with torch.no_grad(), torch.cuda.amp.autocast():
        for batch in tqdm(dl_val, desc=f"Epoch {epoch} valid", unit="batch"):
            input_ids = batch["input_ids"].to(device)
            lengths = batch["length"].to(device)
            labels = batch["labels"].to(device)

            logits = model(input_ids, lengths)
            loss = criterion(logits, labels)
            val_losses.append(loss.item())

            preds = logits.argmax(dim=-1)
            val_correct += (preds == labels).sum().item()
            val_total += labels.numel()

            _, top5 = logits.topk(5, dim=-1)
            val_correct_top5 += (top5 == labels.unsqueeze(-1)).any(dim=-1).sum().item()

    val_loss = np.mean(val_losses)
    val_ppl = np.exp(val_loss)
    val_acc = val_correct / val_total
    val_acc_top5 = val_correct_top5 / val_total

    print(f"epoch {epoch} valid loss: {val_loss:.4f} | ppl: {val_ppl:.2f} "f"| acc@1: {val_acc:.4f} | acc@5: {val_acc_top5:.4f}")
    writer.add_scalar("Loss/valid", val_loss, epoch)
    writer.add_scalar("Perplexity/valid", val_ppl, epoch)
    writer.add_scalar("Acc/valid",  val_acc,  epoch)
    writer.add_scalar("Acc/valid_top5", val_acc_top5, epoch)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), f"{save_path}/weights/best.pt")

    model.train()
    train_losses = []

    for batch in tqdm(dl_train, desc=f"Epoch {epoch} train", unit="batch"):
        input_ids = batch["input_ids"].to(device)
        lengths = batch["length"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        with torch.cuda.amp.autocast():
            logits = model(input_ids, lengths)
            loss = criterion(logits, labels)
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)                                
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) 
        scaler.step(optimizer)                                    
        scaler.update()

        train_losses.append(loss.item())
        writer.add_scalar("Loss/train_step", loss.item(), train_step)
        scheduler.step()
        train_step += 1

    train_loss = np.mean(train_losses)
    print(f"epoch {epoch} train loss: {train_loss:.4f}")
    writer.add_scalar("Loss/train", train_loss, epoch)
    writer.add_scalar("LR", scheduler.get_last_lr()[0], train_step)


torch.save(model.state_dict(), f"{save_path}/weights/last.pt")
writer.close()

In [None]:
import numpy as np
from tqdm import tqdm
import pandas as pd
import ast
from transformers import AutoTokenizer
import evaluate
import torch
from torch.utils.data import DataLoader

from src.model import RNNAutocompletion
from src.dataset import NextTokenDataset


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

DIM = 256
NUM_LAYERS = 2
DROPOUT = 0.3
MODEL_PATH = "exp/exp1/weights/best.pt"
TRAIN_MAX_LENGTH = 32
BATCH_SIZE = 1024
TEST_DATA_PATH = r"data\test.csv"

tokenizer = AutoTokenizer.from_pretrained("distilgpt2", use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
pad_token_id = tokenizer.pad_token_id
vocab_size = tokenizer.vocab_size

data_test = pd.read_csv(TEST_DATA_PATH)
data_test["input_ids"] = data_test["input_ids"].apply(ast.literal_eval)

dataset_test = NextTokenDataset(data_test, pad_token=pad_token_id, max_length=TRAIN_MAX_LENGTH)
dl_test = DataLoader(dataset_test, batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)

model = RNNAutocompletion(
    vocab_size=vocab_size,
    pad_token_id=pad_token_id,
    dim=DIM,
    num_layers=NUM_LAYERS,
    dropout=DROPOUT
).to(device)

checkpoint = torch.load(MODEL_PATH, map_location=device)
model.load_state_dict(checkpoint)
model.eval()

test_correct = 0
test_correct_top5 = 0
test_total = 0

with torch.no_grad(), torch.cuda.amp.autocast():
    for batch in tqdm(dl_test, desc=f"Test", unit="batch"):
        input_ids = batch["input_ids"].to(device)
        lengths = batch["length"].to(device)
        labels = batch["labels"].to(device)

        logits = model(input_ids, lengths)

        preds = logits.argmax(dim=-1)
        test_correct += (preds == labels).sum().item()
        test_total += labels.numel()

        _, top5 = logits.topk(5, dim=-1)
        test_correct_top5 += (top5 == labels.unsqueeze(-1)).any(dim=-1).sum().item()

test_acc = test_correct / test_total
test_acc_top5 = test_correct_top5 / test_total

print(f"Test acc@1: {test_acc:.4f} | acc@5: {test_acc_top5:.4f}")

## Подбор параметров генерации для distilgpt2

In [2]:
import logging
logging.getLogger("transformers").setLevel(logging.ERROR)

from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import pandas as pd
import ast
import torch
from tqdm import tqdm
import evaluate

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# === Константы ===
VAL_DATA_PATH     = r"data\val.csv"
GPT_MODEL_NAME    = "distilgpt2"
BATCH_SIZE        = 1024
TRAIN_MAX_LENGTH  = 32    # сколько токенов берём в контекст
MAX_REF_TOKENS    = 32    # сколько токенов берём в референс (и генерим)

# === Данные ===
data_val = pd.read_csv(VAL_DATA_PATH)
data_val["input_ids"] = data_val["input_ids"].apply(ast.literal_eval)

# === Токенизатор/модель/пайплайн ===
tokenizer = AutoTokenizer.from_pretrained(GPT_MODEL_NAME, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

model = AutoModelForCausalLM.from_pretrained(GPT_MODEL_NAME).to(device)
if device.type == "cuda":
    model = model.half()

generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device=0 if device.type == "cuda" else -1
)

rouge = evaluate.load("rouge")

def ids_to_text(ids):
    return tokenizer.decode(ids, skip_special_tokens=True)

# === Готовим пары (context_text, reference_text) прямо из input_ids ===
contexts, references = [], []
for ids in data_val["input_ids"]:
    n = len(ids)
    if n < 2:
        continue
    pivot = min(TRAIN_MAX_LENGTH, n-1)           # гарантируем, что GT не пустой
    ctx_ids = ids[:pivot]
    ref_ids = ids[pivot : min(n, pivot + MAX_REF_TOKENS)]

    contexts.append(ids_to_text(ctx_ids))
    references.append(ids_to_text(ref_ids))

print(len(contexts), len(data_val["input_ids"]))


# === Генерация батчами (только продолжение, без контекста) ===
predictions = []
for i in tqdm(range(0, len(contexts), BATCH_SIZE), desc="Gen", unit="batch"):
    batch_ctx = contexts[i:i+BATCH_SIZE]
    outs = generator(
        batch_ctx,
        max_new_tokens=MAX_REF_TOKENS,
        do_sample=True,
        top_p=0.95,
        temperature=0.8,
        return_full_text=False,
        batch_size=BATCH_SIZE
    )
    preds_batch = [o[0]["generated_text"] for o in outs]
    predictions.extend(preds_batch)

# === ROUGE по “чистым” продолжениям
scores = rouge.compute(predictions=predictions, references=references)
print(scores)


Device: cuda


Gen: 100%|██████████| 299/299 [09:04<00:00,  1.82s/batch]


{'rouge1': np.float64(0.0), 'rouge2': np.float64(0.0), 'rougeL': np.float64(0.0), 'rougeLsum': np.float64(0.0)}


In [None]:




device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



rouge = evaluate.load("rouge")

batch_size = 16
context_lengths = [1, 3, 9]
max_gen_tokens = 32

for n in context_lengths:
    print(f"\n==== Context length: {n} ====")
    
    all_predictions = []
    all_references = []

    for i in range(0, len(data_test.input_ids), batch_size):
        batch_messages = data_test.input_ids[i : i + batch_size]

        contexts = [msg[:n] for msg in batch_messages]
        contexts = torch.nn.utils.rnn.pad_sequence(
            [torch.tensor(c, dtype=torch.long) for c in contexts],
            batch_first=True,
            padding_value=pad_token_id
        ).to(device)

        references = [msg[n:] for msg in batch_messages]

        with torch.no_grad():
            generated_batch = model.generate(start_tokens=contexts, max_tokens=max_gen_tokens)

        for gen_tokens, ref in zip(generated_batch, references):
            pred_text = tokenizer.decode(gen_tokens.tolist(), skip_special_tokens=True)
            ref_text = tokenizer.decode(ref, skip_special_tokens=True)

            all_predictions.append(pred_text)
            all_references.append(ref_text)

    results = rouge.compute(predictions=all_predictions, references=all_references)
    print(f"ROUGE scores for context={n}: {results}")