In [1]:
import pandas as pd
import ast

data_train = pd.read_csv(r"data\train.csv")
data_train["input_ids"] = data_train["input_ids"].apply(ast.literal_eval)

data_val = pd.read_csv(r"data\val.csv")
data_val["input_ids"] = data_val["input_ids"].apply(ast.literal_eval)

data_test = pd.read_csv(r"data\test.csv")
data_test["input_ids"] = data_test["input_ids"].apply(ast.literal_eval)
data_test

Unnamed: 0,text,input_ids
0,my mom amp i love u we had a very sad day june...,"[1820, 1995, 20766, 1312, 1842, 334, 356, 550,..."
1,i'm going to prom with my lovely bride,"[72, 1101, 1016, 284, 1552, 351, 616, 14081, 2..."
2,dude that really sucks. just this morning i wa...,"[67, 2507, 326, 1107, 22523, 13, 655, 428, 332..."
3,thanks for following me back xoxoxo,"[27547, 329, 1708, 502, 736, 2124, 1140, 1140,..."
4,quot hi quot problem solved.,"[421, 313, 23105, 23611, 1917, 16019, 13, 50256]"
...,...,...
152690,"say hi to my friends kirsty, maree and anh i m...","[16706, 23105, 284, 616, 2460, 479, 667, 88, 1..."
152691,quot part of the list quot and quot kick him o...,"[421, 313, 636, 286, 262, 1351, 23611, 290, 23..."
152692,if i hind one of my hoochies thn imma slide th...,"[361, 1312, 16222, 530, 286, 616, 289, 2238, 3..."
152693,no more heat ftw,"[3919, 517, 4894, 277, 4246, 50256]"


In [None]:
import os
from tqdm import tqdm

import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from transformers import AutoTokenizer

from src.dataset import NextTokenDataset
from src.model import RNNAutocompletion


EXP_NAME = "exp1"
TRAIN_MAX_LENGTH = 32
EPOCHS = 10
BATCH_SIZE = 2048
LR = 2e-3
DIM = 256
NUM_LAYERS = 2
DROPOUT = 0.3

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

tokenizer = AutoTokenizer.from_pretrained("distilgpt2", use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
pad_token_id = tokenizer.pad_token_id
vocab_size = tokenizer.vocab_size

dataset_train = NextTokenDataset(data_train, pad_token=pad_token_id, max_length=TRAIN_MAX_LENGTH)
dataset_val = NextTokenDataset(data_val, pad_token=pad_token_id, max_length=TRAIN_MAX_LENGTH)

dl_train = DataLoader(dataset_train, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
dl_val = DataLoader(dataset_val, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)

model = RNNAutocompletion(
    vocab_size=vocab_size,
    pad_token_id=pad_token_id,
    dim=DIM,
    num_layers=NUM_LAYERS,
    dropout=DROPOUT
    ).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=1e-5)
criterion = nn.CrossEntropyLoss()

save_path = os.path.join("exp", EXP_NAME)
os.makedirs(f"{save_path}/weights", exist_ok=True)
writer = SummaryWriter(log_dir=f"{save_path}/logs")

best_val_loss = float("inf")
train_step = 0

for epoch in range(EPOCHS):

    model.eval()
    val_losses = []
    val_correct = 0
    val_total = 0

    with torch.no_grad():
        for batch in tqdm(dl_val, desc=f"Epoch {epoch} valid", unit="batch"):
            input_ids = batch["input_ids"].to(device)
            lengths = batch["length"].to(device)
            labels = batch["labels"].to(device)

            logits = model(input_ids, lengths)
            loss = criterion(logits, labels)
            val_losses.append(loss.item())

            preds = logits.argmax(dim=-1)
            val_correct += (preds == labels).sum().item()
            val_total += labels.numel()

    val_loss = np.mean(val_losses)
    val_acc = val_correct / val_total

    print(f"epoch {epoch} valid loss: {val_loss:.4f} | acc: {val_acc:.4f}")
    writer.add_scalar("Loss/valid", val_loss, epoch)
    writer.add_scalar("Acc/valid",  val_acc,  epoch)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), f"{save_path}/weights/best.pt")

    model.train()
    train_losses = []

    for batch in tqdm(dl_train, desc=f"Epoch {epoch} train", unit="batch"):
        input_ids = batch["input_ids"].to(device)
        lengths = batch["length"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        logits = model(input_ids, lengths)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        train_losses.append(loss.item())
        writer.add_scalar("Loss/train_step", loss.item(), train_step)
        train_step += 1

    train_loss = np.mean(train_losses)
    print(f"epoch {epoch} train loss: {train_loss:.4f}")
    writer.add_scalar("Loss/train", train_loss, epoch)


torch.save(model.state_dict(), f"{save_path}/weights/last.pt")
writer.close()

  from .autonotebook import tqdm as notebook_tqdm
Disabling PyTorch because PyTorch >= 2.1 is required but found 2.0.1+cu118
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


cuda


Epoch 0 valid:  68%|██████▊   | 823/1205 [00:51<00:21, 17.39batch/s]