# Recurrent Neural Networks and Language Models

In [1]:
import math
import pickle
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext, datasets, math
from tqdm import tqdm

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [3]:
# seed for reproducability
SEED = 122
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Load Data  
Source Credit: TinyStories Dataset by Ronen Eldan (Hugging Face)  
https://huggingface.co/datasets/roneneldan/TinyStories'

Contains short stories.

In [4]:

# Convert to the same structure you had before
# (list of {"text": ...})
MAX_STORIES = 20_000

hf_dataset = datasets.load_dataset(
    "roneneldan/TinyStories",
    split=f"train[:{MAX_STORIES}]"
)

data = [{"text": row["text"]} for row in hf_dataset]
dataset = datasets.Dataset.from_list(data)

dataset

Dataset({
    features: ['text'],
    num_rows: 20000
})

In [5]:
from datasets import DatasetDict

train_test = dataset.train_test_split(test_size=0.2)

# 10% test set and 10% validation set
train_test_valid = train_test['test'].train_test_split(test_size=0.5)

dataset = DatasetDict({
    'train': train_test['train'],
    'test': train_test_valid['test'],
    'validation': train_test_valid['train']})

dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 16000
    })
    test: Dataset({
        features: ['text'],
        num_rows: 2000
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 2000
    })
})

In [6]:
print(dataset['train'][333]['text']) # empty string

'''
If you try to change the index you might notice that sometimes there is no paragraph 
and rather an empty string so we will have to care of that later.
'''

Once upon a time, there was a young dog named Max. Max loved to play with his ball. One day, Max saw a big goal in the park. He wanted to kick the ball into the goal. Max thought it would be a tough game, but he was excited to play.

Max met his friend, a cat named Luna. Luna wanted to play too. Max said, "Let's see who can kick the ball into the goal." Luna agreed, and they started to play. Max tried to prevent Luna from scoring, but she was very fast.

In the end, Max kicked the ball into the goal. He was very happy. Luna was a little sad, but Max said, "You played very well. Next time, we will play together and prevent others from scoring." Luna smiled, and they both went home, excited for their next game.


'\nIf you try to change the index you might notice that sometimes there is no paragraph \nand rather an empty string so we will have to care of that later.\n'

# Preprocessing

## Tokenizing

In [7]:
from torchtext.data.utils import get_tokenizer

In [8]:
tokenizer = get_tokenizer('basic_english')

#function to tokenize
tokenize_data = lambda example, tokenizer: {'tokens': tokenizer(example['text'])}  

#map the function to each example
tokenized_dataset = dataset.map(tokenize_data, remove_columns=['text'], fn_kwargs={'tokenizer': tokenizer})
print(tokenized_dataset['train'][33]['tokens'])

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

['lily', 'and', 'ben', 'are', 'friends', '.', 'they', 'like', 'to', 'play', 'with', 'toys', '.', 'but', 'today', 'they', 'disagree', '.', 'lily', 'wants', 'to', 'play', 'with', 'the', 'doll', '.', 'ben', 'wants', 'to', 'play', 'with', 'the', 'car', '.', 'they', 'both', 'say', ',', 'it', 'is', 'mine', '!', 'a', 'woman', 'sees', 'them', '.', 'she', 'is', 'their', 'teacher', '.', 'she', 'says', ',', 'lily', 'and', 'ben', ',', 'why', 'are', 'you', 'fighting', '?', 'you', 'can', 'share', 'the', 'toys', '.', 'it', 'is', 'easy', '.', 'you', 'can', 'take', 'turns', '.', 'or', 'you', 'can', 'play', 'together', '.', 'how', 'about', 'that', '?', 'lily', 'and', 'ben', 'look', 'at', 'each', 'other', '.', 'they', 'think', 'for', 'a', 'moment', '.', 'they', 'do', 'not', 'want', 'to', 'be', 'angry', '.', 'they', 'want', 'to', 'have', 'fun', '.', 'they', 'say', ',', 'ok', ',', 'teacher', '.', 'we', 'can', 'share', '.', 'we', 'can', 'play', 'together', '.', 'we', 'are', 'sorry', '.', 'the', 'woman', 'sm

## Numericializing

In [10]:
from torchtext.vocab import build_vocab_from_iterator

In [11]:
from collections import Counter
from torchtext.vocab import Vocab

counter = Counter()
for tokens in tokenized_dataset["train"]["tokens"]:
    counter.update(tokens)

# filter tokens by min_freq
counter = Counter({k: v for k, v in counter.items() if v >= 3})

# IMPORTANT: <unk> must be first
vocab = Vocab(
    counter,
    specials=["<unk>", "<eos>"]
)

unk_idx = vocab.stoi["<unk>"]

print("Vocab size:", len(vocab))
print("First 10 tokens:", vocab.itos[:10])




Vocab size: 7238
First 10 tokens: ['<unk>', '<eos>', '.', 'the', 'and', ',', 'to', 'a', 'was', 'he']


In [12]:
def numericalize(tokens, vocab, unk_idx):
    return [vocab.stoi[t] if t in vocab.stoi else unk_idx for t in tokens]

example = tokenized_dataset["train"][0]["tokens"] + ["<eos>"]
print(numericalize(example, vocab, unk_idx))


[45, 34, 8, 7, 54, 2, 13, 390, 8, 4880, 2, 4880, 72, 6, 43, 2, 161, 23, 10, 91, 63, 6, 3, 100, 5, 4, 532, 4, 43, 20, 13, 66, 2, 32, 23, 11, 8, 35, 528, 147, 5, 26, 4880, 216, 49, 6, 43, 2, 13, 504, 15, 5, 0, 5, 16, 730, 152, 7, 820, 4, 2688, 530, 75, 528, 2, 243, 24, 4880, 1036, 126, 4, 174, 6, 63, 6, 630, 2, 26, 10, 8, 75, 121, 4, 2070, 59, 7, 1481, 17, 73, 10, 27, 135, 275, 2, 10, 62, 85, 6, 3, 100, 110, 5, 4, 173, 21, 3, 1908, 2, 10, 565, 34, 200, 11, 8, 37, 6, 186, 108, 2, 10, 76, 406, 4, 2311, 5, 4, 27, 332, 1527, 21, 0, 530, 75, 528, 2, 4880, 62, 6, 312, 5, 38, 4, 1318, 2, 10, 27, 2123, 22, 10, 82, 2688, 530, 75, 528, 217, 112, 21, 3, 1908, 2, 1]


In [13]:
with open('vocab_lm.pkl', 'wb') as f:
    pickle.dump(vocab, f)

After loading and splitting the personally chosen dataset a DatasetDictonary is created. Then on that object the preprocessing steps are applied. Firstly, we tokenize the dataset using torchtext's `get_tokenizer`. The `tokenize_data` function is applied to each example where the `text` column is removed and a new `tokens` column containing the tokenized text is added.

Then our vocabulary is made using the `build_vocab_from_iterator` method from torchtext. We use the training dataset, and consider words that has occured at least three times. This is done to make sure that our vocab does not get too big. Then we add `<unk>` to signify unknown and `<eos>` to signify end of sentence. After all this the vocab size came out to be 11082.

## Prepare the batch loader

### Prepare data

In [14]:
def get_data(dataset, vocab, batch_size):
    data = []                                                   
    for example in dataset:
        if example['tokens']:         
            #appends eos so we know it ends....so model learn how to end...                             
            tokens = example['tokens'].append('<eos>')   
            #numericalize          
            tokens = [vocab[token] for token in example['tokens']] 
            data.extend(tokens)                                    
    data = torch.LongTensor(data)                                 
    num_batches = data.shape[0] // batch_size #get the int number of batches...
    data = data[:num_batches * batch_size] #make the batch evenly, and cut out any remaining                      
    data = data.view(batch_size, num_batches)          
    return data #[batch size, bunch of tokens]

In [15]:
batch_size = 128
train_data = get_data(tokenized_dataset['train'], vocab, batch_size)
valid_data = get_data(tokenized_dataset['validation'], vocab, batch_size)
test_data  = get_data(tokenized_dataset['test'], vocab, batch_size)

## Modeling

In [16]:
class LSTMLanguageModel(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim, num_layers, dropout_rate):
                
        super().__init__()
        self.num_layers = num_layers
        self.hid_dim = hid_dim
        self.emb_dim = emb_dim

        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hid_dim, num_layers=num_layers, 
                    dropout=dropout_rate, batch_first=True)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(hid_dim, vocab_size)
        
        self.init_weights()
        
    def init_weights(self):
        init_range_emb = 0.1
        init_range_other = 1/math.sqrt(self.hid_dim)
        self.embedding.weight.data.uniform_(-init_range_emb, init_range_emb)
        self.fc.weight.data.uniform_(-init_range_other, init_range_other)
        self.fc.bias.data.zero_()
        for i in range(self.num_layers):
            self.lstm.all_weights[i][0] = torch.FloatTensor(self.emb_dim,
                    self.hid_dim).uniform_(-init_range_other, init_range_other) 
            self.lstm.all_weights[i][1] = torch.FloatTensor(self.hid_dim, 
                    self.hid_dim).uniform_(-init_range_other, init_range_other) 

    def init_hidden(self, batch_size, device):
        hidden = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(device)
        cell   = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(device)
        return hidden, cell
    
    def detach_hidden(self, hidden):
        hidden, cell = hidden
        hidden = hidden.detach()
        cell = cell.detach()
        return hidden, cell

    def forward(self, src, hidden):
        #src: [batch size, seq len]
        embedding = self.dropout(self.embedding(src))
        #embedding: [batch size, seq len, emb_dim]
        output, hidden = self.lstm(embedding, hidden)      
        #output: [batch size, seq len, hid_dim]
        #hidden = h, c = [num_layers * direction, seq len, hid_dim)
        output = self.dropout(output) 
        prediction = self.fc(output)
        #prediction: [batch size, seq_len, vocab size]
        return prediction, hidden

The LSTM contains memory cells and gates that allows it to selectively remember and forget information from previous times steps. The input gate controls the flow of information from the previous memeory cell to the current memory cell. The forget gate controls the flow of information from previous cell to the current cell and allows it to selectively forget or remember information from previous steps. The memory cell stores information that can be selectively modified by the input and forget gates. And the output gate controls the flow of information from the memory cell to the hidden state and output.

Our defined class is firstly converting tokens into embeddings then processing those embeddings with stacked LSTM layers to capture temporal dependencies (relationships between past and future events or states in a time series). Then we apply dropout to embeddings to prevent overfitting by randomly zero-ing inputs during training. Then finally we use a linear layer to map LSTM outputs to vocabulary logits for predicting the next word.

## Training

Follows very basic procedure.  One note is that some of the sequences that will be fed to the model may involve parts from different sequences in the original dataset or be a subset of one (depending on the decoding length). For this reason we will reset the hidden state every epoch, this is like assuming that the next batch of sequences is probably always a follow up on the previous in the original dataset.

In [17]:
vocab_size = len(vocab)
emb_dim = 1024                # 400 in the paper
hid_dim = 1024                # 1150 in the paper
num_layers = 2                # 3 in the paper
dropout_rate = 0.65              
lr = 1e-3                     

In [18]:
model = LSTMLanguageModel(vocab_size, emb_dim, hid_dim, num_layers, dropout_rate).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {num_params:,} trainable parameters')

The model has 31,624,262 trainable parameters


In [19]:
def get_batch(data, seq_len, idx):
    #data #[batch size, bunch of tokens]
    src    = data[:, idx:idx+seq_len]                   
    target = data[:, idx+1:idx+seq_len+1]  #target simply is ahead of src by 1            
    return src, target

In [20]:
def train(model, data, optimizer, criterion, batch_size, seq_len, clip, device):
    
    epoch_loss = 0
    model.train()
    # drop all batches that are not a multiple of seq_len
    # data #[batch size, bunch of tokens]
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]  #we need to -1 because we start at 0
    num_batches = data.shape[-1]
    
    #reset the hidden every epoch
    hidden = model.init_hidden(batch_size, device)
    
    for idx in tqdm(range(0, num_batches - 1, seq_len), desc='Training: ',leave=False):
        optimizer.zero_grad()
        
        #hidden does not need to be in the computational graph for efficiency
        hidden = model.detach_hidden(hidden)

        src, target = get_batch(data, seq_len, idx) #src, target: [batch size, seq len]
        src, target = src.to(device), target.to(device)
        batch_size = src.shape[0]
        prediction, hidden = model(src, hidden)               

        #need to reshape because criterion expects pred to be 2d and target to be 1d
        prediction = prediction.reshape(batch_size * seq_len, -1)  #prediction: [batch size * seq len, vocab size]  
        target = target.reshape(-1)
        loss = criterion(prediction, target)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches

In [21]:
def evaluate(model, data, criterion, batch_size, seq_len, device):

    epoch_loss = 0
    model.eval()
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]
    num_batches = data.shape[-1]

    hidden = model.init_hidden(batch_size, device)

    with torch.no_grad():
        for idx in range(0, num_batches - 1, seq_len):
            hidden = model.detach_hidden(hidden)
            src, target = get_batch(data, seq_len, idx)
            src, target = src.to(device), target.to(device)
            batch_size= src.shape[0]

            prediction, hidden = model(src, hidden)
            prediction = prediction.reshape(batch_size * seq_len, -1)
            target = target.reshape(-1)

            loss = criterion(prediction, target)
            epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches

In [23]:
n_epochs = 50
seq_len  = 50 #<----decoding length
clip    = 0.25

lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=0)

best_valid_loss = float('inf')

for epoch in range(n_epochs):
    train_loss = train(model, train_data, optimizer, criterion, 
                batch_size, seq_len, clip, device)
    valid_loss = evaluate(model, valid_data, criterion, batch_size, 
                seq_len, device)

    lr_scheduler.step(valid_loss)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'best-val-lstm_lm.pt')

    print(f'\tTrain Perplexity: {math.exp(train_loss):.3f}')
    print(f'\tValid Perplexity: {math.exp(valid_loss):.3f}')

                                                           

	Train Perplexity: 74.434
	Valid Perplexity: 38.419


                                                           

	Train Perplexity: 31.041
	Valid Perplexity: 22.550


                                                           

	Train Perplexity: 24.594
	Valid Perplexity: 19.322


                                                           

	Train Perplexity: 21.641
	Valid Perplexity: 17.559


                                                           

	Train Perplexity: 19.741
	Valid Perplexity: 16.394


                                                           

	Train Perplexity: 18.453
	Valid Perplexity: 15.541


                                                           

	Train Perplexity: 17.437
	Valid Perplexity: 14.903


                                                           

	Train Perplexity: 16.636
	Valid Perplexity: 14.376


                                                           

	Train Perplexity: 15.995
	Valid Perplexity: 13.979


                                                           

	Train Perplexity: 15.442
	Valid Perplexity: 13.676


                                                           

	Train Perplexity: 14.985
	Valid Perplexity: 13.384


                                                           

	Train Perplexity: 14.566
	Valid Perplexity: 13.163


                                                           

	Train Perplexity: 14.208
	Valid Perplexity: 12.923


                                                           

	Train Perplexity: 13.888
	Valid Perplexity: 12.742


                                                           

	Train Perplexity: 13.611
	Valid Perplexity: 12.580


                                                           

	Train Perplexity: 13.344
	Valid Perplexity: 12.410


                                                           

	Train Perplexity: 13.101
	Valid Perplexity: 12.306


                                                           

	Train Perplexity: 12.888
	Valid Perplexity: 12.162


                                                           

	Train Perplexity: 12.670
	Valid Perplexity: 12.063


                                                           

	Train Perplexity: 12.485
	Valid Perplexity: 11.919


                                                           

	Train Perplexity: 12.312
	Valid Perplexity: 11.848


                                                           

	Train Perplexity: 12.151
	Valid Perplexity: 11.752


                                                           

	Train Perplexity: 12.001
	Valid Perplexity: 11.689


                                                           

	Train Perplexity: 11.871
	Valid Perplexity: 11.619


                                                           

	Train Perplexity: 11.731
	Valid Perplexity: 11.557


                                                           

	Train Perplexity: 11.609
	Valid Perplexity: 11.471


                                                           

	Train Perplexity: 11.489
	Valid Perplexity: 11.453


                                                           

	Train Perplexity: 11.369
	Valid Perplexity: 11.383


                                                           

	Train Perplexity: 11.267
	Valid Perplexity: 11.352


                                                           

	Train Perplexity: 11.160
	Valid Perplexity: 11.287


                                                           

	Train Perplexity: 11.087
	Valid Perplexity: 11.263


                                                           

	Train Perplexity: 10.993
	Valid Perplexity: 11.255


                                                           

	Train Perplexity: 10.904
	Valid Perplexity: 11.192


                                                           

	Train Perplexity: 10.829
	Valid Perplexity: 11.164


                                                           

	Train Perplexity: 10.750
	Valid Perplexity: 11.181


                                                           

	Train Perplexity: 10.418
	Valid Perplexity: 11.063


                                                           

	Train Perplexity: 10.285
	Valid Perplexity: 11.039


                                                           

	Train Perplexity: 10.196
	Valid Perplexity: 11.034


                                                           

	Train Perplexity: 10.136
	Valid Perplexity: 10.990


                                                           

	Train Perplexity: 10.061
	Valid Perplexity: 11.011


                                                           

	Train Perplexity: 9.893
	Valid Perplexity: 10.900


                                                           

	Train Perplexity: 9.828
	Valid Perplexity: 10.898


                                                           

	Train Perplexity: 9.729
	Valid Perplexity: 10.866


                                                           

	Train Perplexity: 9.702
	Valid Perplexity: 10.857


                                                           

	Train Perplexity: 9.665
	Valid Perplexity: 10.860


                                                           

	Train Perplexity: 9.620
	Valid Perplexity: 10.844


                                                           

	Train Perplexity: 9.595
	Valid Perplexity: 10.838


                                                           

	Train Perplexity: 9.576
	Valid Perplexity: 10.830


                                                           

	Train Perplexity: 9.567
	Valid Perplexity: 10.829


                                                           

	Train Perplexity: 9.544
	Valid Perplexity: 10.827


Firstly, we initialize hyperparameters such as vocab size, embedding dimensions, hidden dimensions, number of layers, dropout rate and learning rate. We then move the model to device of user's choice (GPU in our case). Then adam optimizer is defined to optimize the model's parameters and CrossEntropyLoss criterion is use to compute the loss during training.

We train for 50 epochs. In each epoch, the training data is deivided into batches of specific sequece using the `get_batch` method that we defined. At the start of each epoch hidden state is reset and for each batch the model parameters are zeroed and forward pass is done. The loss is then calculated by using the predicted probabilites for a token compared with actual next token. The gradient is calculated using backpropagation and the model's parameters are updated using the optimizer. We also keep track of loss for each epoch.

At the end of an epoch, the model is put into eval mode and the validation data is processed with same process as stated before and validation loss is calculated. The learning rate scheduler is used to adjust the learning rate based on the validation loss. Finally, the model params are saved if the current validation loss is the best one observed so far. 

## Testing

In [25]:
# Load the best model state from the saved checkpoint
model.load_state_dict(torch.load('best-val-lstm_lm.pt',  map_location=device))

# Evaluate the model on the test data
test_loss = evaluate(model, test_data, criterion, batch_size, seq_len, device)

# Print the test perplexity
print(f'Test Perplexity: {math.exp(test_loss):.3f}')

Test Perplexity: 10.620


# Real-world inference

In [29]:
def generate(prompt, max_seq_len, temperature, model, tokenizer, vocab, device, seed=None):
    if seed is not None:
        torch.manual_seed(seed)

    model.eval()

    # tokenize input prompt
    tokens = tokenizer(prompt)
    indices = [vocab[token] for token in tokens]

    batch_size = 1
    hidden = model.init_hidden(batch_size, device)

    # get index-to-string mapping (torchtext-safe)
    itos = vocab.get_itos() if hasattr(vocab, "get_itos") else vocab.itos

    with torch.no_grad():
        for _ in range(max_seq_len):

            src = torch.LongTensor([indices]).to(device)
            output, hidden = model(src, hidden)

            # output: [batch_size, seq_len, vocab_size]
            logits = output[:, -1, :] / temperature
            probs = torch.softmax(logits, dim=-1)

            next_idx = torch.multinomial(probs, num_samples=1).item()

            # resample if <unk>
            while itos[next_idx] == '<unk>':
                next_idx = torch.multinomial(probs, num_samples=1).item()

            # stop if <eos>
            if itos[next_idx] == '<eos>':
                break

            indices.append(next_idx)

    # convert indices back to tokens
    generated_tokens = [itos[i] for i in indices]
    return generated_tokens


In [30]:
prompt = 'Once upon a time'
max_seq_len = 30
seed = 0

#smaller the temperature, more diverse tokens but comes 
#with a tradeoff of less-make-sense sentence
temperatures = [0.25,0.5,0.75,1]
for temperature in temperatures:
    generation = generate(prompt, max_seq_len, temperature, model, tokenizer, 
                          vocab, device, seed)
    print(str(temperature)+'\n'+' '.join(generation)+'\n')

0.25
once upon a time , there was a little girl named lily . she loved to play with her toys and run around outside . one day , she went to the park with

0.5
once upon a time , there was a little girl named lily . she loved to play with her toys and run around outside . one day , she decided to take a walk

0.75
once upon a time , there was a little girl named lily . she loved to play with her toys and give it a big hug . lily was very happy to have her

1
once upon a time , there was a little girl named alice . jane loved to play in the grass and give send more clothes . but the big one was missing and then



In [28]:
type(vocab)

torchtext.vocab.Vocab

**Training Metrics and Visualization**

This section runs a short training session (or continues an existing one) while recording epoch-wise training loss. It then shows a summary table and a plot of loss vs epoch. The artifacts are saved to `model/training_history.csv` and `model/training_loss.png`.

Notes:
- Adjust `n_epochs`, `batch_size`, `seq_len`, and `clip` below to continue training for more epochs.
- The code will reuse `model`, `optimizer`, `criterion`, and `train` defined earlier in this notebook if present.

In [None]:
# Record training history and plot
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display

# fallback defaults if variables are missing
n_epochs = globals().get('n_epochs', 3)
batch_size = globals().get('batch_size', 128)
seq_len = globals().get('seq_len', 32)
clip = globals().get('clip', 0.25)
device = globals().get('device', torch.device('cuda' if torch.cuda.is_available() else 'cpu'))

# ensure train function and model exist
if 'train' not in globals():
    raise RuntimeError('train() function not found in the notebook. Run the earlier cells that define training first.')
if 'model' not in globals():
    raise RuntimeError('`model` not found. Build the model by running the modeling cells first.')
if 'train_data' not in globals():
    raise RuntimeError('`train_data` not found. Prepare data by running preprocessing cells first.')
if 'optimizer' not in globals() or 'criterion' not in globals():
    raise RuntimeError('`optimizer` or `criterion` not found. Run the model/optimizer setup cells first.')

history = {'epoch': [], 'train_loss': []}

for epoch in range(1, n_epochs + 1):
    loss = train(model, train_data, optimizer, criterion, batch_size, seq_len, clip, device)
    history['epoch'].append(epoch)
    history['train_loss'].append(loss)
    print(f'Epoch {epoch} loss: {loss:.4f}')

# show table
df = pd.DataFrame(history)
display(df)

# plot
plt.figure(figsize=(7,4))
plt.plot(df['epoch'], df['train_loss'], marker='o')
plt.title('Training Loss per Epoch')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.grid(True)
plt.tight_layout()

os.makedirs('model', exist_ok=True)
plot_path = os.path.join('model', 'training_loss.png')
plt.savefig(plot_path)
plt.show()

csv_path = os.path.join('model', 'training_history.csv')
df.to_csv(csv_path, index=False)
print(f'Saved history to {csv_path} and plot to {plot_path}')
