In [1]:
!pip install sentencepiece

In [2]:
import os, glob

babylm_10M = '/kaggle/input/babylm-train-10m-cleaned'
babylm_100M = '/kaggle/input/babylm-train-100m-clean'

with open("babylm-train-10m-cleaned-merged", "w", encoding="utf-8") as outfile:
    for filename in glob.glob(os.path.join(babylm_10M, '*.train')):
        with open(filename, "r", encoding="utf-8") as file:
            for line in file:
                outfile.write(line)

with open("babylm-train-100m-clean-merged", "w", encoding="utf-8") as outfile:
    for filename in glob.glob(os.path.join(babylm_100M, '*.train')):
        with open(filename, "r", encoding="utf-8") as file:
            for line in file:
                outfile.write(line)

# BPE

In [None]:
import sentencepiece as spm

babylm_10M = '/kaggle/working/babylm-train-10m-cleaned-merged'
babylm_100M = '/kaggle/working/babylm-train-100m-clean-merged'

spm.SentencePieceTrainer.train(
    input=babylm_10M,
    model_prefix="sentencepiece-10M-bpe",
    vocab_size=16000,
    model_type="bpe",
    character_coverage=1.0
)

spm.SentencePieceTrainer.train(
    input=babylm_100M,
    model_prefix="sentencepiece-100M-bpe",
    vocab_size=50000,
    model_type="bpe",
    character_coverage=1.0
)

# Unigram

In [None]:
spm.SentencePieceTrainer.train(
    input=babylm_10M,
    model_prefix="sentencepiece-10M-unigram",
    vocab_size=16000,
    model_type="unigram",
    character_coverage=1.0
)

spm.SentencePieceTrainer.train(
    input=babylm_100M,
    model_prefix="sentencepiece-100M-unigram",
    vocab_size=50000,
    model_type="unigram",
    character_coverage=1.0
)

# Char

In [None]:
spm.SentencePieceTrainer.train(
    input=babylm_10M,
    model_prefix="sentencepiece-10M-char",
    vocab_size=16000,
    model_type="char",
    character_coverage=1.0
)

spm.SentencePieceTrainer.train(
    input=babylm_100M,
    model_prefix="sentencepiece-100M-char",
    vocab_size=50000,
    model_type="char",
    character_coverage=1.0
)

# Word

In [None]:
spm.SentencePieceTrainer.train(
    input=babylm_10M,
    model_prefix="sentencepiece-10M-word",
    vocab_size=16000,
    model_type="word",
    character_coverage=1.0
)

spm.SentencePieceTrainer.train(
    input=babylm_100M,
    model_prefix="sentencepiece-100M-word",
    vocab_size=50000,
    model_type="word",
    character_coverage=1.0
)

# RNN

In [None]:
import torch
import torch.nn as nn

class LSTMLanguageModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers, dropout=0.5):
        super(LSTMLanguageModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size,
                            hidden_size,
                            num_layers,
                            batch_first=True,
                            dropout=dropout)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden):
        embed = self.embedding(x)
        out, hidden = self.lstm(embed, hidden)
        logits = self.fc(out)
        return logits, hidden

    def init_hidden(self, batch_size, hidden_size, num_layers, device):
        h0 = torch.zeros(num_layers, batch_size, hidden_size, device=device)
        c0 = torch.zeros(num_layers, batch_size, hidden_size, device=device)
        return (h0, c0)

In [None]:
from torch.utils.data import Dataset, DataLoader, random_split

class LanguageModelingDataset(Dataset):
    def __init__(self, data_tensor, seq_length):

        self.data = data_tensor
        self.seq_length = seq_length
        self.length = (len(data_tensor) - 1) // seq_length

    def __len__(self):
        return self.length

    def __getitem__(self, idx):
        start = idx * self.seq_length
        end = start + self.seq_length
        x = self.data[start:end]
        y = self.data[start+1:end+1]
        return x, y

In [None]:
import pytorch_lightning as pl
import torch.optim as optim

class LanguageModelingModule(pl.LightningModule):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers, dropout, learning_rate, seq_length):
        super(LanguageModelingModule, self).__init__()
        self.save_hyperparameters()
        self.model = LSTMLanguageModel(vocab_size, embed_size, hidden_size, num_layers, dropout)
        self.learning_rate = learning_rate

    def forward(self, x, hidden):
        return self.model(x, hidden)

    def training_step(self, batch, batch_idx):
        x, y = batch
        hidden = self.model.init_hidden(x.size(0), self.hparams.hidden_size, 
                                        self.hparams.num_layers, self.device)
        logits, _ = self(x, hidden)
        loss = nn.CrossEntropyLoss()(logits.view(-1, self.hparams.vocab_size), y.view(-1))
        self.log('train_loss', loss, on_step=True, on_epoch=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        hidden = self.model.init_hidden(x.size(0), self.hparams.hidden_size, 
                                        self.hparams.num_layers, self.device)
        logits, _ = self(x, hidden)
        loss = nn.CrossEntropyLoss()(logits.view(-1, self.hparams.vocab_size), y.view(-1))
        self.log('val_loss', loss, prog_bar=True)
        return loss

    def on_validation_epoch_end(self):
        val_loss = self.trainer.callback_metrics.get("val_loss")
    
        if val_loss is not None:
            print(f"[Epoch {self.current_epoch}] Validation loss: {val_loss:.4f}", flush=True)

    def configure_optimizers(self):
        optimizer = optim.AdamW(self.parameters(), lr=self.learning_rate, weight_decay=0.01)
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(
            optimizer,
            mode='min',
            factor=0.5,
            patience=2,
            verbose=True
        )
        return {
            'optimizer': optimizer,
            'lr_scheduler': {
                'scheduler': scheduler,
                'monitor': 'val_loss',
                'frequency': 1
            }
        }

    def configure_gradient_clip_val(self):
        return 1.0

# PREPARE DATA

In [10]:
import sentencepiece as spm
from tokenizers import Tokenizer

#BPE-10M
# tokenizer_path = "/kaggle/input/sentencepiece-tokenizers/sentencepiece-10M-unigram.model"
# sp = spm.SentencePieceProcessor()
# sp.load(tokenizer_path)
#SUPER-BPE-10M
tokenizer_path = "/kaggle/input/sentencepiece-tokenizers/superbpe-10M-16k-final.json"
tokenizer = Tokenizer.from_file(tokenizer_path)

# FOR SENTENCEPIECE
# print("unk_id:", sp.unk_id())
# print("bos_id:", sp.bos_id())
# print("eos_id:", sp.eos_id())
# print("pad_id:", sp.pad_id())  # -1 dacă nu există

In [11]:
merged_file = "/kaggle/working/babylm-train-10m-cleaned-merged"

with open(merged_file, "r", encoding="utf-8") as f:
    text = f.read()

# token_ids = [sp.bos_id()] + sp.encode(text, out_type=int) + [sp.eos_id()]

encoding = tokenizer.encode(text)
token_ids = encoding.ids

data_tensor = torch.tensor(token_ids, dtype=torch.long)

print("Tokenized training dataset")

Tokenized training dataset


In [12]:
# print(sp.get_piece_size())

In [13]:
# print(data_tensor.shape)

# TRAINING

In [None]:
seq_length = 50 
dataset = LanguageModelingDataset(data_tensor, seq_length)
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=3)
val_loader = DataLoader(val_dataset, batch_size=batch_size, num_workers=3)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
import sentencepiece as spm


vocab_size = tokenizer.get_vocab_size()
embed_size = 512
hidden_size = 1024
num_layers = 2
dropout = 0.3
learning_rate = 0.001
num_epochs = 10

model_module = LanguageModelingModule(vocab_size, embed_size, hidden_size, 
                                       num_layers, dropout, learning_rate, seq_length)

early_stop_callback = EarlyStopping(monitor='val_loss', patience=3, verbose=True, mode='min')

checkpoint_callback = ModelCheckpoint(monitor='val_loss', dirpath='./checkpoints', 
                                      filename='best-checkpoint', mode='min')

trainer = pl.Trainer(
    max_epochs=num_epochs,
    accelerator="gpu",
    devices=1,
    callbacks=[early_stop_callback, checkpoint_callback]
)

print("Started training...")
trainer.fit(model_module, train_loader, val_loader)
print("Ended training...")
torch.save(model_module.model.state_dict(), "rnn-sentencepiece-unigram-10M.pt")
print("Saved model!")import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
import sentencepiece as spm


vocab_size = tokenizer.get_vocab_size()
embed_size = 512
hidden_size = 1024
num_layers = 2
dropout = 0.3
learning_rate = 0.001
num_epochs = 10

model_module = LanguageModelingModule(vocab_size, embed_size, hidden_size, 
                                       num_layers, dropout, learning_rate, seq_length)

early_stop_callback = EarlyStopping(monitor='val_loss', patience=3, verbose=True, mode='min')

checkpoint_callback = ModelCheckpoint(monitor='val_loss', dirpath='./checkpoints', 
                                      filename='best-checkpoint', mode='min')

trainer = pl.Trainer(
    max_epochs=num_epochs,
    accelerator="gpu",
    devices=1,
    callbacks=[early_stop_callback, checkpoint_callback]
)

print("Started training...")
trainer.fit(model_module, train_loader, val_loader)
print("Ended training...")
torch.save(model_module.model.state_dict(), "rnn-sentencepiece-unigram-10M.pt")
print("Saved model!")