In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/mosaic-train/train_set_f.xlsx
/kaggle/input/mosaic-train/test_set_f.xlsx


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
from pathlib import Path
from tqdm.auto import tqdm
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [4]:
# Loading dataset from Excel
train_path = "/kaggle/input/mosaic-train/train_set_f.xlsx"
df = pd.read_excel(train_path)
lines = df["SENTENCES"].dropna().tolist()
random.shuffle(lines)

# Split into train, validation, and test sets
train_lines, val_lines = train_test_split(lines, test_size=0.1, shuffle=True)
val_lines, test_lines = train_test_split(val_lines, test_size=0.2, shuffle=True)

print(f"Total: {len(lines)}, Train: {len(train_lines)}, Val: {len(val_lines)}, Test: {len(test_lines)}")

Total: 50000, Train: 45000, Val: 4000, Test: 1000


In [None]:
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers import normalizers
from tokenizers.normalizers import NFD, Lowercase, StripAccents
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import WordPieceTrainer
from tokenizers import decoders

In [5]:
bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
bert_tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])
bert_tokenizer.pre_tokenizer = Whitespace()
bert_tokenizer.decoder = decoders.WordPiece()

# here we are training the tokenizer on the dataset
trainer = WordPieceTrainer(special_tokens=["[UNK]", "[PAD]", "[MASK]"], vocab_size=8192)
bert_tokenizer.train_from_iterator(lines, trainer)

# enabling padding & truncation
bert_tokenizer.enable_padding(pad_id=bert_tokenizer.token_to_id("[PAD]"), length=128)
bert_tokenizer.enable_truncation(128)

# we have saved the tokenizer
tokenizer_path = Path("mlm-baby-bert/tokenizer")
tokenizer_path.mkdir(exist_ok=True, parents=True)
bert_tokenizer.save(str(tokenizer_path / "custom_tokenizer.json"))

print("Tokenizer trained and saved successfully!")

Tokenizer trained and saved successfully!


In [42]:
from tokenizers import Tokenizer
from pathlib import Path

# Loading the trained tokenizer from our saved path
tokenizer_path = Path("mlm-baby-bert/tokenizer/custom_tokenizer.json")
tokenizer = Tokenizer.from_file(str(tokenizer_path))

# testing by encoding a sample sentence
sample_text = "Hello our team name is Bizli"
encoded = tokenizer.encode(sample_text)

# Print tokenized output
for token_id, token in zip(encoded.ids, encoded.tokens):
    print(f"{token_id}:{token} ", end=" ")

# Decoding back to text
decoded_text = tokenizer.decode(encoded.ids)
print("\nDecoded text:", decoded_text)

3936:hell  61:##o  989:our  1213:team  654:name  136:is  2208:bi  83:##z  62:##l  70:##i  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1:[PAD]  1

In [7]:
class MLMDataset:
    def __init__(self,lines):
        self.lines = lines
    def __len__(self,):
        return len(self.lines)
    def __getitem__(self,idx):
        line = self.lines[idx]
        ids = tokenizer.encode(line).ids
        labels = ids.copy()
        return ids, labels

In [8]:
def collate_fn(batch):
    input_ids = [torch.tensor(i[0]) for i in batch]
    labels = [torch.tensor(i[1]) for i in batch]
    input_ids = torch.stack(input_ids)
    labels = torch.stack(labels)
    # mask 15% of text leaving [PAD]
    mlm_mask = torch.rand(input_ids.size()) < 0.15 * (input_ids!=1)
    masked_tokens = input_ids * mlm_mask
    labels[masked_tokens==0]=-100 # seting all tokens except masked tokens to -100
    input_ids[masked_tokens!=0]=2 # MASK TOKEN
    return input_ids, labels

In [9]:
ds = MLMDataset(lines)
dl = torch.utils.data.DataLoader(ds,batch_size=2,shuffle=True,collate_fn=collate_fn)

In [10]:
i,l = next(iter(dl))
print(i[1])
print(l[1])

tensor([   2,    2,  303, 3529,   74,  127,  718,  787,  606,  111, 1379,  265,
         127,  718,    3,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1])
tensor([ 300,  282, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -

In [12]:

class RMSNorm(nn.Module):
    def __init__(self, d, p=-1., eps=1e-8, bias=False):
        """
            Root Mean Square Layer Normalization
        :param d: model size
        :param p: partial RMSNorm, valid value [0, 1], default -1.0 (disabled)
        :param eps:  epsilon value, default 1e-8
        :param bias: whether use bias term for RMSNorm, disabled by
            default because RMSNorm doesn't enforce re-centering invariance.
        """
        super(RMSNorm, self).__init__()

        self.eps = eps
        self.d = d
        self.p = p
        self.bias = bias

        self.scale = nn.Parameter(torch.ones(d))
        self.register_parameter("scale", self.scale)

        if self.bias:
            self.offset = nn.Parameter(torch.zeros(d))
            self.register_parameter("offset", self.offset)

    def forward(self, x):
        if self.p < 0. or self.p > 1.:
            norm_x = x.norm(2, dim=-1, keepdim=True)
            d_x = self.d
        else:
            partial_size = int(self.d * self.p)
            partial_x, _ = torch.split(x, [partial_size, self.d - partial_size], dim=-1)

            norm_x = partial_x.norm(2, dim=-1, keepdim=True)
            d_x = partial_size

        rms_x = norm_x * d_x ** (-1. / 2)
        x_normed = x / (rms_x + self.eps)

        if self.bias:
            return self.scale * x_normed + self.offset

        return self.scale * x_normed

In [13]:
class MultiheadAttention(nn.Module):
    def __init__(self, dim, n_heads, dropout=0.):
        super().__init__()
        self.dim = dim
        self.n_heads = n_heads
        assert dim % n_heads == 0, 'dim should be div by n_heads'
        self.head_dim = self.dim // self.n_heads
        self.in_proj = nn.Linear(dim,dim*3,bias=False)
        self.attn_dropout = nn.Dropout(dropout)
        self.scale = self.head_dim ** -0.5
        self.out_proj = nn.Linear(dim,dim)

    def forward(self,x,mask=None):
        b,t,c = x.shape
        q,k,v = self.in_proj(x).chunk(3,dim=-1)
        q = q.view(b,t,self.n_heads,self.head_dim).permute(0,2,1,3)
        k = k.view(b,t,self.n_heads,self.head_dim).permute(0,2,1,3)
        v = v.view(b,t,self.n_heads,self.head_dim).permute(0,2,1,3)

        qkT = torch.matmul(q,k.transpose(-1,-2)) * self.scale
        qkT = self.attn_dropout(qkT)

        if mask is not None:
            mask = mask.to(dtype=qkT.dtype,device=qkT.device)
            qkT = qkT.masked_fill(mask==0,float('-inf'))

        qkT = F.softmax(qkT,dim=-1)
        attn = torch.matmul(qkT,v)
        attn = attn.permute(0,2,1,3).contiguous().view(b,t,c)
        out = self.out_proj(attn)

        return out

In [14]:
class FeedForward(nn.Module):
    def __init__(self,dim,dropout=0.):
        super().__init__()
        self.feed_forward = nn.Sequential(
            nn.Linear(dim,dim*4),
            nn.Dropout(dropout),
            nn.GELU(),
            nn.Linear(dim*4,dim)
        )

    def forward(self, x):
        return self.feed_forward(x)

In [15]:
class EncoderBlock(nn.Module):
    def __init__(self, dim, n_heads, attn_dropout=0., mlp_dropout=0.):
        super().__init__()
        self.attn = MultiheadAttention(dim,n_heads,attn_dropout)
        self.ffd = FeedForward(dim,mlp_dropout)
        self.ln_1 = RMSNorm(dim)
        self.ln_2 = RMSNorm(dim)

    def forward(self,x,mask=None):
        x = self.ln_1(x)
        x = x + self.attn(x,mask)
        x = self.ln_2(x)
        x = x + self.ffd(x)
        return x

In [16]:
class Embedding(nn.Module):
    def __init__(self,vocab_size,max_len,dim):
        super().__init__()
        self.max_len = max_len
        self.class_embedding = nn.Embedding(vocab_size,dim)
        self.pos_embedding = nn.Embedding(max_len,dim)
    def forward(self,x):
        x = self.class_embedding(x)
        pos = torch.arange(0,x.size(1),device=x.device)
        x = x + self.pos_embedding(pos)
        return x

In [17]:
class MLMBERT(nn.Module):
    def __init__(self, config):

        super().__init__()

        self.embedding = Embedding(config['vocab_size'],config['max_len'],config['dim'])

        self.depth = config['depth']
        self.encoders = nn.ModuleList([
            EncoderBlock(
                dim=config['dim'],
                n_heads=config['n_heads'],
                attn_dropout=config['attn_dropout'],
                mlp_dropout=config['mlp_dropout']
            ) for _ in range(self.depth)
        ])

        self.ln_f = RMSNorm(config['dim'])

        self.mlm_head = nn.Linear(config['dim'],config['vocab_size'],bias=False)

        self.embedding.class_embedding.weight = self.mlm_head.weight #  tying the weights

        self.pad_token_id = config['pad_token_id']
        self.mask_token_id = config['mask_token_id']

        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def create_src_mask(self,src):
        return (src != self.pad_token_id).unsqueeze(1).unsqueeze(2) 

    def forward(self,input_ids,labels=None):

        src_mask = self.create_src_mask(input_ids)
        enc_out = self.embedding(input_ids)
        for layer in self.encoders:
            enc_out = layer(enc_out,mask=src_mask)

        enc_out = self.ln_f(enc_out)

        logits = self.mlm_head(enc_out)

        if labels is not None:
            loss = F.cross_entropy(logits.view(-1,logits.size(-1)),labels.view(-1))
            return {'loss': loss, 'logits': logits}
        else:
            # inference should have only one masked token
            mask_idx = (input_ids==self.mask_token_id).flatten().nonzero().item()
            mask_preds = F.softmax(logits[:,mask_idx,:],dim=-1).argmax(dim=-1)
            return {'mask_predictions':mask_preds}

In [18]:
#setting the parameters of the model
config = {
    'dim': 256,
    'n_heads': 8,
    'attn_dropout': 0.1,
    'mlp_dropout': 0.1,
    'depth': 6,
    'vocab_size': 8192,
    'max_len': 128,
    'pad_token_id': 1,
    'mask_token_id': 2
}

In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MLMBERT(config).to(device)
print(f'Model is running on: {device}')


Model is running on: cuda


In [20]:
print('trainable:',sum([p.numel() for p in model.parameters() if p.requires_grad]))

trainable: 6861056


In [21]:
train_ds = MLMDataset(train_lines)
val_ds = MLMDataset(val_lines)

In [22]:
train_dl = torch.utils.data.DataLoader(train_ds, batch_size=128, shuffle=True, collate_fn=collate_fn)
val_dl = torch.utils.data.DataLoader(val_ds, batch_size=128, shuffle=False, collate_fn=collate_fn)

In [23]:
# TEST : SINGLE TOKEN MASKING

test_actuals = []
test_batches = []
for ln in tqdm(test_lines):
    tokenized = tokenizer.encode(ln)
    fi = len(tokenized.ids)
    if 1 in tokenized.special_tokens_mask:
        fi = torch.tensor(tokenized.special_tokens_mask).nonzero()[0].item() # ignore [PAD]
    m = torch.randint(0,fi,(1,)).item() # select random token to mask
    input_ids = torch.tensor(tokenized.ids)
    test_actuals.append(input_ids[m].item())
    input_ids[m]=2 # replace with [MASK]
    test_batches.append(input_ids)

  0%|          | 0/1000 [00:00<?, ?it/s]

In [24]:
epochs = 60
train_losses = []
valid_losses = []
test_accuracies = []
best_val_loss = 1e9

In [25]:
from torch.optim import AdamW
optim = AdamW(model.parameters(),lr=6e-4 / 25., weight_decay=0.01)
sched = torch.optim.lr_scheduler.OneCycleLR(optim,max_lr=6e-4,steps_per_epoch=len(train_dl),epochs=epochs)

In [26]:
import os
import torch

# Path to save checkpoint
checkpoint_path = './mlm-baby-bert/checkpoint.pth'

# Checking if a checkpoint exists and resuming from the last epoch
if os.path.exists(checkpoint_path):
    checkpoint = torch.load(checkpoint_path)
    start_epoch = checkpoint['epoch'] + 1  # Resume from next epoch
    model.load_state_dict(checkpoint['model_state_dict'])
    optim.load_state_dict(checkpoint['optimizer_state_dict'])
    sched.load_state_dict(checkpoint['scheduler_state_dict'])
    train_losses = checkpoint['train_losses']
    valid_losses = checkpoint['valid_losses']
    test_accuracies = checkpoint['test_accuracies']
    best_val_loss = checkpoint['best_val_loss']
    print(f"Resuming training from epoch {start_epoch}")
else:
    start_epoch = 0  # Starting from scratch if no previous checkpoint is found
    print("No checkpoint found. Training from scratch.")

for ep in tqdm(range(start_epoch, epochs)):  # Starting from saved epoch
    model.train()
    trl = 0.
    tprog = tqdm(enumerate(train_dl), total=len(train_dl))
    for i, (input_ids, labels) in tprog:
        input_ids = input_ids.to('cuda')
        labels = labels.to('cuda')
        loss = model(input_ids, labels)['loss']
        loss.backward()
        optim.step()
        optim.zero_grad()
        sched.step()
        trl += loss.item()
        tprog.set_description(f'train step loss: {loss.item():.4f}')
    train_losses.append(trl / len(train_dl))

    model.eval()
    with torch.no_grad():
        vrl = 0.
        vprog = tqdm(enumerate(val_dl), total=len(val_dl))
        for i, (input_ids, labels) in vprog:
            input_ids = input_ids.to('cuda')
            labels = labels.to('cuda')
            loss = model(input_ids, labels)['loss']
            vrl += loss.item()
            vprog.set_description(f'valid step loss: {loss.item():.4f}')
        vloss = vrl / len(val_dl)
        valid_losses.append(vloss)
        print(f'epoch {ep} | train_loss: {train_losses[-1]:.4f} valid_loss: {valid_losses[-1]:.4f}')

        if vloss < best_val_loss:
            best_val_loss = vloss
            print('PREDICTING!')
            test_predictions = []
            for input_ids in tqdm(test_batches):
                input_ids = input_ids.unsqueeze(0)
                input_ids = input_ids.to('cuda')
                mask_preds = model(input_ids)['mask_predictions']
                test_predictions.extend(list(mask_preds.detach().cpu().flatten().numpy()))

            tacc = accuracy_score(test_actuals, test_predictions)
            test_accuracies.append(tacc)
            print(f'SINGLE MASK TOKEN PREDICTION ACCURACY: {tacc:.4f}')
            print('saving best model...')
            torch.save(model.state_dict(), './mlm-baby-bert/model.pt')

    # *Saving training progress (checkpoint)*
    checkpoint = {
        'epoch': ep,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optim.state_dict(),
        'scheduler_state_dict': sched.state_dict(),
        'train_losses': train_losses,
        'valid_losses': valid_losses,
        'test_accuracies': test_accuracies,
        'best_val_loss': best_val_loss
    }
    torch.save(checkpoint, checkpoint_path)  # Saving after every epoch

    if test_accuracies:
        print(f'epoch {ep} | accuracy: {test_accuracies[-1]:.4f}')

Resuming training from epoch 45


  checkpoint = torch.load(checkpoint_path)


  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/352 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

epoch 45 | train_loss: 3.5055 valid_loss: 3.3448
PREDICTING!


  0%|          | 0/1000 [00:00<?, ?it/s]

SINGLE MASK TOKEN PREDICTION ACCURACY: 0.4710
saving best model...
epoch 45 | accuracy: 0.4710


  0%|          | 0/352 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

epoch 46 | train_loss: 3.3120 valid_loss: 3.2532
PREDICTING!


  0%|          | 0/1000 [00:00<?, ?it/s]

SINGLE MASK TOKEN PREDICTION ACCURACY: 0.4710
saving best model...
epoch 46 | accuracy: 0.4710


  0%|          | 0/352 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

epoch 47 | train_loss: 3.2011 valid_loss: 3.3464
epoch 47 | accuracy: 0.4710


  0%|          | 0/352 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

epoch 48 | train_loss: 3.1405 valid_loss: 3.2728
epoch 48 | accuracy: 0.4710


  0%|          | 0/352 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

epoch 49 | train_loss: 3.0821 valid_loss: 3.2055
PREDICTING!


  0%|          | 0/1000 [00:00<?, ?it/s]

SINGLE MASK TOKEN PREDICTION ACCURACY: 0.4820
saving best model...
epoch 49 | accuracy: 0.4820


  0%|          | 0/352 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

epoch 50 | train_loss: 3.0477 valid_loss: 3.2619
epoch 50 | accuracy: 0.4820


  0%|          | 0/352 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

epoch 51 | train_loss: 3.0105 valid_loss: 3.2236
epoch 51 | accuracy: 0.4820


  0%|          | 0/352 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

epoch 52 | train_loss: 3.0067 valid_loss: 3.2291
epoch 52 | accuracy: 0.4820


  0%|          | 0/352 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

epoch 53 | train_loss: 2.9549 valid_loss: 3.1898
PREDICTING!


  0%|          | 0/1000 [00:00<?, ?it/s]

SINGLE MASK TOKEN PREDICTION ACCURACY: 0.4840
saving best model...
epoch 53 | accuracy: 0.4840


  0%|          | 0/352 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

epoch 54 | train_loss: 2.9380 valid_loss: 3.1653
PREDICTING!


  0%|          | 0/1000 [00:00<?, ?it/s]

SINGLE MASK TOKEN PREDICTION ACCURACY: 0.4790
saving best model...
epoch 54 | accuracy: 0.4790


  0%|          | 0/352 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

epoch 55 | train_loss: 2.9337 valid_loss: 3.1860
epoch 55 | accuracy: 0.4790


  0%|          | 0/352 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

epoch 56 | train_loss: 2.9297 valid_loss: 3.1593
PREDICTING!


  0%|          | 0/1000 [00:00<?, ?it/s]

SINGLE MASK TOKEN PREDICTION ACCURACY: 0.4900
saving best model...
epoch 56 | accuracy: 0.4900


  0%|          | 0/352 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

epoch 57 | train_loss: 2.8993 valid_loss: 3.2641
epoch 57 | accuracy: 0.4900


  0%|          | 0/352 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

epoch 58 | train_loss: 2.9107 valid_loss: 3.0955
PREDICTING!


  0%|          | 0/1000 [00:00<?, ?it/s]

SINGLE MASK TOKEN PREDICTION ACCURACY: 0.4890
saving best model...
epoch 58 | accuracy: 0.4890


  0%|          | 0/352 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

epoch 59 | train_loss: 2.9174 valid_loss: 3.1610
epoch 59 | accuracy: 0.4890


In [39]:

sd = torch.load('./mlm-baby-bert/model.pt')
model.load_state_dict(sd)

  sd = torch.load('./mlm-baby-bert/model.pt')


<All keys matched successfully>

In [40]:
def predict_mask(sentence):
    x = tokenizer.encode(sentence)
    
    try:
        idx = x.ids.index(tokenizer.token_to_id('[MASK]'))  
    except ValueError:
        print("No [MASK] token found in the sentence!")
        return
    
    input_ids = x.ids.copy()
    
    # Preparing input
    input_tensor = torch.tensor(input_ids, dtype=torch.long).unsqueeze(0).to('cuda')
    
    # Model prediction
    out = model(input_tensor)
    predicted_id = out['mask_predictions'].item()
    predicted_token = tokenizer.decode([predicted_id])
    
    # Replacing [MASK] with the predicted token
    predicted_sentence = input_ids.copy()
    predicted_sentence[idx] = predicted_id
    
    print(f'ACTUAL:  {sentence}')
    print(f'PREDICTED: {tokenizer.decode(predicted_sentence, skip_special_tokens=True)}')

    return predicted_token


In [41]:
predict_mask("His second [MASK] Chaya Malka was a daughter of Rabbi Yisroel Friedman of Ruzhin .")


ACTUAL:  His second [MASK] Chaya Malka was a daughter of Rabbi Yisroel Friedman of Ruzhin .
PREDICTED: his second wife chaya malka was a daughter of rabbi yisroel friedman of ruzhin.


'wife'

In [30]:
import pandas as pd
import torch

# Load Excel file
df = pd.read_excel("/kaggle/input/mosaic-train/test_set_f.xlsx")  # Change file path if needed

# List for storing predictions
predictions = []

# Function to predict masked words
def predict_mask(sentence):
    x = tokenizer.encode(sentence)
    
    # Find the index of [MASK] token in the input sentence
    try:
        idx = x.ids.index(tokenizer.token_to_id('[MASK]'))  # Get index of [MASK]
    except ValueError:
        print("No [MASK] token found in the sentence!")
        return
    
    input_ids = x.ids.copy()
    
    # Preparing input
    input_tensor = torch.tensor(input_ids, dtype=torch.long).unsqueeze(0).to('cuda')
    
    # Model prediction
    out = model(input_tensor)
    predicted_id = out['mask_predictions'].item()
    predicted_token = tokenizer.decode([predicted_id])
    
    # Replacing [MASK] with the predicted token
    predicted_sentence = input_ids.copy()
    predicted_sentence[idx] = predicted_id
    
   # print(f'ACTUAL:  {sentence}')
    #print(f'PREDICTED: {tokenizer.decode(predicted_sentence, skip_special_tokens=True)}')

    return predicted_token

# Apply prediction to each sentence
for sentence in df["MASKED SENTENCES"].fillna("NO_SENTENCE"):  # Handle missing values
    pred_word = predict_mask(sentence)
    predictions.append(pred_word)

# Add predictions to DataFrame
df["PREDICTED_WORD"] = predictions

# **Updated Output Path**
output_path = "/kaggle/working/predictions.csv"  # Ensure it's saved in Kaggle working directory
df.to_csv(output_path, index=False)

print(f"✅ Predictions saved to {output_path}")


✅ Predictions saved to /kaggle/working/predictions.csv


'INVALID_MASK_INDEX_10'