In [17]:
import random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForMaskedLM
from torch.utils.data import DataLoader, TensorDataset
import torch
from sklearn.metrics import accuracy_score

In [36]:
# Step 1: Data Simulation (Random Sentences)
random_sentences = [
    "The cat sat on the mat", "The dog barked loudly", 
    "She loves programming", "The sky is blue", 
    "He enjoys reading books", "Birds are flying high", 
    "The sun is shining bright", "I like watching movies",
    "They are playing football", "He is eating dinner"
]

In [37]:
# Step 2: Preprocessing
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def mask_tokens(sentences, tokenizer):
    """Tokenizes sentences and randomly masks a word in each sentence."""
    inputs = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True)
    labels = inputs.input_ids.clone()  # Keep a copy of original inputs for labels

    # Randomly select one token per sentence to be masked (except special tokens)
    mask_indices = torch.randint(1, inputs.input_ids.shape[1] - 1, (inputs.input_ids.shape[0],))
    for i in range(inputs.input_ids.shape[0]):
        labels[i, :] = -100  # Default ignore index for loss calculation
        labels[i, mask_indices[i]] = inputs.input_ids[i, mask_indices[i]]  # Only mask one token
        inputs.input_ids[i, mask_indices[i]] = tokenizer.mask_token_id  # Replace with [MASK]

    return inputs.input_ids, labels

In [38]:
# Create masked input and labels
masked_inputs, labels = mask_tokens(random_sentences, tokenizer)

In [39]:
# Step 3: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(masked_inputs, labels, test_size=0.2, random_state=42)

# Convert to PyTorch tensors
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)

In [40]:
# Step 4: Train Base Model (Masked Language Modeling)
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Train the model
model.train()
epochs = 10  # Increase training time
for epoch in range(epochs):
    total_loss = 0
    for batch in train_loader:
        inputs, labels = batch
        inputs, labels = inputs.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids=inputs, labels=labels)
        
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader):.4f}")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch 1, Loss: 8.3489
Epoch 2, Loss: 6.2921
Epoch 3, Loss: 5.1332
Epoch 4, Loss: 4.3844
Epoch 5, Loss: 3.6391
Epoch 6, Loss: 2.8742
Epoch 7, Loss: 2.7872
Epoch 8, Loss: 2.6044
Epoch 9, Loss: 2.3988
Epoch 10, Loss: 2.1033


In [41]:
# Step 5: Evaluate the Model
model.eval()
with torch.no_grad():
    outputs = model(X_test.to(device))
    predictions = torch.argmax(outputs.logits, dim=-1).cpu()

    # Compute accuracy only on masked tokens
    mask_indices = (y_test != -100)  # Get positions where actual words were masked
    correct = (predictions[mask_indices] == y_test[mask_indices]).sum().item()
    total = mask_indices.sum().item()
    accuracy = correct / total if total > 0 else 0

    print(f"Accuracy on test set: {accuracy * 100:.2f}%")

Accuracy on test set: 50.00%


In [42]:
# Step 6: Predict New Data
new_sentences = ["The weather is nice today", "She is learning machine learning"]
new_inputs, _ = mask_tokens(new_sentences, tokenizer)

model.eval()
with torch.no_grad():
    new_outputs = model(new_inputs.to(device))
    new_predictions = torch.argmax(new_outputs.logits, dim=-1)
    predicted_tokens = [tokenizer.decode(new_predictions[i], skip_special_tokens=True) for i in range(len(new_sentences))]

print(f"Predicted sentences: {predicted_tokens}")

Predicted sentences: ['. the weather is nice day.', '. it is learning machine..']
