# BERTs with Masked Language Model and Named Entities Recognition
---

## Masked Language Model

### Define libraries and dataset

In [None]:
!pip install datasets transformers torch tqdm

In [None]:
from datasets import Dataset
from transformers import AutoTokenizer
import torch
from transformers import AutoModelForMaskedLM
from torch.optim import AdamW
from transformers import get_scheduler
from tqdm.auto import tqdm
import requests
import re

In [None]:
# Download the book
# No need for a local file
url = "https://www.gutenberg.org/files/1342/1342-0.txt"
response = requests.get(url)
response.raise_for_status()
print("Downloaded the book successfully!")

# Extract the main content
text = response.text
print("Raw text length:", len(text))

# Locate the true starting point
start_index = text.find("It is a truth universally acknowledged")
end_index = text.rfind("had been the means of uniting them.")
clean_text = text[start_index:end_index].strip()

# Remove unwanted formatting using regex
clean_text = re.sub(r"Heading to", "", clean_text)  # Remove 'Heading to'
clean_text = re.sub(r"\[.*?\]", "", clean_text)  # Remove content inside square brackets
clean_text = re.sub(r"\d+", "", clean_text)  # Remove numbers
clean_text = re.sub(r"\s+", " ", clean_text).strip()  # Normalize spaces

print("Cleaned text length:", len(clean_text))
print(clean_text[:100])

In [None]:
# Convert the cleaned text into a list of sentences (or chunks)
def chunk_text(text, chunk_size=512):
    words = text.split()
    return [" ".join(words[i : i + chunk_size]) for i in range(0, len(words), chunk_size)]

# Chunk the text
# 512 for maximal context
chunk_size = 512
text_chunks = chunk_text(clean_text, chunk_size)

# Save to a Hugging Face Dataset format
dataset = Dataset.from_dict({"text": text_chunks})

### Define Tokenizer

In [None]:
# Train a custom tokenizer from an existing one
base_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
new_tokenizer = base_tokenizer.train_new_from_iterator(dataset["text"], vocab_size=30522)

# Save tokenizer
new_tokenizer.save_pretrained("./custom_tokenizer")

# Tokenize the dataset
def tokenize_function(examples):
    return new_tokenizer(examples["text"], truncation=True, padding="max_length", max_length=chunk_size)

tokenized_data = dataset.map(tokenize_function, batched=True)

print("Tokenization complete.")

In [None]:
print(tokenized_data)

### Preprocessing for training loop

In [None]:
from transformers import DataCollatorForLanguageModeling
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from torch.utils.data import Subset

In [None]:
# Define a Data Collator for Masked Language Modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=new_tokenizer, 
    mlm=True,  # Enable masked language modeling
    mlm_probability=0.15  # 15% of tokens will be masked
)

# Convert dataset into a PyTorch-compatible format
tokenized_data.set_format(type="torch", columns=["input_ids", "attention_mask"])

# Create DataLoader
batch_size = 8
train_dataloader = DataLoader(tokenized_data, shuffle=True, batch_size=batch_size, collate_fn=data_collator)

print("Data preprocessing complete. Ready for training!")


Splitting the data into training and testing sets before training

In [None]:
# Using 90 to 10 split with training and testing
train_size = 0.9

# Get dataset indices
dataset_size = len(tokenized_data)
train_indices, val_indices = train_test_split(range(dataset_size), train_size=train_size, random_state=42)

# Create Subsets for training and validation
train_dataset = Subset(tokenized_data, train_indices)
val_dataset = Subset(tokenized_data, val_indices)

# Create DataLoaders
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size, collate_fn=data_collator)
val_dataloader = DataLoader(val_dataset, shuffle=False, batch_size=batch_size, collate_fn=data_collator)

print(f"Training samples: {len(train_dataset)} | Validation samples: {len(val_dataset)}")
print("Data split complete. Ready for model training!")


### Load Pretrained BERT Model

In [None]:
# Load BERT model for MLM
model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print("Model loaded and moved to:", device)


### Setup Optimizer and Scheduler

In [None]:
# Define optimizer
epochs = 3
optimizer = AdamW(model.parameters(), lr=5e-5)

# Set up learning rate scheduler
num_training_steps = len(train_dataloader) * epochs
lr_scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

print(f"Total training steps: {num_training_steps}")

### Traning loop

In [None]:
model.train()

progress_bar = tqdm(range(num_training_steps))

for epoch in range(epochs):
    print(f"\nEpoch {epoch+1}/{epochs}")
    
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        
        outputs = model(**batch)  # Forward pass
        loss = outputs.loss  # Get loss
        
        loss.backward()  # Backpropagation
        optimizer.step()  # Update weights
        lr_scheduler.step()  # Update learning rate
        optimizer.zero_grad()  # Reset gradients
        
        progress_bar.update(1)  # Update progress bar

print("Training complete!")

### Evaluate Model Performance

In [None]:
model.eval()  # Set to evaluation mode
total_loss = 0

for batch in val_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    
    with torch.no_grad():  # Disable gradient calculation
        outputs = model(**batch)
        total_loss += outputs.loss.item()

avg_loss = total_loss / len(val_dataloader)
print(f"Final Validation Loss: {avg_loss:.4f}")


### Save the Trained Model

In [None]:
model.save_pretrained("./bert_mlm_model")  
new_tokenizer.save_pretrained("./bert_mlm_model")  
print("Model saved successfully!")


### Test predictions

In [None]:
from transformers import pipeline

fill_mask = pipeline("fill-mask", model=model, tokenizer=new_tokenizer)

# Test sentence with a MASK token
test_sentence = "I would wish not to be hasty in censuring anyone; but I always [MASK] what I think."

# Get predictions
predictions = fill_mask(test_sentence)
for pred in predictions:
    print(f"{pred['sequence']} (Score: {pred['score']:.4f})")
