In [1]:
pip install torch transformers datasets


Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (179 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer, AutoModelForMaskedLM
from datasets import load_dataset
from torch.utils.data import DataLoader
import os

# Suppress symlink warnings
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

# Load a smaller subset of Wikipedia data
dataset = load_dataset("wikipedia", "20220301.en", split="train[:1%]", trust_remote_code=True)

# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")

# Tokenize data in batches
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask"])

# Data loader preparation
batch_size = 8
train_dataloader = DataLoader(tokenized_datasets, batch_size=batch_size, shuffle=True)

# Define training parameters
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = optim.AdamW(model.parameters(), lr=5e-5)

# Training loop
epochs = 1  # For demonstration; typically, more epochs are used
model.train()

for epoch in range(epochs):
    for batch in train_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["input_ids"].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        print(f"Epoch {epoch + 1}, Loss: {loss.item()}")

# Testing the Model
model.eval()

test_sentence = "The capital of France is [MASK]."
inputs = tokenizer(test_sentence, return_tensors="pt")
inputs = {key: value.to(device) for key, value in inputs.items()}

with torch.no_grad():
    outputs = model(**inputs)
    predictions = outputs.logits

masked_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
predicted_id = predictions[0, masked_index].argmax(dim=-1)
predicted_token = tokenizer.decode(predicted_id)

print(f"Test sentence: {test_sentence}")
print(f"Predicted token for [MASK]: {predicted_token}")

# Saving the model and tokenizer to Google Drive
save_path = '/content/snehashish_model/pretrained_wiki_model'
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print("Model pretrained, tested, and saved to Google Drive successfully!")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/16.0k [00:00<?, ?B/s]

wikipedia.py:   0%|          | 0.00/36.7k [00:00<?, ?B/s]

Downloading data:   0%|          | 0/41 [00:00<?, ?files/s]

train-00000-of-00041.parquet:   0%|          | 0.00/1.04G [00:00<?, ?B/s]

train-00001-of-00041.parquet:   0%|          | 0.00/705M [00:00<?, ?B/s]

train-00002-of-00041.parquet:   0%|          | 0.00/558M [00:00<?, ?B/s]

train-00003-of-00041.parquet:   0%|          | 0.00/491M [00:00<?, ?B/s]

train-00004-of-00041.parquet:   0%|          | 0.00/431M [00:00<?, ?B/s]

train-00005-of-00041.parquet:   0%|          | 0.00/391M [00:00<?, ?B/s]

train-00006-of-00041.parquet:   0%|          | 0.00/366M [00:00<?, ?B/s]

train-00007-of-00041.parquet:   0%|          | 0.00/326M [00:00<?, ?B/s]

train-00008-of-00041.parquet:   0%|          | 0.00/329M [00:00<?, ?B/s]

train-00009-of-00041.parquet:   0%|          | 0.00/312M [00:00<?, ?B/s]

train-00010-of-00041.parquet:   0%|          | 0.00/267M [00:00<?, ?B/s]

train-00011-of-00041.parquet:   0%|          | 0.00/247M [00:00<?, ?B/s]

train-00012-of-00041.parquet:   0%|          | 0.00/229M [00:00<?, ?B/s]

train-00013-of-00041.parquet:   0%|          | 0.00/248M [00:00<?, ?B/s]

train-00014-of-00041.parquet:   0%|          | 0.00/222M [00:00<?, ?B/s]

train-00015-of-00041.parquet:   0%|          | 0.00/236M [00:00<?, ?B/s]

train-00016-of-00041.parquet:   0%|          | 0.00/215M [00:00<?, ?B/s]

train-00017-of-00041.parquet:   0%|          | 0.00/229M [00:00<?, ?B/s]

train-00018-of-00041.parquet:   0%|          | 0.00/241M [00:00<?, ?B/s]

train-00019-of-00041.parquet:   0%|          | 0.00/228M [00:00<?, ?B/s]

train-00020-of-00041.parquet:   0%|          | 0.00/214M [00:00<?, ?B/s]

train-00021-of-00041.parquet:   0%|          | 0.00/255M [00:00<?, ?B/s]

train-00022-of-00041.parquet:   0%|          | 0.00/226M [00:00<?, ?B/s]

train-00023-of-00041.parquet:   0%|          | 0.00/226M [00:00<?, ?B/s]

train-00024-of-00041.parquet:   0%|          | 0.00/192M [00:00<?, ?B/s]

train-00025-of-00041.parquet:   0%|          | 0.00/218M [00:00<?, ?B/s]

train-00026-of-00041.parquet:   0%|          | 0.00/212M [00:00<?, ?B/s]

train-00027-of-00041.parquet:   0%|          | 0.00/206M [00:00<?, ?B/s]

train-00028-of-00041.parquet:   0%|          | 0.00/199M [00:00<?, ?B/s]

train-00029-of-00041.parquet:   0%|          | 0.00/219M [00:00<?, ?B/s]

train-00030-of-00041.parquet:   0%|          | 0.00/214M [00:00<?, ?B/s]

train-00031-of-00041.parquet:   0%|          | 0.00/216M [00:00<?, ?B/s]

train-00032-of-00041.parquet:   0%|          | 0.00/200M [00:00<?, ?B/s]

train-00033-of-00041.parquet:   0%|          | 0.00/203M [00:00<?, ?B/s]

train-00034-of-00041.parquet:   0%|          | 0.00/201M [00:00<?, ?B/s]

train-00035-of-00041.parquet:   0%|          | 0.00/192M [00:00<?, ?B/s]

train-00036-of-00041.parquet:   0%|          | 0.00/199M [00:00<?, ?B/s]

train-00037-of-00041.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

train-00038-of-00041.parquet:   0%|          | 0.00/203M [00:00<?, ?B/s]

train-00039-of-00041.parquet:   0%|          | 0.00/192M [00:00<?, ?B/s]

train-00040-of-00041.parquet:   0%|          | 0.00/185M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/6458670 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Map:   0%|          | 0/64587 [00:00<?, ? examples/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Epoch 1, Loss: 0.00011279949831077829
Epoch 1, Loss: 0.00013447094534058124
Epoch 1, Loss: 0.0001721275330055505
Epoch 1, Loss: 0.00030622543999925256
Epoch 1, Loss: 9.93839930742979e-05
Epoch 1, Loss: 8.39512431412004e-05
Epoch 1, Loss: 0.002063441788777709
Epoch 1, Loss: 0.00027953143580816686
Epoch 1, Loss: 0.00012119608436478302
Epoch 1, Loss: 7.056500180624425e-05
Epoch 1, Loss: 0.00010490403656149283
Epoch 1, Loss: 7.493332668673247e-05
Epoch 1, Loss: 7.964578253449872e-05
Epoch 1, Loss: 6.793598004151136e-05
Epoch 1, Loss: 8.810351573629305e-05
Epoch 1, Loss: 0.00019968532433267683
Epoch 1, Loss: 0.00010989278234774247
Epoch 1, Loss: 5.873338523088023e-05
Epoch 1, Loss: 6.388216570485383e-05
Epoch 1, Loss: 0.00010190640023211017
Epoch 1, Loss: 8.197835268219933e-05
Epoch 1, Loss: 0.00031365727772936225
Epoch 1, Loss: 0.0008913861238397658
Epoch 1, Loss: 0.0002912449708674103
Epoch 1, Loss: 0.00016133871395140886
Ep

In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM

# Define model path
model_path = "/content/snehashish_model/pretrained_wiki_model"

# Load the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForMaskedLM.from_pretrained(model_path)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# List of 20 questions with a masked token
questions = [
    "The capital of France is [MASK].",
    "The largest planet in the solar system is [MASK].",
    "The chemical symbol for water is [MASK].",
    "The Great Wall is located in [MASK].",
    "The smallest country in the world is [MASK].",
    "The most abundant gas in Earth's atmosphere is [MASK].",
    "The speed of light is approximately [MASK] km/s.",
    "The primary language spoken in Brazil is [MASK].",
    "The currency used in Japan is [MASK].",
    "The Nobel Peace Prize is awarded in [MASK].",
    "The human body typically has [MASK] bones.",
    "The Eiffel Tower is located in [MASK].",
    "The atomic number of hydrogen is [MASK].",
    "The square root of 64 is [MASK].",
    "The largest ocean on Earth is the [MASK] Ocean.",
    "The highest mountain in the world is [MASK].",
    "The currency of the United States is the [MASK].",
    "The capital of Italy is [MASK].",
    "The longest river in the world is the [MASK].",
    "The author of 'Pride and Prejudice' is [MASK]."
]

# Generate answers for each question
model.eval()  # Set model to evaluation mode
for question in questions:
    inputs = tokenizer(question, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = outputs.logits

    # Get the predicted token
    masked_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
    predicted_id = predictions[0, masked_index].argmax(dim=-1)
    predicted_token = tokenizer.decode(predicted_id)

    print(f"Question: {question}")
    print(f"Predicted Answer: {predicted_token}\n")


Question: The capital of France is [MASK].
Predicted Answer: nearby

Question: The largest planet in the solar system is [MASK].
Predicted Answer: [PAD]

Question: The chemical symbol for water is [MASK].
Predicted Answer: [PAD]

Question: The Great Wall is located in [MASK].
Predicted Answer: scotland

Question: The smallest country in the world is [MASK].
Predicted Answer: australia

Question: The most abundant gas in Earth's atmosphere is [MASK].
Predicted Answer: [PAD]

Question: The speed of light is approximately [MASK] km/s.
Predicted Answer: 100

Question: The primary language spoken in Brazil is [MASK].
Predicted Answer: english

Question: The currency used in Japan is [MASK].
Predicted Answer: [PAD]

Question: The Nobel Peace Prize is awarded in [MASK].
Predicted Answer: october

Question: The human body typically has [MASK] bones.
Predicted Answer: no

Question: The Eiffel Tower is located in [MASK].
Predicted Answer: scotland

Question: The atomic number of hydrogen is [MAS

In [5]:
!zip -r pretrained_wiki_model.zip /content/snehashish_model/pretrained_wiki_model


  adding: content/snehashish_model/pretrained_wiki_model/ (stored 0%)
  adding: content/snehashish_model/pretrained_wiki_model/config.json (deflated 47%)
  adding: content/snehashish_model/pretrained_wiki_model/tokenizer.json (deflated 71%)
  adding: content/snehashish_model/pretrained_wiki_model/model.safetensors (deflated 7%)
  adding: content/snehashish_model/pretrained_wiki_model/special_tokens_map.json (deflated 42%)
  adding: content/snehashish_model/pretrained_wiki_model/tokenizer_config.json (deflated 76%)
  adding: content/snehashish_model/pretrained_wiki_model/vocab.txt (deflated 53%)
  adding: content/snehashish_model/pretrained_wiki_model/generation_config.json (deflated 8%)


In [1]:
from google.colab import drive
drive.mount('/content/drive')

MessageError: Error: credential propagation was unsuccessful