#Installing Transformers

In [None]:
!pip install git+https://github.com/huggingface/transformers.git
!pip install datasets
!pip install transformers torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/huggingface/transformers.git
  Cloning https://github.com/huggingface/transformers.git to /tmp/pip-req-build-zfaiz559
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /tmp/pip-req-build-zfaiz559
  Resolved https://github.com/huggingface/transformers.git to commit 5b28b7833297adf65c5160a685425ddb1eee5ce2
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.1-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.many

In [None]:
import torch
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
torch.cuda.empty_cache()

In [None]:
from datasets import list_datasets, load_dataset, DatasetDict
from collections import Counter
from typing import List, Dict, Union, Callable, Any
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from pprint import pprint
import torch
import torch.nn as nn

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)     

cpu


#Data-Preprocessing: 
Loading the data and adding eos /s at each and every line

In [None]:
wiki_ds = load_dataset('Sree1994/babylm_childstories')  #loading wiki dataset from Baby LM datasets
train = wiki_ds["train"]["text"]
test = wiki_ds["test"]["text"]
print(len(train), len(test))

Downloading readme:   0%|          | 0.00/421 [00:00<?, ?B/s]

Downloading and preparing dataset None/None to /root/.cache/huggingface/datasets/Sree1994___parquet/Sree1994--babylm_childstories-3b1a91b15b07a2d3/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/934k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/240k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/4800 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1200 [00:00<?, ? examples/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/Sree1994___parquet/Sree1994--babylm_childstories-3b1a91b15b07a2d3/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

4800 1200


Splitting the corpus to exactly 256 words per sequence

In [None]:
max_seqlen=256
def split_sentence(sentence, chunk_size):
    words = sentence.split()
    chunks = []
    for i in range(0, len(words), chunk_size):
        chunks.append(' '.join(words[i:i+chunk_size]))
    return chunks

train_data=[]
test_data=[]
for line in train:
  length=line.split()
  if len(length) > max_seqlen:
    train_data.extend(split_sentence(line, max_seqlen))
  else:
    train_data.append(line)

for line in test:
  length=line.split()
  if len(length) > max_seqlen:
    test_data.extend(split_sentence(line, max_seqlen))
  else:
    test_data.append(line)

traindata = [(line.strip())+" </s>" for line in train_data if line.strip() != '']
testdata = [(line.strip())+" </s>" for line in test_data if line.strip() != '']

#Tokenzer and Model selection

In [None]:
from transformers import RobertaTokenizer, RobertaForMaskedLM
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

train_tokens = []
test_tokens = []
ids=[]

for line in train_data:
  ids= tokenizer.encode(line)
  train_tokens.extend(ids)

for line in testdata:
  ids = tokenizer.encode(line)
  test_tokens.extend(ids)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Creating X and Y i.e, for xi word, yj(= xi+1) word will be the predicted word

In [None]:
chunk_size = max_seqlen
x_train = []
y_train = []
for i in range(0, len(train_tokens), chunk_size):
  x_train.append(train_tokens[i:i+chunk_size])
for i in range(1, len(train_tokens), chunk_size):
  y_train.append(train_tokens[i:i+chunk_size])

x_test = []
y_test = []
for i in range(0, len(test_tokens), chunk_size):
  x_test.append(test_tokens[i:i+chunk_size])
for i in range(1, len(test_tokens), chunk_size):
  y_test.append(test_tokens[i:i+chunk_size])

len(x_train[704]), len(y_train[704])

(256, 256)

Creating Datasets and DataLoaders

In [None]:
from torch.utils.data import DataLoader, Dataset

#hyperparameters
batch_size=4
max_len=256

# from torch.utils.data import Dataset, DataLoader
# class imdb(Dataset):
#   def __init__(self, data: pd.DataFrame, vocab_dict, max_len):
#     self.data = data
#     self.vocab_dict = vocab_dict
#     self.default = self.vocab_dict['<pad>']
#     self.max_len = max_len

#   def tokenize(self, text: list):
#     return [word for word in text]

#   def encode_tokens(self, tokens):
#     encoded = [self.vocab_dict.get(token, self.default) for token in tokens]
#     encoded += [0 for _ in range(self.max_len-len(tokens))]
#     return torch.tensor(encoded, device=device)

#   def encode_label(self, label: str):
#     return torch.tensor(0, device=device) if label == 'neg' else torch.tensor(1, device=device)    
  
#   def __getitem__(self, n: int):
#     textstr = self.data['text'].iloc[n]
#     classes = self.data['label'].iloc[n]
#     return self.encode_tokens(self.tokenize(textstr)), self.encode_label(classes)

#   def __len__(self):
#     return len(self.data)

class BLM_wiki(Dataset):
  def __init__(self, x: list, y: list, max_len:int):
    self.x = x
    self.y = y
    self.max_len= max_len

  def encode_tokens(self, tokens):  #creating tensors for train and validation data
    tokens += [0 for _ in range(self.max_len-len(tokens))]
    return torch.tensor(tokens, device=device)
  
  def __getitem__(self, n: int):
    ip_seq = self.x[n]
    op_seq = self.y[n]
    return self.encode_tokens(ip_seq), self.encode_tokens(op_seq)

  def __len__(self):
    return len(self.x)


train_dataset = BLM_wiki(x_train, y_train, max_len)
test_dataset = BLM_wiki(x_test, y_test, max_len)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
next(iter(test_dataloader))

[tensor([[   37, 12960,     7,  ..., 28960,     6, 15092],
         [    4,    17,    46,  ...,     6,    53, 36167],
         [   45,  2509,    11,  ...,     8,    11,    10],
         [    5,   514,    21,  ..., 16634,  2508,     6]]),
 tensor([[12960,     7,  1095,  ...,     6, 15092,     8],
         [   17,    46,  1437,  ...,    53, 36167,  7015],
         [ 2509,    11,  5151,  ...,    11,    10,  2664],
         [  514,    21,    25,  ...,  2508,     6,    38]])]

In [None]:
x = next(iter(train_dataloader))
x[0].shape
row = x[0].size(0)
col = x[0].size(1)
row, col
# for j in range(0, col):
#   for i in range(0, row):

print(torch.stack((x[0][0][0], x[0][1][0])))

tensor([1892,    2])


In [None]:
for i in range(0, len(train_dataloader)):
  next(iter(train_dataloader))
for i in range(0, len(test_dataloader)):
  next(iter(test_dataloader))

#This is for RoBERTa custom configuration but not using for now

In [None]:
from transformers import RobertaConfig, RobertaModel
# Initializing a RoBERTa configuration
config = RobertaConfig(
    vocab_size=20_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)
# configuration = RobertaConfig()
# Initializing a model (with random weights) from the configuration
model = RobertaForMaskedLM(config)
# Accessing the model configuration
configuration = model.config

#Model, Loss Criterion, Optimizer

In [None]:
# Model definition
# class RobertaEncoder(nn.Module):
#     def __init__(self):
#         super(RobertaEncoder, self).__init__()
#         self.roberta = RobertaForMaskedLM.from_pretrained('roberta-base')

#     def forward(self, input_ids):
#         outputs = self.roberta(input_ids)
#         return outputs.last_hidden_state.mean(dim=1)

model = RobertaForMaskedLM.from_pretrained('roberta-base')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)


Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/501M [00:00<?, ?B/s]

#Training, Evaluating and Perplexity Calculation

In [None]:

# train model
model.train()
for epoch in range(3):
    for batch_x, batch_y in train_dataloader:
        optimizer.zero_grad()
        input_ids = batch_x.to(model.device)
        output_ids = batch_y.to(model.device)
        outputs = model(input_ids=input_ids, labels=batch_y)
        loss = criterion(outputs.logits.view(-1, outputs.logits.size(-1)), batch_y.view(-1))
        loss.backward()
        optimizer.step()

# calculate perplexity on test data
model.eval()
total_loss = 0
total_tokens = 0
with torch.no_grad():
    for batch_x, batch_y in test_dataloader:
        input_ids = batch_x.to(model.device)
        output_ids = batch_y.to(model.device)
        outputs = model(input_ids=input_ids, labels=output_ids)
        loss = criterion(outputs.logits.view(-1, outputs.logits.size(-1)), output_ids.view(-1))    
        total_loss += loss.item() * input_ids.size(0)
        total_tokens += input_ids.size(0) * input_ids.size(1)

perplexity = torch.exp(torch.tensor(total_loss / total_tokens))
print("Perplexity:", perplexity)

#Some unwanted work

In [None]:
# import torch
# from transformers import RobertaTokenizer, RobertaForMaskedLM
# from torch.utils.data import Dataset, DataLoader

# # define training dataset
# class MyDataset(Dataset):
#     def __init__(self, texts, tokenizer):
#         self.texts = texts
#         self.tokenizer = tokenizer
    
#     def __len__(self):
#         return len(self.texts)
    
#     def __getitem__(self, idx):
#         text = self.texts[idx]
#         tokens = self.tokenizer.encode(text, add_special_tokens=True)
#         return torch.tensor(tokens)

# # initialize tokenizer and model
# tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
# model = RobertaForMaskedLM.from_pretrained('roberta-base')

# # set up optimizer and loss function
# optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
# criterion = torch.nn.CrossEntropyLoss()

# # set up data loader
# texts = ["example sentence 1", "example sentence 2"]
# dataset = MyDataset(texts, tokenizer)
# dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

# # train model
# model.train()
# for epoch in range(10):
#     for batch in dataloader:
#         optimizer.zero_grad()
#         input_ids = batch.to(model.device)
#         outputs = model(input_ids=input_ids, labels=input_ids)
#         loss = criterion(outputs.logits.view(-1, outputs.logits.size(-1)), input_ids.view(-1))
#         loss.backward()
#         optimizer.step()

# # calculate perplexity on test data
# model.eval()
# test_texts = ["test sentence 1", "test sentence 2"]
# test_dataset = MyDataset(test_texts, tokenizer)
# test_dataloader = DataLoader(test_dataset, batch_size=8)
# total_loss = 0
# total_tokens = 0
# with torch.no_grad():
#     for batch in test_dataloader:
#         input_ids = batch.to(model.device)
#         outputs = model(input_ids=input_ids, labels=input_ids)
#         loss = criterion(outputs.logits.view(-1, outputs.logits.size(-1)), input_ids.view(-1))
#         total_loss += loss.item() * input_ids.size(0)
#         total_tokens += input_ids.size(0) * input_ids.size(1)

# perplexity = torch.exp(torch.tensor(total_loss / total_tokens))
# print("Perplexity:", perplexity)