### Dataset Class

In [1]:
import torch

class Dataset(torch.utils.data.Dataset):
    """
    This class loads and preprocesses the given text data
    """
    def __init__(self, paths, tokenizer):
        """
        This function initialises the object. It takes the given paths and tokeniser.
        """
        self.paths = paths
        self.tokenizer = tokenizer
        self.data = self.read_file(self.paths[0])
        self.current_file = 0
        self.offset = 0
        self.remaining = len(self.data)
        
         # get length
        self.length = 0
        for path in self.paths: 
            print(len(self.read_file(path)))
            self.length += len(self.read_file(path))

    def __len__(self):
        """
        returns the length of the ds
        """
        return self.length
        #return 1058750 # pre-calculated length of 10M data set
        #return 10587561
    
    def read_file(self, path):
        """
        reads a given file
        """
        with open(path, 'r', encoding='utf-8') as f:
            lines = f.read().split('\n')
        return lines

    def get_encodings(self, lines_all):
        """
        Creates encodings for a given text input
        """
        # tokenise all text 
        batch = self.tokenizer(lines_all, max_length=128, padding='max_length', truncation=True)

        # Ground Truth
        labels = torch.tensor(batch['input_ids'])
        # Attention Masks
        mask = torch.tensor(batch['attention_mask'])

        # Input to be masked
        input_ids = labels.detach().clone()
        rand = torch.rand(input_ids.shape)

        # with a probability of 15%, mask a given word, leave out CLS, SEP and PAD
        mask_arr = (rand < .15) * (input_ids != 0) * (input_ids != 2) * (input_ids != 3)
        # assign token 4 (=MASK)
        input_ids[mask_arr] = 4
        
        return {'input_ids':input_ids, 'attention_mask':mask, 'labels':labels}

    def __getitem__(self, i):
        """
        returns item i
        Note: do not use shuffling for this dataset
        """
        # if we have looked at all items in the file - take next
        if self.remaining == 0:
            self.offset += len(self.data)
            self.current_file += 1
            # if we are at the end of the dataset, start over again
            if self.current_file == len(self.paths):
                self.current_file = 0
            # self.get_encodings(self.data)
            print("reading {}".format(self.paths[self.current_file]))
            self.data = self.read_file(self.paths[self.current_file])
            self.remaining = len(self.data)
        
        # reset offset when i is reset
        if i == 0:
            self.offset = 0
        
        self.remaining -= 1

        encodings = self.get_encodings(self.data[i - self.offset])

        return encodings 

### Set up electra tokenizer

In [2]:
from pathlib import Path
from transformers import ElectraTokenizerFast
from transformers import ElectraConfig
from transformers import ElectraForMaskedLM

tokenizer = ElectraTokenizerFast.from_pretrained('google/electra-small-discriminator')

tokens = tokenizer('Hello, how are you?')
print(tokens)
# {'input_ids': [2, 21694, 16, 2287, 2009, 1991, 35, 3],
# 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

tokenizer.decode(tokens['input_ids'])
# '[CLS] hello, how are you? [SEP]'

{'input_ids': [101, 7592, 1010, 2129, 2024, 2017, 1029, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}


'[CLS] hello, how are you? [SEP]'

### Grab all data

In [4]:
# load dataset files one by one
paths = [str(x) for x in Path('train_10M').glob('**/*.train')]
ds = Dataset(paths, tokenizer=tokenizer)
# tokenize data with batch size 16
loader = torch.utils.data.DataLoader(ds, batch_size=16)

90001
580001
66015
360001
65001
18001


### Get Electra Config

In [5]:
import requests

# URL to the config file
config_url = "https://huggingface.co/bsu-slim/electra-tiny/resolve/main/config.json"

# Download the config file
response = requests.get(config_url)

# Load the JSON content into ElectraConfig using .from_dict
config = ElectraConfig.from_dict(response.json())
model = ElectraForMaskedLM(config)
optim=torch.optim.Adam(model.parameters())

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

ElectraForMaskedLM(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(30522, 64, padding_idx=0)
      (position_embeddings): Embedding(512, 64)
      (token_type_embeddings): Embedding(2, 64)
      (LayerNorm): LayerNorm((64,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (embeddings_project): Linear(in_features=64, out_features=196, bias=True)
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0-17): 18 x ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=196, out_features=196, bias=True)
              (key): Linear(in_features=196, out_features=196, bias=True)
              (value): Linear(in_features=196, out_features=196, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=19

In [6]:
config

ElectraConfig {
  "_attn_implementation_autoset": true,
  "_name_or_path": "electra_owt_full_b256_hs196_ah4_is128_l18_es64_vs30522_pytorch",
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 64,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 196,
  "initializer_range": 0.02,
  "intermediate_size": 128,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 4,
  "num_hidden_layers": 18,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj": true,
  "torch_dtype": "float32",
  "transformers_version": "4.49.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [7]:
from tqdm.notebook import tqdm
import numpy as np

epochs = 10

for epoch in range(epochs):
    loop = tqdm(loader, leave=True)

    # set model to training mode
    model.train()
    losses = []

    # iterate over dataset
    for batch in loop:
        optim.zero_grad()

        # copy input to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # predict
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

        # update weights
        loss = outputs.loss
        loss.backward()

        optim.step()

        # output current loss
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())
        losses.append(loss.item())

    print("Mean Training Loss", np.mean(losses))
    losses = []
    loop = tqdm(test_loader, leave=True)

    # set model to evaluation mode
    model.eval()

    # iterate over dataset
    for batch in loop:
        # copy input to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # predict
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

        # update weights
        loss = outputs.loss

        # output current loss
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())
        losses.append(loss.item())
    print("Mean Test Loss", np.mean(losses))

  0%|          | 0/73689 [00:00<?, ?it/s]

reading train_10M\childes.train
reading train_10M\gutenberg.train
reading train_10M\open_subtitles.train
reading train_10M\simple_wiki.train
reading train_10M\switchboard.train


NameError: name 'np' is not defined

In [None]:
# save the pretrained model
torch.save(model, "electra_tiny.model")

In [None]:
model = torch.load("electra_tiny.model")