### Dataset Class

In [1]:
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

class Dataset(torch.utils.data.Dataset):
    """
    This class loads and preprocesses the given text data
    """
    def __init__(self, paths, tokenizer):
        """
        This function initialises the object. It takes the given paths and tokeniser.
        """
        self.paths = paths
        self.tokenizer = tokenizer
        self.data = self.read_file(self.paths[0])
        self.current_file = 0
        self.offset = 0
        self.remaining = len(self.data)
        
         # get length
        self.length = 0
        for path in self.paths: 
            print(len(self.read_file(path)))
            self.length += len(self.read_file(path))

    def __len__(self):
        """
        returns the length of the ds
        """
        return self.length
        #return 1058750 # pre-calculated length of 10M data set
        #return 10587561
    
    def read_file(self, path):
        """
        reads a given file
        """
        with open(path, 'r', encoding='utf-8') as f:
            lines = f.read().split('\n')
        return lines

    def get_encodings(self, lines_all):
        """
        Creates encodings for a given text input
        """
        # tokenise all text 
        batch = self.tokenizer(lines_all, max_length=128, padding='max_length', truncation=True)

        # Ground Truth
        labels = torch.tensor(batch['input_ids'])
        # Attention Masks
        mask = torch.tensor(batch['attention_mask'])

        # Input to be masked
        input_ids = labels.detach().clone()
        rand = torch.rand(input_ids.shape)

        # with a probability of 15%, mask a given word, leave out CLS, SEP and PAD
        mask_arr = (rand < .15) * (input_ids != 0) * (input_ids != 2) * (input_ids != 3)
        # assign token 4 (=MASK)
        input_ids[mask_arr] = 4
        
        return {'input_ids':input_ids, 'attention_mask':mask, 'labels':labels}

    def __getitem__(self, i):
        """
        returns item i
        Note: do not use shuffling for this dataset
        """
        # if we have looked at all items in the file - take next
        if self.remaining == 0:
            self.offset += len(self.data)
            self.current_file += 1
            # if we are at the end of the dataset, start over again
            if self.current_file == len(self.paths):
                self.current_file = 0
            # self.get_encodings(self.data)
            print("reading {}".format(self.paths[self.current_file]))
            self.data = self.read_file(self.paths[self.current_file])
            self.remaining = len(self.data)
        
        # reset offset when i is reset
        if i == 0:
            self.offset = 0
        
        self.remaining -= 1

        encodings = self.get_encodings(self.data[i - self.offset])

        return encodings 

cuda


### Set up electra tokenizer

In [2]:
from pathlib import Path
from transformers import ElectraTokenizerFast, ElectraModel, ElectraConfig, ElectraForMaskedLM

tokenizer = ElectraTokenizerFast.from_pretrained('google/electra-small-discriminator')

tokens = tokenizer('Hello, how are you?')
print(tokens)
# {'input_ids': [2, 21694, 16, 2287, 2009, 1991, 35, 3],
# 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

tokenizer.decode(tokens['input_ids'])
# '[CLS] hello, how are you? [SEP]'

{'input_ids': [101, 7592, 1010, 2129, 2024, 2017, 1029, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}


'[CLS] hello, how are you? [SEP]'

### Grab all data

In [10]:
# load dataset files one by one
paths = [str(x) for x in Path('train_10M').glob('**/*.train')]
ds = Dataset(paths, tokenizer=tokenizer)
# tokenize data with batch size 16
loader = torch.utils.data.DataLoader(ds, batch_size=32)

90001
580001
66015
360001
65001
18001


In [11]:
import numpy as np

i = iter(ds)

for j in range(10):
    sample = next(i)
    
    input_ids = sample['input_ids']
    attention_masks = sample['attention_mask']
    labels = sample['labels']
    
    # check if the dimensions are right
    assert input_ids.shape[0] == (128)
    assert attention_masks.shape[0] == (128)
    assert labels.shape[0] == (128)
    
    # if the input ids are not masked, the labels are the same as the input ids
    assert np.array_equal(input_ids[input_ids != 4].numpy(),labels[input_ids != 4].numpy())
    # input ids are zero if the attention masks are zero
    assert np.all(input_ids[attention_masks == 0].numpy()==0)
    # check if input contains masked tokens (we can't guarantee this 100% but this will apply) most likely
    #assert np.any(input_ids.numpy() == 4)
print("Passed")

Passed


### Get Electra Config

In [12]:
import requests

# URL to the config file
config_url = "https://huggingface.co/bsu-slim/electra-tiny/resolve/main/config.json"

# Download the config file
response = requests.get(config_url)

# Load the JSON content into ElectraConfig using .from_dict
config = ElectraConfig.from_dict(response.json())
model = ElectraForMaskedLM(config)
optim=torch.optim.Adam(model.parameters())

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

ElectraForMaskedLM(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(30522, 64, padding_idx=0)
      (position_embeddings): Embedding(512, 64)
      (token_type_embeddings): Embedding(2, 64)
      (LayerNorm): LayerNorm((64,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (embeddings_project): Linear(in_features=64, out_features=196, bias=True)
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0-17): 18 x ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=196, out_features=196, bias=True)
              (key): Linear(in_features=196, out_features=196, bias=True)
              (value): Linear(in_features=196, out_features=196, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=19

In [13]:
config

ElectraConfig {
  "_attn_implementation_autoset": true,
  "_name_or_path": "electra_owt_full_b256_hs196_ah4_is128_l18_es64_vs30522_pytorch",
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 64,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 196,
  "initializer_range": 0.02,
  "intermediate_size": 128,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 4,
  "num_hidden_layers": 18,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj": true,
  "torch_dtype": "float32",
  "transformers_version": "4.49.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [14]:
from tqdm.notebook import tqdm
from torch.optim import AdamW
import numpy as np

In [15]:
# we use AdamW as the optimiser
optim = AdamW(model.parameters(), lr=1e-4)

In [16]:
epochs = 10

for epoch in range(epochs):
    loop = tqdm(loader, leave=True)

    # set model to training mode
    model.train()
    losses = []

    # iterate over dataset
    for batch in loop:
        optim.zero_grad()

        # copy input to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # predict
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

        # update weights
        loss = outputs.loss
        loss.backward()

        optim.step()

        # output current loss
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())
        losses.append(loss.item())

    print("Mean Training Loss", np.mean(losses))

  0%|          | 0/36845 [00:00<?, ?it/s]

reading train_10M\childes.train
reading train_10M\gutenberg.train
reading train_10M\open_subtitles.train
reading train_10M\simple_wiki.train
reading train_10M\switchboard.train
reading train_10M\bnc_spoken.train
Mean Training Loss 0.24338584062980162


  0%|          | 0/36845 [00:00<?, ?it/s]

reading train_10M\childes.train
reading train_10M\gutenberg.train
reading train_10M\open_subtitles.train
reading train_10M\simple_wiki.train
reading train_10M\switchboard.train
reading train_10M\bnc_spoken.train
Mean Training Loss 0.0801780707593125


  0%|          | 0/36845 [00:00<?, ?it/s]

reading train_10M\childes.train
reading train_10M\gutenberg.train
reading train_10M\open_subtitles.train
reading train_10M\simple_wiki.train
reading train_10M\switchboard.train
reading train_10M\bnc_spoken.train
Mean Training Loss 0.07501595280145068


  0%|          | 0/36845 [00:00<?, ?it/s]

reading train_10M\childes.train
reading train_10M\gutenberg.train
reading train_10M\open_subtitles.train
reading train_10M\simple_wiki.train
reading train_10M\switchboard.train
reading train_10M\bnc_spoken.train
Mean Training Loss 0.07169152818006169


  0%|          | 0/36845 [00:00<?, ?it/s]

reading train_10M\childes.train
reading train_10M\gutenberg.train
reading train_10M\open_subtitles.train
reading train_10M\simple_wiki.train
reading train_10M\switchboard.train
reading train_10M\bnc_spoken.train
Mean Training Loss 0.068760689341164


  0%|          | 0/36845 [00:00<?, ?it/s]

reading train_10M\childes.train
reading train_10M\gutenberg.train
reading train_10M\open_subtitles.train
reading train_10M\simple_wiki.train
reading train_10M\switchboard.train
reading train_10M\bnc_spoken.train
Mean Training Loss 0.0663094697193877


  0%|          | 0/36845 [00:00<?, ?it/s]

reading train_10M\childes.train
reading train_10M\gutenberg.train
reading train_10M\open_subtitles.train
reading train_10M\simple_wiki.train
reading train_10M\switchboard.train
reading train_10M\bnc_spoken.train
Mean Training Loss 0.06451682619906615


  0%|          | 0/36845 [00:00<?, ?it/s]

reading train_10M\childes.train
reading train_10M\gutenberg.train
reading train_10M\open_subtitles.train
reading train_10M\simple_wiki.train
reading train_10M\switchboard.train
reading train_10M\bnc_spoken.train
Mean Training Loss 0.06279580554748


  0%|          | 0/36845 [00:00<?, ?it/s]

reading train_10M\childes.train
reading train_10M\gutenberg.train
reading train_10M\open_subtitles.train
reading train_10M\simple_wiki.train
reading train_10M\switchboard.train
reading train_10M\bnc_spoken.train
Mean Training Loss 0.061605543963385405


  0%|          | 0/36845 [00:00<?, ?it/s]

reading train_10M\childes.train
reading train_10M\gutenberg.train
reading train_10M\open_subtitles.train
reading train_10M\simple_wiki.train
reading train_10M\switchboard.train
reading train_10M\bnc_spoken.train
Mean Training Loss 0.06059068696942291


In [17]:
# save the pretrained model
torch.save(model.state_dict(), "pytorch_model.bin")

In [18]:
# save config
config.to_json_file("config.json")

### Evaluation and Fine Tuning

#### Load Data

In [21]:
X = [line.strip() for line in open('X.txt').readlines()]
y = train_data = [int(line.strip()) for line in open('YL1.txt').readlines()]

train_X = X[:46000]
train_y = np.array(y[:46000])
test_X = X[46000:]
test_y = np.array(y[46000:])

labels = {
    0:'Computer Science',
    1:'Electrical Engineering',
    2:'Psychology',
    3:'Mechanical Engineering',
    4:'Civil Engineering',
    5:'Medical Science',
    6:'Biochemistry'
}

#### Define Data Structures

In [26]:
class MultiLabelDataset(torch.utils.data.Dataset):

    def __init__(self, text, labels, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.text = text
        self.targets = labels
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = self.text[index]
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

class ELECTRAClass(torch.nn.Module):
    def __init__(self, NUM_OUT):
        super(ELECTRAClass, self).__init__()
                   
        self.l1 = ElectraModel.from_pretrained("./electra_tiny_model")
        # ELECTRA tiny has 196 hidden units
        self.classifier = torch.nn.Linear(196, NUM_OUT)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        output = self.classifier(pooler)

        return output

#### Define Loss and Train Function

In [27]:
def loss_fn(outputs, targets):
    # Change to cross entropy
    return torch.nn.CrossEntropyLoss()(outputs, targets)

def train(model, training_loader, optimizer):
    model.train()
    for data in tqdm(training_loader):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    return loss
    
def validation(model, testing_loader):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for data in tqdm(testing_loader):
            targets = data['targets']
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            outputs = model(ids, mask, token_type_ids)
            #outputs = torch.sigmoid(outputs).cpu().detach()
            fin_outputs.extend(outputs.cpu().detach())
            fin_targets.extend(targets)
    return torch.stack(fin_outputs), torch.stack(fin_targets)

#### Format DataLoader

In [40]:
MAX_LEN = 128
BATCH_SIZE = 16
EPOCHS = 5
NUM_OUT = 7
LEARNING_RATE = 2e-05

training_data = MultiLabelDataset(train_X, torch.from_numpy(train_y), tokenizer, MAX_LEN)
test_data = MultiLabelDataset(test_X, torch.from_numpy(test_y), tokenizer, MAX_LEN)

train_params = {'batch_size': BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }    

training_loader = torch.utils.data.DataLoader(training_data, **train_params)
testing_loader = torch.utils.data.DataLoader(test_data, **test_params)

#### Train and Evaluate

In [41]:
from sklearn.metrics import accuracy_score

In [42]:
model = ELECTRAClass(NUM_OUT)
model.to(device)

optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

for epoch in range(EPOCHS):
    loss = train(model, training_loader, optimizer)
    print(f'Epoch: {epoch}, Loss: {loss.mean().item()}')
    guess, targs = validation(model, testing_loader)
    guesses = torch.max(guess, dim=1).indices
    targets = targs
    print('Accuracy on test set: {}'.format(accuracy_score(guesses, targets)))

  0%|          | 0/2875 [00:00<?, ?it/s]

  'targets': torch.tensor(self.targets[index], dtype=torch.float)


Epoch: 0, Loss: 0.5766854286193848


  0%|          | 0/62 [00:00<?, ?it/s]

Accuracy on test set: 0.749238578680203


  0%|          | 0/2875 [00:00<?, ?it/s]

  'targets': torch.tensor(self.targets[index], dtype=torch.float)


Epoch: 1, Loss: 0.28415295481681824


  0%|          | 0/62 [00:00<?, ?it/s]

Accuracy on test set: 0.7746192893401015


  0%|          | 0/2875 [00:00<?, ?it/s]

  'targets': torch.tensor(self.targets[index], dtype=torch.float)


Epoch: 2, Loss: 0.7769172191619873


  0%|          | 0/62 [00:00<?, ?it/s]

Accuracy on test set: 0.7888324873096447


  0%|          | 0/2875 [00:00<?, ?it/s]

  'targets': torch.tensor(self.targets[index], dtype=torch.float)


Epoch: 3, Loss: 0.13983726501464844


  0%|          | 0/62 [00:00<?, ?it/s]

Accuracy on test set: 0.7918781725888325


  0%|          | 0/2875 [00:00<?, ?it/s]

  'targets': torch.tensor(self.targets[index], dtype=torch.float)


Epoch: 4, Loss: 0.4454813301563263


  0%|          | 0/62 [00:00<?, ?it/s]

Accuracy on test set: 0.782741116751269


Questions

1. How well did your model perform on the evaluation compared to BERT?
Compared to BERT, which I was able to get 84% accuracy on the same fine-tuning test, it did worse, only getting a high of 79% on epoch 3.
2. What is the difference between pre training and fine-tuning?
Pre training requires more computational resources becuase it requires more data as the weights in the model are randomly initialized.  Fine-tuning is different because it requires less data and uses a pre trained model which has trained weights, this makes it more computationally effective.
3. Why do you think pre training requires more compute resources than fine-tuning?
- The amount of data required to pre train is far greater than for fine-tuning
- Weights are randomly initialized during pre training
- Diversity of data probably needs to be greater for pre training
- Pre training is an unsupervised learning task while fine tuning is supervised https://www.ibm.com/think/topics/supervised-vs-unsupervised-learning