# Imports and preparing

In [None]:
!pip install evaluate

In [8]:
import torch
from torch.utils.data import DataLoader, Dataset
import transformers
from transformers import AutoTokenizer
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer
import torch.nn as nn
import matplotlib.pyplot as plt
import seaborn as sns
import torch.nn.functional as F
import numpy as np
import pandas as pd
from datasets import load_dataset
import os
import warnings
warnings.simplefilter("ignore")

In [13]:
def train_data_preprocess(examples):
    
    """
    generate start and end indexes of answer in context
    """
    
    def find_context_start_end_index(sequence_ids):
        """
        returns the token index in whih context starts and ends
        """
        token_idx = 0
        while sequence_ids[token_idx] != 1:  #means its special tokens or tokens of queston
            token_idx += 1                   # loop only break when context starts in tokens
        context_start_idx = token_idx
    
        while sequence_ids[token_idx] == 1:
            token_idx += 1
        context_end_idx = token_idx - 1
        return context_start_idx,context_end_idx  
    
    
    questions = [q.strip() for q in examples["question"]]
    context = examples["context"]
    answers = examples["answers"]
    
    inputs = tokenizer(
        questions,
        context,
        max_length=512,
        truncation="only_second",
        stride=128,
        return_overflowing_tokens=True,  #returns id of base context
        return_offsets_mapping=True,  # returns (start_index,end_index) of each token
        padding="max_length"
    )


    start_positions = []
    end_positions = []

    
    for i,mapping_idx_pairs in enumerate(inputs['offset_mapping']):
        context_idx = inputs['overflow_to_sample_mapping'][i]
    
        answer = answers[context_idx]
        answer_start_char_idx = answer['answer_start'][0]
        answer_end_char_idx = answer_start_char_idx + len(answer['text'][0])

        tokens = inputs['input_ids'][i]
        sequence_ids = inputs.sequence_ids(i)
   
        context_start_idx,context_end_idx = find_context_start_end_index(sequence_ids)

        context_start_char_index = mapping_idx_pairs[context_start_idx][0]
        context_end_char_index = mapping_idx_pairs[context_end_idx][1]
    
        if (context_start_char_index > answer_start_char_idx) or (
            context_end_char_index < answer_end_char_idx):
            start_positions.append(0)
            end_positions.append(0)
    
        else:

            idx = context_start_idx
            while idx <= context_end_idx and mapping_idx_pairs[idx][0] <= answer_start_char_idx:
                idx += 1
            start_positions.append(idx - 1)  
        
            idx = context_end_idx
            while idx >= context_start_idx and mapping_idx_pairs[idx][1] > answer_end_char_idx:
                idx -= 1
            end_positions.append(idx + 1)
    
    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [14]:
def preprocess_validation_examples(examples):
    """
    preprocessing validation data
    """
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=512,
        truncation="only_second",
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")

    base_ids = []

    for i in range(len(inputs["input_ids"])):
        
        # take the base id (ie in cases of overflow happens we get base id)
        base_context_idx = sample_map[i]
        base_ids.append(examples["id"][base_context_idx])
        
        # sequence id indicates the input. 0 for first input and 1 for second input
        # and None for special tokens by default
        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        # for Question tokens provide offset_mapping as None
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["base_id"] = base_ids
    return inputs

In [15]:
def predict_answers_and_evaluate(start_logits,end_logits,eval_set,examples):
    """
    make predictions 
    Args:
    start_logits : strat_position prediction logits
    end_logits: end_position prediction logits
    eval_set: processed val data
    examples: unprocessed val data with context text
    """
    # appending all id's corresponding to the base context id
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(eval_set):
        example_to_features[feature["base_id"]].append(idx)

    n_best = 20
    max_answer_length = 30
    predicted_answers = []

    for example in examples:
        example_id = example["id"]
        context = example["context"]
        answers = []

        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = eval_set["offset_mapping"][feature_index]
        
            # sorting the predictions of all hidden states and taking best n_best prediction
            # means taking the index of top 20 tokens
            start_indexes = np.argsort(start_logit).tolist()[::-1][:n_best]
            end_indexes = np.argsort(end_logit).tolist()[::-1][:n_best]
        
    
            for start_index in start_indexes:
                for end_index in end_indexes:
                
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length.
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                       ):
                        continue

                    answers.append({
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                        })

    
            # Select the answer with the best score
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})
    
    metric = evaluate.load("squad")

    theoretical_answers = [
            {"id": ex["id"], "answers": ex["answers"]} for ex in examples
    ]
    
    metric_ = metric.compute(predictions=predicted_answers, references=theoretical_answers)
    return predicted_answers,metric_

In [16]:
class DataQA(Dataset):
    def __init__(self, dataset,mode="train"):
        self.mode = mode
        
        
        if self.mode == "train":
            # sampling
            self.dataset = dataset["train"]
            self.data = self.dataset.map(train_data_preprocess,
                                                      batched=True,
                            remove_columns= dataset["train"].column_names)
        
        else:
            self.dataset = dataset["validation"]
            self.data = self.dataset.map(preprocess_validation_examples,
            batched=True,remove_columns = dataset["validation"].column_names,
               )
            
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):

        out = {}
        example = self.data[idx]
        out['input_ids'] = torch.tensor(example['input_ids'])
        out['attention_mask'] = torch.tensor(example['attention_mask'])

        
        if self.mode == "train":

            out['start_positions'] = torch.unsqueeze(torch.tensor(example['start_positions']),dim=0)
            out['end_positions'] = torch.unsqueeze(torch.tensor(example['end_positions']),dim=0)
            
        return out

# DistilBERT

In [17]:
dataset = load_dataset("squad")

dataset['train'] = dataset['train'].select([i for i in range(5000)])
dataset['validation'] = dataset['validation'].select([i for i in range(500)])

In [18]:
trained_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(trained_checkpoint)

train_dataset = DataQA(dataset,mode="train")
val_dataset = DataQA(dataset,mode="validation")

In [19]:
from transformers import default_data_collator
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    train_dataset,
    shuffle=True,
    collate_fn=default_data_collator,
    batch_size=2,
)
eval_dataloader = DataLoader(
    val_dataset, collate_fn=default_data_collator, batch_size=2
)

In [20]:
from transformers import DistilBertForQuestionAnswering
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Available device: {device}')

checkpoint =  "distilbert-base-uncased"
model = DistilBertForQuestionAnswering.from_pretrained(checkpoint)
model = model.to(device)

Available device: cuda


Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
from transformers import AdamW
from tqdm.notebook import tqdm
import datetime
import numpy as np
import collections
import evaluate

optimizer = AdamW(model.parameters(), lr=2e-5)

epochs = 2

# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs
print(total_steps)


def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

5010


In [23]:
validation_processed_dataset = dataset["validation"].map(preprocess_validation_examples,
            batched=True,remove_columns = dataset["validation"].column_names,)

In [24]:
import random,time
import numpy as np

seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

stats = []

total_train_time_start = time.time()

for epoch in range(epochs):
    print(' ')
    print(f'=====Epoch {epoch + 1}=====')
    print('Training....')

    t0 = time.time()
     
    training_loss = 0
    model.train()
    for step,batch in enumerate(train_dataloader):
         
        if step%40 == 0 and not step == 0:
              elapsed_time = format_time(time.time() - t0)
              # Report progress.
              print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed_time))   
       
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
            
        model.zero_grad()

        result = model(input_ids = input_ids, 
                        attention_mask = attention_mask,
                        start_positions = start_positions,
                        end_positions = end_positions,
                        return_dict=True)
         
        loss = result.loss
    
        training_loss += loss.item()      

        loss.backward()

        optimizer.step()

    avg_train_loss = training_loss/len(train_dataloader) 
 
    training_time = format_time(time.time() - t0)
     
    
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))
     
    print("")
    print("Running Validation...")

    t0 = time.time()

    model.eval()

    start_logits,end_logits = [],[]
    for step,batch in enumerate(eval_dataloader):
       
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        with torch.no_grad():  
             result = model(input_ids = input_ids, 
                        attention_mask = attention_mask,return_dict=True)

        start_logits.append(result.start_logits.cpu().numpy())
        end_logits.append(result.end_logits.cpu().numpy())
   

    start_logits = np.concatenate(start_logits)
    end_logits = np.concatenate(end_logits)

    answers,metrics_ = predict_answers_and_evaluate(start_logits,end_logits,validation_processed_dataset,dataset["validation"])
    print(f'Exact match: {metrics_["exact_match"]}, F1 score: {metrics_["f1"]}')

    print('')
    validation_time = format_time(time.time() - t0)

    print("  Validation took: {:}".format(validation_time))

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_train_time_start)))

 
=====Epoch 1=====
Training....
  Batch    40  of  2,505.    Elapsed: 0:00:06.
  Batch    80  of  2,505.    Elapsed: 0:00:11.
  Batch   120  of  2,505.    Elapsed: 0:00:15.
  Batch   160  of  2,505.    Elapsed: 0:00:20.
  Batch   200  of  2,505.    Elapsed: 0:00:24.
  Batch   240  of  2,505.    Elapsed: 0:00:29.
  Batch   280  of  2,505.    Elapsed: 0:00:34.
  Batch   320  of  2,505.    Elapsed: 0:00:38.
  Batch   360  of  2,505.    Elapsed: 0:00:43.
  Batch   400  of  2,505.    Elapsed: 0:00:48.
  Batch   440  of  2,505.    Elapsed: 0:00:53.
  Batch   480  of  2,505.    Elapsed: 0:00:58.
  Batch   520  of  2,505.    Elapsed: 0:01:02.
  Batch   560  of  2,505.    Elapsed: 0:01:07.
  Batch   600  of  2,505.    Elapsed: 0:01:12.
  Batch   640  of  2,505.    Elapsed: 0:01:17.
  Batch   680  of  2,505.    Elapsed: 0:01:22.
  Batch   720  of  2,505.    Elapsed: 0:01:27.
  Batch   760  of  2,505.    Elapsed: 0:01:32.
  Batch   800  of  2,505.    Elapsed: 0:01:37.
  Batch   840  of  2,505.  

# DiffEvo

In [25]:
import random
import copy

import numpy as np
from tqdm import tqdm

all_mode = False
device = 'cpu'
number_of_samples = 1
number_of_iterations = 10
losses = {}


def get_states(model, list_mode=False):
    if not list_mode:
        states = {}
        for key in model.state_dict().keys():
            if all_mode:
                states[key] = model.state_dict()[key]
            else:
                if 'qa_outputs' not in key:
                    continue
                else:
                    states[key] = model.state_dict()[key]
        return states
    else:
        states = []
        for key in model.state_dict().keys():
            if all_mode:
                states.append((model.state_dict()[key].cpu().numpy()))
            else:
                if 'qa_outputs' not in key:
                    continue
                else:
                    states.append((model.state_dict()[key].cpu().numpy()))
        return states


def bert_fobj(model, states, _popsize):
    for i, key in enumerate(x for x in list(model.state_dict().keys()) if 'qa_outputs' in x):
        if not all_mode:
            if 'qa_outputs' in key:
                pass
                model.state_dict()[key] = states[i]
        else:
            model.state_dict()[key] = states[i]

    results = []
    data = list(train_dataloader)

    for i in range(number_of_samples):
        a = random.choice(data)

        input_ids = a['input_ids'].to(device)
        attention_mask = a['attention_mask'].to(device)
        start_positions = a['start_positions'].to(device)
        end_positions = a['end_positions'].to(device)

        _result = model(input_ids=input_ids,
                        attention_mask=attention_mask,
                        start_positions=start_positions,
                        end_positions=end_positions,
                        return_dict=True)

        results.append(_result.loss.cpu().detach().numpy())

    _avg_loss = sum(results) / len(results)
    losses[_popsize].append(_avg_loss)

    print(f'--- individual {_popsize} gets loss {_avg_loss}')

    return _avg_loss


def de(fobj, bounds, model,
       mut=0.8,
       crossp=0.7,
       popsize=5,
       its=number_of_iterations):
    model.eval().to(device)
    states = get_states(model, list_mode=True)
    dimensions = [layer.shape for layer in states]
    min_b, max_b = np.asarray(bounds).T
    diff = np.fabs(min_b - max_b)

    print('Population 0')

    pop = []
    for _popsize in range(popsize):
        losses[_popsize] = []
        _pop = []
        for state in states:
            _pop.append(min_b + np.random.rand(*state.shape) * diff)
        pop.append(_pop)

    fitness = np.asarray([fobj(model=model, states=ind, _popsize=i) for i, ind in enumerate(pop)])
    best_idx = np.argmin(fitness)
    best = pop[best_idx]

    for i_iter in range(its):
        print(f'Population {i_iter + 1}')
        for i_individual in range(popsize):
            idxs = [idx for idx in range(popsize) if idx != i_individual]
            indexes = np.random.choice(idxs, 3, replace=False)
            a, b, c = [pop[x] for x in indexes]
            mutant = [np.clip(_a + mut * (_b - _c), 0, 1) for _a, _b, _c in zip(a, b, c)]

            cross_points = [np.random.rand(*layer.shape) < crossp for layer in states]
            if not np.any(cross_points[0]):
                for i_layer, dimension in enumerate(dimensions):
                    cross_points[i_layer][np.random.randint(0, dimension)] = True

            trial_denorm = []
            trial = []
            for i_layer, dimension in enumerate(dimensions):
                
                trial.append(np.where(cross_points[i_layer], mutant[i_layer], pop[i_individual][i_layer]))
                trial_denorm.append(min_b + trial[i_layer] * diff)

            f = fobj(model=model, states=trial_denorm, _popsize=i_individual)
            if f < fitness[i_individual]:
                fitness[i_individual] = f
                pop[i_individual] = trial
                if f < fitness[best_idx]:
                    best_idx = i_individual
                    best = trial_denorm
    yield best, fitness[best_idx]

In [26]:
list(de(fobj=bert_fobj,
     bounds=[(-1, 1)],
     model=model))

Population 0
--- individual 0 gets loss 0.25523555278778076
--- individual 1 gets loss 0.40104830265045166
--- individual 2 gets loss 0.3161746859550476
--- individual 3 gets loss 0.20430442690849304
--- individual 4 gets loss 0.2114558219909668
Population 1
--- individual 0 gets loss 0.4372691810131073
--- individual 1 gets loss 0.07762440294027328
--- individual 2 gets loss 0.07414310425519943
--- individual 3 gets loss 0.06446172297000885
--- individual 4 gets loss 0.10795057564973831
Population 2
--- individual 0 gets loss 1.0606532096862793
--- individual 1 gets loss 0.501928448677063
--- individual 2 gets loss 0.8306143879890442
--- individual 3 gets loss 0.16158036887645721
--- individual 4 gets loss 0.18242719769477844
Population 3
--- individual 0 gets loss 0.6541902422904968
--- individual 1 gets loss 0.1711169183254242
--- individual 2 gets loss 0.6292484998703003
--- individual 3 gets loss 0.3854890465736389
--- individual 4 gets loss 0.13277095556259155
Population 4
--- in

[([array([[-2.00584072, -1.73222951,  0.23530529, ..., -2.04684132,
           -1.        , -1.        ],
          [-1.        , -1.        , -0.68756045, ..., -0.63040182,
           -0.49963379,  0.78474758]]),
   array([-1.        , -0.27196051])],
  0.01124594733119011)]