<a href="https://colab.research.google.com/github/salmenhsairi/EndOfStudiesProjectNotebooks/blob/main/RayTune%26PytorchHyperPrams.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install dependencies

In [None]:
!pip install transformers==3.0.2
!pip install ray==0.8.7
!pip install ray[tune]

Installing collected packages: tensorboardX
Successfully installed tensorboardX-2.5


## Get the tagging Data
*upload the iob/iob2 full documents file in zip and run the cell below*

In [None]:
%%bash
cd /content && unzip -q test_stripe && rm test_stripe.zip
mkdir /content/data
mv *.tsv ./data/full_documents_IOB.tsv

## BERT fine Tuning

In [None]:
# importing required libraries
import os
import pandas as pd
import torch
import numpy as np
import ray

from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score
from torch.utils.data import Dataset, DataLoader
from torch import cuda
from transformers import BertTokenizer, BertConfig, BertForTokenClassification
from tensorboard.plugins.hparams import api as hp
from ray import tune
from ray.tune.schedulers import ASHAScheduler
from tensorboard import notebook

In [None]:
def read_iob_format(filename, idx=-1):
    # read file
    lines =  open(filename).read().strip()   
    # find sentence-like boundaries
    if lines.find('-DOCSTART- -X- O O\n') != -1: #IOB2
      lines = lines.split("-DOCSTART- -X- O O\n") 
    else: #IOB
      lines = lines.split('\n\n')
    # split on newlines
    lines = [line.split("\n") for line in lines if line.strip() != ""]
    # get tokens
    tokens = [[l.split('\t')[0] for l in line if l.split('\t')[-1] != ''] for line in lines]
    # get labels/tags
    labels = [[l.split('\t')[-1] for l in line if l.split('\t')[-1] != ''] for line in lines]
    #convert to df
    data= {'sentence': tokens, 'word_labels': labels}
    df=pd.DataFrame(data=data)
    return df

In [None]:
# a simple function to get all the tags in a single list 
flatten = lambda l : [item for sublist in l for item in sublist] 

In [None]:
# garbage collector 
import gc
gc.collect()

103

In [None]:
# Get the data in pandas dataframe format
DATADIR = "/content/data/"
def get_iob2_data(trainfile=DATADIR + "full_documents_IOB.tsv"):
    data = read_iob_format(trainfile)
    print("data: %d sentences, %d tokens"%(len(data), len(flatten(data.word_labels))))
    return data

df = get_iob2_data()
df.head()

data: 75 sentences, 12968 tokens


Unnamed: 0,sentence,word_labels
0,"[Education, ,, Key, Experiences, ,, Knowledge,...","[O, O, O, O, O, O, O, O, O, B-EXPERIENCE, I-EX..."
1,"[8, +, years, of, experience, in, developing, ...","[B-EXPERIENCE, I-EXPERIENCE, I-EXPERIENCE, O, ..."
2,"[Basic, :, High, School, Diploma, or, equivale...","[O, O, O, O, O, O, O, B-EXPERIENCE, I-EXPERIEN..."
3,"[5, -, 10, Yrs, of, Experience, ., Proven, tec...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,"[Must, have, basic, knowledge, in, one, of, th...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [None]:
label_list = np.unique(flatten(df['word_labels']))
label_list = list(label_list)

labels_to_ids = {k: v for v, k in enumerate(label_list)}
ids_to_labels = {v: k for v, k in enumerate(label_list)}
labels_to_ids

{'B-DIPLOMA': 0,
 'B-DIPLOMA_MAJOR': 1,
 'B-EXPERIENCE': 2,
 'B-SKILLS': 3,
 'I-DIPLOMA': 4,
 'I-DIPLOMA_MAJOR': 5,
 'I-EXPERIENCE': 6,
 'I-SKILLS': 7,
 'O': 8}

In [None]:
df['word_labels'] = df['word_labels'].apply(lambda l : ','.join(l))
df['sentence'] = df['sentence'].apply(lambda l : ' '.join(l))
print(df.shape)
df.head(5)

(75, 2)


Unnamed: 0,sentence,word_labels
0,"Education , Key Experiences , Knowledge and Sk...","O,O,O,O,O,O,O,O,O,B-EXPERIENCE,I-EXPERIENCE,I-..."
1,8 + years of experience in developing .Net app...,"B-EXPERIENCE,I-EXPERIENCE,I-EXPERIENCE,O,O,O,O..."
2,Basic : High School Diploma or equivalent 1 ye...,"O,O,O,O,O,O,O,B-EXPERIENCE,I-EXPERIENCE,O,O,B-..."
3,5 - 10 Yrs of Experience . Proven technical kn...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
4,Must have basic knowledge in one of the follow...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."


In [None]:
#check the device 
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


**we first set all the hyperparameters fixed then we choose those that will be tuned and keep the others fixed**

In [None]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 5
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
def tokenize_and_preserve_labels(sentence, text_labels, tokenizer):
    tokenized_sentence = []
    labels = []
    sentence = sentence.strip()
    for word, label in zip(sentence.split(), text_labels.split(",")):
        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)
        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)
        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)
    return tokenized_sentence, labels

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [None]:
class dataset(Dataset):
    def __init__(self,
     dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        # step 1: tokenize (and adapt corresponding labels)
        sentence = self.data.sentence[index]  
        word_labels = self.data.word_labels[index]  
        tokenized_sentence, labels = tokenize_and_preserve_labels(sentence, word_labels, self.tokenizer)
        
        # step 2: add special tokens (and corresponding labels)
        tokenized_sentence = ["[CLS]"] + tokenized_sentence + ["[SEP]"] # add special tokens
        labels.insert(0, "O") # add outside label for [CLS] token in the sentence beginning
        labels.insert(-1, "O") # add outside label for [SEP] token in the end of the sentence

        # step 3: truncating/padding
        maxlen = self.max_len
        if (len(tokenized_sentence) > maxlen):
          # truncate
          tokenized_sentence = tokenized_sentence[:maxlen]
          labels = labels[:maxlen]
        else:
          # pad
          tokenized_sentence = tokenized_sentence + ['[PAD]'for _ in range(maxlen - len(tokenized_sentence))]
          labels = labels + ["O" for _ in range(maxlen - len(labels))] #fill padd labels with O 

        # step 4: obtain the attention mask
        attn_mask = [1 if tok != '[PAD]' else 0 for tok in tokenized_sentence]
        # step 5: convert tokens to input ids
        ids = self.tokenizer.convert_tokens_to_ids(tokenized_sentence)
        label_ids = [labels_to_ids[label] for label in labels]
        # the following line is deprecated
        #label_ids = [label if label != 0 else -100 for label in label_ids]
        
        return {
              'ids': torch.tensor(ids, dtype=torch.long),
              'mask': torch.tensor(attn_mask, dtype=torch.long),
              #'token_type_ids': torch.tensor(token_ids, dtype=torch.long),
              'targets': torch.tensor(label_ids, dtype=torch.long)
        } 
    
    def __len__(self):
        return self.len

* Train/valid split size is  kept fixed to this notebook, shortly, it will be included in the tuned parameters 

In [None]:
train_size = 0.8
train_dataset = df.sample(frac=train_size,random_state=200)
test_dataset = df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = dataset(train_dataset, tokenizer, MAX_LEN)
testing_set = dataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (75, 2)
TRAIN Dataset: (60, 2)
TEST Dataset: (15, 2)


* Defining the training function where config includes the hyperparameters to be tuned

In [None]:
def train(epoch,config):
    bert_config = BertConfig.from_pretrained('bert-base-uncased',classifier_dropout=config['classifier_dropout'], num_labels=len(labels_to_ids))
    model = BertForTokenClassification.from_pretrained('bert-base-uncased',config=bert_config)
    model.to(device)
    train_params = {'batch_size': config['train_batch_size'],
                'shuffle': True,
                'num_workers': 0
                }
    training_loader = DataLoader(training_set, **train_params)
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()
    tk0 = tqdm(training_loader, total = len(training_loader))
    for idx,batch in enumerate(tk0):
        ids = batch['ids'].to(device, dtype = torch.long)
        mask = batch['mask'].to(device, dtype = torch.long)
        targets = batch['targets'].to(device, dtype = torch.long)
        loss,tr_logits= model(input_ids=ids, attention_mask=mask, labels=targets)
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += targets.size(0)
        # every 100 iteration (forward-backward pass)
        if idx % 100==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")

        optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE,weight_decay=config['weight_decay'])   
        # compute training accuracy
        flattened_targets = targets.view(-1) 
        active_logits = tr_logits.view(-1, bert_config.num_labels) 
        flattened_predictions = torch.argmax(active_logits, axis=1) 
        active_accuracy = mask.view(-1) == 1 
        targets = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)
        
        tr_preds.extend(predictions)
        tr_labels.extend(targets)
        
        tmp_tr_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy
    
        # gradient clipping (avoid gradients vanishing/exploding)
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )
        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")
    labels, predictions = valid(model,config)

* evaluation function 

In [None]:
def valid(model, config):
    # put model in evaluation mode
    model.eval()
    test_params = {'batch_size': VALID_BATCH_SIZE,
                    'shuffle': True,
                    'num_workers': 0
                    }
    testing_loader = DataLoader(testing_set, **test_params)
    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []
    
    with torch.no_grad():
        tk0 = tqdm(testing_loader, total = len(testing_loader))
        for idx, batch in enumerate(tk0):
   
            ids = batch['ids'].to(device, dtype = torch.long)
            mask = batch['mask'].to(device, dtype = torch.long)
            targets = batch['targets'].to(device, dtype = torch.long)
            
            loss, eval_logits = model(input_ids=ids, attention_mask=mask, labels=targets)
            
            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += targets.size(0)
        
            if idx % 100==0:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")
              
            # compute evaluation accuracy
            flattened_targets = targets.view(-1) 
            active_logits = eval_logits.view(-1, model.num_labels) 
            flattened_predictions = torch.argmax(active_logits, axis=1) 
            active_accuracy = mask.view(-1) == 1 
            targets = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)
            
            eval_labels.extend(targets)
            eval_preds.extend(predictions)
            
            tmp_eval_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy
    

    labels = [ids_to_labels[id.item()] for id in eval_labels]
    predictions = [ids_to_labels[id.item()] for id in eval_preds]

    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    # tune report is important to enable ray tune to compare trials
    tune.report(eval_accuracy=eval_accuracy) #eval_accuracy should be replaced by F1 as the most unbiased metric
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")
    return labels, predictions

* simple training loop, just define it and the call must be dealt by ray tuning

In [None]:
def train_cifar(config,checkpoint_dir=None):
  for epoch in range(EPOCHS):
      print(f"Training epoch: {epoch + 1}")
      train(epoch,config)

In [None]:
gc.collect()

202

## Basic Hyperparameter Tuning with ray tune

In [None]:
def main_tuning(num_samples=5, max_num_epochs=5, gpus_per_trial=1):
    #Tune's search space
    config = {
        # "learning_rate": tune.loguniform(1e-4, 1e-1),
        "weight_decay": tune.loguniform(1e-4, 1e-1),
        "classifier_dropout" : tune.choice(np.arange(0.1,0.6,0.1)),
        "train_batch_size": tune.choice([2, 4])
        #"valid_batch_size": tune.choice([2, 4]),
        #"max_grad_norm": tune.choice([8, 10, 12])
    }
    scheduler = ASHAScheduler(
        max_t=max_num_epochs,
        grace_period=1,
        reduction_factor=2)
    result = tune.run(
        train_cifar,
        resources_per_trial={"cpu": 2, "gpu": gpus_per_trial},
        config=config,
        num_samples=num_samples,
        scheduler=scheduler
    )
    print("Best config: ", result.get_best_config(metric="eval_accuracy", mode="max"))


In [None]:
main_tuning()

In [None]:
gc.collect()

146

## let's visualize all trials results  

In [None]:
%load_ext tensorboard

In [None]:
%tensorboard --logdir ~/ray_results 

* download Log file (Optional) 

In [None]:
#move the log directory to the current working directory to zip and download it 
! cp -r ~/ray_results ray_results
! zip -r ray_results.zip ray_results
! rm -r ray_results

In [None]:
! unzip ray_results.zip && rm ray_results.zip

* tensorboard dev is a tensorboard plugin to host, track, and share your ML experiments.

In [None]:
notebook.list()

Known TensorBoard instances:
  - port 6006: logdir ray_results (started 0:00:27 ago; pid 139)


In [None]:
# this cell will generate the url to the hosted experiment dashboard
!tensorboard dev upload --logdir ./ray_results \
  --name "Simple experiment" \
  --description "fine tunning BERT with Hyperparameter optimization "

In [None]:
# this line outputs all the hosted experiments
!tensorboard dev list

2022-02-21 17:05:44.727447: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
https://tensorboard.dev/experiment/Xj88X09KTzqpMfkJnFKwOA/
	Name                 Simple experiment
	Description          fine tunning BERT with Hyperparameter optimization 
	Id                   Xj88X09KTzqpMfkJnFKwOA
	Created              2022-02-21 16:53:51 (12 minutes ago)
	Updated              2022-02-21 16:53:51 (12 minutes ago)
	Runs                 5
	Tags                 8
	Scalars              125
	Tensor bytes         1290
	Binary object bytes  0
Total: 1 experiment(s)


In [None]:
# You must replace YOUR_EXPERIMENT_ID with the value output from the previous
# tensorboard `list` command or `upload` command.  For example
# `tensorboard dev delete --experiment_id pQpJNh00RG2Lf1zOe9BrQA`

## !tensorboard dev delete --experiment_id YOUR_EXPERIMENT_ID_HERE