In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
! pip install transformers
! pip install datasets 
! pip install --upgrade tqdm

Collecting transformers
  Downloading transformers-4.9.0-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 4.2 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 40.6 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 52.9 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 56.3 MB/s 
Collecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninstalled P

!pip uninstall -y torch
!pip install torch==1.7.1

In [3]:
!pip install pytorch-lightning==1.3.8

Collecting pytorch-lightning==1.3.8
  Downloading pytorch_lightning-1.3.8-py3-none-any.whl (813 kB)
[K     |████████████████████████████████| 813 kB 4.2 MB/s 
Collecting tensorboard!=2.5.0,>=2.2.0
  Downloading tensorboard-2.4.1-py3-none-any.whl (10.6 MB)
[K     |████████████████████████████████| 10.6 MB 8.6 MB/s 
Collecting future>=0.17.1
  Downloading future-0.18.2.tar.gz (829 kB)
[K     |████████████████████████████████| 829 kB 51.8 MB/s 
Collecting pyDeprecate==0.3.0
  Downloading pyDeprecate-0.3.0-py3-none-any.whl (10 kB)
Collecting torchmetrics>=0.2.0
  Downloading torchmetrics-0.4.1-py3-none-any.whl (234 kB)
[K     |████████████████████████████████| 234 kB 54.0 MB/s 
Collecting aiohttp
  Downloading aiohttp-3.7.4.post0-cp37-cp37m-manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 54.8 MB/s 
Collecting async-timeout<4.0,>=3.0
  Downloading async_timeout-3.0.1-py3-none-any.whl (8.2 kB)
Collecting yarl<2.0,>=1.0
  Downloading yarl-1.6.3-cp37-cp3

In [None]:
!unzip -qq /content/drive/MyDrive/Hateful_Memes/hateful_memes.zip

In [None]:
import torch
torch.__version__

In [None]:
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import os

In [None]:
import os
import shutil

dirpath = '/content/model-checkpoints'
if os.path.exists(dirpath) and os.path.isdir(dirpath):
    shutil.rmtree(dirpath)

In [None]:
df_train = pd.read_json('hateful_memes/train.jsonl', lines=True)
df_train.head()

In [None]:
df_train.label.value_counts()

In [None]:
val_seen = pd.read_json('hateful_memes/dev_seen.jsonl', lines=True)
val_unseen = pd.read_json('hateful_memes/dev_unseen.jsonl', lines=True)
df_val = pd.concat([val_seen, val_unseen],axis=0)
df_val.head()

In [None]:
test_seen = pd.read_json('hateful_memes/test_seen.jsonl', lines=True)
test_unseen = pd.read_json('hateful_memes/test_unseen.jsonl', lines=True)
df_test = pd.concat([test_seen, test_unseen],axis=0)
df_train.shape, df_val.shape, df_test.shape

In [None]:
df_val.label.value_counts()

In [None]:
df_test.label.value_counts()

In [None]:
df_train.head()

In [None]:

df_train['text_len'] = df_train['text'].str.split().str.len()
df_train['text_len'].describe()

In [None]:
df_train['idx'] = df_train['id'].astype(str).str.zfill(5)
df_train.head()

In [None]:
df_val['idx'] = df_val['id'].astype(str).str.zfill(5)
df_test['idx'] = df_test['id'].astype(str).str.zfill(5)

In [None]:
df_train.shape, df_val.shape

## Remove records for which features couldn't be pulled correctly

In [None]:
import pickle
subset = False
if subset:
    with open('/content/drive/MyDrive/Hateful_Memes/features_100.pickle', 'rb') as handle:
        features_dict = pickle.load(handle)
else:
    with open('/content/drive/MyDrive/Hateful_Memes/features.pickle', 'rb') as handle:
        features_dict = pickle.load(handle)

In [None]:
features_idx = list(features_dict.keys())
train_idx = df_train['idx'].tolist()
val_idx = df_val['idx'].tolist()
print(len(features_idx), len(train_idx), len(val_idx))

In [None]:
missing_train=[]
for each in train_idx:
    if each not in features_idx:
        missing_train.append(each)


missing_val=[]
for each in val_idx:
    if each not in features_idx:
        missing_val.append(each)
print(len(missing_train), len(missing_val))

In [None]:
df_train = df_train[~df_train['idx'].isin(missing_train)]

In [None]:
df_val = df_val[~df_val['idx'].isin(missing_val)]

In [None]:
df_train.shape, df_val.shape

## Compute Class Weight

In [None]:
from sklearn.utils import class_weight
y_train = df_train["label"].values.tolist()
class_weights = class_weight.compute_class_weight('balanced',
                                                 np.unique(y_train),
                                                 y_train)
print(class_weights)

In [None]:
df_train.label.value_counts()

## Load as a dataset

In [None]:
subset = False
if subset:
    all_records = sorted(os.listdir('hateful_memes/img'))
    selection = all_records[:100]
    select_idx = [int(select.split('.')[0]) for select in selection]
    df_train = df_train[df_train['id'].isin(select_idx)]
    df_val = df_val[df_val['id'].isin(select_idx)]

df_train.shape, df_val.shape

In [None]:
from datasets import list_metrics, load_metric
metrics_list = list_metrics()
print(metrics_list)

In [None]:
acc_metric = load_metric('accuracy')
f1_metric = load_metric('f1')
precision_metric = load_metric('precision')
recall_metric = load_metric('recall')

## Create Dataset function

In [None]:
from transformers import BertTokenizer, VisualBertForPreTraining, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

## Load Visual Embedding features

In [None]:
import torch

In [None]:
class HatefulMemesData(Dataset):
    def __init__(self, df, tokenizer, sequence_length, 
                 print_text=False):         

        self.sequence_length = sequence_length
        self.tokenizer = tokenizer
        self.print_text = print_text

        texts = df["text"].values.tolist()
        labels = df["label"].values.tolist()
        ids =  df["idx"].values.tolist()

        self.dataset = []
        for i, inp in enumerate(texts):
            self.dataset.append({"text": inp, "label": labels[i], 'idx': ids[i]})
  
    def __len__(self):
        return len(self.dataset)


    def tokenize_data(self, example):
   
        idx = example['idx']
        idx = [idx] if isinstance(idx, str) else idx
        # encoded_dict = tokenizer.batch_encode_plus(example['text'], padding='max_length', max_length=max_len, truncation=True, return_tensors='pt')
        encoded_dict = tokenizer(example['text'], padding='max_length', max_length=self.sequence_length, truncation=True, return_tensors='pt')
        tokens = encoded_dict['input_ids']
        token_type_ids = encoded_dict['token_type_ids']
        attn_mask = encoded_dict['attention_mask']
        
        targets = torch.tensor(example['label']).type(torch.int64)
        embed_list = [features_dict[idval] for idval in idx]
        embed_list = np.array(embed_list)
        visual_embeds = torch.from_numpy(embed_list).double()
        # visual_embeds = visual_embeds.repeat(1,1,2)

        visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.int64)
        visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.int64)

        inputs={"input_ids": tokens.squeeze(),
            "attention_mask": attn_mask.squeeze(),
            "token_type_ids": token_type_ids.squeeze(),
            "visual_embeds": visual_embeds.squeeze(),
            "visual_token_type_ids": visual_token_type_ids.squeeze(),
            "visual_attention_mask": visual_attention_mask.squeeze(),
            "label": targets.squeeze()
        }
        
        return inputs
  
    def __getitem__(self, index):
        inputs = self.tokenize_data(self.dataset[index])
        
        if self.print_text:
            for k in inputs.keys():
                print(k, inputs[k].shape, inputs[k].dtype)

        return inputs

In [None]:
dataset = HatefulMemesData(df_val, tokenizer, 50, True)

In [None]:
example1 = dataset[5]

## Fine-Tune Model

In [None]:
from transformers import BertTokenizer, VisualBertModel, TrainingArguments, Trainer

model = VisualBertModel.from_pretrained('uclanlp/visualbert-nlvr2-coco-pre')


In [None]:
# example1 = tokenize_data(df_train.to_dict('records')[0])
print(example1)

In [None]:
example1['input_ids'].unsqueeze(0).shape

In [None]:
model = model.double()

In [None]:
outputs = model(input_ids=example1['input_ids'].unsqueeze(0),
                attention_mask=example1['attention_mask'].unsqueeze(0),
                visual_token_type_ids=example1['visual_token_type_ids'].unsqueeze(0),
                token_type_ids=example1['token_type_ids'].unsqueeze(0),
                visual_embeds=example1['visual_embeds'].unsqueeze(0),
                visual_attention_mask=example1['visual_attention_mask'].unsqueeze(0),
                )

In [None]:
pooled_outputs = outputs[1]
print(pooled_outputs.shape)

## Tuning using Pytorch Lightning

In [None]:
import pytorch_lightning as pl
from torch.utils.data import Dataset, DataLoader
from pytorch_lightning.loggers import WandbLogger
from datasets import load_metric
import torch.nn.functional as F
from sklearn.metrics import accuracy_score
from transformers import (
    AdamW,
    VisualBertModel,
    get_linear_schedule_with_warmup
)
import logging
import argparse
import time
from torch.nn import CrossEntropyLoss
from sklearn.metrics import roc_auc_score

In [None]:
# from pytorch_lightning.loggers.wandb import WandbLogger
import os
from pathlib import Path
from string import punctuation
import torch.nn as nn

## Look at Model Summary

In [None]:
weights = [0.77510622, 1.40873991]
wt_tensor = torch.FloatTensor(weights).cuda()
print(wt_tensor)

In [None]:
class VisualBERTClassifier(torch.nn.Module):
    def __init__(self):
        """
        In the constructor we instantiate two nn.Linear modules and assign them as
        member variables.
        """
        super(VisualBERTClassifier, self).__init__()
        self.visualbert = VisualBertModel.from_pretrained('uclanlp/visualbert-nlvr2-coco-pre')
        self.num_labels = 2
        self.dropout = nn.Dropout(0.1)
        self.cls=  nn.Linear(768, self.num_labels)
        self.weight = torch.FloatTensor([0.77510622, 1.40873991]),

        nSamples = [5178, 2849]
        normedWeights = [1 - (x / sum(nSamples)) for x in nSamples]
        self.loss_fct = CrossEntropyLoss(weight=torch.FloatTensor(normedWeights))
        
    
    def forward(self, input_ids, attention_mask, token_type_ids, visual_embeds, visual_attention_mask,
                visual_token_type_ids, labels):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Tensors.
        """
        outputs = self.visualbert(
                input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
                visual_embeds=visual_embeds,
                visual_attention_mask=visual_attention_mask,
                visual_token_type_ids=visual_token_type_ids,
            )
        
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.cls(pooled_output)
        reshaped_logits = logits.view(-1, self.num_labels)

        loss = self.loss_fct(reshaped_logits, labels.view(-1))
      
        return loss, reshaped_logits

In [None]:
model = VisualBERTClassifier().to('cuda')

In [None]:
!nvidia-smi

## Using HuggingFace Trainer

In [None]:
from transformers import TrainingArguments, Trainer
batch_size = 48
seq_len = 50

In [None]:
model = VisualBERTClassifier()
model = model.cuda()

In [None]:
metric_name = "accuracy"

args = TrainingArguments(
    output_dir = "model-checkpoint",
    seed = 110, 
    evaluation_strategy = "steps",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=40,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    eval_steps = 500,
    save_steps = 500,
    fp16 = False,
    gradient_accumulation_steps = 2


)

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = acc_metric.compute(predictions=predictions, references=labels)
    auc_score = roc_auc_score(labels, predictions)
    return {"accuracy": acc['accuracy'], "auroc": auc_score} 

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset = HatefulMemesData(df_train,tokenizer=tokenizer, sequence_length=seq_len),
    eval_dataset =  HatefulMemesData(df_val,tokenizer=tokenizer, sequence_length=seq_len),
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
## To resume from an old checkpoint, set path in resume-from
resume_from ='/content/model-checkpoint/checkpoint-12000'
trainer.train()

In [None]:
import numpy as np
trainer.evaluate()

In [None]:
trainer.save_model('VisualBERT_classification_model')

In [None]:
!zip -r 'VisualBERT_classification_model.zip' 'VisualBERT_classification_model'

In [None]:
!mv VisualBERT_classification_model.zip.zip /content/drive/MyDrive/Hateful_Memes

## Pytorch Lightning version of code - May have bugs

In [None]:
class VisualBERTFineTuner(pl.LightningModule):
    def __init__(self, hparams):
        super(VisualBERTFineTuner, self).__init__()      
        self.model = VisualBERTClassifier().double()
        self.num_labels = 2
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(768, self.num_labels)

        self.problem_type = 'single_label_classification'
        self.save_hyperparameters(hparams)
        self.tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
        self.output_dir = Path(self.hparams.output_dir)
        self.total_steps = 0

    def is_logger(self):
        return self.trainer.global_rank <= 0
    
        
    def forward(self, input_ids, attention_mask, token_type_ids, visual_embeds, visual_attention_mask,
                visual_token_type_ids, labels):
        loss, preds = self.model(
                input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
                visual_embeds=visual_embeds,
                visual_attention_mask=visual_attention_mask,
                visual_token_type_ids=visual_token_type_ids,
                labels = labels
            )
        return loss, preds 

    # def loss(self, batch, prediction):
    #     loss_fct = CrossEntropyLoss()
    #     labels = batch['label']
    #     loss = loss_fct(prediction.view(-1, self.num_labels), labels.view(-1))
    #     return loss
   

    def _step(self, batch):
        outputs = self(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            token_type_ids=batch["token_type_ids"],
            visual_embeds=batch["visual_embeds"],
            visual_attention_mask=batch["visual_attention_mask"],
            visual_token_type_ids=batch["visual_token_type_ids"],
            labels = batch['label']
        )

        return outputs
    

    def training_step(self, batch, batch_idx):
        loss, preds = self._step(batch)
        return loss
  

    def validation_step(self, batch, batch_idx):
        val_loss, preds = self._step(batch)
        preds = torch.argmax(preds, axis=1)
        labels = batch["label"]
        return {'loss': val_loss, "preds": preds, "labels": labels}

    
    def validation_epoch_end(self, outputs):
        preds = torch.cat([x['preds'] for x in outputs]).detach().cpu().numpy()
        labels = torch.cat([x['labels'] for x in outputs]).detach().cpu().numpy()
        loss = torch.stack([x['loss'] for x in outputs]).mean()
        auc_score = roc_auc_score(labels, preds, average='weighted')
        self.log('val_loss', loss, prog_bar=True)
        self.log('val_auroc', auc_score, prog_bar=True)
        self.log_dict(acc_metric.compute(predictions=preds, references=labels), prog_bar=True)


    def configure_optimizers(self):

        model = self.model
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": self.hparams.weight_decay,
            },
            {
                "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)

        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=self.total_steps
        )
        scheduler = {'scheduler': scheduler, 'interval': 'step', 'frequency': 1}
        return [optimizer], [scheduler]
  
    # def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx,
    #                    closure, on_tpu=False, using_native_amp=False, using_lbfgs=False):
    #     model = self.model
    #     if self.trainer.use_tpu:
    #         xm.optimizer_step(optimizer)
    #     else:
    #         optimizer.step(closure=closure)
    #     optimizer.zero_grad()
    #     torch.nn.utils.clip_grad_norm_(model.parameters(), self.hparams.max_grad_norm)
    #     self.lr_scheduler.step()
  
    def get_tqdm_dict(self):
        tqdm_dict = {"loss": "{:.3f}".format(self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]}

        return tqdm_dict
    
    def train_dataloader(self):   
        train_loader = DataLoader(HatefulMemesData(df_train, self.tokenizer, self.hparams.max_input_length),
                                batch_size=self.hparams.train_batch_size, drop_last=True, shuffle=True, num_workers=2)
       # Calculate total steps
        tb_size = self.hparams.train_batch_size * max(1, self.trainer.gpus)
        ab_size = self.trainer.accumulate_grad_batches * float(self.trainer.max_epochs)
        self.total_steps = (len(train_loader.dataset) // tb_size) // ab_size

        return train_loader

    def val_dataloader(self):
        return DataLoader(HatefulMemesData(df_val, self.tokenizer, self.hparams.max_input_length),
                          batch_size=self.hparams.eval_batch_size, num_workers=2)
    
    def test_dataloader(self):
        return  DataLoader(HatefulMemesData(df_test, self.tokenizer, self.hparams.max_input_length),
                          batch_size=self.hparams.eval_batch_size, num_workers=2)
    
    def on_save_checkpoint(self, checkpoint):
        save_path = self.output_dir.joinpath(model_prefix)
        self.model.config.save_step = self.step_count
        self.model.save_pretrained(save_path)
        self.tokenizer.save_pretrained(save_path)

In [None]:
# from pytorch_lightning import loggers as pl_loggers
# tb_logger = pl_loggers.TensorBoardLogger('logs/')

logger = logging.getLogger(__name__)
class LoggingCallback(pl.Callback):
  def on_validation_end(self, trainer, pl_module):
    logger.info("***** Validation results *****")
    if pl_module.is_logger():
      metrics = trainer.callback_metrics
      # Log results
      for key in sorted(metrics):
        if key not in ["log", "progress_bar"]:
          logger.info("{} = {}\n".format(key, str(metrics[key])))

  def on_test_end(self, trainer, pl_module):
    logger.info("***** Test results *****")

    if pl_module.is_logger():
      metrics = trainer.callback_metrics

      # Log and save results to file
      output_test_results_file = os.path.join(pl_module.hparams.output_dir, "test_results.txt")
      with open(output_test_results_file, "w") as writer:
        for key in sorted(metrics):
          if key not in ["log", "progress_bar"]:
            logger.info("{} = {}\n".format(key, str(metrics[key])))
            writer.write("{} = {}\n".format(key, str(metrics[key])))

In [None]:
model_name = "visualbert"
token_len = 50
model_prefix = f"{model_name}-{token_len}"

In [None]:
args_dict = dict(
    output_dir="", # path to save the checkpoints
    max_input_length=token_len,
    learning_rate=1e-4,
    weight_decay=0.0,
    adam_epsilon=1e-8,
    warmup_steps=0,
    train_batch_size=4,
    eval_batch_size=4,
    num_train_epochs=5,
    gradient_accumulation_steps=1,
    n_gpu=1,
    resume_from_checkpoint=None, 
    val_check_interval = 0.5, 
    early_stop_callback=False,
    fp_16=False, # if you want to enable 16-bit training then install apex and set this to true
    opt_level='O1', # you can find out more on optimisation levels here https://nvidia.github.io/apex/amp.html#opt-levels-and-properties
    max_grad_norm=1.0, # if you enable 16-bit training then set this to a sensible value, 0.5 is a good default
    seed=42,
)


args_dict.update({'output_dir': "./" + model_prefix + "_final", 'num_train_epochs':6,
             'train_batch_size': 32, 'eval_batch_size': 32})
args = argparse.Namespace(**args_dict)


## Define Checkpoint function
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    dirpath="./" + model_prefix + "_checkpoint", filename=model_prefix, monitor="accuracy", mode="max", save_top_k=1
)

In [None]:
print(args)

In [None]:
train_params = dict(
    accumulate_grad_batches=args.gradient_accumulation_steps,
    gpus=min(1, torch.cuda.device_count()),
    max_epochs=args.num_train_epochs,
    precision= 16 if args.fp_16 else 32,
    amp_level=args.opt_level,
    resume_from_checkpoint=args.resume_from_checkpoint,
    # gradient_clip_val=args.max_grad_norm,
    checkpoint_callback=checkpoint_callback,
    val_check_interval=args.val_check_interval,
    callbacks=[LoggingCallback()],
    # logger=tb_logger
)



model = VisualBERTFineTuner(args)
trainer = pl.Trainer(**train_params)

trainer.fit(model)

## Hyperparameter Optimization

In [None]:
! pip install optuna -q
! pip install 'ray[tune]'

In [None]:
import ray
ray.__version__

In [None]:
! pip install datasets

In [None]:
import os
import ray
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import PopulationBasedTraining
from transformers import  AutoConfig, \
    AutoModelForSequenceClassification, AutoTokenizer, Trainer, \
     TrainingArguments

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
 
    predictions = np.argmax(predictions, axis=1)
    acc = acc_metric.compute(predictions=predictions, references=labels)
    # precision = precision_metric.compute(predictions=predictions, references=labels, average='weighted')
    # recall = recall_metric.compute(predictions=predictions, references=labels, average='weighted')
    f1 = f1_metric.compute(predictions=predictions, references=labels, average='weighted')
    return acc

In [None]:
def tune_transformer( train_dataset,
                     test_dataset,
                    num_samples=8,
                     gpus_per_trial=0,
                     num_labels=5,
                     ray_address=None):
    
    #ray.shutdown()
    #ray.init(ray_address, log_to_driver=False)
    data_dir_name = "./data" 
    data_dir = os.path.abspath(os.path.join(os.getcwd(), data_dir_name))
    if not os.path.exists(data_dir):
        os.mkdir(data_dir, 0o755)

    # Change these as needed.
    model_name = "roberta-base" 

    config = AutoConfig.from_pretrained(
        model_name, num_labels=num_labels )

    # Download and cache tokenizer, model, and features
    print("Downloading and caching Tokenizer")
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Triggers tokenizer download to cache
    print("Downloading and caching pre-trained model")
    AutoModelForSequenceClassification.from_pretrained(
        model_name,
        config=config,
    )

    def get_model():
        return AutoModelForSequenceClassification.from_pretrained(
            model_name,
            config=config,
        )


    training_args = TrainingArguments(
        output_dir=".",
        learning_rate=1e-5,  # config
        do_train=True,
        do_eval=True,
        no_cuda=gpus_per_trial <= 0,
        evaluation_strategy="epoch",
        load_best_model_at_end=True,
        num_train_epochs=2,  # config
        max_steps=-1,
        per_device_train_batch_size=16,  # config
        per_device_eval_batch_size=16,  # config
        warmup_steps=0,
        weight_decay=0.1,  # config
        logging_dir="./logs",
    )

    training_args._n_gpu = gpus_per_trial

    trainer = Trainer(
        model_init=get_model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics)

    tune_config = {
        "per_device_train_batch_size": 32,
        "per_device_eval_batch_size": 32,
        "num_train_epochs": tune.choice([2, 3, 4, 5]),
        "max_steps": 1 
    }

    scheduler = PopulationBasedTraining(
        time_attr="training_iteration",
        metric="eval_acc",
        mode="max",
        perturbation_interval=1,
        hyperparam_mutations={
            "weight_decay": tune.uniform(0.0, 0.3),
            "learning_rate": tune.uniform(1e-5, 5e-5),
            "per_device_train_batch_size": [16, 32, 64],
        })

    reporter = CLIReporter(
        parameter_columns={
            "weight_decay": "w_decay",
            "learning_rate": "lr",
            "per_device_train_batch_size": "train_bs/gpu",
            "num_train_epochs": "num_epochs"
        },
        metric_columns=[
            "eval_acc", "eval_loss", "epoch", "training_iteration"
        ])

    trainer.hyperparameter_search(
        hp_space=lambda _: tune_config,
        backend="ray",
        n_trials=num_samples,
        resources_per_trial={
            "cpu": 1,
            "gpu": gpus_per_trial
        },
        scheduler=scheduler,
        keep_checkpoints_num=1,
        checkpoint_score_attr="training_iteration",
        stop=None,
        progress_reporter=reporter,
        local_dir="~/ray_results/",
        name="tune_transformer_pbt",
        log_to_file=False)

In [None]:
tune_transformer(encoded_train_dataset, encoded_test_dataset)

In [None]:
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(
        'roberta-base', return_dict=True)

In [None]:
trainer = Trainer(
    args=args,
    tokenizer=tokenizer,
    train_dataset= encoded_train_dataset, 
    eval_dataset=encoded_test_dataset,
    model_init=model_init,
    compute_metrics=compute_metrics,
)

In [None]:
from ray.tune.schedulers import PopulationBasedTraining
from ray.tune import uniform
from random import randint
from ray import tune

scheduler = PopulationBasedTraining(
    mode = "max",
    metric='mean_accuracy',
    perturbation_interval=2,
    hyperparam_mutations={
        "weight_decay": tune.uniform(0.0, 0.3),
        "learning_rate": tune.uniform(1e-5, 5e-5),
        "per_device_train_batch_size": tune.choice([16, 32, 64]),
        "num_train_epochs": tune.choice([2,3,4]),
        "warmup_steps":tune.choice(range(0, 500))
    }
)

best_trial = trainer.hyperparameter_search(
    direction="maximize",
    backend="ray",
    n_trials=10,
    keep_checkpoints_num=1,
    scheduler=scheduler)

In [None]:
best_run = trainer.hyperparameter_search(n_trials=10, direction="maximize")

In [None]:
best_trial = trainer.hyperparameter_search(
    direction="maximize",
    backend="ray",
    # Choose among many libraries:
    # https://docs.ray.io/en/latest/tune/api_docs/suggestion.html
    search_alg=HyperOptSearch(),
    # Choose among schedulers:
    # https://docs.ray.io/en/latest/tune/api_docs/schedulers.html
    scheduler=AsyncHyperBand())