In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
! pip install transformers
! pip install datasets 
! pip install --upgrade tqdm

Collecting transformers
  Downloading transformers-4.9.0-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 7.6 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 53.5 MB/s 
Collecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 58.1 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 57.2 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninsta

In [3]:
!unzip -qq /content/drive/MyDrive/Hateful_Memes/hateful_memes.zip

In [4]:
import torch
torch.__version__

'1.9.0+cu102'

In [5]:
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import os

In [None]:
import os
import shutil

dirpath = '/content/test-results-concat-new'
if os.path.exists(dirpath) and os.path.isdir(dirpath):
    shutil.rmtree(dirpath)

In [6]:
df_train = pd.read_json('hateful_memes/train.jsonl', lines=True)
df_train.head()

Unnamed: 0,id,img,label,text
0,42953,img/42953.png,0,its their character not their color that matters
1,23058,img/23058.png,0,don't be afraid to love again everyone is not ...
2,13894,img/13894.png,0,putting bows on your pet
3,37408,img/37408.png,0,i love everything and everybody! except for sq...
4,82403,img/82403.png,0,"everybody loves chocolate chip cookies, even h..."


In [7]:
df_train.label.value_counts()

0    5481
1    3019
Name: label, dtype: int64

In [8]:
val_seen = pd.read_json('hateful_memes/dev_seen.jsonl', lines=True)
val_unseen = pd.read_json('hateful_memes/dev_unseen.jsonl', lines=True)
df_val = pd.concat([val_seen, val_unseen],axis=0)
df_val.head()

Unnamed: 0,id,img,label,text
0,8291,img/08291.png,1,white people is this a shooting range
1,46971,img/46971.png,1,bravery at its finest
2,3745,img/03745.png,1,your order comes to $37.50 and your white priv...
3,83745,img/83745.png,1,it is time.. to send these parasites back to t...
4,80243,img/80243.png,1,mississippi wind chime


In [9]:
test_seen = pd.read_json('hateful_memes/test_seen.jsonl', lines=True)
test_unseen = pd.read_json('hateful_memes/test_unseen.jsonl', lines=True)
df_test = pd.concat([test_seen, test_unseen],axis=0)
df_train.shape, df_val.shape, df_test.shape

((8500, 4), (1040, 4), (3000, 4))

In [10]:
df_val.label.value_counts()

0    593
1    447
Name: label, dtype: int64

In [11]:
df_test.label.value_counts()

0    1760
1    1240
Name: label, dtype: int64

In [12]:
df_train.head()

Unnamed: 0,id,img,label,text
0,42953,img/42953.png,0,its their character not their color that matters
1,23058,img/23058.png,0,don't be afraid to love again everyone is not ...
2,13894,img/13894.png,0,putting bows on your pet
3,37408,img/37408.png,0,i love everything and everybody! except for sq...
4,82403,img/82403.png,0,"everybody loves chocolate chip cookies, even h..."


In [13]:

df_train['text_len'] = df_train['text'].str.split().str.len()
df_train['text_len'].describe()

count    8500.000000
mean       11.742588
std         6.877021
min         1.000000
25%         7.000000
50%        10.000000
75%        15.000000
max        70.000000
Name: text_len, dtype: float64

## Tokenize data

In [14]:
from datasets import list_metrics, load_metric
metrics_list = list_metrics()
print(metrics_list)

['accuracy', 'bertscore', 'bleu', 'bleurt', 'cer', 'comet', 'coval', 'cuad', 'f1', 'gleu', 'glue', 'indic_glue', 'matthews_correlation', 'meteor', 'pearsonr', 'precision', 'recall', 'rouge', 'sacrebleu', 'sari', 'seqeval', 'spearmanr', 'squad', 'squad_v2', 'super_glue', 'wer', 'wiki_split', 'xnli']


In [15]:
acc_metric = load_metric('accuracy')
f1_metric = load_metric('f1')
precision_metric = load_metric('precision')
recall_metric = load_metric('recall')

Downloading:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.96k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.98k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.99k [00:00<?, ?B/s]

## Create Dataset function

In [16]:
from datasets import Dataset
train_dataset = Dataset.from_pandas(df_train)
train_dataset, train_dataset[0]

(Dataset({
     features: ['id', 'img', 'label', 'text', 'text_len'],
     num_rows: 8500
 }),
 {'id': 42953,
  'img': 'img/42953.png',
  'label': 0,
  'text': 'its their character not their color that matters',
  'text_len': 8})

In [17]:
from datasets import Dataset
val_dataset = Dataset.from_pandas(df_val)
val_dataset, val_dataset[0]

(Dataset({
     features: ['id', 'img', 'label', 'text', '__index_level_0__'],
     num_rows: 1040
 }),
 {'__index_level_0__': 0,
  'id': 8291,
  'img': 'img/08291.png',
  'label': 1,
  'text': 'white people is this a shooting range'})

In [18]:
from transformers import AutoTokenizer
model_checkpoint = 'roberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [19]:
import torch
max_len = 50
pad_to_max = True
def tokenize_data(example):
    # Tokenize the review body
    text_ = example['text']
    encodings = tokenizer.encode_plus(text_, padding=pad_to_max, max_length=max_len,
                                            truncation=True,
                                           add_special_tokens=True,
                                            return_token_type_ids=False,
                                            return_attention_mask=True,
                                            return_overflowing_tokens=False,
                                            return_special_tokens_mask=False,
                                           )
    
    # Subtract 1 from labels to have them in range 0-4
    targets = torch.tensor(example['label'],dtype=torch.long)
    

    encodings.update({'labels': targets})
    return encodings

In [20]:
encoded_train_dataset = train_dataset.map(tokenize_data)
encoded_val_dataset = val_dataset.map(tokenize_data)

  0%|          | 0/8500 [00:00<?, ?ex/s]

  0%|          | 0/1040 [00:00<?, ?ex/s]

In [21]:
encoded_train_dataset.column_names

['id',
 'img',
 'label',
 'text',
 'text_len',
 'input_ids',
 'attention_mask',
 'labels']

## Fine Tune Model

In [28]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
batch_size = 128
num_labels = 2


model_checkpoint = 'roberta-base'
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.9.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading weights file https://huggingface.co/roberta-base/resolve/main/pytorch_model.bin f

In [29]:
metric_name = "accuracy"
from sklearn.metrics import roc_auc_score

args = TrainingArguments(
    output_dir = "test-results-concat",
    seed = 125, 
    evaluation_strategy = "steps",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=50,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    eval_steps = 500,
    save_steps = 500,
    fp16 = False

)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [30]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = acc_metric.compute(predictions=predictions, references=labels)
    auc_score = roc_auc_score(labels, predictions)
    return {"accuracy": acc['accuracy'], "auroc": auc_score} 

In [31]:
trainer = Trainer(
    model,
    args,
    train_dataset= encoded_train_dataset, 
    eval_dataset=encoded_val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [32]:
!nvidia-smi

Sat Jul 24 03:39:07 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   46C    P0    36W / 250W |   6551MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [33]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text_len, text, img, id.
***** Running training *****
  Num examples = 8500
  Num Epochs = 50
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 3350


Step,Training Loss,Validation Loss,Accuracy,Auroc
500,0.4173,1.300941,0.579808,0.531841
1000,0.2384,1.630121,0.577885,0.537039
1500,0.1863,1.955605,0.575962,0.540035
2000,0.1648,2.359348,0.575,0.532306
2500,0.1504,2.699128,0.577885,0.534561
3000,0.1419,2.887314,0.580769,0.535162


The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: __index_level_0__, text, img, id.
***** Running Evaluation *****
  Num examples = 1040
  Batch size = 128
Saving model checkpoint to test-results-concat/checkpoint-500
Configuration saved in test-results-concat/checkpoint-500/config.json
Model weights saved in test-results-concat/checkpoint-500/pytorch_model.bin
tokenizer config file saved in test-results-concat/checkpoint-500/tokenizer_config.json
Special tokens file saved in test-results-concat/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: __index_level_0__, text, img, id.
***** Running Evaluation *****
  Num examples = 1040
  Batch size = 128
Saving model checkpoint to test-results-concat/checkpoint-1000
Configuration saved in test-results-conca

TrainOutput(global_step=3350, training_loss=0.20832178144312616, metrics={'train_runtime': 1876.7145, 'train_samples_per_second': 226.46, 'train_steps_per_second': 1.785, 'total_flos': 1.03174889249184e+16, 'train_loss': 0.20832178144312616, 'epoch': 50.0})

In [None]:
import numpy as np
trainer.evaluate()

In [None]:
trainer.save_model('Roberta_classification_model')

In [None]:
!zip -r 'Roberta_classification_model.zip' 'Roberta_classification_model'

In [None]:
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(
        'roberta-base', return_dict=True)

In [None]:
trainer = Trainer(
    args=args,
    tokenizer=tokenizer,
    train_dataset= encoded_train_dataset, 
    eval_dataset=encoded_test_dataset,
    model_init=model_init,
    compute_metrics=compute_metrics,
)

In [None]:
from ray.tune.schedulers import PopulationBasedTraining
from ray.tune import uniform
from random import randint
from ray import tune

scheduler = PopulationBasedTraining(
    mode = "max",
    metric='mean_accuracy',
    perturbation_interval=2,
    hyperparam_mutations={
        "weight_decay": tune.uniform(0.0, 0.3),
        "learning_rate": tune.uniform(1e-5, 5e-5),
        "per_device_train_batch_size": tune.choice([16, 32, 64]),
        "num_train_epochs": tune.choice([2,3,4]),
        "warmup_steps":tune.choice(range(0, 500))
    }
)

best_trial = trainer.hyperparameter_search(
    direction="maximize",
    backend="ray",
    n_trials=10,
    keep_checkpoints_num=1,
    scheduler=scheduler)

In [None]:
best_run = trainer.hyperparameter_search(n_trials=10, direction="maximize")

In [None]:
best_trial = trainer.hyperparameter_search(
    direction="maximize",
    backend="ray",
    # Choose among many libraries:
    # https://docs.ray.io/en/latest/tune/api_docs/suggestion.html
    search_alg=HyperOptSearch(),
    # Choose among schedulers:
    # https://docs.ray.io/en/latest/tune/api_docs/schedulers.html
    scheduler=AsyncHyperBand())