# Model -> https://huggingface.co/poom-sci/bert-base-uncased-multi-emotion

# Install Dependencies

In [None]:
!pip install -q transformers transformers[sentencepiece] datasets

[K     |████████████████████████████████| 3.1 MB 11.4 MB/s 
[K     |████████████████████████████████| 290 kB 34.3 MB/s 
[K     |████████████████████████████████| 895 kB 42.5 MB/s 
[K     |████████████████████████████████| 3.3 MB 43.7 MB/s 
[K     |████████████████████████████████| 596 kB 47.9 MB/s 
[K     |████████████████████████████████| 59 kB 7.2 MB/s 
[K     |████████████████████████████████| 243 kB 47.4 MB/s 
[K     |████████████████████████████████| 132 kB 45.9 MB/s 
[K     |████████████████████████████████| 1.1 MB 41.0 MB/s 
[K     |████████████████████████████████| 192 kB 50.1 MB/s 
[K     |████████████████████████████████| 160 kB 49.4 MB/s 
[K     |████████████████████████████████| 271 kB 52.3 MB/s 
[K     |████████████████████████████████| 1.2 MB 50.6 MB/s 
[?25h

In [None]:
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
!nvidia-smi

Tue Nov  9 07:13:41 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   44C    P0    28W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
# To control logging level for various modules used in the application:
import logging
import re
def set_global_logging_level(level=logging.ERROR, prefices=[""]):
    """
    Override logging levels of different modules based on their name as a prefix.
    It needs to be invoked after the modules have been loaded so that their loggers have been initialized.

    Args:
        - level: desired level. e.g. logging.INFO. Optional. Default is logging.ERROR
        - prefices: list of one or more str prefices to match (e.g. ["transformers", "torch"]). Optional.
          Default is `[""]` to match all active loggers.
          The match is a case-sensitive `module_name.startswith(prefix)`
    """
    prefix_re = re.compile(fr'^(?:{ "|".join(prefices) })')
    for name in logging.root.manager.loggerDict:
        if re.match(prefix_re, name):
            logging.getLogger(name).setLevel(level)

set_global_logging_level(logging.ERROR, ["transformers", "nlp", "torch", "tensorflow", "tensorboard", "wandb"])

# Import Dependencies

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns

from datasets import (concatenate_datasets,
                    Dataset,
                    load_dataset,
                    load_metric)

from transformers import (AutoTokenizer, 
                        DataCollatorWithPadding,
                        AutoModelForSequenceClassification,
                        EarlyStoppingCallback,
                        AdamW,
                        get_cosine_schedule_with_warmup,
                        pipeline,
                        AutoModelForSequenceClassification,
                        Trainer,
                        TrainingArguments)

from torch.optim.lr_scheduler import StepLR
from transformers.optimization import Adafactor, AdafactorSchedule
from sklearn.metrics import confusion_matrix,classification_report

# Declaration

In [None]:
dataset_path='gdrive/MyDrive/nvidia/huggingface/dataset'
dataset_name='/godataset'

model_path='gdrive/MyDrive/nvidia/huggingface/AWS_REVIEW/bert_goemotion'
model_name='/bert_goemotion'

checkpoint = "bert-base-uncased"

test_model=model_path+model_name

# Data Preparation

In [None]:
dataset = load_dataset("go_emotions", "simplified")

n_labels=28
train = dataset["train"].to_pandas()
valid = dataset["validation"].to_pandas()
test = dataset["test"].to_pandas()

Reusing dataset go_emotions (/root/.cache/huggingface/datasets/go_emotions/simplified/0.0.0/2637cfdd4e64d30249c3ed2150fa2b9d279766bfcd6a809b9f085c61a90d776d)


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:

from tqdm.notebook import tqdm

train=dataset["train"].to_pandas()

def one_hot_encoder(df):
    one_hot_encoding = []
    for i in tqdm(range(len(df))):
        temp = [0] * n_labels
        label_indices = df.iloc[i]["labels"]
        for index in label_indices:
            temp[index] = 1
        one_hot_encoding.append(temp)
        
    return pd.DataFrame(one_hot_encoding)

train_ohe_labels = one_hot_encoder(train)
valid_ohe_labels = one_hot_encoder(valid)
test_ohe_labels = one_hot_encoder(test)

print(train_ohe_labels.shape)
#(43410, 28)

train = pd.concat([train, train_ohe_labels], axis=1)
valid = pd.concat([valid, valid_ohe_labels], axis=1)
test = pd.concat([test, test_ohe_labels], axis=1)

  0%|          | 0/43410 [00:00<?, ?it/s]

  0%|          | 0/5426 [00:00<?, ?it/s]

  0%|          | 0/5427 [00:00<?, ?it/s]

(43410, 28)


In [None]:
col=['text','labels','id']
train['labels']=train[range(n_labels)].values.tolist()
valid['labels']=valid[range(n_labels)].values.tolist()
test['labels']=test[range(n_labels)].values.tolist()

In [None]:
train_dataset = Dataset.from_pandas(train[col])
validation_dataset = Dataset.from_pandas(valid[col])
test_dataset = Dataset.from_pandas(test[col])

In [None]:
train_dataset

Dataset({
    features: ['text', 'labels', 'id'],
    num_rows: 43410
})

## Tokenize Text

In [None]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["text"],max_length=512,truncation=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)
validation_dataset = validation_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

https://huggingface.co/bert-base-uncased/resolve/main/tokenizer_config.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpe6pg0er8


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

storing https://huggingface.co/bert-base-uncased/resolve/main/tokenizer_config.json in cache at /root/.cache/huggingface/transformers/c1d7f0a763fb63861cc08553866f1fc3e5a6f4f07621be277452d26d71303b7e.20430bd8e10ef77a7d2977accefe796051e01bc2fc4aa146bc862997a1a15e79
creating metadata file for /root/.cache/huggingface/transformers/c1d7f0a763fb63861cc08553866f1fc3e5a6f4f07621be277452d26d71303b7e.20430bd8e10ef77a7d2977accefe796051e01bc2fc4aa146bc862997a1a15e79
https://huggingface.co/bert-base-uncased/resolve/main/config.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpq8h2ddz8


Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

storing https://huggingface.co/bert-base-uncased/resolve/main/config.json in cache at /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
creating metadata file for /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range":

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

storing https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt in cache at /root/.cache/huggingface/transformers/45c3f7a79a80e1cf0a489e5c62b43f173c15db47864303a55d623bb3c96f72a5.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99
creating metadata file for /root/.cache/huggingface/transformers/45c3f7a79a80e1cf0a489e5c62b43f173c15db47864303a55d623bb3c96f72a5.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99
https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpl2tktb0j


Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

storing https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json in cache at /root/.cache/huggingface/transformers/534479488c54aeaf9c3406f647aa2ec13648c06771ffe269edabebd4c412da1d.7f2721073f19841be16f41b0a70b600ca6b880c8f3df6f3535cbc704371bdfa4
creating metadata file for /root/.cache/huggingface/transformers/534479488c54aeaf9c3406f647aa2ec13648c06771ffe269edabebd4c412da1d.7f2721073f19841be16f41b0a70b600ca6b880c8f3df6f3535cbc704371bdfa4
loading file https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/45c3f7a79a80e1cf0a489e5c62b43f173c15db47864303a55d623bb3c96f72a5.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99
loading file https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json from cache at /root/.cache/huggingface/transformers/534479488c54aeaf9c3406f647aa2ec13648c06771ffe269edabebd4c412da1d.7f2721073f19841be16f41b0a70b600ca6b880c8f3df6f3535cbc704371bdfa4
loading file https://hugg

  0%|          | 0/44 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

# Training Model

## Set Training Argument

In [None]:
training_args=TrainingArguments(
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    save_steps=300,
    overwrite_output_dir=True,
    output_dir=model_path,
    logging_dir=model_path+'/logs',
    load_best_model_at_end =True,
    evaluation_strategy ='steps',
    eval_steps = 300,
    metric_for_best_model ='eval_loss',
    greater_is_better=False,
    logging_steps=300,
    warmup_steps=300,
    remove_unused_columns=True,
    hub_model_id ='poom-sci/bert-base-uncased-multi-emotion',
    # dataloader_num_workers=0
    # hub_token ='OZGWlaSJuzSEImyTFrJYpERzXPXEWzdvSSETiUrUhvwIxzKdKpldtLLTEUKAeuXSDigMRjaizmwKPOxALNdkMUVotAAOIDOQodCQOAvcGtQktukwcPkSbxiZCroAjeHV',
    # push_to_hub=True
    # report_to =None,
    # lr_scheduler_type='polynomial'
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
class MultilabelTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = torch.nn.BCEWithLogitsLoss()
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), 
                        labels.float().view(-1, self.model.config.num_labels))
        return (loss, outputs) if return_outputs else loss

## model setup with optimizer and lr_scheduler

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=28,ignore_mismatched_sizes=True)
model.config.hidden_dropout_prob=0.2


trainer = MultilabelTrainer(
    model,
    training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer
)


loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19",
    "20": "LABEL_20",
    "21": "LA

## Start Training

In [None]:
trainer.train(resume_from_checkpoint=False )

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id, text.
***** Running training *****
  Num examples = 43410
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 8142


Step,Training Loss,Validation Loss
300,0.3724,0.164947
600,0.1538,0.14266
900,0.1394,0.130157
1200,0.126,0.118814
1500,0.1168,0.111974
1800,0.1098,0.104686
2100,0.1042,0.100156
2400,0.1005,0.09733
2700,0.0984,0.095235
3000,0.0907,0.093417


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id, text.
***** Running Evaluation *****
  Num examples = 5426
  Batch size = 16
Saving model checkpoint to gdrive/MyDrive/nvidia/huggingface/AWS_REVIEW/bert_goemotion/checkpoint-300
Configuration saved in gdrive/MyDrive/nvidia/huggingface/AWS_REVIEW/bert_goemotion/checkpoint-300/config.json
Model weights saved in gdrive/MyDrive/nvidia/huggingface/AWS_REVIEW/bert_goemotion/checkpoint-300/pytorch_model.bin
tokenizer config file saved in gdrive/MyDrive/nvidia/huggingface/AWS_REVIEW/bert_goemotion/checkpoint-300/tokenizer_config.json
Special tokens file saved in gdrive/MyDrive/nvidia/huggingface/AWS_REVIEW/bert_goemotion/checkpoint-300/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id, text.
***** Running Evaluation *****


TrainOutput(global_step=8142, training_loss=0.10298599423193223, metrics={'train_runtime': 1437.1681, 'train_samples_per_second': 90.616, 'train_steps_per_second': 5.665, 'total_flos': 2296117152130032.0, 'train_loss': 0.10298599423193223, 'epoch': 3.0})

In [None]:
import torch
trainer.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id, text.
***** Running Evaluation *****
  Num examples = 5426
  Batch size = 16


{'epoch': 3.0,
 'eval_loss': 0.08542061597108841,
 'eval_runtime': 8.492,
 'eval_samples_per_second': 638.958,
 'eval_steps_per_second': 40.038}

In [None]:
trainer.save_model(model_path+model_name)

Saving model checkpoint to gdrive/MyDrive/nvidia/huggingface/AWS_REVIEW/bert_goemotion/bert_goemotion
Configuration saved in gdrive/MyDrive/nvidia/huggingface/AWS_REVIEW/bert_goemotion/bert_goemotion/config.json
Model weights saved in gdrive/MyDrive/nvidia/huggingface/AWS_REVIEW/bert_goemotion/bert_goemotion/pytorch_model.bin
tokenizer config file saved in gdrive/MyDrive/nvidia/huggingface/AWS_REVIEW/bert_goemotion/bert_goemotion/tokenizer_config.json
Special tokens file saved in gdrive/MyDrive/nvidia/huggingface/AWS_REVIEW/bert_goemotion/bert_goemotion/special_tokens_map.json


# test model and visualize

## initialize model

In [None]:
sentiment_model = pipeline(
    "sentiment-analysis",
    model=test_model,
    tokenizer=test_model
)

mapping = {
    0:"admiration",
    1:"amusement",
    2:"anger",
    3:"annoyance",
    4:"approval",
    5:"caring",
    6:"confusion",
    7:"curiosity",
    8:"desire",
    9:"disappointment",
    10:"disapproval",
    11:"disgust",
    12:"embarrassment",
    13:"excitement",
    14:"fear",
    15:"gratitude",
    16:"grief",
    17:"joy",
    18:"love",
    19:"nervousness",
    20:"optimism",
    21:"pride",
    22:"realization",
    23:"relief",
    24:"remorse",
    25:"sadness",
    26:"surprise",
    27:"neutral",
}

sentiment_model.model.config.id2label=mapping

loading configuration file gdrive/MyDrive/nvidia/huggingface/AWS_REVIEW/bert_goemotion/bert_goemotion/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.2,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19",
    "20": "LABEL_20",
    "21": "LABEL_21",
    "22": "LABEL_22",
    "23": "LABEL_23",
    "24": "LABEL_24",
    "25": "LABEL_25",
    "26": 

In [None]:
sentiment_model('''I would fall for the trap and rub her belly!!! I am the luckiest person alive to have 2 cats that love belly rubs because when they lay like that I just can't help myself. >.<''')

[{'label': 'love', 'score': 0.48738592863082886}]

## real text example

In [None]:
text_list=['''That's how I was sleeping this morning. I want to kiss him.''',
            '''BIBLICAL. That's so brilliant. Please do more cover of Oasis' song :)''',
            '''I love this cover so much😩😩😩❤️❤️''',
            '''What Is Hate ?''',
           '''The first challenge in stopping hate speech is defining its boundaries.''',
           '''I hate to ask you for another favor, but I wonder if I could pay you to drive me home''',
           '''Don't do it again''',
           '''Omg I love this version!  I'm definitely adding it to my playlist''',
           '''I hate to see you unhappy.''',
           '''I hate to love the ice-cream'''
           ]

sentimental_scores=sentiment_model(text_list)

for i in range(len(text_list)):
    print(text_list[i])
    print(sentimental_scores[i])

  cpuset_checked))


That's how I was sleeping this morning. I want to kiss him.
{'label': 'desire', 'score': 0.4758101999759674}
BIBLICAL. That's so brilliant. Please do more cover of Oasis' song :)
{'label': 'admiration', 'score': 0.9848008155822754}
I love this cover so much😩😩😩❤️❤️
{'label': 'love', 'score': 0.974336564540863}
What Is Hate ?
{'label': 'curiosity', 'score': 0.5542548298835754}
The first challenge in stopping hate speech is defining its boundaries.
{'label': 'neutral', 'score': 0.9820588231086731}
I hate to ask you for another favor, but I wonder if I could pay you to drive me home
{'label': 'curiosity', 'score': 0.426676869392395}
Don't do it again
{'label': 'disapproval', 'score': 0.27320927381515503}
Omg I love this version!  I'm definitely adding it to my playlist
{'label': 'love', 'score': 0.9670977592468262}
I hate to see you unhappy.
{'label': 'anger', 'score': 0.539501965045929}
I hate to love the ice-cream
{'label': 'love', 'score': 0.4158727824687958}


## predict test

In [None]:
to_predict_data=test_dataset

In [None]:
predictions = trainer.predict(to_predict_data)
print(predictions.predictions.shape, predictions.label_ids.shape)

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id, text.
***** Running Prediction *****
  Num examples = 5427
  Batch size = 16


(5427, 28) (5427, 28)


In [None]:
predictions.predictions[0]

array([-3.5662284 , -3.976088  , -4.5829115 , -4.2553415 , -3.848753  ,
       -3.8415399 , -4.8834248 , -4.5203176 , -3.8215868 , -3.6140032 ,
       -4.051503  , -4.7262397 , -4.418176  , -4.7006035 , -4.145585  ,
       -3.986978  , -4.5680046 , -3.61954   , -0.19181313, -4.8885818 ,
       -4.0600457 , -5.4316664 , -3.954712  , -5.1267414 , -1.6811986 ,
       -1.1542732 , -4.8230734 , -4.1545463 ], dtype=float32)

In [None]:
def compute_loss( outputs, labels, return_outputs=False):
        outputs=torch.tensor(outputs)
        labels=torch.tensor(labels)
        # logits = outputs.logits
        loss_fct = torch.nn.BCEWithLogitsLoss()
        loss = loss_fct(outputs,labels.float())
        return (loss, outputs) if return_outputs else loss

In [None]:
compute_loss(predictions.predictions,to_predict_data['labels'])

tensor(0.0841)