# Using Hugging Face Datasets and Transformers Libraray

### Install the necessary packages

In [None]:
! pip install datasets transformers

Collecting datasets
  Downloading datasets-2.0.0-py3-none-any.whl (325 kB)
[K     |████████████████████████████████| 325 kB 5.0 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 54.4 MB/s 
Collecting fsspec[http]>=2021.05.0
  Downloading fsspec-2022.2.0-py3-none-any.whl (134 kB)
[K     |████████████████████████████████| 134 kB 59.6 MB/s 
[?25hCollecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 56.7 MB/s 
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 63.7 MB/s 
[?25hCollecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 5.0 MB/s 
Co

### Checking GPU Details

In [None]:
!nvidia-smi

Thu Mar 31 15:44:57 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P0    27W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

### Logging into the Hugging Face Platform

In [None]:
from huggingface_hub import notebook_login

notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


### Importing Transformers Library

In [None]:
import transformers

print(transformers.__version__)

4.17.0


### Calling the BERTweet model checkpoint (as per the model's name on Hugging Face)

In [None]:
model_checkpoint = "cardiffnlp/bertweet-base-sentiment"
batch_size = 16

### Loading dataset and metric from the datasets library

In [None]:
from datasets import load_dataset, load_metric

In [None]:
dataset = load_dataset("rahulacj/SA_patra", use_auth_token=True)

Using custom data configuration rahulacj--SA_patra-82bb402a70616e33


Downloading and preparing dataset csv/rahulacj--SA_patra to /root/.cache/huggingface/datasets/csv/rahulacj--SA_patra-82bb402a70616e33/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/762k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/93.5k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/92.7k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/rahulacj--SA_patra-82bb402a70616e33/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label'],
        num_rows: 10079
    })
    test: Dataset({
        features: ['sentence', 'label'],
        num_rows: 1262
    })
    validation: Dataset({
        features: ['sentence', 'label'],
        num_rows: 1260
    })
})

In [None]:
dataset["train"][:5]

{'label': [0, 1, 2, 0, 0],
 'sentence': ['till the time i used to blow chalk around the tab , bells used to ring for break . . . . i had met the teacher the next day  :-)',
  "I haven't done any numbering .",
  'Bro , those who are away from god , this is the only way to touch them .',
  'salman brother reply to my comnt plzzzzzzzzzzzz',
  '@ someUSER best year ever it will be ! i am tossing the idea of air around for mid jan to mid feb . but california dreaming for suuure']}

In [None]:
# df = dataset["train"].to_pandas()
# df.head()

In [None]:
metric_accuracy = load_metric("accuracy")
metric_f1 = load_metric("f1")

Downloading builder script:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.06k [00:00<?, ?B/s]

In [None]:
metric_accuracy

Metric(name: "accuracy", features: {'predictions': Value(dtype='int32', id=None), 'references': Value(dtype='int32', id=None)}, usage: """
Args:
    predictions: Predicted labels, as returned by a model.
    references: Ground truth labels.
    normalize: If False, return the number of correctly classified samples.
        Otherwise, return the fraction of correctly classified samples.
    sample_weight: Sample weights.
Returns:
    accuracy: Accuracy score.
Examples:

    >>> accuracy_metric = datasets.load_metric("accuracy")
    >>> results = accuracy_metric.compute(references=[0, 1], predictions=[0, 1])
    >>> print(results)
    {'accuracy': 1.0}
""", stored examples: 0)

In [None]:
metric_f1

Metric(name: "f1", features: {'predictions': Value(dtype='int32', id=None), 'references': Value(dtype='int32', id=None)}, usage: """
Args:
    predictions: Predicted labels, as returned by a model.
    references: Ground truth labels.
    labels: The set of labels to include when average != 'binary', and
        their order if average is None. Labels present in the data can
        be excluded, for example to calculate a multiclass average ignoring
        a majority negative class, while labels not present in the data will
        result in 0 components in a macro average. For multilabel targets,
        labels are column indices. By default, all labels in y_true and
        y_pred are used in sorted order.
    average: This parameter is required for multiclass/multilabel targets.
        If None, the scores for each class are returned. Otherwise, this
        determines the type of averaging performed on the data:
            binary: Only report results for the class specified by pos

### Importing AutoTokenizer from Transformers and calling BERTweet's tokenizer

In [None]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

Downloading:   0%|          | 0.00/318 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/837 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/824k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.03M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/17.0 [00:00<?, ?B/s]

emoji is not installed, thus not converting emoticons or emojis into text. Please install emoji: pip3 install emoji


### Understanding the tokenizer

In [None]:
tokenizer("Hello, this one sentence!", "And this sentence goes with it.")

{'input_ids': [0, 48518, 7, 33, 63, 31707, 11725, 12, 2, 2, 159, 33, 5199, 660, 30, 987, 4, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
# token_type_ids indicates whether we have sentence 1 or sentence 2 i.e useful when there are 2 sentences

In [None]:
tokenizer("Hello, this one sentence!")

{'input_ids': [0, 48518, 7, 33, 63, 31707, 11725, 12, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
tokenizer("And this sentence goes with it.")

{'input_ids': [0, 159, 33, 5199, 660, 30, 987, 4, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
tokenizer("till the time i used to blow chalk around the tab , bells used to ring for break . . . . i had met the teacher the next day  :-)")

{'input_ids': [0, 842, 6, 78, 37, 372, 9, 2800, 31310, 284, 6, 14147, 7, 17897, 372, 9, 1882, 19, 691, 4, 4, 4, 4, 37, 118, 1137, 6, 1688, 6, 217, 93, 1078, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

### Creating a function to tokenize the dataset

In [None]:
def preprocess_function(examples):
   return tokenizer(examples["sentence"], max_length=128, truncation=True)

In [None]:
preprocess_function(dataset['train'][:5])

{'input_ids': [[0, 842, 6, 78, 37, 372, 9, 2800, 31310, 284, 6, 14147, 7, 17897, 372, 9, 1882, 19, 691, 4, 4, 4, 4, 37, 118, 1137, 6, 1688, 6, 217, 93, 1078, 2], [0, 8, 58297, 15700, 270, 207, 26828, 776, 4, 2], [0, 4779, 7, 268, 87, 41, 300, 53, 697, 7, 33, 17, 6, 121, 154, 9, 1605, 106, 4, 2], [0, 4360, 171, 823, 1538, 9, 23, 2846, 3861, 2916, 59815, 20282, 2], [0, 59157, 8426, 60738, 161, 189, 179, 18, 70, 31, 12, 37, 155, 40888, 6, 707, 15, 1284, 284, 19, 5955, 10411, 9, 5955, 27344, 4, 42, 27138, 7431, 19, 1692, 713, 4318, 2]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

### Mapping the tokenize function to the dataset

In [None]:
encoded_dataset = dataset.map(preprocess_function, batched=True)

  0%|          | 0/11 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [None]:
print(encoded_dataset['train'][:5])

{'sentence': ['till the time i used to blow chalk around the tab , bells used to ring for break . . . . i had met the teacher the next day  :-)', "I haven't done any numbering .", 'Bro , those who are away from god , this is the only way to touch them .', 'salman brother reply to my comnt plzzzzzzzzzzzz', '@ someUSER best year ever it will be ! i am tossing the idea of air around for mid jan to mid feb . but california dreaming for suuure'], 'label': [0, 1, 2, 0, 0], 'input_ids': [[0, 842, 6, 78, 37, 372, 9, 2800, 31310, 284, 6, 14147, 7, 17897, 372, 9, 1882, 19, 691, 4, 4, 4, 4, 37, 118, 1137, 6, 1688, 6, 217, 93, 1078, 2], [0, 8, 58297, 15700, 270, 207, 26828, 776, 4, 2], [0, 4779, 7, 268, 87, 41, 300, 53, 697, 7, 33, 17, 6, 121, 154, 9, 1605, 106, 4, 2], [0, 4360, 171, 823, 1538, 9, 23, 2846, 3861, 2916, 59815, 20282, 2], [0, 59157, 8426, 60738, 161, 189, 179, 18, 70, 31, 12, 37, 155, 40888, 6, 707, 15, 1284, 284, 19, 5955, 10411, 9, 5955, 27344, 4, 42, 27138, 7431, 19, 1692, 713, 4

### Downloading the BERTweet model from Hugging Face and importing other important functions

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

In [None]:
num_labels = 3
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

Downloading:   0%|          | 0.00/515M [00:00<?, ?B/s]

### Calling the DataCollator function for padding

In [None]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

### Passing the Training Arguments

In [None]:
repo_name = "bertweet-base-finetuned-sentiment-analysis"

args = TrainingArguments(
    repo_name,
    evaluation_strategy = "epoch",
    save_strategy = "epoch", 
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model='f1', 
    push_to_hub=True,
)

### Defining a function to compute the metric

In [None]:
import numpy as np
 
def compute_metrics(eval_pred):
   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = metric_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
   f1 = metric_f1.compute(predictions=predictions, references=labels, average="weighted")["f1"]
   return {"accuracy": accuracy, "f1": f1}

### Passing the arguments to the Trainer

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

Cloning https://huggingface.co/rahulacj/bertweet-base-finetuned-sentiment-analysis into local empty directory.


### Training

In [None]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: sentence. If sentence are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 10079
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 3150


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.8904,0.850855,0.638095,0.63403
2,0.7655,0.834504,0.657937,0.655864
3,0.66,0.919874,0.654762,0.651422
4,0.447,1.032367,0.642857,0.641747
5,0.3585,1.123387,0.645238,0.642389


The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: sentence. If sentence are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1260
  Batch size = 16
Saving model checkpoint to bertweet-base-finetuned-sentiment-analysis/checkpoint-630
Configuration saved in bertweet-base-finetuned-sentiment-analysis/checkpoint-630/config.json
Model weights saved in bertweet-base-finetuned-sentiment-analysis/checkpoint-630/pytorch_model.bin
tokenizer config file saved in bertweet-base-finetuned-sentiment-analysis/checkpoint-630/tokenizer_config.json
Special tokens file saved in bertweet-base-finetuned-sentiment-analysis/checkpoint-630/special_tokens_map.json
added tokens file saved in bertweet-base-finetuned-sentiment-analysis/checkpoint-630/added_tokens.json
tokenizer config file saved in bertweet-base-finetuned

TrainOutput(global_step=3150, training_loss=0.601625960819305, metrics={'train_runtime': 428.5437, 'train_samples_per_second': 117.596, 'train_steps_per_second': 7.35, 'total_flos': 1190356449298506.0, 'train_loss': 0.601625960819305, 'epoch': 5.0})

In [None]:
trainer.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: sentence. If sentence are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1260
  Batch size = 16


{'epoch': 5.0,
 'eval_accuracy': 0.6579365079365079,
 'eval_f1': 0.6558636394079431,
 'eval_loss': 0.8345035910606384,
 'eval_runtime': 2.3355,
 'eval_samples_per_second': 539.507,
 'eval_steps_per_second': 33.826}

### Evaluating Test

In [None]:
trainer.evaluate(encoded_dataset["test"])

The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: sentence. If sentence are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1262
  Batch size = 16


{'epoch': 5.0,
 'eval_accuracy': 0.6426307448494454,
 'eval_f1': 0.6396824216922826,
 'eval_loss': 0.845811665058136,
 'eval_runtime': 2.6569,
 'eval_samples_per_second': 474.995,
 'eval_steps_per_second': 29.734}

### Finding out the predictions

In [None]:
predictions = trainer.predict(encoded_dataset["test"])

The following columns in the test set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: sentence. If sentence are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 1262
  Batch size = 16


In [None]:
predictions

PredictionOutput(predictions=array([[ 1.6785152 , -0.31024674, -1.2138392 ],
       [ 0.5496086 , -2.3655117 ,  1.9713081 ],
       [ 1.656848  , -0.58579785, -0.84466517],
       ...,
       [ 2.2263784 , -1.5033185 , -0.51410717],
       [-0.2842087 ,  0.08992758,  0.3747614 ],
       [-0.08858068, -1.3480327 ,  1.5883008 ]], dtype=float32), label_ids=array([2, 2, 1, ..., 0, 2, 2]), metrics={'test_loss': 0.845811665058136, 'test_accuracy': 0.6426307448494454, 'test_f1': 0.6396824216922826, 'test_runtime': 2.6464, 'test_samples_per_second': 476.865, 'test_steps_per_second': 29.851})

In [None]:
!nvidia-smi

Thu Mar 31 16:04:40 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P0    34W / 250W |   5291MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

### Computing metrics for prediction

In [None]:
def prediction_metrics(eval_pred):
   logits, labels, metrics = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = metric_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
   f1 = metric_f1.compute(predictions=predictions, references=labels, average="weighted")["f1"]
   return {"accuracy": accuracy, "f1": f1}

In [None]:
prediction_metrics(predictions)

{'accuracy': 0.6426307448494454, 'f1': 0.6396824216922826}

### Calculating the Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix

preds = np.argmax(predictions.predictions, axis=-1)
confusion_matrix(predictions.label_ids, preds)

array([[425,  69,  79],
       [113, 147,  30],
       [116,  44, 239]])

### Uploading the model to the Hugging Face Platform

In [None]:
trainer.push_to_hub()

Saving model checkpoint to bertweet-base-finetuned-sentiment-analysis
Configuration saved in bertweet-base-finetuned-sentiment-analysis/config.json
Model weights saved in bertweet-base-finetuned-sentiment-analysis/pytorch_model.bin
tokenizer config file saved in bertweet-base-finetuned-sentiment-analysis/tokenizer_config.json
Special tokens file saved in bertweet-base-finetuned-sentiment-analysis/special_tokens_map.json
added tokens file saved in bertweet-base-finetuned-sentiment-analysis/added_tokens.json
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file pytorch_model.bin:   0%|          | 3.34k/515M [00:00<?, ?B/s]

Upload file runs/Mar31_15-49-21_7c6b8a214a4f/events.out.tfevents.1648741792.7c6b8a214a4f.76.0:  50%|#####     …

Upload file runs/Mar31_15-49-21_7c6b8a214a4f/events.out.tfevents.1648742558.7c6b8a214a4f.76.2: 100%|##########…

To https://huggingface.co/rahulacj/bertweet-base-finetuned-sentiment-analysis
   f6040e3..4c7cc5f  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Text Classification', 'type': 'text-classification'}, 'metrics': [{'name': 'Accuracy', 'type': 'accuracy', 'value': 0.6426307448494454}, {'name': 'F1', 'type': 'f1', 'value': 0.6396824216922826}]}
To https://huggingface.co/rahulacj/bertweet-base-finetuned-sentiment-analysis
   4c7cc5f..3fb8a77  main -> main



'https://huggingface.co/rahulacj/bertweet-base-finetuned-sentiment-analysis/commit/4c7cc5f9e507aa7185539ffe3e2a697f3e9034d4'