### Guide: https://www.linkedin.com/pulse/fine-tune-nlp-model-run-sentiment-analysis-2023-updated-vergara/

#### WandB: https://docs.wandb.ai/guides/integrations/huggingface

#### WandB: https://colab.research.google.com/github/wandb/examples/blob/master/colabs/huggingface/Optimize_Hugging_Face_models_with_Weights_%26_Biases.ipynb

In [None]:
!pip install transformers
!pip install mlflow
!pip install evaluate
!pip install wandb

In [2]:
import torch
import pandas as pd
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import numpy as np
from imblearn.datasets import make_imbalance
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline, TrainingArguments, Trainer, BertModel
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from datasets import Dataset,load_dataset, load_from_disk, DatasetDict
import evaluate
import os
import io

In [3]:
# Login to W&B
!wandb login

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [4]:
# Load model and tokenizer

tokenizer = RobertaTokenizer.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment-latest')
model = RobertaForSequenceClassification.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment-latest', num_labels=3)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
# import data
df = pd.read_csv('first_annotations_majority_vote.csv', index_col = 0)
df2 = pd.read_csv('parties_annotations.csv', index_col = 0)

In [6]:
df = pd.concat([df, df2])

In [7]:
df3 = pd.DataFrame(df['comment_text'])

df3['label'] = df['label']

In [8]:
# No fine-tuning

sentiment_analysis = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, padding="max_length", truncation=True, max_length=128, device=0)

In [9]:
label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
pred_labels = []
for text in df3['comment_text']:
    output = sentiment_analysis(text)
    pred_label = output[0]['label']
    pred_labels.append(label_map[pred_label])



In [10]:
conf_mat = confusion_matrix(df3['label'], pred_labels, labels=[0, 1, 2])
print(conf_mat)

[[239  47  16]
 [ 34  51  17]
 [ 12   9 104]]


In [11]:

# Print classification report
print(classification_report(df3['label'], pred_labels))

              precision    recall  f1-score   support

           0       0.84      0.79      0.81       302
           1       0.48      0.50      0.49       102
           2       0.76      0.83      0.79       125

    accuracy                           0.74       529
   macro avg       0.69      0.71      0.70       529
weighted avg       0.75      0.74      0.75       529



In [12]:
# Fine-tuning model

dataset = Dataset.from_pandas(df3,preserve_index=False)

train_test = dataset.train_test_split(shuffle = True, seed = 200, test_size=0.3)
test_val = train_test['test'].train_test_split(shuffle = True, seed = 200, test_size=0.50)
train_val_test_dataset = DatasetDict({
    'train': train_test['train'],
    'test': test_val['test'],
    'dev': test_val['train']})


In [13]:
#Tokenize Dataset
def tokenize_function(examples):
    return tokenizer(examples["comment_text"], padding="max_length", truncation=True,max_length=128)
tokenized_datasets = train_val_test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/370 [00:00<?, ? examples/s]

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/79 [00:00<?, ? examples/s]

In [14]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [30]:
#Load the training arguments and hyperparameters
training_args = TrainingArguments(output_dir="/content/roberta_model",
                                  logging_strategy="epoch",
                                  evaluation_strategy="epoch",
                                  per_device_train_batch_size=16,
                                  per_device_eval_batch_size=16,
                                  num_train_epochs=20,
                                  save_strategy = "epoch",
                                  load_best_model_at_end=True,
                                  learning_rate=1e-6,
                                  report_to="wandb",
                                  # args to prevent disk overload on Google Colab
                                  save_steps = -1,
                                  save_on_each_node = False,
                                  overwrite_output_dir = True
                                  )

In [31]:
#Train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["dev"],
    compute_metrics=compute_metrics
)
trainer.train()    

wandb.finish()

--- Logging error ---
Traceback (most recent call last):
  File "/usr/lib/python3.10/logging/__init__.py", line 1104, in emit
    self.flush()
  File "/usr/lib/python3.10/logging/__init__.py", line 1084, in flush
    self.stream.flush()
OSError: [Errno 28] No space left on device
Call stack:
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.10/dist-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.10/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelapp.py", line 619, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.10/dist-packages/tornado/platform/asyncio.py", line 215, in start
    self.asyncio_loop.run_forever()

Epoch,Training Loss,Validation Loss


--- Logging error ---
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/wandb/sdk/interface/router_sock.py", line 27, in _read_message
    resp = self._sock_client.read_server_response(timeout=1)
  File "/usr/local/lib/python3.10/dist-packages/wandb/sdk/lib/sock_client.py", line 285, in read_server_response
    data = self._read_packet_bytes(timeout=timeout)
  File "/usr/local/lib/python3.10/dist-packages/wandb/sdk/lib/sock_client.py", line 269, in _read_packet_bytes
    raise SockClientClosedError
wandb.sdk.lib.sock_client.SockClientClosedError

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/wandb/sdk/interface/router.py", line 70, in message_loop
    msg = self._read_message()
  File "/usr/local/lib/python3.10/dist-packages/wandb/sdk/interface/router_sock.py", line 29, in _read_message
    raise MessageRouterClosedError
wandb.sdk.interface.router.M

BrokenPipeError: ignored

Error in callback <function _WandbInit._pause_backend at 0x7f6e4c880550> (for post_run_cell):


--- Logging error ---
Traceback (most recent call last):
  File "/usr/lib/python3.10/logging/__init__.py", line 1104, in emit
    self.flush()
  File "/usr/lib/python3.10/logging/__init__.py", line 1084, in flush
    self.stream.flush()
OSError: [Errno 28] No space left on device
Call stack:
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.10/dist-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.10/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelapp.py", line 619, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.10/dist-packages/tornado/platform/asyncio.py", line 215, in start
    self.asyncio_loop.run_forever()

BrokenPipeError: ignored