<a href="https://colab.research.google.com/github/qmeng222/transformers-for-NLP/blob/main/Fine_Tuning_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# install the HF transformers and datasets libraries:
!pip install transformers datasets
# `transformers` library: for loading, training, and evaluating NLP models
# `datasets` library: for loading, processing, and evaluating datasets



In [2]:
from datasets import load_dataset # import the `load_dataset` function from HF datasets library
import numpy as np # import the NumPy library for numerical computations in Python

In [3]:
# load the Stanford Sentiment Treebank (SST-2) dataset from the GLUE (General Language Understanding Evaluation) benchmark:
raw_datasets = load_dataset("glue", "sst2")

In [4]:
# (train + validation + test) datasets:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

# Inspect the train dataset:

In [5]:
type(raw_datasets['train'])

datasets.arrow_dataset.Dataset

In [6]:
# dataset summary (cols/features, rows):
raw_datasets['train']

Dataset({
    features: ['sentence', 'label', 'idx'],
    num_rows: 67349
})

In [7]:
# tabular data:
raw_datasets['train'].data

MemoryMappedTable
sentence: string
label: int64
idx: int32
----
sentence: [["hide new secretions from the parental units ","contains no wit , only labored gags ","that loves its characters and communicates something rather beautiful about human nature ","remains utterly satisfied to remain the same throughout ","on the worst revenge-of-the-nerds clichés the filmmakers could dredge up ",...,"you wish you were at home watching that movie instead of in the theater watching this one ","'s no point in extracting the bare bones of byatt 's plot for purposes of bland hollywood romance ","underdeveloped ","the jokes are flat ","a heartening tale of small victories "],["suspense , intriguing characters and bizarre bank robberies , ","a gritty police thriller with all the dysfunctional family dynamics one could wish for ","with a wonderful ensemble cast of characters that bring the routine day to day struggles of the working class to life ","nonetheless appreciates the art and reveals a music sc

In [8]:
# the 1st row:
raw_datasets['train'][0]

{'sentence': 'hide new secretions from the parental units ',
 'label': 0,
 'idx': 0}

In [9]:
# inspect rows:
raw_datasets['train'][50000:50003]

{'sentence': ['glow ',
  'a classical dramatic animated feature ',
  'best espionage picture '],
 'label': [1, 1, 1],
 'idx': [50000, 50001, 50002]}

In [10]:
# inspect cols:
raw_datasets['train'].features

{'sentence': Value(dtype='string', id=None),
 'label': ClassLabel(names=['negative', 'positive'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [11]:
# inspect the list of available attributes and methods of the object:
dir(raw_datasets['train'])

['_TF_DATASET_REFS',
 '__class__',
 '__del__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getitems__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_build_local_temp_path',
 '_check_index_is_initialized',
 '_data',
 '_estimate_nbytes',
 '_fingerprint',
 '_format_columns',
 '_format_kwargs',
 '_format_type',
 '_generate_tables_from_cache_file',
 '_generate_tables_from_shards',
 '_get_cache_file_path',
 '_get_output_signature',
 '_getitem',
 '_indexes',
 '_indices',
 '_info',
 '_map_single',
 '_new_dataset_with_indices',
 '_output_all_columns',
 '_push_parquet_shards_to_hub',
 '_save_to_disk_single',
 '_select_contiguous',
 '_select_with_indices_mappin

# Tokenize:

In [12]:
# import the AutoTokenizer class from the transformers library
# to automatically load the appropriate tokenizer for a given pre-trained model
from transformers import AutoTokenizer

In [13]:
# load the appropriate tokenizer for a specific pre-trained model:
# checkpoint = "bert-base-uncased"
checkpoint = "distilbert-base-uncased" # specify the model name
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [14]:
# tokenize:
tokenized_sentences = tokenizer(raw_datasets['train'][0:3]['sentence']) # select rows and a col
tokenized_sentences

{'input_ids': [[101, 5342, 2047, 3595, 8496, 2013, 1996, 18643, 3197, 102], [101, 3397, 2053, 15966, 1010, 2069, 4450, 2098, 18201, 2015, 102], [101, 2008, 7459, 2049, 3494, 1998, 10639, 2015, 2242, 2738, 3376, 2055, 2529, 3267, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [15]:
# import the pprint function from the pprint module
# (pretty-print) display complex data structures in the more readable format
from pprint import pprint

pprint(tokenized_sentences)

{'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
 'input_ids': [[101, 5342, 2047, 3595, 8496, 2013, 1996, 18643, 3197, 102],
               [101,
                3397,
                2053,
                15966,
                1010,
                2069,
                4450,
                2098,
                18201,
                2015,
                102],
               [101,
                2008,
                7459,
                2049,
                3494,
                1998,
                10639,
                2015,
                2242,
                2738,
                3376,
                2055,
                2529,
                3267,
                102]]}


In [16]:
# function that takes a batch of data as input and returns a tokenized version of the data:
def tokenize_fn(batch):
  return tokenizer(batch['sentence'], truncation=True) # tokens will be truncated to a maximum length that determined by the tokenizer object

In [17]:
# tokenize a dataset using the above function:
tokenized_datasets = raw_datasets.map(tokenize_fn, batched=True) # apply the function to batches of examples in the dataset & return a new dataset with the tokenized examples

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [18]:
# import the `TrainingArguments` class from the transformers library
# to store and manage settings for transformer-based models
from transformers import TrainingArguments

In [19]:
!pip install transformers[torch]



In [20]:
# create an instance of the `TrainingArguments` class with specific configuration settings:
training_args = TrainingArguments(
  'my_trainer', #  specify the directory where the trained model and associated files will be saved
  evaluation_strategy='epoch', # evaluation (validation) will be performed at the end of each epoch
  save_strategy='epoch', # the model will be saved at the end of each epoch
  num_train_epochs=1, # the model will be trained for only one epoch
)

In [21]:
# import the `AutoModelForSequenceClassification` class from the `transformers` library
# automatically loads a pre-trained model suitable for sequence classification
from transformers import AutoModelForSequenceClassification # import a class from the `transformers` library

In [22]:
# load a pre-trained model from the specified checkpoint (model name) & customize the number of output labels
model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint,
    num_labels=2) # binary classification

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
type(model)

transformers.models.distilbert.modeling_distilbert.DistilBertForSequenceClassification

In [24]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [25]:
!pip install torchinfo # install the `torchinfo` library to obtain detailed info about the layers and params of PyTorch models



In [26]:
from torchinfo import summary # summary function to print a summary of the PyTorch model

# summary(model, input_size=(16,512), dtypes=['torch.IntTensor'], device='cpu')
summary(model) # analyze the model and prints a summary report

Layer (type:depth-idx)                                  Param #
DistilBertForSequenceClassification                     --
├─DistilBertModel: 1-1                                  --
│    └─Embeddings: 2-1                                  --
│    │    └─Embedding: 3-1                              23,440,896
│    │    └─Embedding: 3-2                              393,216
│    │    └─LayerNorm: 3-3                              1,536
│    │    └─Dropout: 3-4                                --
│    └─Transformer: 2-2                                 --
│    │    └─ModuleList: 3-5                             42,527,232
├─Linear: 1-2                                           590,592
├─Linear: 1-3                                           1,538
├─Dropout: 1-4                                          --
Total params: 66,955,010
Trainable params: 66,955,010
Non-trainable params: 0

In [27]:
# save params before training the model:
params_before = []

for name, p in model.named_parameters():
  params_before.append(p.detach().cpu().numpy())

In [28]:
from transformers import Trainer # import the `Trainer` class for training transformer-based models
from datasets import load_metric # import the `load_metric` function for loading evaluation metrics for different tasks

In [29]:
# load an evaluation metric:
metric = load_metric("glue", "sst2")

  metric = load_metric("glue", "sst2")


In [30]:
# test with a dummy list:
metric.compute(predictions=[1, 0, 1], references=[1, 0, 0])

{'accuracy': 0.6666666666666666}

👆 The result is a dictionary.

And for the sst2 task, the only metric is accuracy.

# Write my own metric funciton instead of using the built-in ones provided by HF:

In [31]:
# function that computes evaluation metrics for a classification task:
def compute_metrics(logits_and_labels):
  # metric = load_metric("glue", "sst2") # the `metric` object has already been defined outside the function
  logits, labels = logits_and_labels # unpack the `logits_and_labels` tuple
  predictions = np.argmax(logits, axis=-1)
  return metric.compute(predictions=predictions, references=labels)

In [32]:
# create a Trainer object with the necessary configurations for training and evaluation:
trainer = Trainer(
    model, # the model to train
    training_args, # an instance of the TrainingArguments class
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [33]:
# begin the training process for a model:
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2155,0.36666,0.899083


TrainOutput(global_step=8419, training_loss=0.2677215855285014, metrics={'train_runtime': 465.979, 'train_samples_per_second': 144.532, 'train_steps_per_second': 18.067, 'total_flos': 518596929468840.0, 'train_loss': 0.2677215855285014, 'epoch': 1.0})

In [34]:
# save the trained model's weights and configuration to a specified directory:
trainer.save_model('my_saved_model')

In [35]:
!ls my_saved_model

config.json	   special_tokens_map.json  tokenizer.json     vocab.txt
model.safetensors  tokenizer_config.json    training_args.bin


In [36]:
# import the pipeline function from the transformers library
# to perform NLP tasks using pre-trained models
from transformers import pipeline

In [37]:
# create a text classification pipeline using 'my_saved_model':
newmodel = pipeline('text-classification', model='my_saved_model', device=0)

# Test my sentimanet analyser:

In [38]:
# classify an input text:
newmodel('This movie is great!')

[{'label': 'LABEL_1', 'score': 0.9992327690124512}]

In [39]:
newmodel('This movie sucks')

[{'label': 'LABEL_0', 'score': 0.9939330220222473}]

In [40]:
newmodel('This movie is so-so.')

[{'label': 'LABEL_0', 'score': 0.9240649342536926}]

In [41]:
# display the contents of the file:
!cat my_saved_model/config.json

{
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.35.1",
  "vocab_size": 30522
}


👆 The `config.json` file is a dictionary.

# Map ids (0/1) to labels (neg/pos) in the `config.json` file:

In [42]:
# import the json module
# for working with JSON (JavaScript Object Notation) data
import json

In [43]:
config_path = 'my_saved_model/config.json' # specify the file path

# the `with` statement is used here to ensure that the file is properly closed after reading,
# even if an exception occurs during the processing
with open(config_path) as f: # open the file for reading ('r' mode)
  j = json.load(f) # load the JSON content from the file

j['id2label'] = {0: 'negative', 1: 'positive'} # id (0/1) -> lable (neg/pos)

# the 'w' mode truncates the file if it exists,
# or creates a new file if it doesn't
with open(config_path, 'w') as f: # open the file for writing ('w' mode)
  json.dump(j, f, indent=2) # write the modified dictionary `j` back to the `config.json` file with indentation

In [44]:
# display the content of the file located in the directory:
!cat my_saved_model/config.json

{
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.35.1",
  "vocab_size": 30522,
  "id2label": {
    "0": "negative",
    "1": "positive"
  }
}

In [45]:
newmodel = pipeline('text-classification', model='my_saved_model', device=0)

In [46]:
newmodel('This movie is great!')

[{'label': 'positive', 'score': 0.9992327690124512}]

In [47]:
newmodel('This movie sucks')

[{'label': 'negative', 'score': 0.9939330220222473}]

In [48]:
newmodel('This movie is so-so.')

[{'label': 'negative', 'score': 0.9240649342536926}]

In [49]:
params_after = []
for name, p in model.named_parameters():
  params_after.append(p.detach().cpu().numpy())

# Check whether the model weights are changed before and after training:

In [61]:
# calcuate the sum of the absolute differences of each parameter:
for p1, p2 in zip(params_before, params_after):
  print(np.sum(np.abs(p1 - p2))) # sum(abs(diff))

13347.563
87.693016
1.7295547
1.0863681
1308.5573
1.7791028
1291.3124
0.0033193836
1194.5209
1.0678046
1128.4482
0.8458084
1.753972
0.84513366
4929.4663
5.7427373
4506.981
0.7077203
1.6224422
0.67397106
1260.1796
1.4002612
1254.3839
0.0029209864
1105.4321
0.85051656
1058.9504
0.7575139
1.6045413
0.746284
4879.2056
5.358651
4456.976
0.70972806
1.5115304
0.78560394
1273.6676
1.5428607
1284.9869
0.0027768384
1113.1842
0.7579951
1096.8013
0.7125282
1.534085
0.7531638
4957.272
5.6397905
4423.3105
0.71052396
1.3993564
0.7578372
1301.8689
1.513132
1312.9058
0.00304857
1171.7545
0.7827668
1122.4364
0.72776467
1.4408332
0.7405715
4893.143
5.564643
4247.1187
0.7828486
1.3949804
0.7889575
1210.5659
1.4438974
1197.9781
0.0020871544
1030.0344
0.7888408
1038.0922
0.88561684
1.4070728
0.913869
4444.501
5.0946302
3669.5544
0.8185555
1.3470589
0.7537111
1108.9258
1.4934338
1127.5002
0.0014123274
946.3496
0.810292
917.4854
1.0957061
1.2877239
1.1339346
3644.7883
4.6397557
3316.1367
0.95170677
1.3783717


👆 All sums are non-zero, which means weights were updated during the training process.