# **Sequence Classification using Hugging Face**
Sequence classification is a technique that enables machines to understand and categorize different types of data in a sequence. Think of it in the same way as labeling the different parts of a sentence or the points along a time series.
Hugging Face provides many ready to use models and datasets to run our custom scripts easily. In this project we use "rotten-tomatoes" reviews dataset and distilbert pretrained model to fine tune on the dataset.

In [1]:
# download the different modules required
!pip install datasets
!pip install -U accelerate
!pip install -U transformers



In [11]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━[0m [32m41.0/84.1 kB[0m [31m1.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Installing collected packages: responses, evaluate
Successfully installed evaluate-0.4.1 responses-0.18.0


In [19]:
# import essential libraries and modules
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer
from datasets import load_dataset, load_metric
import evaluate
import numpy as np

In [20]:
# first, we download the data from dataset hub from hugging face
data = load_dataset("rotten_tomatoes")

In [26]:
data

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 8530
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
})

In [21]:
data["train"]["text"][1]

'the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\'s expanded vision of j . r . r . tolkien\'s middle-earth .'

# Tokenization
To convert raw text into data that can be understood by model such as numbers,vectors, tensors etc. We will use the AutoTokenizer and call pretrained tokenizer model.

In [22]:
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english" # checkpoint for tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# function to tokenize the dataset using tokenizer object
def tokenize_dataset(dataset):
  return tokenizer(
      dataset['text']
  )

In [23]:
# map function to use the function
dataset = data.map(tokenize_dataset)

In [31]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 8530
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 1066
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 1066
    })
})

In [7]:
dataset["train"].features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['neg', 'pos'], id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [8]:
# to implement dynamic padding
data_collator = DataCollatorWithPadding(tokenizer)

# Fine Tuning a pretrained model
We will use a pretrained model and fine tune the model on our dataset. We will first implement the training arguments using the TrainingArguments and then will use the Trainer() to train the model on our dataset.

In [47]:
# pretrained model
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

# training arguments
training_args = TrainingArguments(
    "test-trainer",
)

In [48]:
# implement the trainer
trainer = Trainer(
    model,
    training_args,
    train_dataset = dataset["train"],
    eval_dataset  = dataset["validation"],
    data_collator = data_collator,
    tokenizer = tokenizer,
)

# train the trainer
trainer.train()

Step,Training Loss
500,0.3606
1000,0.3423
1500,0.1808
2000,0.1654
2500,0.0833
3000,0.0468


TrainOutput(global_step=3201, training_loss=0.18763009148513998, metrics={'train_runtime': 223.7572, 'train_samples_per_second': 114.365, 'train_steps_per_second': 14.306, 'total_flos': 294316169824056.0, 'train_loss': 0.18763009148513998, 'epoch': 3.0})

# Evaluation

In [49]:
trainer.evaluate(dataset['test'])

{'eval_loss': 0.6589958667755127,
 'eval_accuracy': 0.8930581613508443,
 'eval_runtime': 1.8224,
 'eval_samples_per_second': 584.93,
 'eval_steps_per_second': 73.528,
 'epoch': 3.0}

In [50]:
# predict the test dataset
preds = trainer.predict(dataset['test'])
print(preds.predictions.shape,preds.label_ids.shape)

(1066, 2) (1066,)


In [52]:
# compute the accuracy
metric = evaluate.load("accuracy")
pred = np.argmax(preds.predictions,axis=-1)
metric.compute(predictions=pred,references=preds.label_ids)

{'accuracy': 0.8930581613508443}

In [62]:
### to save the model
### trainer.save_model('custom-model')