In [1]:
!pip install transformers[torch]



In [None]:
# Import the necessary libraries
import numpy as np
import pandas as pd

import torch
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification

import requests
import re
from bs4 import BeautifulSoup

In [3]:
# Extract the tokeniser
tokeniser = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

# Extract the model
model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
emotions = 'Pizza was awesome, but service was not good'

In [5]:
tokens = tokeniser.encode(emotions, return_tensors='pt') # pytorch
print(tokens)

tensor([[  101, 59371, 10140, 37079, 42279, 10688,   117, 10502, 11416, 10140,
         10497, 12050,   102]])


In [6]:
# Decode tokens back
decoded_tokens = tokeniser.decode(tokens[0])
print(decoded_tokens)

[CLS] pizza was awesome, but service was not good [SEP]


In [7]:
# Feed the token to our pretrained model
results = model(tokens)
# Get results
print(results)
# Logits
print(results.logits)
# Max arg represents the sentiment predicted by the model
print(torch.argmax(results.logits))

SequenceClassifierOutput(loss=None, logits=tensor([[ 0.9302,  1.1564,  0.8870, -0.5145, -1.9438]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
tensor([[ 0.9302,  1.1564,  0.8870, -0.5145, -1.9438]],
       grad_fn=<AddmmBackward0>)
tensor(1)


In [8]:
# Experimenting with different emotions
happy = 'Absolutely loved the idea!'
happy_tkns = tokeniser.encode(happy, return_tensors='pt')
happy_results = model(happy_tkns)
print(happy_results.logits)
print(torch.argmax(happy_results.logits))

tensor([[-1.6949, -2.0449, -1.1944,  0.9835,  3.1879]],
       grad_fn=<AddmmBackward0>)
tensor(4)


In [9]:
sad = 'this bicycle is trash!'
sad_tkns = tokeniser.encode(sad, return_tensors='pt')
sad_results = model(sad_tkns)
print(sad_results.logits)
print(torch.argmax(sad_results.logits))

tensor([[ 4.8434,  1.8983, -0.4402, -2.7727, -2.6937]],
       grad_fn=<AddmmBackward0>)
tensor(0)


So, basically lower values represent negative emotions and higher values represent positive emotions.

# Fine-Tuning Models

In this case, I worked with [DistilBERT base model (uncased)](https://huggingface.co/distilbert/distilbert-base-uncased)

In [10]:
# Extract the dataset
df = pd.read_csv('/content/SMSSpamCollection',
                 sep='\t',
                 names=["label", "message"])
print(df.shape)
print(df.head())

(5572, 2)
  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [11]:
X = list(df["message"])
print(X[:2])

['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...', 'Ok lar... Joking wif u oni...']


In [12]:
Y = list(df["label"])
print(Y[:2])

['ham', 'ham']


In [13]:
# Convert label from string to zeros and ones
Y = list(pd.get_dummies(Y, drop_first=True)['spam'].astype(int))
print(Y[:2])

[0, 0]


In [14]:
# Splitting dataset
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [15]:
# Import the model
from transformers import (
    DistilBertTokenizerFast, DistilBertForSequenceClassification
)
from transformers import TrainingArguments, Trainer

In [16]:
# Call the distilbert tokenizer
db_tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Call the model
db_model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased",num_labels=2)
db_model = db_model.to('cuda')


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
example_token = db_tokenizer.encode('My name is Prasoon!', return_tensors='pt')
print(example_token)

tensor([[  101,  2026,  2171,  2003, 10975,  3022,  7828,   999,   102]])


In [18]:
# Encode the training and testing datasett
encoded_train = db_tokenizer(x_train, truncation=True, padding=True)
encoded_test = db_tokenizer(x_test, truncation=True, padding=True)

In [19]:
print(encoded_train[:2])
print(encoded_test[:2])

[Encoding(num_tokens=238, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]), Encoding(num_tokens=238, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])]
[Encoding(num_tokens=140, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]), Encoding(num_tokens=140, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])]


In [20]:
print(encoded_train.keys())
print(encoded_train['input_ids'][:2])

dict_keys(['input_ids', 'attention_mask'])
[[101, 7514, 2000, 2663, 27708, 4882, 999, 2073, 2097, 1996, 2294, 5713, 2088, 2452, 2022, 2218, 1029, 4604, 2644, 2000, 6584, 21926, 2683, 2000, 2203, 2326, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 7592, 1012, 4066, 1997, 2041, 1999, 2237, 2525, 1012, 2008, 1012, 2061, 2123, 2102, 5481, 2188, 1010, 1045, 2572, 5983, 6583, 9905, 2015, 1012, 2097, 2292

In [21]:
# Create torch dataset (https://huggingface.co/transformers/v3.2.0/custom_datasets.html)
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])


In [22]:
train_dataset = Dataset(encoded_train, y_train)
test_dataset = Dataset(encoded_test, y_test)
print(train_dataset[:2])

{'input_ids': tensor([[  101,  7514,  2000,  2663, 27708,  4882,   999,  2073,  2097,  1996,
          2294,  5713,  2088,  2452,  2022,  2218,  1029,  4604,  2644,  2000,
          6584, 21926,  2683,  2000,  2203,  2326,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,  

In [23]:
def compute_metrics(p):
    print(type(p))
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [24]:
# Define Trainer
args = TrainingArguments(
    output_dir="output",
    num_train_epochs=1,
    per_device_train_batch_size=8

)
trainer = Trainer(
    model=db_model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [25]:
# Train the model
trainer.train()

Step,Training Loss
500,0.0696


Checkpoint destination directory output/checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.


TrainOutput(global_step=558, training_loss=0.06841590284873936, metrics={'train_runtime': 109.209, 'train_samples_per_second': 40.812, 'train_steps_per_second': 5.109, 'total_flos': 274447094927208.0, 'train_loss': 0.06841590284873936, 'epoch': 1.0})

In [26]:
trainer.evaluate()

<class 'transformers.trainer_utils.EvalPrediction'>


{'eval_loss': 0.038456059992313385,
 'eval_accuracy': 0.9919282511210762,
 'eval_precision': 0.9666666666666667,
 'eval_recall': 0.9731543624161074,
 'eval_f1': 0.9698996655518394,
 'eval_runtime': 28.8816,
 'eval_samples_per_second': 38.606,
 'eval_steps_per_second': 4.847,
 'epoch': 1.0}

In [27]:
# testing the model
trainer.predict(test_dataset)

<class 'transformers.trainer_utils.EvalPrediction'>


PredictionOutput(predictions=array([[ 3.9447992, -3.7668896],
       [ 3.862001 , -3.7507174],
       [ 3.944909 , -3.7761822],
       ...,
       [ 4.0023665, -3.8157938],
       [ 3.7116566, -3.649529 ],
       [ 3.47273  , -3.4237866]], dtype=float32), label_ids=array([0, 0, 0, ..., 0, 0, 0]), metrics={'test_loss': 0.038456059992313385, 'test_accuracy': 0.9919282511210762, 'test_precision': 0.9666666666666667, 'test_recall': 0.9731543624161074, 'test_f1': 0.9698996655518394, 'test_runtime': 41.7891, 'test_samples_per_second': 26.682, 'test_steps_per_second': 3.35})

In [53]:
test_spam_text = "WINNER!! As a valued network customer"
spam_tkns = db_tokenizer(test_spam_text, truncation=True, padding=True, return_tensors='pt').to('cuda')
print(spam_tkns)

{'input_ids': tensor([[  101,  3453,   999,   999,  2004,  1037, 11126,  2897,  8013,   102]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}


In [54]:
results = db_model(**spam_tkns)
print(results)

SequenceClassifierOutput(loss=None, logits=tensor([[-1.2897,  1.0517]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


In [56]:
predictions = torch.nn.functional.softmax(results.logits, dim=-1)
print(predictions)
predictions = predictions.cpu().detach().numpy()
print(predictions)

tensor([[0.0878, 0.9122]], device='cuda:0', grad_fn=<SoftmaxBackward0>)
[[0.08775879 0.9122412 ]]
