In [1]:
import os
os.environ['http_proxy'] = 'http://proxy1.bgc-jena.mpg.de:3128' 
os.environ['https_proxy'] = 'http://proxy1.bgc-jena.mpg.de:3128'

In [2]:
!pip install transformers



In [3]:
!pip install transformers[torch]



In [4]:
# Import the necessary libraries
import numpy as np
import pandas as pd

import torch
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

import requests
import re
from bs4 import BeautifulSoup

cuda


In [5]:
# Extract the tokeniser
tokeniser = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

# Extract the model
model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

In [6]:
emotions = 'Pizza was awesome, but service was not good'

In [7]:
tokens = tokeniser.encode(emotions, return_tensors='pt') # pytorch
print(tokens)

tensor([[  101, 59371, 10140, 37079, 42279, 10688,   117, 10502, 11416, 10140,
         10497, 12050,   102]])


In [8]:
# Decode tokens back
decoded_tokens = tokeniser.decode(tokens[0])
print(decoded_tokens)

[CLS] pizza was awesome, but service was not good [SEP]


In [9]:
# Feed the token to our pretrained model
results = model(tokens)
# Get results
print(results)
# Logits
print(results.logits)
# Max arg represents the sentiment predicted by the model
print(torch.argmax(results.logits))

SequenceClassifierOutput(loss=None, logits=tensor([[ 0.9302,  1.1564,  0.8870, -0.5145, -1.9438]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
tensor([[ 0.9302,  1.1564,  0.8870, -0.5145, -1.9438]],
       grad_fn=<AddmmBackward0>)
tensor(1)


In [10]:
# Experimenting with different emotions
happy = 'Absolutely loved the idea!'
happy_tkns = tokeniser.encode(happy, return_tensors='pt')
happy_results = model(happy_tkns)
print(happy_results.logits)
print(torch.argmax(happy_results.logits))

tensor([[-1.6949, -2.0449, -1.1944,  0.9835,  3.1879]],
       grad_fn=<AddmmBackward0>)
tensor(4)


In [11]:
sad = 'this bicycle is trash!'
sad_tkns = tokeniser.encode(sad, return_tensors='pt')
sad_results = model(sad_tkns)
print(sad_results.logits)
print(torch.argmax(sad_results.logits))

tensor([[ 4.8434,  1.8983, -0.4402, -2.7727, -2.6937]],
       grad_fn=<AddmmBackward0>)
tensor(0)


So, basically lower values represent negative emotions and higher values represent positive emotions.

# Fine-Tuning Models

In this case, I worked with [DistilBERT base model (uncased)](https://huggingface.co/distilbert/distilbert-base-uncased)

In [12]:
# Extract the dataset
df = pd.read_csv('/Net/Groups/BGI/scratch/ppandey/LLMs_Playground/BERT/SMSSpamCollection',
                 sep='\t',
                 names=["label", "message"])
print(df.shape)
print(df.head())

(5572, 2)
  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [13]:
X = list(df["message"])
print(X[:2])

['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...', 'Ok lar... Joking wif u oni...']


In [14]:
Y = list(df["label"])
print(Y[:2])

['ham', 'ham']


In [15]:
# Convert label from string to zeros and ones
Y = list(pd.get_dummies(Y, drop_first=True)['spam'].astype(int))
print(Y[:2])

[0, 0]


In [16]:
# Splitting dataset
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [17]:
# Import the model
from transformers import (
    DistilBertTokenizerFast, DistilBertForSequenceClassification
)
from transformers import TrainingArguments, Trainer

2024-04-07 23:10:56.797027: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-07 23:10:56.833687: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-07 23:10:56.833744: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-07 23:10:56.834707: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-07 23:10:56.840968: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-07 23:10:56.841675: I tensorflow/core/platform/cpu_feature_guard.cc:1

In [18]:
# Call the distilbert tokenizer
db_tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Call the model
db_model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
db_model = db_model.cuda()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
example_token = db_tokenizer.encode('My name is Prasoon!', return_tensors='pt')
print(example_token)

tensor([[  101,  2026,  2171,  2003, 10975,  3022,  7828,   999,   102]])


In [20]:
# Encode the training and testing datasett
encoded_train = db_tokenizer(x_train, truncation=True, padding=True)
encoded_test = db_tokenizer(x_test, truncation=True, padding=True)

In [21]:
print(encoded_train[:2])
print(encoded_test[:2])

[Encoding(num_tokens=238, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]), Encoding(num_tokens=238, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])]
[Encoding(num_tokens=140, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]), Encoding(num_tokens=140, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])]


In [22]:
print(encoded_train.keys())
print(encoded_train['input_ids'][:2])

dict_keys(['input_ids', 'attention_mask'])
[[101, 7514, 2000, 2663, 27708, 4882, 999, 2073, 2097, 1996, 2294, 5713, 2088, 2452, 2022, 2218, 1029, 4604, 2644, 2000, 6584, 21926, 2683, 2000, 2203, 2326, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 7592, 1012, 4066, 1997, 2041, 1999, 2237, 2525, 1012, 2008, 1012, 2061, 2123, 2102, 5481, 2188, 1010, 1045, 2572, 5983, 6583, 9905, 2015, 1012, 2097, 2292

In [23]:
# Create torch dataset (https://huggingface.co/transformers/v3.2.0/custom_datasets.html)
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])


In [24]:
train_dataset = Dataset(encoded_train, y_train)
test_dataset = Dataset(encoded_test, y_test)
print(train_dataset[:2])

{'input_ids': tensor([[  101,  7514,  2000,  2663, 27708,  4882,   999,  2073,  2097,  1996,
          2294,  5713,  2088,  2452,  2022,  2218,  1029,  4604,  2644,  2000,
          6584, 21926,  2683,  2000,  2203,  2326,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,  

In [25]:
def compute_metrics(p):
    print(type(p))
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [26]:
# Define Trainer
args = TrainingArguments(
    output_dir="output",
    num_train_epochs=1,
    per_device_train_batch_size=8

)
trainer = Trainer(
    model=db_model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [27]:
# Train the model
trainer.train()



Step,Training Loss


TrainOutput(global_step=70, training_loss=0.11615197317940848, metrics={'train_runtime': 28.1149, 'train_samples_per_second': 158.528, 'train_steps_per_second': 2.49, 'total_flos': 274447094927208.0, 'train_loss': 0.11615197317940848, 'epoch': 1.0})

In [28]:
trainer.evaluate()



<class 'transformers.trainer_utils.EvalPrediction'>


{'eval_loss': 0.03388119488954544,
 'eval_accuracy': 0.9928251121076234,
 'eval_precision': 0.9795918367346939,
 'eval_recall': 0.9664429530201343,
 'eval_f1': 0.9729729729729729,
 'eval_runtime': 14.3647,
 'eval_samples_per_second': 77.621,
 'eval_steps_per_second': 1.253,
 'epoch': 1.0}

In [29]:
# testing the model
trainer.predict(test_dataset)



<class 'transformers.trainer_utils.EvalPrediction'>


PredictionOutput(predictions=array([[ 2.987157 , -2.484671 ],
       [ 3.0913355, -2.485971 ],
       [ 3.0936885, -2.5021067],
       ...,
       [ 3.1390152, -2.5200768],
       [ 3.0432382, -2.4372091],
       [ 2.9266534, -2.339787 ]], dtype=float32), label_ids=array([0, 0, 0, ..., 0, 0, 0]), metrics={'test_loss': 0.03388119488954544, 'test_accuracy': 0.9928251121076234, 'test_precision': 0.9795918367346939, 'test_recall': 0.9664429530201343, 'test_f1': 0.9729729729729729, 'test_runtime': 14.3641, 'test_samples_per_second': 77.624, 'test_steps_per_second': 1.253})

In [30]:
test_spam_text = "WINNER!! As a valued network customer"
spam_tkns = db_tokenizer(test_spam_text, truncation=True, padding=True, return_tensors='pt').to('cuda')
print(spam_tkns)

{'input_ids': tensor([[  101,  3453,   999,   999,  2004,  1037, 11126,  2897,  8013,   102]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}


In [31]:
results = db_model(**spam_tkns)
print(results)

SequenceClassifierOutput(loss=None, logits=tensor([[ 0.2705, -0.1136]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


In [32]:
predictions = torch.nn.functional.softmax(results.logits, dim=-1)
print(predictions)
predictions = predictions.cpu().detach().numpy()
print(predictions)

tensor([[0.5949, 0.4051]], device='cuda:0', grad_fn=<SoftmaxBackward0>)
[[0.59486663 0.40513337]]
