In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install datasets



In [3]:
!pip install -U transformers



In [4]:
!pip install evaluate



In [5]:
import transformers

In [6]:
transformers.__version__

'4.45.2'

In [7]:
from datasets import load_dataset, Dataset, Features, ClassLabel, Value
from functools import partial
import os

In [8]:
def get_labels(data_dir):
    return os.listdir(data_dir)

In [9]:
def get_file_from_folder_with_label(data_dir):

    for root, sub_folder, files in os.walk(data_dir):
        for file in files:
            filename = os.path.join(root, file)
            label = os.path.basename(root)
            with open(filename, "r", encoding="latin-1") as f:
                yield {"text" : f.read(), "labels": label, "filename": file}

In [10]:
data_dir = "/content/drive/MyDrive/Colab Notebooks/dataset/bbc"

**Folder structure of data_dir:**

     -- class_1
        -- file1.txt
        -- file2.txt
     -- class_2
        -- file1.txt
        -- file2.txt

### Dataset creation:

In [11]:
file_iterator = partial(get_file_from_folder_with_label,data_dir)

In [12]:
cache_dir = "hub_cache"

In [13]:
class_names = get_labels(data_dir)
print(class_names)
class_label = ClassLabel(names=class_names)

['entertainment', 'business', 'politics', 'tech', 'sport']


In [14]:
bbc_features = Features({'text': Value('string'), 'labels': class_label, 'filename': Value('string')})

In [15]:
ds = Dataset.from_generator(file_iterator, features=bbc_features,
                            cache_dir=cache_dir)

In [16]:
ds

Dataset({
    features: ['text', 'labels', 'filename'],
    num_rows: 2225
})

In [17]:
ds.features

{'text': Value(dtype='string', id=None),
 'labels': ClassLabel(names=['entertainment', 'business', 'politics', 'tech', 'sport'], id=None),
 'filename': Value(dtype='string', id=None)}

In [18]:
ds = ds.train_test_split(test_size=0.2)

In [19]:
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'filename'],
        num_rows: 1780
    })
    test: Dataset({
        features: ['text', 'labels', 'filename'],
        num_rows: 445
    })
})

### Tokenizer

In [20]:
from transformers import DataCollatorWithPadding
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate

In [21]:
model_name = "distilbert/distilbert-base-uncased"

In [22]:
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir = cache_dir)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [23]:
ds["train"][0]

{'text': "Nadal puts Spain 2-0 up\n\nResult: Nadal 6-7 (6/8) 6-2 7-6 (8/6) 6-2 Roddick\n\nSpain's Rafael Nadal beats Andy Roddick of the USA in the second singles match rubber of the 2004 Davis Cup final in Seville. Spain lead 1-0 after Carlos Moya beat Mardy Fish in straight sets in the opening match of the tie.\n\nNadal holds his nerve and the crowd goes wild as Spain go 2-0 up in the tie.\n\nRoddick holds serve to force Nadal to serve for the match but the American surely cannot turn things around now.\n\nNadal works Roddick around the court on two consecutive points to earn two break points. One is enough, the Spaniard secures the double-break and Roddick is now teetering on the edge.\n\nRoddick is trying to gee himself up but the clay surface is taking its toll on his game and he is looking tired. Nadal wins the game to love.\n\nNadal steps up the pressure to break and Spain have the early initiative in the fourth set.\n\nNadal also holds convincingly as both players feel their wa

In [24]:
def preprocess_text(example):
    return tokenizer(example["text"], truncation=True)

In [25]:
tokenized_ds = ds.map(preprocess_text, batched=True)

Map:   0%|          | 0/1780 [00:00<?, ? examples/s]

Map:   0%|          | 0/445 [00:00<?, ? examples/s]

In [26]:
tokenized_ds

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'filename', 'input_ids', 'attention_mask'],
        num_rows: 1780
    })
    test: Dataset({
        features: ['text', 'labels', 'filename', 'input_ids', 'attention_mask'],
        num_rows: 445
    })
})

In [27]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

In [28]:
id2label = {}
label2id = {}
for label in class_label.names:
    id2label[class_label.str2int(label)] = label
    label2id[label] = class_label.str2int(label)
print(id2label)
print(label2id)

{0: 'entertainment', 1: 'business', 2: 'politics', 3: 'tech', 4: 'sport'}
{'entertainment': 0, 'business': 1, 'politics': 2, 'tech': 3, 'sport': 4}


In [29]:
num_labels = class_label.num_classes
num_labels

5

### Model

In [30]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, cache_dir = cache_dir, num_labels=num_labels, id2label=id2label, label2id=label2id)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [32]:
import torch
import numpy as np

In [33]:
device = "cpu"
if torch.cuda.is_available():
    device = "cuda"
device

'cuda'

In [34]:
model = model.to(device)

In [35]:
!nvidia-smi

Fri Oct 18 03:09:22 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   67C    P0              31W /  70W |    395MiB / 15360MiB |      2%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

### Model training using HF framework

In [36]:
model_result_dir = "models_trained"

In [37]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    num_labels = max(predictions.max(), labels.max())

    fn = sum([sum(predictions[(labels==i)]!=i) for i in range(num_labels)])
    fp = sum([sum(predictions[(labels!=i)]==i) for i in range(num_labels)])
    tp = sum(predictions==labels)

    accuracy = tp/labels.shape[0]
    precision = 0 if tp==0 else tp/(tp+fp)
    recall = 0 if tp==0 else tp/(tp+fn)
    f1_score = 0 if tp==0 else (2*precision*recall)/(precision+recall)
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1_score}

In [38]:
training_args = TrainingArguments(
                    output_dir= model_result_dir,
                    eval_strategy="epoch",
                    save_strategy="epoch",
                    load_best_model_at_end=True,
                    per_device_train_batch_size=16,
                    per_device_eval_batch_size=16,
                    num_train_epochs=2,
                    push_to_hub=False

                )

In [39]:
trainer = Trainer(model = model,
                 args=training_args,
                 train_dataset=tokenized_ds["train"],
                 eval_dataset=tokenized_ds["test"],
                 tokenizer=tokenizer,
                 data_collator=data_collator,
                 compute_metrics=compute_metrics)

In [40]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.133601,0.964045,0.964045,0.966216,0.965129
2,No log,0.122231,0.977528,0.977528,0.97973,0.978628


TrainOutput(global_step=224, training_loss=0.18355260576520646, metrics={'train_runtime': 206.5408, 'train_samples_per_second': 17.236, 'train_steps_per_second': 1.085, 'total_flos': 471609169305600.0, 'train_loss': 0.18355260576520646, 'epoch': 2.0})

In [41]:
!nvidia-smi

Fri Oct 18 03:22:18 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   77C    P0              34W /  70W |   6333MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

### Metrics evaluation

In [48]:
from torch.utils.data import DataLoader

In [49]:
class MetricEvaluation():
    def __init__(self):
        self.m = {"tp": 0, "fp": 0, "fn": 0}
        self.total = 0

    def add_batch(self, predictions=[], references=[]):


        num_labels = max(predictions.max().item(), references.max().item())

        fn = sum([sum(predictions[(references==i)]!=i) for i in range(num_labels)]).item()
        fp = sum([sum(predictions[(references!=i)]==i) for i in range(num_labels)]).item()
        tp = sum(predictions==references).item()

        self.total += predictions.size(0)

        self.m["tp"]+=tp
        self.m["fp"]+=fp
        self.m["fn"]+=fn

    def compute(self):
        tp,fp,fn = self.m["tp"],self.m["fp"],self.m["fn"]
        accuracy = tp/self.total
        precision = 0 if tp==0 else tp/(tp+fp)
        recall = 0 if tp==0 else tp/(tp+fn)
        f1_score = 0 if tp==0 else (2*precision*recall)/(precision+recall)
        return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1_score}


In [52]:
tokenized_ds = tokenized_ds.remove_columns(["text", "filename"])

In [59]:
train_dataloader = DataLoader(tokenized_ds["train"],batch_size=16,collate_fn=data_collator)

In [60]:
trn_metric = MetricEvaluation()

In [61]:
model.eval()
for batch in train_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    trn_metric.add_batch(predictions=predictions, references=batch["labels"])

trn_metric.compute()

{'accuracy': 0.9955056179775281,
 'precision': 0.9955056179775281,
 'recall': 0.9955056179775281,
 'f1': 0.9955056179775281}

In [62]:
eval_dataloader = DataLoader(tokenized_ds["test"],batch_size=16,collate_fn=data_collator)

In [63]:
metric = MetricEvaluation()

In [64]:
model.eval()
preds = []
references = []
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

{'accuracy': 0.9775280898876404,
 'precision': 0.9775280898876404,
 'recall': 0.9797297297297297,
 'f1': 0.9786276715410572}