Hi there.
This is the demo of creating a simple custom text classicifcaition model. 


In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Install dependencies (this is mostly for Google Colab, as the other dependences are available by default in Colab)
try:
  import datasets, evaluate, accelerate
  import gradio as gr
except ModuleNotFoundError:
  !pip install -U datasets evaluate accelerate gradio # -U stands for "upgrade" so we'll get the latest version by default
  import datasets, evaluate, accelerate
  import gradio as gr

import random

import numpy as np
import pandas as pd

import torch
import transformers

print(f"Using transformers version: {transformers.__version__}")
print(f"Using datasets version: {datasets.__version__}")
print(f"Using torch version: {torch.__version__}")

Using transformers version: 4.49.0
Using datasets version: 3.4.1
Using torch version: 2.6.0+cu126


Loading the dataset from HuggingfaceHub 

Dataset credits : Daniel Brouke 

In [3]:
dataset = datasets.load_dataset(path="mrdbourke/learn_hf_food_not_food_image_captions")

dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 250
    })
})

In [4]:
len(dataset["train"])

250

In [5]:
range(len(dataset["train"]))

range(0, 250)

In [6]:
#visualizing the data 

import random 

# creates a sequences of items in training data and iteraties through.

random_indexs = random.sample(range(len(dataset["train"])),5)
random_samples = dataset["train"][random_indexs]

for item in zip(random_samples["text"],random_samples["label"]):
    print(f"Text: {item[0]} | Label: {item[1]}")

Text: Rich and spicy lamb rogan josh with yogurt garnish, featuring tender lamb pieces in a bold sauce with spices, finished with creamy yogurt. | Label: food
Text: Guitar leaning casually against a couch | Label: not_food
Text: Washing machine and dryer side by side in a laundry room | Label: not_food
Text: Pizza with a seasonal theme, featuring toppings like butternut squash and kale | Label: food
Text: A bowl of sliced kiwi with a sprinkle of sugar and a side of yogurt | Label: food


In [7]:
# Check number of each label
from collections import Counter

Counter(dataset["train"]["label"])

Counter({'food': 125, 'not_food': 125})

In [8]:
data_df = pd.DataFrame(dataset['train'])
data_df.sample(5)

Unnamed: 0,text,label
49,"Pizza with a white sauce base, topped with spi...",food
82,"Cooking dinner in the kitchen, a man has a dog...",not_food
20,White bathtub with a shower curtain ready for ...,not_food
178,Crunchy sushi roll with tempura flakes or pank...,food
188,A basket of fresh strawberries with a sprinkle...,food


Tokenization 

In [9]:
# Create mappings programmatically from dataset
id2label = {idx: label for idx, label in enumerate(dataset["train"].unique("label")[::-1])} 
label2id = {label: idx for idx, label in id2label.items()}

print(f"Label to ID mapping: {label2id}")
print(f"ID to Label mapping: {id2label}")

Label to ID mapping: {'not_food': 0, 'food': 1}
ID to Label mapping: {0: 'not_food', 1: 'food'}


In [10]:
def map_label_to_number(example):
    example["label"] = label2id[example["label"]]
    return example

example_sample = {"text": "This is a sentence about my favourite food: Biriyani.", "label": "food"}

# Test the function
map_label_to_number(example_sample)

{'text': 'This is a sentence about my favourite food: Biriyani.', 'label': 1}

In [11]:
# we map the labels to numbers

dataset = dataset["train"].map(map_label_to_number)
dataset[:5]

{'text': ['Creamy cauliflower curry with garlic naan, featuring tender cauliflower in a rich sauce with cream and spices, served with garlic naan bread.',
  'Set of books stacked on a desk',
  'Watching TV together, a family has their dog stretched out on the floor',
  'Wooden dresser with a mirror reflecting the room',
  'Lawn mower stored in a shed'],
 'label': [1, 0, 0, 0, 0]}

Creating a test dataset to evaluate the performance

In [12]:
# Create train/test splits
dataset = dataset.train_test_split(test_size=0.2, seed=42) 
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 200
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 50
    })
})

In [15]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path="distilbert/distilbert-base-uncased",
                                          use_fast=True) 


In [16]:
tokenizer("I love pizza")

{'input_ids': [101, 1045, 2293, 10733, 102], 'attention_mask': [1, 1, 1, 1, 1]}

In [26]:
tokenizer("Sreedeep")

{'input_ids': [101, 5034, 13089, 4402, 2361, 102], 'attention_mask': [1, 1, 1, 1, 1, 1]}

In [27]:

tokenizer.convert_ids_to_tokens(tokenizer("sreedeep").input_ids)

['[CLS]', 'sr', '##eed', '##ee', '##p', '[SEP]']

In [28]:
def tokenize_text(examples):
    """
    Tokenize given example text and return the tokenized text.
    """
    return tokenizer(examples["text"],
                     padding=True, # pad short sequences to longest sequence in the batch
                     truncation=True)

In [29]:
tokenized_dataset = dataset.map(function=tokenize_text,
                                batched=True,
                                batch_size=1000)


tokenized_dataset

Map: 100%|██████████| 200/200 [00:00<00:00, 2776.01 examples/s]
Map: 100%|██████████| 50/50 [00:00<00:00, 3797.19 examples/s]


DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 200
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 50
    })
})

In [35]:
import evaluate
import numpy as np 
from typing import Tuple

accuracy_metric = evaluate.load("accuracy")

def compute_accuracy(predictions_and_labels: Tuple[np.array, np.array]):
    
    predictions, labels = predictions_and_labels
    
    if len(predictions.shape) >= 2:
        predictions = np.argmax(predictions, axis=1)
        
        return accuracy_metric.compute(predictions=predictions,references=labels)
    

Creating the model 

In [37]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path="distilbert/distilbert-base-uncased",
                                                           num_labels=2,
                                                           id2label=id2label,
                                                           label2id=label2id)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [39]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [40]:

from pathlib import Path

models_dir = Path("models")
models_dir.mkdir(exist_ok=True)


model_save_name = "food-not-food-model"

model_save_dir = Path(models_dir, model_save_name)
model_save_dir

WindowsPath('models/food-not-food-model')

In [42]:
from transformers import TrainingArguments

print(f"[INFO] Saving model checkpoints to: {model_save_dir}")

# Create training arguments
training_args = TrainingArguments(
    output_dir=model_save_dir,
    learning_rate=0.0001,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    eval_strategy="epoch", 
    save_strategy="epoch",
    save_total_limit=3,
    use_cpu=False,
    seed=42, 
    load_best_model_at_end=True, 
    logging_strategy="epoch", 
    report_to="none", 
    hub_private_repo=False 
)

[INFO] Saving model checkpoints to: models\food-not-food-model


In [43]:
from transformers import Trainer

# Setup Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
   
    tokenizer=tokenizer, 
    compute_metrics=compute_accuracy
)

In [44]:
results = trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.4389,0.07816,1.0
2,0.0317,0.006274,1.0
3,0.0041,0.002003,1.0
4,0.0016,0.00107,1.0
5,0.0011,0.000738,1.0
6,0.0008,0.000596,1.0
7,0.0007,0.000525,1.0
8,0.0006,0.000487,1.0
9,0.0006,0.000467,1.0
10,0.0006,0.000461,1.0


In [45]:
for key, value in results.metrics.items():
    print(f"{key}: {value}")

train_runtime: 40.2015
train_samples_per_second: 49.749
train_steps_per_second: 1.741
total_flos: 18110777160000.0
train_loss: 0.04805797283936824
epoch: 10.0


In [46]:
print(f"Saving model to {model_save_dir}")
trainer.save_model(output_dir=model_save_dir)

Saving model to models\food-not-food-model
