In [12]:
!pip install --upgrade transformers datasets

Collecting datasets
  Using cached datasets-4.4.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.4.1,>=0.3.0 (from datasets)
  Using cached dill-0.4.0-py3-none-any.whl.metadata (10 kB)
Collecting httpx<1.0.0 (from datasets)
  Using cached httpx-0.28.1-py3-none-any.whl.metadata (7.1 kB)
Collecting multiprocess<0.70.19 (from datasets)
  Using cached multiprocess-0.70.18-py313-none-any.whl.metadata (7.2 kB)
Collecting aiohttp!=4.0.0a0,!=4.0.0a1 (from fsspec[http]<=2025.10.0,>=2023.1.0->datasets)
  Using cached aiohttp-3.13.2-cp313-cp313-win_amd64.whl.metadata (8.4 kB)
Collecting anyio (from httpx<1.0.0->datasets)
  Using cached anyio-4.11.0-py3-none-any.whl.metadata (4.1 kB)
Collecting httpcore==1.* (from httpx<1.0.0->datasets)
  Using cached httpcore-1.0.9-py3-none-any.whl.metadata (21 kB)
Collecting h11>=0.16 (from httpcore==1.*->httpx<1.0.0->datasets)
  Using cached h11-0.16.0-py3-none-any.whl.metadata (8.3 kB)
Collecting aiohappyeyeballs>=2.5.0 (from aiohttp!=4.0.0a0,!=4.0.0a1->f

In [13]:
import pandas as pd
df = pd.read_csv("expense_dataset_10000.csv")
print(df.head())


                                                Text  Amount Category
0               I bought lunch and paid $111 at work     111     Food
1  I grabbed a quick bite at sandwich shop and pa...      37     Food
2  I purchased ingredients and spent $138 for coo...     138     Food
3      I had takeout and paid $115 at the McDonald's     115     Food
4                 I spent $128 on food for the party     128     Food


In [14]:
df['Text'] = df['Text'].str.lower()
df['Text'] = df['Text'].str.replace(r'[^a-zA-Z0-9\s$\'\-]', '', regex=True)



In [15]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Category_encoded'] = le.fit_transform(df['Category'])
print(le.classes_)


['Clothing' 'Entertainment' 'Food' 'Health' 'Transport']


In [16]:
from sklearn.model_selection import train_test_split

train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['Text'], df['Category_encoded'], test_size=0.2, random_state=42, stratify=df['Category_encoded']
)


In [17]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizerFast


  from .autonotebook import tqdm as notebook_tqdm


In [18]:
#Loading a pretrained tokenizer (distilbert)
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [19]:
#This converts text and labels into tensors that the model can train on

class ExpenseDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=64):
        self.texts = texts.tolist()
        self.labels = labels.tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors="pt"
        )

        # Remove the extra batch dimension
        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": torch.tensor(label, dtype=torch.long)
        }


In [20]:
#train/test datasets
train_dataset = ExpenseDataset(train_texts, train_labels, tokenizer)
test_dataset = ExpenseDataset(test_texts, test_labels, tokenizer)


In [21]:
#loading distil bert pretrained model for classification
from transformers import DistilBertForSequenceClassification

num_labels = len(le.classes_)

model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=num_labels
)


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
from transformers import Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments



In [30]:
training_args = TrainingArguments(
    output_dir="./results",           # where the model checkpoints will be saved
    eval_strategy="epoch",      # evaluate after each epoch
    save_strategy="epoch",            # save after each epoch
    learning_rate=2e-5,               # recommended for fine-tuning
    per_device_train_batch_size=16,   # batch size for training
    per_device_eval_batch_size=16,    # batch size for evaluation
    num_train_epochs=5,               # number of training epochs
    weight_decay=0.01,                # regularization
)

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>=0.26.0'`

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,

)


  trainer = Trainer(


In [None]:
trainer.train()


Epoch,Training Loss,Validation Loss
1,0.1366,0.001741
2,0.0013,0.000526
3,0.0011,0.000289
4,0.0004,0.000202
5,0.0003,0.000179


TrainOutput(global_step=2500, training_loss=0.027926406210660934, metrics={'train_runtime': 290.3005, 'train_samples_per_second': 137.788, 'train_steps_per_second': 8.612, 'total_flos': 662372428800000.0, 'train_loss': 0.027926406210660934, 'epoch': 5.0})

In [None]:
trainer.evaluate()

{'eval_loss': 0.00017930757894646376,
 'eval_runtime': 3.8025,
 'eval_samples_per_second': 525.968,
 'eval_steps_per_second': 32.873,
 'epoch': 5.0}

In [None]:
# Evaluate the model
metrics = trainer.evaluate(test_dataset)
print(metrics)


{'eval_loss': 0.00017930757894646376, 'eval_runtime': 3.7499, 'eval_samples_per_second': 533.349, 'eval_steps_per_second': 33.334, 'epoch': 5.0}


In [None]:
import re

def predict_expense_with_amount(text):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)

    # tokenize input
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # predict category
    with torch.no_grad():
        outputs = model(**inputs)
    pred_class = torch.argmax(outputs.logits, dim=1).item()
    category = le.inverse_transform([pred_class])[0]

    # extract amount using regex
    match = re.search(r'\$?(\d+(\.\d+)?)\$?', text)  # matches 10, 10.5, $10, 10$
    amount = float(match.group(1)) if match else None

    return category, amount




In [None]:
test_sentences = [
    "i bought a coffee for 10$",
    "Grabbed lunch at McDonald's for $12",
    "Took an Uber to the airport and paid $25",
    "Went to the cinema and paid $15 for tickets",
    "Bought vitamins for $20",
    "Bought a birthday gift for $40"
]

for sentence in test_sentences:
    category, amount = predict_expense_with_amount(sentence)
    print(f"Sentence: '{sentence}' -> Category: {category}, Amount: ${amount}")


Sentence: 'i bought a coffee for 10$' -> Category: Food, Amount: $10.0
Sentence: 'Grabbed lunch at McDonald's for $12' -> Category: Food, Amount: $12.0
Sentence: 'Took an Uber to the airport and paid $25' -> Category: Transport, Amount: $25.0
Sentence: 'Went to the cinema and paid $15 for tickets' -> Category: Entertainment, Amount: $15.0
Sentence: 'Bought vitamins for $20' -> Category: Health, Amount: $20.0
Sentence: 'Bought a birthday gift for $40' -> Category: Entertainment, Amount: $40.0


In [None]:
from sklearn.metrics import classification_report, accuracy_score
import torch

all_preds = []
all_labels = []

# Make sure the model is on the correct device
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()

for batch in test_dataset:
    # Move batch tensors to device
    input_ids = batch["input_ids"].unsqueeze(0).to(device)
    attention_mask = batch["attention_mask"].unsqueeze(0).to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        pred = torch.argmax(outputs.logits, dim=1).item()

    all_preds.append(pred)
    all_labels.append(batch["labels"].item())

# Accuracy
accuracy = accuracy_score(all_labels, all_preds)
print("Overall Accuracy:", accuracy)

# Precision, Recall, F1-score per class
report = classification_report(all_labels, all_preds, target_names=le.classes_)
print("\nClassification Report:\n", report)


Overall Accuracy: 1.0

Classification Report:
                precision    recall  f1-score   support

     Clothing       1.00      1.00      1.00       400
Entertainment       1.00      1.00      1.00       400
         Food       1.00      1.00      1.00       400
       Health       1.00      1.00      1.00       400
    Transport       1.00      1.00      1.00       400

     accuracy                           1.00      2000
    macro avg       1.00      1.00      1.00      2000
 weighted avg       1.00      1.00      1.00      2000



In [None]:
# Save the model
model.save_pretrained("expense_classifier_model")

# Save the tokenizer
tokenizer.save_pretrained("expense_classifier_tokenizer")


('expense_classifier_tokenizer/tokenizer_config.json',
 'expense_classifier_tokenizer/special_tokens_map.json',
 'expense_classifier_tokenizer/vocab.txt',
 'expense_classifier_tokenizer/added_tokens.json',
 'expense_classifier_tokenizer/tokenizer.json')