In [3]:
# STEP 0 — Imports & paths
import os, json, math, itertools
import numpy as np
import pandas as pd
from datasets import load_dataset

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import (
    confusion_matrix, ConfusionMatrixDisplay,
    classification_report
)
from scipy.sparse import hstack

import matplotlib.pyplot as plt

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [4]:
# STEP 1 — Load BANKING77 and prepare splits/labels
ds = load_dataset("PolyAI/banking77")
label_names = ds["train"].features["label"].names

In [5]:
X_train = list(ds["train"]["text"])
y_train = np.array(ds["train"]["label"])
X_test  = list(ds["test"]["text"])
y_test  = np.array(ds["test"]["label"])

In [6]:
print(f"Train: {len(X_train)} | Test: {len(X_test)} | #Classes: {len(label_names)}")

Train: 10003 | Test: 3080 | #Classes: 77


In [7]:
# STEP 2 — Build Word/Char TF-IDF features and train LinearSVC baseline

In [8]:
# Word n-grams (1–2)
word_vec = TfidfVectorizer(ngram_range=(1,2), min_df=2, max_features=150_000, lowercase=True)
Xtr_word = word_vec.fit_transform(X_train)
Xte_word = word_vec.transform(X_test)


In [9]:
# Char n-grams (3–5, word boundary mode)
char_vec = TfidfVectorizer(analyzer="char_wb", ngram_range=(3,5), min_df=2, max_features=150_000, lowercase=True)
Xtr_char = char_vec.fit_transform(X_train)
Xte_char = char_vec.transform(X_test)


In [10]:
# Combine (Word + Char)
Xtr_combo = hstack([Xtr_word, Xtr_char])
Xte_combo = hstack([Xte_word, Xte_char])

In [11]:
# Linear SVM baseline
svm = LinearSVC(C=1.0, random_state=42)
svm.fit(Xtr_combo, y_train)
y_pred_svm = svm.predict(Xte_combo)

In [12]:
print("Baseline (Word+Char+LinearSVC) — quick report")
print(classification_report(y_test, y_pred_svm, target_names=label_names, digits=4))

Baseline (Word+Char+LinearSVC) — quick report
                                                  precision    recall  f1-score   support

                                activate_my_card     0.9750    0.9750    0.9750        40
                                       age_limit     0.9524    1.0000    0.9756        40
                         apple_pay_or_google_pay     1.0000    1.0000    1.0000        40
                                     atm_support     0.9500    0.9500    0.9500        40
                                automatic_top_up     0.9750    0.9750    0.9750        40
         balance_not_updated_after_bank_transfer     0.7619    0.8000    0.7805        40
balance_not_updated_after_cheque_or_cash_deposit     0.9744    0.9500    0.9620        40
                         beneficiary_not_allowed     0.9487    0.9250    0.9367        40
                                 cancel_transfer     1.0000    0.9500    0.9744        40
                            card_about_to_expire     

In [13]:
# STEP 3 — Load fine-tuned DistilBERT and run batch inference on test

In [15]:
import glob, os
sorted(glob.glob("./**/checkpoint-*/", recursive=True))[:10]

[]

In [14]:
best_dir = "./out_distilbert/best_model"   # adjust if different
model = AutoModelForSequenceClassification.from_pretrained(best_dir)
tok   = AutoTokenizer.from_pretrained(best_dir)

HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': './out_distilbert/best_model'. Use `repo_type` argument if needed.