In [None]:
!pip install transformers datasets scikit-learn
!pip uninstall -y transformers
!pip install transformers
!pip install datasets --upgrade
!pip install openpyxl

Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.1-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m32.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl 

In [None]:
import io
import pandas as pd
from datasets import Dataset
from google.colab import files
from sklearn.preprocessing import LabelEncoder
from transformers import Trainer,TrainingArguments
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer,AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score,precision_recall_fscore_support
uploaded = files.upload()

# Get the actual filename and file content from the uploaded dictionary
uploaded_filename = list(uploaded.keys())[0]
uploaded_file_content = uploaded[uploaded_filename]

# Read the file using the appropriate function based on file type
if uploaded_filename.endswith(".csv"):
    updated_data = pd.read_csv(io.BytesIO(uploaded_file_content)) # Use io.BytesIO to handle byte content
elif uploaded_filename.endswith(".xls") or uploaded_filename.endswith(".xlsx"):
    # Use openpyxl to handle both xls and xlsx files
    try:
        updated_data = pd.read_excel(io.BytesIO(uploaded_file_content), engine='openpyxl')
    except:
        # If openpyxl fails, try to interpret as a csv
        updated_data = pd.read_csv(io.BytesIO(uploaded_file_content))
else:
    raise ValueError(f"Unsupported file type: {uploaded_filename}")
print(updated_data.columns)
updated_data.head()
updated_data.tail()




Saving augmented_data.xls to augmented_data.xls
Index(['Query', 'Intent'], dtype='object')


Unnamed: 0,Query,Intent
10095,Can I have my money refunded for an item,Banking
10096,Where is my card PIN?,Banking
10097,Are you an AI digital assistant?,Non-Banking
10098,My payment shows that it is pending will I be ...,Banking
10099,Access the course details for Business Valuation.,Non-Banking


In [None]:
# split dataset
train_valid_df, test_df = train_test_split(updated_data, test_size=0.2, random_state=42, stratify=updated_data["Intent"])
train_df,valid_df = train_test_split(train_valid_df, test_size=0.1, random_state=42, stratify=train_valid_df["Intent"])
le = LabelEncoder()
le.fit(updated_data["Intent"])
train_df["Intent"] = le.transform(train_df["Intent"])
valid_df["Intent"] = le.transform(valid_df["Intent"])
test_df["Intent"] = le.transform(test_df["Intent"])

# convert to huggingface dataset
train_dataset = Dataset.from_pandas(train_df)
valid_dataset = Dataset.from_pandas(valid_df)
test_dataset = Dataset.from_pandas(test_df)

# initialize tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch["Query"], padding=True, truncation=True)
train_dataset = train_dataset.map(tokenize, batched=True)
valid_dataset = valid_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

# Rename label column to 'labels'
train_dataset = train_dataset.rename_column("Intent", "labels")
valid_dataset = valid_dataset.rename_column("Intent", "labels")
test_dataset = test_dataset.rename_column("Intent", "labels")

# Set dataset format for PyTorch
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
valid_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

# initialize the model
num_labels = len(le.classes_)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

# training parameters
training_args = TrainingArguments( output_dir="./bert_intent_output",
    num_train_epochs = 6,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    learning_rate = 2e-5,
    eval_strategy = "epoch",
    save_strategy = "epoch",
    logging_dir = "./logs",
    load_best_model_at_end = True,
    metric_for_best_model = "accuracy",
    report_to = "none"
    )

# evaluation metrics
def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    labels = p.label_ids
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = valid_dataset,
    tokenizer = tokenizer,
    compute_metrics = compute_metrics,
)
trainer.run_name = "my_custom_run_name"
trainer.train()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/7272 [00:00<?, ? examples/s]

Map:   0%|          | 0/808 [00:00<?, ? examples/s]

Map:   0%|          | 0/2020 [00:00<?, ? examples/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.0878,0.030935,0.993812,0.993812,0.993888,0.993812
2,0.0096,0.017689,0.997525,0.997525,0.997525,0.997525
3,0.007,0.014716,0.996287,0.996287,0.99629,0.996287
4,0.0024,0.015417,0.997525,0.997525,0.997537,0.997525
5,0.0009,0.023606,0.997525,0.997525,0.997537,0.997525


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.0878,0.030935,0.993812,0.993812,0.993888,0.993812
2,0.0096,0.017689,0.997525,0.997525,0.997525,0.997525
3,0.007,0.014716,0.996287,0.996287,0.99629,0.996287
4,0.0024,0.015417,0.997525,0.997525,0.997537,0.997525
5,0.0009,0.023606,0.997525,0.997525,0.997537,0.997525
6,0.0002,0.017333,0.997525,0.997525,0.997537,0.997525


TrainOutput(global_step=5454, training_loss=0.014533928976277095, metrics={'train_runtime': 828.4801, 'train_samples_per_second': 52.665, 'train_steps_per_second': 6.583, 'total_flos': 1755661961322720.0, 'train_loss': 0.014533928976277095, 'epoch': 6.0})

In [None]:
# print the checkpoints
import os
ckpt_root = "./bert_intent_output"

# list the checkpoint directories as-is
all_ckpts = [d for d in os.listdir(ckpt_root) if d.startswith("checkpoint")]
print("Epoch-wise checkpoints:")
for idx, ckpt in enumerate(all_ckpts, start=1):
    print(f"Epoch {idx}")
    print(f"  {ckpt}")

# evaluate the trained model
metrics = trainer.evaluate(test_dataset)
print(metrics)
print("Best checkpoint:", trainer.state.best_model_checkpoint)

Epoch-wise checkpoints:
Epoch 1
  checkpoint-1818
Epoch 2
  checkpoint-2727
Epoch 3
  checkpoint-5454
Epoch 4
  checkpoint-3636
Epoch 5
  checkpoint-4545
Epoch 6
  checkpoint-909


{'eval_loss': 0.018503427505493164, 'eval_accuracy': 0.996039603960396, 'eval_f1': 0.996039603960396, 'eval_precision': 0.996039603960396, 'eval_recall': 0.996039603960396, 'eval_runtime': 11.3379, 'eval_samples_per_second': 178.164, 'eval_steps_per_second': 22.315, 'epoch': 6.0}
Best checkpoint: ./bert_intent_output/checkpoint-1818


In [None]:
import numpy as np
from sklearn.metrics import confusion_matrix

# print total number of training data,validation data, test data
print("Total number of training data : " ,len(train_df))
print("Total number of validation data : " ,len(valid_df))
print("Total number of test data : " ,len(test_df))

# print total number of banking and non banking data in training set
print("Total number of banking data in training set",len(train_df[train_df['Intent']==0]))
print("Total number of non banking in training set",len(train_df[train_df['Intent']==1]))

# Get predictions
predictions_output = trainer.predict(test_dataset)

# Predicted class labels
y_pred = np.argmax(predictions_output.predictions, axis=1)

# True labels
y_true = predictions_output.label_ids

# Confusion matrix
conf_matrix = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:\n", conf_matrix)

Total number of training data :  7272
Total number of validation data :  808
Total number of test data :  2020
Total number of banking data in training set 3629
Total number of non banking in training set 3643


Confusion Matrix:
 [[1004    4]
 [   4 1008]]


In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
import joblib
MODEL_DIR = "/content/drive/MyDrive/intent_model"

# Save model & tokenizer
trainer.save_model(MODEL_DIR)
tokenizer.save_pretrained(MODEL_DIR)

# Save label encoder
joblib.dump(le, f"{MODEL_DIR}/label_encoder.pkl")

['/content/drive/MyDrive/intent-model/label_encoder.pkl']

In [None]:
print(os.listdir(MODEL_DIR))

['config.json', 'model.safetensors', 'tokenizer_config.json', 'special_tokens_map.json', 'vocab.txt', 'tokenizer.json', 'training_args.bin', 'label_encoder.pkl']
