In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
!pip install sentence-transformers

In [None]:
!pip install datasets

In [None]:
!pip install setfit

In [None]:
import os, sys
from google.colab import drive
drive.mount('/content/drive')
nb_path = '/content/notebooks'
#os.symlink('/content/drive/My Drive/Colab Notebooks', nb_path)
sys.path.insert(0, nb_path)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score
from datasets import DatasetDict, Dataset
from setfit import SetFitModel,Trainer,TrainingArguments
import os

In [None]:
train_df = pd.read_csv("train.csv")  # Contains 'text' and 'label' (string)
test_df = pd.read_csv("test.csv")

In [None]:
# Sample 20 utterances per label
sampled_train_df = (
    train_df.groupby("label", group_keys=False)
    .apply(lambda x: x.sample(n=7, random_state=42))
    .reset_index(drop=True)
)

# Sample 20 utterances per label
sampled_test_df = (
    test_df.groupby("label", group_keys=False)
    .apply(lambda x: x.sample(n=5, random_state=42))
    .reset_index(drop=True)
)

In [None]:
print(sampled_train_df.shape)
print(sampled_test_df.shape)

In [None]:
train_dataset = Dataset.from_pandas(sampled_train_df)
test_dataset = Dataset.from_pandas(sampled_test_df)

In [None]:
unique_labels = sorted(train_dataset.unique("label"))

In [None]:
model = SetFitModel.from_pretrained("sentence-transformers/all-MiniLM-L12-v2",labels=unique_labels)

In [None]:
args = TrainingArguments(
    batch_size=32,
    num_epochs=2,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    metric="accuracy",
    column_mapping={"text": "text", "label": "label"}  # Map dataset columns to text/label expected by trainer
)

In [None]:
trainer.train()

In [None]:
val_preds = trainer.model.predict(dataset["validation"]["text"])
val_labels = dataset["validation"]["label"]

val_acc = accuracy_score(val_labels, val_preds)
val_f1 = f1_score(val_labels, val_preds, average="macro")

print(f"Validation Accuracy: {val_acc:.4f}")
print(f"Validation F1 Score: {val_f1:.4f}")

In [None]:
unseen_dataset = Dataset.from_pandas(test_df)
test_preds = trainer.model.predict(unseen_dataset["text"])
test_labels = unseen_dataset["label"]

test_acc = accuracy_score(test_labels, test_preds)
test_f1 = f1_score(test_labels, test_preds, average="macro")

print(f"Test Accuracy: {test_acc:.4f}")
print(f"Test F1 Score: {test_f1:.4f}")

In [None]:
type(test_preds)

In [None]:
df_dict = {'text': unseen_dataset['text'], 'label': unseen_dataset['label'], 'pred_label': test_preds.tolist()}

predicted_unseen_df = pd.DataFrame(df_dict)

In [None]:
predicted_unseen_df.head(5)

In [None]:
filepath = '/content/drive/My Drive/output_folder/predicted_unseen_df_banking.xlsx' # Replace 'your_folder' and 'your_file'
predicted_unseen_df.to_excel(filepath, index=False)

print(f"DataFrame saved to: {filepath}")

In [None]:
trainer.model.save_pretrained("./fine-tuned-banking-model")

In [None]:
model.save_pretrained("/content/drive/MyDrive/banking_setfit_model")