## Load and Inspect Dataset

In [24]:
from datasets import load_dataset
# load data
ds = load_dataset("banking77")
print(ds)

# check splits
train = ds['train']
test = ds['test']
print("train size:", len(train))
print("test size:", len(test))

# columns
print("columns:", train.column_names)
print("sample:", train[0])

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 10003
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 3080
    })
})
train size: 10003
test size: 3080
columns: ['text', 'label']
sample: {'text': 'I am still waiting on my card?', 'label': 11}


## Check Label Names and Confirm Intents

In [2]:
label_features = train.features["label"]
label_names = label_features.names
print("# of labels: ", len(label_names))
print("Example of labels: ", label_names[:10])

# of labels:  77
Example of labels:  ['activate_my_card', 'age_limit', 'apple_pay_or_google_pay', 'atm_support', 'automatic_top_up', 'balance_not_updated_after_bank_transfer', 'balance_not_updated_after_cheque_or_cash_deposit', 'beneficiary_not_allowed', 'cancel_transfer', 'card_about_to_expire']


## Class Distributions

In [5]:
import numpy as np

labels = np.array(train["label"])
# count occurences
label, count = np.unique(labels, return_counts=True)

# sort in descending order
sorted_desc = np.argsort(count)[::-1]
# get top 5 classes
top5_labels = label[sorted_desc][:5]
top5_counts = count[sorted_desc][:5]
# print labels w counts
print("Top 5 classes: \n")
for l, c in zip(top5_labels, top5_counts):
    print(label_names[l], c , "\n")

# sort in asc order
sorted_asc = np.argsort(count)
# get "last" 5 classes
last5_labels = label[sorted_asc][:5]
last5_counts = count[sorted_asc][:5]
# print labels w counts
print("Last 5 classes: \n")
for l, c in zip(last5_labels, last5_counts):
    print(label_names[l], c, "\n")


Top 5 classes: 

card_payment_fee_charged 187 

direct_debit_payment_not_recognised 182 

balance_not_updated_after_cheque_or_cash_deposit 181 

wrong_amount_of_cash_received 180 

cash_withdrawal_charge 177 

Last 5 classes: 

contactless_not_working 35 

virtual_card_not_working 41 

card_acceptance 59 

card_swallowed 61 

lost_or_stolen_card 82 



## Tokenizer Setup

In [6]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")  # get dict w input_ids

## Token Length Distribution (w/o truncation)

In [22]:
def token_lengths(dataset, text_col="text"):
    enc = tokenizer( list(dataset[text_col]),
        add_special_tokens=True,
        truncation=False)
    return np.array([len(ids) for ids in enc["input_ids"]])



train_len = token_lengths(train, text_col="text")
test_len = token_lengths(test, text_col="text")

def summarize(lens, name):
    print(f"\n{name}")
    for p in [50, 90, 95, 99]:
        print(f"p{p}: {int(np.percentile(lens, p))}")
    print("mean:", float(lens.mean()))
    print("max:", int(lens.max()))

summarize(train_len, "train")
summarize(test_len, "test")

max_len = 64
print("\n% train > 64:", float((train_len > max_len).mean())*100)
print("% test  > 64:", float((test_len > max_len).mean())*100)


train
p50: 13
p90: 28
p95: 37
p99: 53
mean: 16.21373587923623
max: 98

test
p50: 13
p90: 23
p95: 31
p99: 47
mean: 15.06948051948052
max: 82

% train > 64: 0.29991002699190245
% test  > 64: 0.16233766233766234


In [20]:
# print truncated examples
idxs = np.where(test_len > 64)[0][:5]
print("showing", len(idxs), "examples with length > 64\n")

for i in idxs:
    ex = test[int(i)]
    print("len:", int(test_len[int(i)]))
    print("label:", label_names[ex["label"]])
    print("text:", ex["text"])
    print("-" * 60)

showing 5 examples with length > 64

len: 69
label: pending_top_up
text: Is there something wrong with your website? I tried topping up my account and it's been close to two hours now and it's still at "pending" for some reason. I just joined with you guys and this is the first attempt at this, so maybe I'm wrong, but shouldn't this be instant?
------------------------------------------------------------
len: 76
label: pending_top_up
text: Hi. I'm a new customer to your system and I think something isn't working right - or maybe it is. Maybe you can confirm. I tried to top up today (my first time ever) and it's been stuck at "pending" for over an hour now. Is it supposed to do this or is there something wrong in the system?
------------------------------------------------------------
len: 81
label: reverted_card_payment?
text: Hi, I would like to file a claim for an inquiry. I am a frequent customer of the company in question and have never had any issue with my purchases, payments, or

## Conclusion:

- macro F1 due to imbalance(35-180)
- max_len=64; truncated is lesser than .3%
- intent keywords appear earlier