<a href="https://colab.research.google.com/github/saribasmetehan/bank_administrative_assistant/blob/main/BERT_BankingClassifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers[torch] -U -q
!pip install accelerate -U -q
!pip install -q datasets
!pip install -q evaluate

In [2]:
# import pandas as pd
# import numpy as np
# from datasets import load_dataset, concatenate_datasets
# import torch
# from transformers import (
#     AutoTokenizer, 
#     AutoModelForSequenceClassification,
#     DataCollatorWithPadding,
#     Trainer,
#     TrainingArguments
# )
# import evaluate
# from huggingface_hub import notebook_login

In [3]:
dataset = load_dataset("intent")

In [12]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'intent'],
        num_rows: 313
    })
    test: Dataset({
        features: ['text', 'intent'],
        num_rows: 192
    })
})

In [13]:
train_df = pd.DataFrame(dataset["train"])
test_df = pd.DataFrame(dataset["test"])

In [14]:
train_df.nunique().sort_values(ascending = False)

text      282
intent      4
dtype: int64

In [17]:
def preprocess_data(batch):
    batch["labels"] = batch["intent"]  # Map `intent` to `labels`
    return batch

dataset_encoded = dataset.map(preprocess_data, batched=True)


Map:   0%|          | 0/313 [00:00<?, ? examples/s]

Map:   0%|          | 0/192 [00:00<?, ? examples/s]

In [18]:
intent_label_map = {intent: i for i, intent in enumerate(dataset["train"].unique("intent"))}

def preprocess_data(batch):
    batch["labels"] = [intent_label_map[intent] for intent in batch["intent"]]  # Convert intents to numerical labels
    return batch


In [19]:
dataset_encoded = dataset.map(preprocess_data, batched=True)


Map:   0%|          | 0/313 [00:00<?, ? examples/s]

Map:   0%|          | 0/192 [00:00<?, ? examples/s]

In [24]:
print(dataset_encoded["train"][260])


{'text': 'Give me some budgeting advice for my ₹5000', 'intent': 'financial_advice', 'labels': 3}


In [25]:
model_name = "google-bert/bert-base-uncased"

In [26]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [27]:
tokenizer.vocab_size

30522

In [28]:
tokenizer.model_max_length

512

In [29]:
def tokenize(batch):
  return tokenizer(batch["text"], truncation = True)

In [32]:
tokenize(dataset["train"][:3])

{'input_ids': [[101, 4931, 1010, 2064, 2017, 2425, 2033, 2026, 5703, 1029, 102], [101, 2129, 2172, 2769, 2079, 1045, 2031, 1999, 2026, 4070, 1029, 102], [101, 1045, 2342, 2000, 2113, 2026, 5703, 1010, 3531, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [34]:
print(dataset_encoded["train"][0])


{'text': 'Hey, can you tell me my balance?', 'intent': 'check_balance', 'labels': 0}


In [42]:
# Get unique intents and create a mapping
unique_intents = dataset["train"].unique("intent")
intent_label_map = {intent: i for i, intent in enumerate(unique_intents)}

print("Intent to Label Mapping:", intent_label_map)


Intent to Label Mapping: {'check_balance': 0, 'transfer_money': 1, 'pay_bills': 2, 'financial_advice': 3}


In [43]:
data_collator = DataCollatorWithPadding(tokenizer = tokenizer)

In [44]:
num_labels = 4

In [46]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [47]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels = num_labels).to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [48]:
accuracy = evaluate.load("accuracy")

In [40]:
def compute_metrics(eval_pred):
  predictions, labels = eval_pred
  predictions = np.argmax(predictions, axis = 1)
  return accuracy.compute(predictions = predictions, references = labels)

In [72]:
training_args = TrainingArguments(output_dir = "FinSee",
                                   num_train_epochs = 4,
                                   per_device_train_batch_size = 16,
                                   per_device_eval_batch_size = 16,
                                   weight_decay = 0.01,
                                   eval_strategy = "epoch",
                                   save_strategy="epoch",
                                   load_best_model_at_end = True,
                                   report_to = "none",
                                   learning_rate=5e-5,
                                   push_to_hub = True,
                                  hub_token="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",
                                   )

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [50]:
from datasets import DatasetDict, concatenate_datasets

# Shuffle the test dataset
shuffled_eval_dataset = dataset_encoded["test"].shuffle(seed=42)

# Split test dataset into additional train data and remaining eval data
additional_train_data = shuffled_eval_dataset.select(range(100))  # Use first 100 examples for training
remaining_eval_data = shuffled_eval_dataset.select(range(100, len(shuffled_eval_dataset)))  # Rest for eval

# Augment train dataset
train_dataset = concatenate_datasets([dataset_encoded["train"], additional_train_data])

# Final evaluation dataset
eval_dataset = remaining_eval_data


In [51]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [52]:
trainer = Trainer(
    model = model,
    args = training_args,
    compute_metrics = compute_metrics,
    train_dataset = train_dataset,
    eval_dataset = eval_dataset,
    tokenizer = tokenizer,
)

NameError: name 'training_args' is not defined

In [76]:
trainer.train()

ValueError: The model did not return a loss from the inputs, only the following keys: logits. For reference, the inputs it received are input_ids,token_type_ids,attention_mask.

In [30]:
unique_intents = dataset["train"].unique("intent")
intent_label_map = {intent: i for i, intent in enumerate(unique_intents)}

In [31]:
def preprocess_data(batch):
    # Convert intents to numerical labels
    batch["labels"] = [intent_label_map[intent] for intent in batch["intent"]]
    return batch

In [32]:
def preprocess_function(examples):
    # Tokenize the texts
    tokenized = tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

In [33]:
# Apply preprocessing
dataset_encoded = dataset.map(preprocess_data, batched=True)


In [34]:
def tokenize_function(batch):
    return tokenizer(
        batch["text"],
        padding=True,
        truncation=True,
        max_length=128  # Adjust based on your text length
    )


In [35]:
tokenized_datasets = dataset_encoded.map(
    tokenize_function,
    batched=True,
    remove_columns=dataset_encoded["train"].column_names
)

Map:   0%|          | 0/192 [00:00<?, ? examples/s]

In [36]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [37]:
num_labels = len(intent_label_map)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [38]:
accuracy = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [39]:
training_args = TrainingArguments(
    output_dir="FinSee",
    num_train_epochs=4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    learning_rate=5e-5,
    report_to="none",
    push_to_hub=True,
    hub_token="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"  # Replace with your token
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [40]:
shuffled_eval_dataset = tokenized_datasets["test"].shuffle(seed=42)
split_point = int(len(shuffled_eval_dataset) * 0.1)
additional_train_data = shuffled_eval_dataset.select(range(split_point))
remaining_eval_data = shuffled_eval_dataset.select(range(split_point, len(shuffled_eval_dataset)))

In [41]:
train_dataset = concatenate_datasets([
    tokenized_datasets["train"],
    additional_train_data
])
eval_dataset = remaining_eval_data

In [18]:
.login()

NameError: name 'notebook' is not defined

In [42]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


  trainer = Trainer(


OutOfMemoryError: CUDA out of memory. Tried to allocate 90.00 MiB. GPU 0 has a total capacity of 3.63 GiB of which 19.19 MiB is free. Including non-PyTorch memory, this process has 744.00 MiB memory in use. Process 14177 has 2.88 GiB memory in use. Of the allocated memory 606.02 MiB is allocated by PyTorch, and 69.98 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [23]:
trainer.train()

ValueError: The model did not return a loss from the inputs, only the following keys: logits. For reference, the inputs it received are input_ids,token_type_ids,attention_mask.