<a href="https://colab.research.google.com/github/saddarudin/google_colab/blob/main/GenAI_TransferLearning_HF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Problem Statement and Dataset

### https://www.kaggle.com/datasets/adhamelkomy/bank-customer-complaint-analysis/data

### The Customer Financial Protection Bureau (CFPB) acts as a mediator between financial institutions and cosumers, facilitating dispute resolution when complaints arise.

### To improve Efficiency and accuracy in handling customer complaints, they would like to automatically classify and route complaints to the appropriate teams based on their content and associated financial products.

In [3]:
import pandas as pd

In [4]:
df = pd.read_csv('complaints.csv')
df.head()

Unnamed: 0,product,narrative
0,credit_card,purchase order day shipping amount receive pro...
1,credit_card,forwarded message date tue subject please inve...
2,retail_banking,forwarded message cc sent friday pdt subject f...
3,credit_reporting,payment history missing credit report speciali...
4,credit_reporting,payment history missing credit report made mis...


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162421 entries, 0 to 162420
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   product    162421 non-null  object
 1   narrative  162411 non-null  object
dtypes: object(2)
memory usage: 2.5+ MB


In [27]:
df['product'].value_counts()

Unnamed: 0_level_0,count
product,Unnamed: 1_level_1
credit_reporting,91179
debt_collection,23150
mortgages_and_loans,18990
credit_card,15566
retail_banking,13536


## For now dividing into two classes


1.   Credit Card --> 1
2.   Other --> 0



In [6]:
df['label'] = df['product'].apply(lambda x: 1 if x == 'credit_card' else 0)
df.head()

Unnamed: 0,product,narrative,label
0,credit_card,purchase order day shipping amount receive pro...,1
1,credit_card,forwarded message date tue subject please inve...,1
2,retail_banking,forwarded message cc sent friday pdt subject f...,0
3,credit_reporting,payment history missing credit report speciali...,0
4,credit_reporting,payment history missing credit report made mis...,0


In [7]:
!pip -q install accelerate -U
!pip -q install transformers[torch]
!pip -q install datasets

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m52.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m31.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m45.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [8]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, DatasetDict, ClassLabel, Dataset
import pandas as pd
from sklearn.model_selection import train_test_split
import torch

In [9]:
sample_data = df[df['label']==1]
sample_data.shape

(15566, 3)

In [10]:
sample_data = sample_data.sample(n=500, random_state=42)
sample_data['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,500


In [11]:
label_0 = df[df['label']==0]
label_0 = label_0.sample(n=500, random_state=42)
label_0['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,500


In [12]:
sample_data = pd.concat([sample_data, label_0])
sample_data['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,500
0,500


In [13]:
sample_data.head()

Unnamed: 0,product,narrative,label
123262,credit_card,saturday make sevice appointment mother give c...,1
88030,credit_card,early opened dispute transaction citibank cred...,1
70370,credit_card,received call pandemic started bank america re...,1
135356,credit_card,cancelled account based response email problem...,1
141395,credit_card,tjmaxx refusing give card middle initial middl...,1


In [14]:
sample_data['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,500
0,500


In [15]:
sample_data = Dataset.from_pandas(sample_data)

In [16]:
sample_data

Dataset({
    features: ['product', 'narrative', 'label', '__index_level_0__'],
    num_rows: 1000
})

In [17]:
# divide the sample_data into 80% training and 20% test and convert into format of Dataset
train_test_data = sample_data.train_test_split(test_size=0.2, seed=42)
dataset = DatasetDict({
    'train': train_test_data['train'],
    'test': train_test_data['test']
})
dataset

DatasetDict({
    train: Dataset({
        features: ['product', 'narrative', 'label', '__index_level_0__'],
        num_rows: 800
    })
    test: Dataset({
        features: ['product', 'narrative', 'label', '__index_level_0__'],
        num_rows: 200
    })
})

In [20]:
# Load the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Padding
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.add_special_tokens({'pad_token': '[PAD]'} )

def tokenize_function(examples):
    return tokenizer(examples["narrative"], padding="max_length", truncation=True, max_length=512)
tokenized_datasets = dataset.map(tokenize_function, batched=True)

tokenized_datasets

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['product', 'narrative', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 800
    })
    test: Dataset({
        features: ['product', 'narrative', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 200
    })
})

In [21]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased',
                                                            num_labels=2,
                                                            pad_token_id=tokenizer.eos_token_id)
model

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (li

In [22]:
training_args = TrainingArguments(
    output_dir="./results_bert_custom",
    num_train_epochs=2,
    logging_dir="./logs_bert_custom",
    report_to="none"  # Disable wandb logging
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
)

# Start training
trainer.train()

Step,Training Loss


TrainOutput(global_step=200, training_loss=0.3578628158569336, metrics={'train_runtime': 80.5947, 'train_samples_per_second': 19.852, 'train_steps_per_second': 2.482, 'total_flos': 211947837849600.0, 'train_loss': 0.3578628158569336, 'epoch': 2.0})

In [23]:
model_dir = "./distilbert_fine_tuned"

In [24]:
model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)

('./distilbert_fine_tuned/tokenizer_config.json',
 './distilbert_fine_tuned/special_tokens_map.json',
 './distilbert_fine_tuned/vocab.txt',
 './distilbert_fine_tuned/added_tokens.json')

In [25]:
trainer.save_model('Distilbert_CustomModel')

In [26]:
#Code to donwloading the distilbert model
!gdown --id 1785J3ir19RaZP3ebbFvWUX88PMaBouro -O distilbert_finetuned_V1.zip

Downloading...
From (original): https://drive.google.com/uc?id=1785J3ir19RaZP3ebbFvWUX88PMaBouro
From (redirected): https://drive.google.com/uc?id=1785J3ir19RaZP3ebbFvWUX88PMaBouro&confirm=t&uuid=c46e2bc0-f2ea-4324-b8de-ec14664b2bca
To: /content/distilbert_finetuned_V1.zip
100% 247M/247M [00:04<00:00, 51.5MB/s]


In [27]:
!unzip -o -j distilbert_finetuned_V1.zip -d distilbert_finetuned_V1

model_v1 = DistilBertForSequenceClassification.from_pretrained('/content/distilbert_finetuned_V1')
model_v1.to("cuda:0")

Archive:  distilbert_finetuned_V1.zip
  inflating: distilbert_finetuned_V1/config.json  
  inflating: distilbert_finetuned_V1/model.safetensors  
  inflating: distilbert_finetuned_V1/special_tokens_map.json  
  inflating: distilbert_finetuned_V1/tokenizer_config.json  
  inflating: distilbert_finetuned_V1/vocab.txt  


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (li

In [28]:
def make_prediction(text):
  new_complaint = text
  inputs = tokenizer(new_complaint,return_tensors="pt")
  inputs = inputs.to(torch.device("cuda:0"))
  outputs = model(**inputs)
  predictions = outputs.logits.argmax(-1)
  predictions = predictions.detach().cpu().numpy()
  return predictions

In [30]:
sample_data_large=df.sample(n=1000, random_state=55)
sample_data_large["finetuned_predicted"]=sample_data_large["narrative"].apply(lambda x: make_prediction(str(x)[:350])[0])

In [31]:
sample_data_large["finetuned_predicted"]

Unnamed: 0,finetuned_predicted
36949,0
27628,0
138979,0
60466,0
98334,1
...,...
66079,0
122182,0
37186,1
121726,0


In [32]:
from sklearn.metrics import confusion_matrix
# Create the confusion matrix
cm1 = confusion_matrix(sample_data_large["label"], sample_data_large["finetuned_predicted"])
print(cm1)
accuracy1=cm1.diagonal().sum()/cm1.sum()
print(accuracy1)

[[741 161]
 [ 11  87]]
0.828


## Saving the model to hugging face

In [33]:
!pip install transformers
!pip install huggingface_hub
!pip install -U ipykernel #for executing the commands

Collecting ipykernel
  Downloading ipykernel-6.29.5-py3-none-any.whl.metadata (6.3 kB)
Collecting comm>=0.1.1 (from ipykernel)
  Downloading comm-0.2.2-py3-none-any.whl.metadata (3.7 kB)
Collecting jedi>=0.16 (from ipython>=7.23.1->ipykernel)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading ipykernel-6.29.5-py3-none-any.whl (117 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.2/117.2 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading comm-0.2.2-py3-none-any.whl (7.2 kB)
Downloading jedi-0.19.2-py2.py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m44.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: jedi, comm, ipykernel
  Attempting uninstall: ipykernel
    Found existing installation: ipykernel 6.17.1
    Uninstalling ipykernel-6.17.1:
      Successfully uninstalled ipykernel-6.17.1
[31mERROR: pip's dependency resolver does not currently take into

In [34]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [35]:
model_v1.push_to_hub("saddarudin/Bank_distil_bert_Custom")

Uploading...:   0%|          | 0.00/268M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/saddarudin/Bank_distil_bert_Custom/commit/c9612c6745106f0ad6c793b563f7d212fb06aef5', commit_message='Upload DistilBertForSequenceClassification', commit_description='', oid='c9612c6745106f0ad6c793b563f7d212fb06aef5', pr_url=None, repo_url=RepoUrl('https://huggingface.co/saddarudin/Bank_distil_bert_Custom', endpoint='https://huggingface.co', repo_type='model', repo_id='saddarudin/Bank_distil_bert_Custom'), pr_revision=None, pr_num=None)