In [None]:
# install only once
# ! pip install kaggle
# ! pip install -q --upgrade keras-nlp
! pip install tensorflow --upgrade



# Download the dataset from kaggle
# ! kaggle datasets download -d shashwatwork/consume-complaints-dataset-fo-nlp
# unzip it
# !tar -zxf consume-complaints-dataset-fo-nlp.zip

In [148]:
from transformers import (set_seed,
                          TrainingArguments,
                          Trainer,
                          AdamW, 
                          get_linear_schedule_with_warmup,
                          BertTokenizer, TFBertForSequenceClassification, InputExample, InputFeatures)

# from torch.utils.data import DataLoader, TensorDataset, random_split
import torch
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
from sklearn.metrics import classification_report
import tensorflow as tf

# import keras_nlp
# import keras_core as keras
# from tensorflow.keras.utils import to_categorical
import tensorflow as tf
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Input, Dense

# from tensorflow import keras
# from tensorflow.keras import layers

# Set seed for reproducibility.
set_seed(58)

# Look for gpu to use. Will use `cpu` by default if no gpu found.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# EDA

In [18]:
df = pd.read_csv('complaints_processed.csv', index_col=False)
# remove rows with null values
df = df.dropna()
print("Discovery of null values: \n\n", df.isnull().sum())
print("\nDistribution of products: \n\n", df['product'].value_counts())
print("\nUnique product values: \n", df['product'].unique().tolist())
print("\nNarative sentence word count describe:\n ", df['narrative'].apply(lambda x: len(str(x).split())).describe())


Discovery of null values: 

 Unnamed: 0    0
product       0
narrative     0
dtype: int64

Distribution of products: 

 product
credit_reporting       91172
debt_collection        23148
mortgages_and_loans    18990
credit_card            15566
retail_banking         13535
Name: count, dtype: int64

Unique product values: 
 ['credit_card', 'retail_banking', 'credit_reporting', 'mortgages_and_loans', 'debt_collection']

Narative sentence word count describe:
  count    162411.000000
mean         80.232798
std         108.872213
min           1.000000
25%          27.000000
50%          50.000000
75%          95.000000
max        2685.000000
Name: narrative, dtype: float64


# Data Preprocessing

In [118]:
if 'Unnamed: 0' in df.columns:
    df = df.drop('Unnamed: 0', axis=1)


def label_to_int(label):
    """ Convert label to int.
    Returns: List of converted labels.
    """
    global label_dict
    label_dict = {'credit_card': 0, 'retail_banking': 1, 'credit_reporting': 2, 'mortgages_and_loans': 3, 'debt_collection': 4}  
    # Convert labels to integers
    return label_dict[label]

df['label'] = df['product'].apply(label_to_int)
# calculate the number of words in each narrative
# Notice that type in each row didn't defautl to string, so i had to convert to string prior to splitting
df['num_words'] = df['narrative'].apply(lambda x: len(str(x).split(" ")))
product_labels = df['product'].unique().tolist()

for label in product_labels:
    df[label] = df['product'].apply(lambda x: 1 if x == label else 0)

# create columns 
display(df.head(5))

Unnamed: 0,product,narrative,num_words,credit_card,retail_banking,credit_reporting,mortgages_and_loans,debt_collection,label
0,credit_card,purchase order day shipping amount receive pro...,230,1,0,0,0,0,0
1,credit_card,forwarded message date tue subject please inve...,132,1,0,0,0,0,0
2,retail_banking,forwarded message cc sent friday pdt subject f...,173,0,1,0,0,0,1
3,credit_reporting,payment history missing credit report speciali...,131,0,0,1,0,0,2
4,credit_reporting,payment history missing credit report made mis...,123,0,0,1,0,0,2


In [142]:
label_vals = [0,1,2,3,4]
labels = ['credit_card', 'retail_banking', 'credit_reporting', 'mortgages_and_loans', 'debt_collection']
num_labels = len(labels)

# create traing and test sets
X = df['narrative'].values
# y = df['label'].values
y = df[labels].values

batch_size = 128
# Choose a max_length that suits your data
max_length = 512  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=58)

# create 5 rows of data to test the model
test_data = X_test[:10]
test_labels = y_test[:10]
print(test_labels)

['advised get reverse mortgage one reverse mortgage funded original principal limit one lump sum could invest receive monthly income idea current principal limit could grow monthly interest plus service fee mip compounding monthly husband getting married living together home since since husband added name deed name still maiden name time lender told u spouse could mortgage age said worry added loan turn signed since turning attempting get help immediately called xxxxnd birthday knew important afraid something happened husband would leave home lived past year feel lender violated law loan process telling u never would signed otherwise counseled risk option lump sum etc predatory constantly calling u afterward nowhere found question reverse mortgage many company final contacted many time tried six year finally wish get resolved behind u happen called written enclosed original adviser went california left florida loan made home date contact plus many many telephone call large file informa

## Model tokenization and conversion of input features

In [123]:
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=True)

In [143]:
from transformers import InputExample, InputFeatures

# Convert texts to InputExamples
train_examples = [InputExample(guid=str(i), text_a=test_data[i], label=test_labels[i]) for i in range(len(test_data))]

print(train_examples[0])

InputExample(guid='0', text_a='advised get reverse mortgage one reverse mortgage funded original principal limit one lump sum could invest receive monthly income idea current principal limit could grow monthly interest plus service fee mip compounding monthly husband getting married living together home since since husband added name deed name still maiden name time lender told u spouse could mortgage age said worry added loan turn signed since turning attempting get help immediately called xxxxnd birthday knew important afraid something happened husband would leave home lived past year feel lender violated law loan process telling u never would signed otherwise counseled risk option lump sum etc predatory constantly calling u afterward nowhere found question reverse mortgage many company final contacted many time tried six year finally wish get resolved behind u happen called written enclosed original adviser went california left florida loan made home date contact plus many many tele

In [144]:
# Convert InputExamples to InputFeatures
def convert_examples_to_features(examples, tokenizer, max_length=max_length, label_list=labels, output_mode="classification"):
    features = []
    for example in examples:
        inputs = tokenizer.encode_plus(
            example.text_a,
            add_special_tokens=True, 
            padding='max_length',
            truncation=True,
            return_attention_mask = True,
            return_token_type_ids=True,
            max_length=max_length, 
        )
        features.append(InputFeatures(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], label=example.label))
    return features

train_features = convert_examples_to_features(train_examples, tokenizer, max_length=max_length, label_list=test_labels)
print("train feature: ",train_features[0])
print("input ids: ",train_features[0].input_ids)
print("attention mask: ",train_features[0].attention_mask)



train feature:  InputFeatures(input_ids=[101, 9449, 2131, 7901, 14344, 2028, 7901, 14344, 6787, 2434, 4054, 5787, 2028, 15116, 7680, 2071, 15697, 4374, 7058, 3318, 2801, 2783, 4054, 5787, 2071, 4982, 7058, 3037, 4606, 2326, 7408, 2771, 2361, 7328, 2075, 7058, 3129, 2893, 2496, 2542, 2362, 2188, 2144, 2144, 3129, 2794, 2171, 15046, 2171, 2145, 10494, 2171, 2051, 18496, 2121, 2409, 1057, 18591, 2071, 14344, 2287, 2056, 4737, 2794, 5414, 2735, 2772, 2144, 3810, 7161, 2131, 2393, 3202, 2170, 22038, 20348, 4859, 5798, 2354, 2590, 4452, 2242, 3047, 3129, 2052, 2681, 2188, 2973, 2627, 2095, 2514, 18496, 2121, 14424, 2375, 5414, 2832, 4129, 1057, 2196, 2052, 2772, 4728, 9517, 2098, 3891, 5724, 15116, 7680, 4385, 21659, 7887, 4214, 1057, 9707, 7880, 2179, 3160, 7901, 14344, 2116, 2194, 2345, 11925, 2116, 2051, 2699, 2416, 2095, 2633, 4299, 2131, 10395, 2369, 1057, 4148, 2170, 2517, 10837, 2434, 11747, 2253, 2662, 2187, 3516, 5414, 2081, 2188, 3058, 3967, 4606, 2116, 2116, 7026, 2655, 2312, 5371

In [162]:
# Convert InputFeatures to TensorFlow datasets
def convert_features_to_tf_dataset(features):
    all_input_ids = []
    all_attention_masks = []
    all_labels = []

    for feature in features:
        all_input_ids.append(feature.input_ids)
        all_attention_masks.append(feature.attention_mask)
        all_labels.append(feature.label)

    # Convert lists to TF tensors
    tf_ds = tf.data.Dataset.from_tensor_slices(({"input_ids": all_input_ids, "attention_mask": all_attention_masks}, all_labels))
    return tf_ds

train_dataset = convert_features_to_tf_dataset(train_features)
print(type(train_dataset))
print(train_dataset)

batch_dataset = train_dataset.shuffle(10000).batch(3)
print(type(batch_dataset))
print(batch_dataset)

<class 'tensorflow.python.data.ops.from_tensor_slices_op._TensorSliceDataset'>
<_TensorSliceDataset element_spec=({'input_ids': TensorSpec(shape=(512,), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(512,), dtype=tf.int32, name=None)}, TensorSpec(shape=(5,), dtype=tf.int64, name=None))>
<class 'tensorflow.python.data.ops.batch_op._BatchDataset'>
<_BatchDataset element_spec=({'input_ids': TensorSpec(shape=(None, 512), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(None, 512), dtype=tf.int32, name=None)}, TensorSpec(shape=(None, 5), dtype=tf.int64, name=None))>


# Model definition, training, and evaluation

## Model definition and intstantiation

In [179]:
model = TFBertForSequenceClassification.from_pretrained(model_name, num_labels=5)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-05, epsilon=1e-08, clipnorm=1.0),
              loss=tf.keras.losses.CategoricalCrossentropy(from_logits = True),
              metrics=[tf.keras.metrics.CategoricalAccuracy('accuracy'),
                       tf.keras.metrics.Recall(thresholds=0)])

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Model training

In [180]:
train_history = model.fit(batch_dataset, epochs=3, verbose=1)

Epoch 1/3


Epoch 2/3
Epoch 3/3


In [177]:
model.summary()

Model: "tf_bert_for_sequence_classification_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_455 (Dropout)       multiple                  0         


                                                                 
 classifier (Dense)          multiple                  3845      
                                                                 
Total params: 109486085 (417.66 MB)
Trainable params: 109486085 (417.66 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


## Model evaluation

In [None]:
predicted_raw = model.predict()

In [118]:
# Define Trainer parameters
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)
    # print(classification_report(y_test,nb.predict(X_test)))
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

# Predict

In [175]:
test_sentence = test_data[4]
test_input = tokenizer.encode_plus(test_sentence, add_special_tokens=True, return_tensors="tf")
outputs = model(test_input["input_ids"], attention_mask=test_input["attention_mask"])
probs = tf.nn.softmax(outputs[0], axis=-1)
predicted_label = tf.argmax(probs, axis=1).numpy()[0]
print(f"Predicted label: {predicted_label}")

Predicted label: 2
