In [100]:
# install only once
# ! pip install kaggle
# ! pip install -q --upgrade keras-nlp


# Download the dataset from kaggle
# ! kaggle datasets download -d shashwatwork/consume-complaints-dataset-fo-nlp
# unzip it
# !tar -zxf consume-complaints-dataset-fo-nlp.zip



In [17]:
from transformers import (set_seed,
                          TrainingArguments,
                          Trainer,
                          AdamW, 
                          get_linear_schedule_with_warmup,
                          BertTokenizer, TFBertForSequenceClassification, InputExample, InputFeatures)

from torch.utils.data import DataLoader, TensorDataset, random_split
import torch
import nltk
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
from sklearn.metrics import classification_report
import tensorflow as tf

import keras_nlp
import keras_core as keras

import tensorflow as tf


# from tensorflow import keras
# from tensorflow.keras import layers

# Set seed for reproducibility.
set_seed(58)

# Look for gpu to use. Will use `cpu` by default if no gpu found.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# EDA

In [18]:
df = pd.read_csv('complaints_processed.csv', index_col=False)
# remove rows with null values
df = df.dropna()
print("Discovery of null values: \n\n", df.isnull().sum())
print("\nDistribution of products: \n\n", df['product'].value_counts())
print("\nUnique product values: \n", df['product'].unique().tolist())
print("\nNarative sentence word count describe:\n ", df['narrative'].apply(lambda x: len(str(x).split())).describe())


Discovery of null values: 

 Unnamed: 0    0
product       0
narrative     0
dtype: int64

Distribution of products: 

 product
credit_reporting       91172
debt_collection        23148
mortgages_and_loans    18990
credit_card            15566
retail_banking         13535
Name: count, dtype: int64

Unique product values: 
 ['credit_card', 'retail_banking', 'credit_reporting', 'mortgages_and_loans', 'debt_collection']

Narative sentence word count describe:
  count    162411.000000
mean         80.232798
std         108.872213
min           1.000000
25%          27.000000
50%          50.000000
75%          95.000000
max        2685.000000
Name: narrative, dtype: float64


# Data Preprocessing

In [25]:
df = df.drop('Unnamed: 0', axis=1)
# calculate the number of words in each narrative
# Notice that type in each row didn't defautl to string, so i had to convert to string prior to splitting
df['num_words'] = df['narrative'].apply(lambda x: len(str(x).split(" ")))

product_labels = df['product'].unique().tolist()

for label in product_labels:
    df[label] = df['product'].apply(lambda x: 1 if x == label else 0)

# create columns 
display(df.head(5))

Unnamed: 0,product,narrative,num_words,credit_card,retail_banking,credit_reporting,mortgages_and_loans,debt_collection
0,credit_card,purchase order day shipping amount receive pro...,230,1,0,0,0,0
1,credit_card,forwarded message date tue subject please inve...,132,1,0,0,0,0
2,retail_banking,forwarded message cc sent friday pdt subject f...,173,0,1,0,0,0
3,credit_reporting,payment history missing credit report speciali...,131,0,0,1,0,0
4,credit_reporting,payment history missing credit report made mis...,123,0,0,1,0,0


In [81]:
# create traing and test sets
X = df['narrative'].values
y = df[['credit_card', 'retail_banking', 'credit_reporting', 'mortgages_and_loans', 'debt_collection']].values

# Convert X to a NumPy array of shape (n_samples, 1) to match y's shape of (n_samples, n_labels)
# y.shape (2224, 5)
X = np.array(X).reshape(-1, 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=58, stratify=y)

# Reshape to reshape nparrays from 2d to 1d for vectorization
X_train = X_train.reshape(-1)
X_test = X_test.reshape(-1)

# create 5 rows of data to test the model
test_data = X_test[:10]
test_labels = y_test[:10]
display(test_data)
display(test_labels)

array(['please reference complaint retailed well fargo filing complaint agency please read email attached received well fargo today requesting information already provided asking house vacant instructed vacate premise well fargo counter offer house listed year ago countering offer settle bottom email said correspondence particular loan guideline outlined specific loan interpreted well fargo home mortgage loan blatant retaliation filing complaint cfpb asking assistance senator assistance email senator office attached well fargo stop',
       'authorized user purchased jewelry cost user receive final product never received final product made dispute citi found merchant favor wrote letter citi appeal heard back willing take merchant court citi employee witness',
       'short summary information account showing report still inaccurate sent letter acknowledging inaccurate changed account open past due year charged activity adversely impacted credit score payment history moved bad credit ra

array([[0, 0, 0, 1, 0],
       [1, 0, 0, 0, 0],
       [0, 0, 1, 0, 0],
       [0, 0, 0, 0, 1],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 0, 1],
       [0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0]])

## Model tokenization and conversion of input features

In [82]:

model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=True)

In [83]:
# Choose a max_length that suits your data
max_length = 512  
batch_size = 64
labels = [0,1]
labels_dict = {'credit_card': 0, 'retail_banking': 1, 'credit_reporting': 2, 'mortgages_and_loans': 3, 'debt_collection': 4}  

def labels_to_int(labels, label_dict):
    """ Convert label to int.
    Returns: List of converted labels.
    """
    # Convert labels to integers
    label_ids = [label_dict[label] for label in labels]
    return label_ids

def index_labels(labels, label_dict):
    """ Convert label to int.
    Returns: List of converted labels.
    """
    # Convert labels to integers
    label_ids = [label_dict[label] for label in labels]
    return label_ids
# int_labels = labels_to_int(test_labels, labels_dict)
# print(int_labels)

In [84]:
from transformers import InputExample, InputFeatures

# Convert texts to InputExamples
train_examples = [InputExample(guid=str(i), text_a=test_data[i], label=test_labels[i]) for i in range(len(test_data))]

print(train_examples[0])

InputExample(guid='0', text_a='please reference complaint retailed well fargo filing complaint agency please read email attached received well fargo today requesting information already provided asking house vacant instructed vacate premise well fargo counter offer house listed year ago countering offer settle bottom email said correspondence particular loan guideline outlined specific loan interpreted well fargo home mortgage loan blatant retaliation filing complaint cfpb asking assistance senator assistance email senator office attached well fargo stop', text_b=None, label=array([0, 0, 0, 1, 0]))


In [85]:
# Convert InputExamples to InputFeatures
def convert_examples_to_features(examples, tokenizer, max_length=max_length, label_list=labels, output_mode="classification"):
    features = []
    for example in examples:
        inputs = tokenizer.encode_plus(
            example.text_a,
            add_special_tokens=True, 
            padding='max_length',
            truncation=True,
            return_attention_mask = True,
            return_token_type_ids=True,
            max_length=max_length, 
        )
        

        features.append(InputFeatures(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], label=example.label))
    return features

train_features = convert_examples_to_features(train_examples, tokenizer, max_length=max_length, label_list=test_labels)
print("train feature: ",train_features[0])
print("input ids: ",train_features[0].input_ids)
print("attention mask: ",train_features[0].attention_mask)

train feature:  InputFeatures(input_ids=[101, 3531, 4431, 12087, 7027, 2098, 2092, 23054, 15242, 12087, 4034, 3531, 3191, 10373, 4987, 2363, 2092, 23054, 2651, 17942, 2592, 2525, 3024, 4851, 2160, 10030, 10290, 12436, 16280, 18458, 2092, 23054, 4675, 3749, 2160, 3205, 2095, 3283, 4675, 2075, 3749, 7392, 3953, 10373, 2056, 11061, 3327, 5414, 5009, 4179, 14801, 3563, 5414, 10009, 2092, 23054, 2188, 14344, 5414, 1038, 20051, 4630, 18695, 15242, 12087, 12935, 2361, 2497, 4851, 5375, 5205, 5375, 10373, 5205, 2436, 4987, 2092, 23054, 2644, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [89]:
# Convert InputFeatures to TensorFlow datasets
def convert_features_to_tf_dataset(features):
    all_input_ids = []
    all_attention_masks = []
    all_labels = []

    for feature in features:
        all_input_ids.append(feature.input_ids)
        all_attention_masks.append(feature.attention_mask)
        all_labels.append(feature.label)

    # Convert lists to TF tensors
    tf_ds = tf.data.Dataset.from_tensor_slices(({"input_ids": all_input_ids, "attention_mask": all_attention_masks}, all_labels))
    return tf_ds

train_dataset = convert_features_to_tf_dataset(train_features)
print(type(train_dataset))
print(train_dataset)

batch_dataset = train_dataset.shuffle(3).batch(3)
print(type(batch_dataset))
print(batch_dataset)


<class 'tensorflow.python.data.ops.from_tensor_slices_op._TensorSliceDataset'>
<_TensorSliceDataset element_spec=({'input_ids': TensorSpec(shape=(512,), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(512,), dtype=tf.int32, name=None)}, TensorSpec(shape=(5,), dtype=tf.int64, name=None))>
<class 'tensorflow.python.data.ops.batch_op._BatchDataset'>
<_BatchDataset element_spec=({'input_ids': TensorSpec(shape=(None, 512), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(None, 512), dtype=tf.int32, name=None)}, TensorSpec(shape=(None, 5), dtype=tf.int64, name=None))>


In [95]:
# Iterate through the batched dataset
print(len(batch_dataset))
for batch in batch_dataset:
    input_ids = batch[0]["input_ids"]
    attention_mask = batch[0]["attention_mask"]
    print("ids:", input_ids)
    print("mask:", attention_mask)
    print("labels:", batch[1])

4
ids: tf.Tensor(
[[  101  2460 12654 ...     0     0     0]
 [  101  9362  5310 ...     0     0     0]
 [  101  4156  2034 ...     0     0     0]], shape=(3, 512), dtype=int32)
mask: tf.Tensor(
[[1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]], shape=(3, 512), dtype=int32)
labels: tf.Tensor(
[[0 0 1 0 0]
 [1 0 0 0 0]
 [0 0 0 0 1]], shape=(3, 5), dtype=int64)
ids: tf.Tensor(
[[  101  3531  4431 ...     0     0     0]
 [  101  2741 28314 ...     0     0     0]
 [  101  6016  7909 ...     0     0     0]], shape=(3, 512), dtype=int32)
mask: tf.Tensor(
[[1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]], shape=(3, 512), dtype=int32)
labels: tf.Tensor(
[[0 0 0 1 0]
 [0 0 0 0 1]
 [0 0 0 1 0]], shape=(3, 5), dtype=int64)
ids: tf.Tensor(
[[ 101 2089 5142 ...    0    0    0]
 [ 101 2927 2095 ...    0    0    0]
 [ 101 4015 3076 ...    0    0    0]], shape=(3, 512), dtype=int32)
mask: tf.Tensor(
[[1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]], shape=(3, 512), dtype=int32)

In [70]:
split = 0.8 
print(batch_size)
size = int((len(X) / batch_size) * split)
print(size)

train = train_dataset.take(1)
val = train_dataset.skip(1)

10
64
2030


TypeError: '_RepeatDataset' object is not subscriptable

# Model definition, training, and evaluation

## Model definition and intstantiation

In [97]:
model = TFBertForSequenceClassification.from_pretrained(model_name)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5, epsilon=1e-06, clipnorm=1.0),
              loss=tf.keras.losses.CategoricalCrossentropy(),
              metrics=[tf.keras.metrics.CategoricalAccuracy('accuracy')])

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Model training

In [98]:
model.fit(batch_dataset , epochs=3)

Epoch 1/3


ValueError: in user code:

    File "/usr/local/lib/python3.11/site-packages/keras/src/engine/training.py", line 1338, in train_function  *
        return step_function(self, iterator)
    File "/usr/local/lib/python3.11/site-packages/keras/src/engine/training.py", line 1322, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/usr/local/lib/python3.11/site-packages/keras/src/engine/training.py", line 1303, in run_step  **
        outputs = model.train_step(data)
    File "/usr/local/lib/python3.11/site-packages/transformers/modeling_tf_utils.py", line 1672, in train_step
        loss = self.compiled_loss(y, y_pred, sample_weight, regularization_losses=self.losses)
    File "/usr/local/lib/python3.11/site-packages/keras/src/engine/compile_utils.py", line 265, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "/usr/local/lib/python3.11/site-packages/keras/src/losses.py", line 142, in __call__
        losses = call_fn(y_true, y_pred)
    File "/usr/local/lib/python3.11/site-packages/keras/src/losses.py", line 268, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "/usr/local/lib/python3.11/site-packages/keras/src/losses.py", line 2122, in categorical_crossentropy
        return backend.categorical_crossentropy(
    File "/usr/local/lib/python3.11/site-packages/keras/src/backend.py", line 5560, in categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)

    ValueError: Shapes (None, 5) and (None, 2) are incompatible


## Model evaluation

In [118]:
# Define Trainer parameters
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)
    # print(classification_report(y_test,nb.predict(X_test)))
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

# Predict

In [None]:
test_sentence = "I think this is amazing!"
test_input = tf_tokenizer.encode_plus(test_sentence, add_special_tokens=True, return_tensors="tf")
outputs = model(test_input["input_ids"], attention_mask=test_input["attention_mask"])
probs = tf.nn.softmax(outputs[0], axis=-1)
predicted_label = tf.argmax(probs, axis=1).numpy()[0]
print(f"Predicted label: {predicted_label}")