In [1]:
!pip install transformers



In [2]:
#Import libraries
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, BertConfig, get_linear_schedule_with_warmup
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from keras.utils import pad_sequences
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import numpy as np
import random
from sklearn import metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [3]:
#Connect to Google Drive
from google.colab import drive
drive.mount('/content/drive')

import warnings
warnings.filterwarnings("ignore")

device = torch.device("cuda")
torch.cuda.empty_cache()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


##Model Selection and Implementation

In [4]:
#Text preprocessing
data = pd.read_csv('/content/drive/MyDrive/AAI520/MODULE6/complaints_processed.csv')
data = data.sample(100000)
print(data.shape)
data.sample(6)

(100000, 3)


Unnamed: 0.1,Unnamed: 0,product,narrative
51448,51448,mortgages_and_loans,dealing first pacific funding guy dealing thou...
94181,94181,credit_reporting,submitted proper document equifax showing vict...
84405,84405,credit_reporting,trying trade vehicle newer model told salesman...
85732,85732,credit_reporting,would like make complaint chase action led inc...
26181,26181,debt_collection,hi actually talking representative portfolio r...
25029,25029,retail_banking,hello withdrew euro coinbase account bank acco...


In [5]:
print(data.info())
data['product'].value_counts()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 30425 to 88740
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Unnamed: 0  100000 non-null  int64 
 1   product     100000 non-null  object
 2   narrative   99992 non-null   object
dtypes: int64(1), object(2)
memory usage: 3.1+ MB
None


credit_reporting       56103
debt_collection        14237
mortgages_and_loans    11657
credit_card             9628
retail_banking          8375
Name: product, dtype: int64

In [6]:
#Label 0 = credit_reporting, 1 = debt_collection, 2=mortgages_and_loans, 3=credit_card, 4=retail_banking
data['product'] = data['product'].replace(0,'credit_reporting')
data['product'] = data['product'].replace(1,'debt_collection')
data['product'] = data['product'].replace(2,'mortgages_and_loans')
data['product'] = data['product'].replace(3,'credit_card')
data['product'] = data['product'].replace(4,'retail_banking')

label_encoder = LabelEncoder()
data['product'] = label_encoder.fit_transform(data['product'])

#Get list of sentences and labels
sentences = data.narrative.values
sentences = sentences.astype(str)
labels = data['product'].values

#Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = 5, output_attentions = False, output_hidden_states = False, )

#Complete tokenization and IDs for all sentences
input_ids = []
for sent in sentences:
  # sent = [str(sent) for l in sent]
  id = tokenizer.encode(sent)
  input_ids.append(id)

#Pad the input to the max length
max_len = 128
input_ids = pad_sequences(input_ids, maxlen=max_len, dtype="long", value=0, truncating="post", padding="post")

#Create attention masks
attention_mask = []
for sent in input_ids:
  att_mask = [int(token_id > 0) for token_id in sent]
  attention_mask.append(att_mask)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Token indices sequence length is longer than the specified maximum sequence length for this model (851 > 512). Running this sequence through the model will result in indexing errors


##Training

In [7]:
#Split dataset into training and testing and validation
X_train, X_test, y_train, y_test = train_test_split(input_ids, labels, random_state=10, test_size=0.5)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, random_state=10, test_size=0.2)

#Split attention masks
train_masks, test_masks, train_label, _ = train_test_split(attention_mask, labels,  random_state=10, test_size=0.5)
train_masks, val_masks, _, _ = train_test_split(train_masks, train_label,  random_state=10, test_size=0.2)

#Convert input data to tensors
X_train = torch.tensor(X_train)
X_test = torch.tensor(X_test)
X_val = torch.tensor(X_val)

y_train = torch.tensor(y_train)
y_test = torch.tensor(y_test)
y_val = torch.tensor(y_val)

train_masks = torch.tensor(train_masks)
test_masks = torch.tensor(test_masks)
val_masks = torch.tensor(val_masks)

#Load the optimizer
optimizer = AdamW(model.parameters(),lr = 2e-5,eps = 1e-8)
model.cuda()

batch_size = 16

#Create the DataLoader for training
train_data = TensorDataset(X_train, train_masks, y_train)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

#Create the DataLoader for validation
validation_data = TensorDataset(X_val, val_masks, y_val)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

epochs = 2
total_steps = len(train_dataloader)*epochs
scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps = 0,num_training_steps = total_steps)

In [8]:
#Function to calculate the accuracy
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [9]:
#Train model
seed_val = 10
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

loss_values = []
for i in range(0, epochs):
  print('Epoch {:} / {:}'.format(i + 1, epochs))
  total_loss = 0
  model.train()
  for step, batch in enumerate(train_dataloader):
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)
    model.zero_grad()
    outputs = model(b_input_ids,token_type_ids=None,attention_mask=b_input_mask,labels=b_labels)
    loss = outputs[0]
    total_loss += loss.item()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    optimizer.step()
    scheduler.step()
  avg_train_loss = total_loss / len(train_dataloader)
  loss_values.append(avg_train_loss)
  print('Average training loss: {0:.2f}'.format(avg_train_loss))

  model.eval()
  eval_loss, eval_accuracy = 0, 0
  nb_eval_steps, nb_eval_examples = 0, 0
  for batch in validation_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch
    with torch.no_grad():
      outputs = model(b_input_ids,token_type_ids=None, attention_mask=b_input_mask)
    logits = outputs[0]
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    tmp_eval_accuracy = flat_accuracy(logits, label_ids)
    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1
  print('Accuracy: {0:.2f}'.format(eval_accuracy/nb_eval_steps))

Epoch 1 / 2
Average training loss: 0.51
Accuracy: 0.86
Epoch 2 / 2
Average training loss: 0.32
Accuracy: 0.87


##Evaluation

In [10]:
#Create the DataLoader for testing
prediction_data = TensorDataset(X_test, test_masks, y_test)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

In [11]:
#Evaluate model on the test set
model.eval()
predictions , true_labels = [], []

for batch in prediction_dataloader:
  batch = tuple(t.to(device) for t in batch)
  b_input_ids, b_input_mask, b_labels = batch
  with torch.no_grad():
      outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
  logits = outputs[0]

  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()

  predictions.append(logits)
  true_labels.append(label_ids)

flat_predictions = [item for sublist in predictions for item in sublist]
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
flat_true_labels = [item for sublist in true_labels for item in sublist]

In [12]:
#Performance metrics
accuracy = accuracy_score(flat_true_labels, flat_predictions)
precision = precision_score(flat_true_labels, flat_predictions, average='weighted')
recall= recall_score(flat_true_labels, flat_predictions, average='weighted')
f1_score = metrics.f1_score(flat_true_labels, flat_predictions, average='weighted')
print('Performance Evaluation:')
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)

Performance Evaluation:
Accuracy: 0.87896
Precision: 0.8783452391013687
Recall: 0.87896
F1 Score: 0.8785445630193242
