In [None]:
!pip install transformers



#### Import Libraries

In [None]:
# Import Libraries
from transformers import BertForSequenceClassification, BertTokenizer
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score
import torch
from google.colab import drive
import pandas as pd
import numpy as np

In [None]:
# Access the test Data

drive.mount('/content/drive/', force_remount=True)
testPath = '/content/drive/My Drive/NLP/NLP_Project/complaints_test_data.csv'
testData = pd.read_csv(testPath)

Mounted at /content/drive/


In [None]:
testData.head()

Unnamed: 0.1,Unnamed: 0,Date received,narrative,product_category
0,0,2023-08-28,home improvement loan thru well fargo interest...,debt_collection
1,1,2023-04-07,someone attempted open two different credit ca...,credit_card
2,2,2023-02-17,santander consumer closed account auto loan wr...,debt_collection
3,3,2023-08-10,made transfers involved bank accounts without ...,retail_banking
4,4,2023-04-22,informed credit bureau fraudulent activity mul...,credit_reporting


In [None]:
# Drop the remaining columns that are not important.
columns_to_drop = ['Unnamed: 0']
testData.drop(columns=columns_to_drop, inplace=True)

In [None]:
from sklearn.preprocessing import LabelEncoder

# Assuming your original labels are in a list
original_labels = ['credit_card', 'credit_reporting', 'debt_collection', 'mortgages_and_loans', 'retail_banking']

# Create a label encoder
label_encoder = LabelEncoder()
numeric_labels = label_encoder.fit_transform(original_labels)

In [None]:
# Create a DataFrame with randomly sampled entries for each category
testData = testData.sample(2000, random_state=42).reset_index(drop=True)

print(testData["product_category"].value_counts())

retail_banking         417
credit_card            408
mortgages_and_loans    393
credit_reporting       392
debt_collection        390
Name: product_category, dtype: int64


In [None]:
import numpy as np

xTest = np.array(testData["narrative"])
yTest = np.array(testData["product_category"])

In [None]:
yTest

array(['retail_banking', 'credit_reporting', 'credit_card', ...,
       'retail_banking', 'mortgages_and_loans', 'mortgages_and_loans'],
      dtype=object)

In [None]:
yTest.shape

(2000,)

#### Pre trained BERT Model





In [None]:
# Load the pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=5)

for param in model.parameters():
    param.requires_grad = False

# Unfreeze the last two layers
for param in model.base_model.encoder.layer[-2:].parameters():
    param.requires_grad = True

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset
# Convert numpy array to a list of strings
xTest_list = [str(entry) for entry in xTest]

# Encode the training and test data
test_encodings = tokenizer(xTest_list, truncation=True, padding=True, return_tensors='pt', max_length=256)

# Create a DataLoader for the test set
yTest = torch.tensor(label_encoder.transform(yTest))
test_dataset = torch.utils.data.TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], yTest)

test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cpu')

In [None]:
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

#### Model 1

Here i used datasize 2500.

In [None]:
model.load_state_dict(torch.load('/content/drive/My Drive/NLP/Assignment1/Filtered_ComplaintsData_bert_model_2500_epoch1.pth', map_location=torch.device('cpu')))

<All keys matched successfully>

In [None]:
# Evaluate the model using the evaluate method
import numpy as np

model.eval()
predicted_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = batch
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predicted_labels.extend(torch.argmax(logits, dim=1).cpu().numpy())

In [None]:
decoded_predicted_labels = label_encoder.inverse_transform(predicted_labels)

# Evaluate accuracy
accuracy = accuracy_score(testData["product_category"], decoded_predicted_labels)
print(f"Accuracy: {accuracy}")

Accuracy: 0.5475


In [None]:
from sklearn.metrics import f1_score, confusion_matrix, classification_report

f1 = f1_score(testData["product_category"], decoded_predicted_labels, average='weighted')
conf_matrix = confusion_matrix(testData["product_category"], decoded_predicted_labels)
class_report = classification_report(testData["product_category"], decoded_predicted_labels)

print(f"F1 Score: {f1}")
print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Classification Report:\n{class_report}")

F1 Score: 0.5368428615649642
Confusion Matrix:
[[125  72  90   4 117]
 [ 10 304  73   2   3]
 [ 13  90 254   9  24]
 [ 15  26 136 145  71]
 [ 55  30  63   2 267]]
Classification Report:
                     precision    recall  f1-score   support

        credit_card       0.57      0.31      0.40       408
   credit_reporting       0.58      0.78      0.67       392
    debt_collection       0.41      0.65      0.50       390
mortgages_and_loans       0.90      0.37      0.52       393
     retail_banking       0.55      0.64      0.59       417

           accuracy                           0.55      2000
          macro avg       0.60      0.55      0.54      2000
       weighted avg       0.60      0.55      0.54      2000



#### Model 2

The datasize during the training was 10000 entity’s.




In [None]:
# Load the saved model
model.load_state_dict(torch.load('/content/drive/My Drive/NLP/Assignment1/Filtered_ComplaintsData_bert_model_10000_epoch1.pth', map_location=torch.device('cpu')))

<All keys matched successfully>

In [None]:
# Evaluate the model using the evaluate method
model.eval()
predicted_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = batch
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predicted_labels.extend(torch.argmax(logits, dim=1).cpu().numpy())

In [None]:
decoded_predicted_labels = label_encoder.inverse_transform(predicted_labels)

# Evaluate accuracy
accuracy = accuracy_score(testData["product_category"], decoded_predicted_labels)
print(f"Accuracy: {accuracy}")

Accuracy: 0.7645


In [None]:
# Evaluate the f1-score, confusion matrix, classification report.
from sklearn.metrics import f1_score, confusion_matrix, classification_report

f1 = f1_score(testData["product_category"], decoded_predicted_labels, average='weighted')
conf_matrix = confusion_matrix(testData["product_category"], decoded_predicted_labels)
class_report = classification_report(testData["product_category"], decoded_predicted_labels)

print(f"F1 Score: {f1}")
print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Classification Report:\n{class_report}")

F1 Score: 0.7634569175063995
Confusion Matrix:
[[270  41  31  14  52]
 [ 11 343  23  13   2]
 [ 12  90 261  20   7]
 [  8  28  25 318  14]
 [ 47   3  16  14 337]]
Classification Report:
                     precision    recall  f1-score   support

        credit_card       0.78      0.66      0.71       408
   credit_reporting       0.68      0.88      0.76       392
    debt_collection       0.73      0.67      0.70       390
mortgages_and_loans       0.84      0.81      0.82       393
     retail_banking       0.82      0.81      0.81       417

           accuracy                           0.76      2000
          macro avg       0.77      0.76      0.76      2000
       weighted avg       0.77      0.76      0.76      2000



### Model 3

The datasize during training was 12500 entities.

In [None]:
# Load the model
model.load_state_dict(torch.load('/content/drive/My Drive/NLP/Assignment1/Filtered_ComplaintsData_bert_model_12500_epoch1.pth', map_location=torch.device('cpu')))

<All keys matched successfully>

In [None]:
# Evaluate the model using the evaluate method
model.eval()
predicted_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = batch
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predicted_labels.extend(torch.argmax(logits, dim=1).cpu().numpy())

In [None]:
# Decoded predicted labels from numeric to text format.
decoded_predicted_labels = label_encoder.inverse_transform(predicted_labels)

# Evaluate accuracy
accuracy = accuracy_score(testData["product_category"], decoded_predicted_labels)
print(f"Accuracy: {accuracy}")

Accuracy: 0.774


In [None]:
from sklearn.metrics import f1_score, confusion_matrix, classification_report

f1 = f1_score(testData["product_category"], decoded_predicted_labels, average='weighted')
conf_matrix = confusion_matrix(testData["product_category"], decoded_predicted_labels)
class_report = classification_report(testData["product_category"], decoded_predicted_labels)

print(f"F1 Score: {f1}")
print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Classification Report:\n{class_report}")

F1 Score: 0.7719930680730043
Confusion Matrix:
[[266  48  18  18  58]
 [ 10 349  18  12   3]
 [ 16  83 260  19  12]
 [  8  29  17 318  21]
 [ 38   1  11  12 355]]
Classification Report:
                     precision    recall  f1-score   support

        credit_card       0.79      0.65      0.71       408
   credit_reporting       0.68      0.89      0.77       392
    debt_collection       0.80      0.67      0.73       390
mortgages_and_loans       0.84      0.81      0.82       393
     retail_banking       0.79      0.85      0.82       417

           accuracy                           0.77      2000
          macro avg       0.78      0.77      0.77      2000
       weighted avg       0.78      0.77      0.77      2000



### Model 4

The datasize during training was 15000.

In [None]:
# Load the model
model.load_state_dict(torch.load('/content/drive/My Drive/NLP/Assignment1/Filtered_ComplaintsData_bert_model_15000_epoch1.pth', map_location=torch.device('cpu')))

<All keys matched successfully>

In [None]:
# Evaluate the model using the evaluate method
model.eval()
predicted_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = batch
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predicted_labels.extend(torch.argmax(logits, dim=1).cpu().numpy())

In [None]:
# Decoded from numeric to text
decoded_predicted_labels = label_encoder.inverse_transform(predicted_labels)

# Evaluate accuracy
accuracy = accuracy_score(testData["product_category"], decoded_predicted_labels)
print(f"Accuracy: {accuracy}")

Accuracy: 0.7815


In [None]:
from sklearn.metrics import f1_score, confusion_matrix, classification_report

f1 = f1_score(testData["product_category"], decoded_predicted_labels, average='weighted')
conf_matrix = confusion_matrix(testData["product_category"], decoded_predicted_labels)
class_report = classification_report(testData["product_category"], decoded_predicted_labels)

print(f"F1 Score: {f1}")
print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Classification Report:\n{class_report}")

F1 Score: 0.7773391700297456
Confusion Matrix:
[[252  37  21  28  70]
 [  8 348  18  16   2]
 [ 11  73 259  37  10]
 [  7  18  16 335  17]
 [ 20   1  10  17 369]]
Classification Report:
                     precision    recall  f1-score   support

        credit_card       0.85      0.62      0.71       408
   credit_reporting       0.73      0.89      0.80       392
    debt_collection       0.80      0.66      0.73       390
mortgages_and_loans       0.77      0.85      0.81       393
     retail_banking       0.79      0.88      0.83       417

           accuracy                           0.78      2000
          macro avg       0.79      0.78      0.78      2000
       weighted avg       0.79      0.78      0.78      2000

