In [None]:
!pip install transformers



#### Import Library

In [None]:
import pandas as pd
import numpy as np
from google.colab import drive
from transformers import BertForSequenceClassification, BertTokenizer
from transformers import AdamW

In [None]:
drive.mount('/content/drive/', force_remount=True)
trainPath = '/content/drive/My Drive/NLP/NLP_Project/filtered_complaints_train_data.csv'
testPath = '/content/drive/My Drive/NLP/NLP_Project/filtered_complaints_test_data.csv'

Mounted at /content/drive/


In [None]:
# Read the CSV data into DataFrames
trainData = pd.read_csv(trainPath)
testData = pd.read_csv(testPath)

In [None]:
trainData.head()

Unnamed: 0.1,Unnamed: 0,Date received,narrative,product_category
0,0,2019-04-23,saw debt report mine call portfolio recovery a...,debt_collection
1,1,2023-03-31,located ca contacted stating owed credit card ...,debt_collection
2,2,2022-12-10,got side track cause inflation went damaged fi...,mortgages_and_loans
3,3,2021-01-29,noticed derogatory account listed credit repor...,debt_collection
4,4,2018-08-14,went branch withdraw funds form certified chec...,retail_banking


In [None]:
# Drop the remaining columns that are not important.
columns_to_drop = ['Unnamed: 0']
trainData.drop(columns=columns_to_drop, inplace=True)

In [None]:
trainData.shape

(50000, 3)

In [None]:
trainData.head()

Unnamed: 0,Date received,narrative,product_category
0,2019-04-23,saw debt report mine call portfolio recovery a...,debt_collection
1,2023-03-31,located ca contacted stating owed credit card ...,debt_collection
2,2022-12-10,got side track cause inflation went damaged fi...,mortgages_and_loans
3,2021-01-29,noticed derogatory account listed credit repor...,debt_collection
4,2018-08-14,went branch withdraw funds form certified chec...,retail_banking


In [None]:
type(np.array(trainData['narrative']))

numpy.ndarray

#### Convert categories into numeric form

In [None]:
from sklearn.preprocessing import LabelEncoder

# Assuming your original labels are in a list
original_labels = ['credit_card', 'credit_reporting', 'debt_collection', 'mortgages_and_loans', 'retail_banking']

# Create a label encoder
label_encoder = LabelEncoder()
numeric_labels = label_encoder.fit_transform(original_labels)

In [None]:
numeric_labels

array([0, 1, 2, 3, 4])

In [None]:
# Split the dataset
from sklearn.model_selection import train_test_split
subset1, subset2 = train_test_split(trainData, test_size=0.70, random_state=64)

In [None]:
subset1["product_category"].value_counts()

credit_reporting       3085
credit_card            3043
debt_collection        3032
mortgages_and_loans    2935
retail_banking         2905
Name: product_category, dtype: int64

In [None]:
xTrain = np.array(subset1["narrative"])
yTrain = np.array(subset1["product_category"])
xTest = np.array(testData["narrative"])
yTest = np.array(testData["product_category"])

In [None]:
# Load the pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=5)

# freeze the all layers
for param in model.parameters():
    param.requires_grad = False

# Unfreeze the last two layers
for param in model.base_model.encoder.layer[-2:].parameters():
    param.requires_grad = True

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
type(xTrain)

numpy.ndarray

In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset
# Convert numpy array to a list of strings
xTrain_list = [str(entry) for entry in xTrain]

# Encode the training and test data
train_encodings = tokenizer(xTrain_list, truncation=True, padding=True, return_tensors='pt', max_length=256)

In [None]:
yTrain

array(['debt_collection', 'debt_collection', 'mortgages_and_loans', ...,
       'mortgages_and_loans', 'credit_card', 'credit_reporting'],
      dtype=object)

In [None]:
# Create PyTorch tensors for labels
yTrain = torch.tensor(label_encoder.transform(yTrain))

# Create training and test datasets
train_dataset = torch.utils.data.TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], yTrain)

In [None]:
# Create DataLoader
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

In [None]:
model.to(device)

In [None]:
# Train the model
from tqdm import tqdm
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)
model.train()

progress_bar = tqdm(train_loader, desc=f'Epoch 1', leave=False)

for batch in progress_bar:
    input_ids, attention_mask, labels = batch
    optimizer.zero_grad()
    outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs.loss
    loss.backward()
    optimizer.step()

    # Update the progress bar with the current loss
    progress_bar.set_postfix({'Loss': loss.item()}, refresh=True)

print(f'Training for Epoch 1 completed.')

In [None]:
# Save the model on drive
torch.save(model.state_dict(), '/content/drive/My Drive/NLP/Assignment1/Filtered_ComplaintsData_bert_model_15000_epoch1.pth')
print("Model save on drive")