In [11]:
import re
import torch
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch.nn as nn
from transformers import BertModel
from transformers import BertTokenizer
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [12]:
lr = 1e-3
seq_len = 20
dropout = 0.5
num_epochs = 10
label_col = "Product"
tokens_path = "Output/tokens.pkl"
labels_path = "Output/labels.pkl"
data_path = "/Users/Downloads/modelstatedict"
text_col_name = "Consumer complaint narrative"
label_encoder_path = "Output/label_encoder.pkl"
product_map = {'Vehicle loan or lease': 'vehicle_loan',
               'Credit reporting, credit repair services, or other personal consumer reports': 'credit_report',
               'Credit card or prepaid card': 'card',
               'Money transfer, virtual currency, or money service': 'money_transfer',
               'virtual currency': 'money_transfer',
               'Mortgage': 'mortgage',
               'Payday loan, title loan, or personal loan': 'loan',
               'Debt collection': 'debt_collection',
               'Checking or savings account': 'savings_account',
               'Credit card': 'card',
               'Bank account or service': 'savings_account',
               'Credit reporting': 'credit_report',
               'Prepaid card': 'card',
               'Payday loan': 'loan',
               'Other financial service': 'others',
               'Virtual currency': 'money_transfer',
               'Student loan': 'loan',
               'Consumer Loan': 'loan',
               'Money transfers': 'money_transfer'}

In [13]:
def save_file(name, obj):
    """
    Function to save an object as pickle file
    """
    with open(name, 'wb') as f:
        pickle.dump(obj, f)


def load_file(name):
    """
    Function to load a pickle object
    """
    return pickle.load(open(name, "rb"))

## Process text data
---

In [14]:
data = pd.read_csv("complaints.csv")
data.head()

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,2019-06-13,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Information belongs to someone else,,,CAPITAL ONE FINANCIAL CORPORATION,PA,186XX,,Consent not provided,Web,2019-06-13,Closed with explanation,Yes,,3274605.0
1,2019-11-01,Vehicle loan or lease,Loan,Struggling to pay your loan,Denied request to lower payments,I contacted Ally on Friday XX/XX/XXXX after fa...,Company has responded to the consumer and the ...,ALLY FINANCIAL INC.,NJ,088XX,,Consent provided,Web,2019-11-01,Closed with explanation,Yes,,3425257.0
2,2019-04-01,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Account status incorrect,,Company has responded to the consumer and the ...,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",PA,19067,,Consent not provided,Web,2019-04-01,Closed with explanation,Yes,,3198225.0
3,2021-11-01,"Credit reporting, credit repair services, or o...",Credit reporting,Problem with a credit reporting company's inve...,Was not notified of investigation status or re...,,,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",GA,31707,,,Web,2021-11-01,In progress,Yes,,4863965.0
4,2021-11-02,Debt collection,Medical debt,Took or threatened to take negative or legal a...,Threatened or suggested your credit would be d...,,,"Medical Data Systems, Inc.",VA,22033,,,Web,2021-11-02,In progress,Yes,,4866449.0


In [15]:
data.dropna(subset=["Consumer complaint narrative"], inplace=True) #inplace = True changes the original dataset instead of defining a new one
data.head()

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
1,2019-11-01,Vehicle loan or lease,Loan,Struggling to pay your loan,Denied request to lower payments,I contacted Ally on Friday XX/XX/XXXX after fa...,Company has responded to the consumer and the ...,ALLY FINANCIAL INC.,NJ,088XX,,Consent provided,Web,2019-11-01,Closed with explanation,Yes,,3425257.0
7,2019-07-08,"Credit reporting, credit repair services, or o...",Credit reporting,Problem with a credit reporting company's inve...,Their investigation did not fix an error on yo...,Hello This complaint is against the three cred...,Company has responded to the consumer and the ...,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",NY,109XX,,Consent provided,Web,2019-07-08,Closed with explanation,Yes,,3299394.0
8,2020-06-10,"Credit reporting, credit repair services, or o...",Credit reporting,Improper use of your report,Credit inquiries on your report that you don't...,I am a victim of Identity Theft & currently ha...,Company has responded to the consumer and the ...,Experian Information Solutions Inc.,MT,,Servicemember,Consent provided,Web,2020-06-10,Closed with explanation,Yes,,3692762.0
10,2019-07-03,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Account information incorrect,Two accounts are still on my credit history af...,Company has responded to the consumer and the ...,Experian Information Solutions Inc.,FL,328XX,,Consent provided,Web,2019-07-03,Closed with non-monetary relief,Yes,,3294745.0
13,2019-03-21,"Credit reporting, credit repair services, or o...",Other personal consumer report,Identity theft protection or other monitoring ...,Received unwanted marketing or advertising,Receiving daily telephone call ( s ) from XXXX...,Company has responded to the consumer and the ...,"NRA Group, LLC",MA,,,Consent provided,Web,2019-03-27,Closed with explanation,Yes,,3186954.0


In [16]:
data.replace({"Product": product_map}, inplace=True)
data.head()

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
1,2019-11-01,vehicle_loan,Loan,Struggling to pay your loan,Denied request to lower payments,I contacted Ally on Friday XX/XX/XXXX after fa...,Company has responded to the consumer and the ...,ALLY FINANCIAL INC.,NJ,088XX,,Consent provided,Web,2019-11-01,Closed with explanation,Yes,,3425257.0
7,2019-07-08,credit_report,Credit reporting,Problem with a credit reporting company's inve...,Their investigation did not fix an error on yo...,Hello This complaint is against the three cred...,Company has responded to the consumer and the ...,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",NY,109XX,,Consent provided,Web,2019-07-08,Closed with explanation,Yes,,3299394.0
8,2020-06-10,credit_report,Credit reporting,Improper use of your report,Credit inquiries on your report that you don't...,I am a victim of Identity Theft & currently ha...,Company has responded to the consumer and the ...,Experian Information Solutions Inc.,MT,,Servicemember,Consent provided,Web,2020-06-10,Closed with explanation,Yes,,3692762.0
10,2019-07-03,credit_report,Credit reporting,Incorrect information on your report,Account information incorrect,Two accounts are still on my credit history af...,Company has responded to the consumer and the ...,Experian Information Solutions Inc.,FL,328XX,,Consent provided,Web,2019-07-03,Closed with non-monetary relief,Yes,,3294745.0
13,2019-03-21,credit_report,Other personal consumer report,Identity theft protection or other monitoring ...,Received unwanted marketing or advertising,Receiving daily telephone call ( s ) from XXXX...,Company has responded to the consumer and the ...,"NRA Group, LLC",MA,,,Consent provided,Web,2019-03-27,Closed with explanation,Yes,,3186954.0


### Encode labels

In [17]:
label_encoder = LabelEncoder()
label_encoder.fit(data["Product"]) #looks through the column 'Product' and assigns a factor to each category. 

labels = label_encoder.transform(data[label_col]) #replace the categories with their corresponding integers assigned. 
labels

array([8, 1, 1, ..., 1, 2, 8])

In [18]:
labels = labels[:100]

### Process the text column

In [19]:
x = data['Consumer complaint narrative']
print(x.head())
print(f"\nNumber of samples: {len(x)}")
type(x)

1     I contacted Ally on Friday XX/XX/XXXX after fa...
7     Hello This complaint is against the three cred...
8     I am a victim of Identity Theft & currently ha...
10    Two accounts are still on my credit history af...
13    Receiving daily telephone call ( s ) from XXXX...
Name: Consumer complaint narrative, dtype: object

Number of samples: 162985


pandas.core.series.Series

In [20]:
# Convert the input to a list: 
input_text = list(x)
input_text[0]

'I contacted Ally on Friday XX/XX/XXXX after falling behind on payments due to being out of work for a short period of time due to an illness. I chated with a representative after logging into my account regarding my opitions to ensure I protect my credit and bring my account current. \n\nShe advised me that before an extenstion could be done, I had to make a payment in the amount of {$270.00}. I reviewed my finances, as I am playing catch up on all my bills and made this payment on Monday XX/XX/XXXX. This rep advised me, once this payment posts to my account to contact Ally back for an extention or to have a payment deffered to the end of my loan. \n\nWith this in mind, I contacted Ally again today and chatted with XXXX. I explained all of the above and the information I was provided when I chatted with the rep last week. She asked several questions and advised me that a one or two month  extension/deffered payment could be done however partial payment is needed! WHAT? She advised me 

In [21]:
def edit_text(x,m): 
    """
    Input 
    
    X       : The input sentences into the model. Note that X must be in the format of a list. 
    m       : The fixed length of the sequences. 

    Output
    
    Tokens : This is a list of dictionaries. Each dictionary corresponds to a sampel sequence. 
             The dictionary contains a vector of integers correponding to the words in the sequence 
             and an attention mask. 
    """

    # Convert the text into lower case: 
    input_text = [i.lower() for i in x]
    
    # Remove the punctuations: 
    input_text = [re.sub(r"[^\w\d'\s]+", " ", i) for i in input_text]
    
    # Remove digits from the text: 
    input_text = [re.sub("\d+", "", i) for i in input_text]

    # Once digits are removed, there will be double spaces between the words. Remove them: 
    input_text = [re.sub(' +', ' ', i) for i in input_text]
    
    # Remove more than one instance of 'x': 
    input_text = [re.sub(r'[x]{2,}', "", i) for i in input_text]
    
    seq_len = 20 #limiting the length of the sequence. 
    tokens = [tokenizer(i, padding="max_length", max_length=seq_len, 
                    truncation=True, return_tensors="pt")  for i in tqdm(input_text)]
    
    return(tokens)

The BertTokenizer splits the words and mapps them to a Bert Dictionary containing 30,000 tokens. Notice that the starting and ending integer tokens are the same for all the sample complaints; this is because the first and last indices refer to the $<SOS>$ and $<EOS>$ tokens. Furthermore, the unknown tokens are split into the most known subwords or characters. This allows the model to handle out-of-vocabulary words gracefully. In addition to mapping the words to a vector of integers, Tokenizer can also pad or truncate the sequences and define an attention mask along with the samples. 

In [22]:
# Load the tokenizer to use: 
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")



In [23]:
input_text[0]

'I contacted Ally on Friday XX/XX/XXXX after falling behind on payments due to being out of work for a short period of time due to an illness. I chated with a representative after logging into my account regarding my opitions to ensure I protect my credit and bring my account current. \n\nShe advised me that before an extenstion could be done, I had to make a payment in the amount of {$270.00}. I reviewed my finances, as I am playing catch up on all my bills and made this payment on Monday XX/XX/XXXX. This rep advised me, once this payment posts to my account to contact Ally back for an extention or to have a payment deffered to the end of my loan. \n\nWith this in mind, I contacted Ally again today and chatted with XXXX. I explained all of the above and the information I was provided when I chatted with the rep last week. She asked several questions and advised me that a one or two month  extension/deffered payment could be done however partial payment is needed! WHAT? She advised me 

In [24]:
tokens_samples = edit_text(input_text[0:2],20)

100%|████████████████████████████████████████████| 2/2 [00:00<00:00, 209.93it/s]


In [188]:
print(tokens_samples[0].input_ids)
tokens_samples[0].attention_mask


tensor([[  101,   178, 12017, 11989,  1113,   175, 22977,  1183,  1170,  4058,
          1481,  1113, 10772,  1496,  1106,  1217,  1149,  1104,  1250,   102]])


tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])

In [189]:
print(tokens_samples[1].input_ids)
tokens_samples[1].attention_mask

tensor([[  101, 19082,  1142, 12522,  1110,  1222,  1103,  1210,  4755,  7516,
          2557, 14715,  3779,  1105,   178,  3535,  1199,  6187,  1874,   102]])


tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])

In [25]:
# Edit and Tokenize all samples: 
tokens = edit_text(input_text[:100],20)

100%|███████████████████████████████████████| 100/100 [00:00<00:00, 1001.43it/s]


Note that the input tokens cannot be used directly within tokens we must first extract it from the dictionary. 

## Create PyTorch Dataset
---

In [26]:
class TextDataset(torch.utils.data.Dataset):
    """
    Input
    
    tokens: This is a list of dictionaries including the word bags and their corresponding attention_mask. 
    labels: This is a list of labels ranging from 0 - 8 corresponding to the product the complaint is against. 

    Output 
    
    Python obj self with two attributes: Tokens and labels. 
    __len__() outputs the number of samples. 
    __getitem__ will output a specific sample's tokens and its label. 
    
    """
    
    def __init__(self, tokens, labels):
        self.tokens = tokens
        self.labels = labels
        
    def __len__(self):
        return len(self.tokens)
    
    def __getitem__(self, idx):
        return self.labels[idx], self.tokens[idx]

### Split data into train, validation and test sets

The dataset is first split between the training (80%) and testing sets (20%). The training set is then split further into training (75% of 80%) and validation sets (25% of 80%). The model first takes the training set and learns the patterns by updating its parameters. We then take the validation dataset to update the hyperparameters. First the model is exposed to the training dataset and makes a prediction then by using back propagation, the model updates the parameters. once the parameters are updated we make a prediction on the validation set where we find out how the model performs on unseen data. Using the results the model produces with the validatino dataset, we can tune the hyperparameters. The model then goes through anther iteration of back propagation. Finally, when the loss is small enoguh or when the max-iteration number is reached, the model will run a prediction on the testing data. 

#### Come back to the explanation above 

In [27]:
X_train, X_test, y_train, y_test = train_test_split(tokens, labels,
                                                   test_size=0.2)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, 
                                                      y_train,
                                                     test_size=0.25)

In [28]:
train_dataset = TextDataset(X_train, y_train)
valid_dataset = TextDataset(X_valid, y_valid)
test_dataset = TextDataset(X_test, y_test)

In [29]:
train_dataset

<__main__.TextDataset at 0x2a616fc50>

In [30]:
train_dataset.labels[0:100] # the labels corresponding to the first 100 training samples. 

array([0, 2, 1, 8, 1, 1, 1, 7, 3, 7, 1, 1, 3, 2, 2, 1, 1, 1, 0, 1, 8, 1,
       5, 0, 2, 1, 2, 5, 2, 5, 1, 5, 4, 3, 1, 2, 0, 2, 1, 7, 2, 1, 1, 5,
       1, 0, 0, 7, 5, 1, 1, 0, 5, 1, 2, 4, 1, 1, 5, 7])

In [62]:
train_dataset.tokens[0] # The bag of words and its corresponding attentino mask for the first training sample. 

{'input_ids': tensor([[ 101,  178,  112,  182, 1770, 1106, 1129, 1107, 1103, 2319, 1106, 4779,
          170, 1313, 1114, 1115, 1217, 1163,  178,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [66]:
train_dataset.__len__()

97791

In [71]:
train_dataset.__getitem__(0) #first the label and then the tokens are returned. 

(1,
 {'input_ids': tensor([[ 101,  178,  112,  182, 1770, 1106, 1129, 1107, 1103, 2319, 1106, 4779,
           170, 1313, 1114, 1115, 1217, 1163,  178,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])})

Note that the tokens in each dataset must be further processed to be extracted from the dictionary tokens. 

## torch.utils.data.DataLoader: 
Instead of feeding all the training dataset at once, we use a stochastic approach by feeding into the model one batch at a time. To create the batches we use **'torch.utils.data.DataLoader'** which does the following: 

- Batch Processing:

DataLoader allows you to load data in batches, which is crucial for training models efficiently. Instead of loading the entire dataset into memory at once, it loads data in smaller chunks (batches).
- Shuffling:

It can shuffle the data at the beginning of each epoch to ensure that the model does not learn the order of the data, which helps in improving the generalization of the model.
- Parallel Data Loading:

DataLoader supports multi-threaded data loading, which means it can use multiple worker processes to load data in parallel. This can significantly speed up the data loading process.

- drop_last: 

A boolean flag that indicates whether to drop the last incomplete batch if the dataset size is not divisible by the batch size. If True, the last batch will be dropped if it is smaller than batch_size.


In [31]:
train_loader = torch.utils.data.DataLoader(train_dataset,
                                           batch_size=16,
                                           shuffle=True,
                                           drop_last=True)
valid_loader = torch.utils.data.DataLoader(valid_dataset,
                                           batch_size=16)
test_loader = torch.utils.data.DataLoader(test_dataset, 
                                         batch_size=16)

## Create Bert model
---

The objective here is to create a Bert classifier class in which we input the sequences and the pre-trained Bert model will attend to the sentences. The output is then run through an activation then a dropout layer is added and finally the output is mapped to another linear layer.


### Rewrite: shorten: Understanding pooler_output in BERT Models
BERT Model Overview
BERT (Bidirectional Encoder Representations from Transformers) is a transformer-based model used for a variety of natural language processing (NLP) tasks. It processes input sequences and generates embeddings for each token in the sequence.

The [CLS] Token
Special Token: [CLS] stands for "classification" and is a special token added at the beginning of the input sequence.
Purpose: This token is used to aggregate information from the entire sequence. During training, its embedding is often utilized for sequence-level classification tasks.
How BERT Processes the [CLS] Token
Token Embeddings:

Each token in the input sequence, including [CLS], is converted into an embedding by the model's embedding layer.
Transformer Layers:

These embeddings are processed through multiple transformer layers, refining the embeddings based on both left and right context due to bidirectional attention.
Pooling:

After the sequence has been processed through the transformer layers, the [CLS] token's embedding is pooled or extracted. This pooling typically involves a simple linear transformation to produce a fixed-size vector.
The pooler_output
Definition: pooler_output refers to the embedding of the [CLS] token after pooling.

Pooling Process: After processing through the final transformer layer, the [CLS] token's embedding is passed through a pooling layer to obtain a vector of size hidden_size.
Shape: [batch_size, hidden_size]

batch_size: The number of sequences processed in one forward pass.
hidden_size: The size of the hidden layer in the BERT model, typically 768 for BERT base models.
Usage in Classification Tasks
Feature Representation: The embedding of the [CLS] token acts as a summary representation of the entire input sequence.
Classification Layer: This embedding is fed into a classification head (usually a fully connected layer) to produce class scores.
Example Workflow
Input Sequence:

Tokenize the sequence and add [CLS] at the start.
Forward Pass:

Pass the sequence through BERT to generate embeddings for each token, including [CLS].
Pooling:

Extract the [CLS] token’s embedding from the final hidden states, which is the pooler_output.
Classification:

Use the pooler_output in a classification head to predict class labels.


In [123]:
class BertClassifier(nn.Module): #nn.Module specifies that we are building the neural network in pytorch and it should inherit all properties of nn in pytorch. 
    
    def __init__(self, dropout, num_classes):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-cased')
        #Make sure not all parameters during training will be used: 
        for param in self.bert.parameters(): #iterating through all the parameters of the model
            param.required_grad = False #this parameter will not require gradient updates. 
        #self.dropout1 = nn.Dropout(dropout_rate) #this is no longer tenserflow so dropout must be nn from torch. 
        self.dropout = nn.Dropout(dropout)
        #Define the linear layer 
        self.linear = nn.Linear(768, num_classes) # no use of units here anymore. first arg input size and second arg is output shape. 
        self.activation = nn.ReLU() 
        # Add LayerNorms 
        self.layernorm1 = nn.LayerNorm(normalized_shape=768)
    

    def forward(self,input_seqs, attention_mask): #in context of Pytorch use forward instead of call command. 

        """
        Inputs 
        
        input_seqs     (batch_size, len_seq) : An integer vector corresponding to the index of words used in the dictionary. 
        attention_mask (batch_size,len_seq)  : A binary vector indicating the padding mask

        Output
        final_output:  (batch_size, num_classes): a list of logits, each representing the score of the sequence belonging to a class.

        """
        
        #First you get the input run it through a dropout layer: 
        #x_dropped = self.dropout1(input_seqs)
        
        
        # Run through the Bert Model: 
        _, output_bert = self.bert(input_ids = input_seqs, 
                                   attention_mask = attention_mask, 
                                   return_dict = False) # Shape: (batch_size, len_seq, num_hidden_neurons = 768) 
        
        dropout_bert = self.layernorm1(self.dropout(output_bert))
      
        final_output = self.linear(dropout_bert)
        return final_output 
        

In [155]:
class BertClassifier(nn.Module):
    
    def __init__(self, dropout, num_classes):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-cased')
        for param in self.bert.parameters():
            param.required_grad = False
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, num_classes)
        self.activation = nn.ReLU()
    
    def forward(self, input_ids, attention_mask):
        _, bert_output = self.bert(input_ids=input_ids,
                                  attention_mask=attention_mask,
                                  return_dict=False)
        dropout_output = self.activation(self.dropout(bert_output))
        final_output = self.linear(dropout_output)
        return final_output

## Example: 

In [112]:
for batch_labels, batch_data in tqdm(train_loader): 
    input_ids = batch_data["input_ids"] #extract the embeddings 
    attention_mask = batch_data["attention_mask"] #extract the attention_masks 
    print(batch_labels)

100%|████████████████████████████████████████████| 3/3 [00:00<00:00, 722.37it/s]

tensor([1, 2, 1, 1, 1, 2, 1, 2, 3, 7, 1, 7, 1, 1, 7, 5])
tensor([5, 0, 7, 5, 5, 0, 2, 1, 3, 4, 2, 1, 1, 2, 8, 1])
tensor([5, 5, 2, 2, 4, 0, 1, 2, 7, 1, 1, 1, 1, 0, 5, 0])





In [113]:
input_ids.shape

torch.Size([16, 1, 20])

In [114]:
attention_mask.shape

torch.Size([16, 1, 20])

In [115]:
input_ids = torch.squeeze(input_ids, 1)
input_ids.shape #this is the expected input size to the BertClassifier model 

torch.Size([16, 20])

In [107]:
input_ids = input_ids.to(torch.float32)
layernorm = nn.LayerNorm(normalized_shape=20)
layernorm(input_ids)

tensor([[[-0.9036, -0.6614, -0.5998,  0.9039, -0.7093,  1.8036,  0.2790,
          -0.2847, -0.7064, -0.8898,  2.3714,  0.7420, -0.7087,  0.6164,
          -0.7078, -0.7082,  1.7924, -0.4410, -0.2847, -0.9034]],

        [[-1.0095,  0.0187, -0.9693,  1.6986,  0.4400,  0.2456, -0.4206,
          -0.4212, -0.9647, -0.4061, -0.3985, -0.0884, -0.1012, -0.4247,
           2.9753, -0.4055,  1.6986, -0.4160, -0.0424, -1.0089]],

        [[-0.5410, -0.5273, -0.3471, -0.5287,  0.2925, -0.3592, -0.3610,
          -0.5287,  2.1348, -0.3607, -0.5273, -0.0405, -0.3605, -0.3584,
           0.0962,  0.0279, -0.3619,  3.5519, -0.3603, -0.5408]],

        [[-0.5991, -0.5722, -0.1255, -0.5750,  0.0070, -0.2436, -0.5722,
          -0.5694,  0.1439, -0.2485, -0.2495, -0.2286, -0.5750,  1.6988,
           3.6834, -0.2471, -0.2419,  0.6854, -0.5722, -0.5987]],

        [[-0.4106, -0.3971,  0.1519, -0.2287, -0.1982, -0.2343, -0.3971,
          -0.2288, -0.2311, -0.1268, -0.2297, -0.2278,  4.3241, -0.2327,
  

In [116]:
attention_mask = torch.squeeze(attention_mask, 1)
attention_mask.shape #This is the expected shape for the attention_mask in the model. 

torch.Size([16, 20])

#### Try the BERT model on its own: 

In [121]:
bert = BertModel.from_pretrained('bert-base-cased')
output_bert  = bert(input_ids, attention_mask)



In [39]:
output_bert.last_hidden_state.shape #this is the output of the last hidden state 

torch.Size([16, 20, 768])

**Pooler_output**:

The hidden state representation of the [CLS] token after pooling (typically used for classification tasks).
Shape: (batch_size, hidden_size = 768)
Use: This tensor is used as the representation for the entire sequence. It can be used for classification or other downstream tasks where a single vector representation of the input is required.

In [40]:
output_bert.pooler_output.shape   #this is the pooler output 

torch.Size([16, 768])

#### Try BertClassifier: 

In [124]:
model = BertClassifier(0.1, 9)
output = model(input_ids, attention_mask)



In [168]:
output.shape #outputting a logit for every possible category. 

torch.Size([16, 9])

In [99]:
output[0] #scores corresponding how likely it is for the first sample in the batch to belong to each category. 

tensor([-0.1679, -0.3284, -0.0554,  0.2516, -0.1639,  0.0882, -0.1001, -0.0939,
         0.1223], grad_fn=<SelectBackward0>)

In [110]:
torch.nn.functional.softmax(output, dim = -1)[0]

tensor([0.0974, 0.0829, 0.1090, 0.1481, 0.0977, 0.1258, 0.1042, 0.1048, 0.1301],
       grad_fn=<SelectBackward0>)

### Create model object

In [113]:
device = torch.device("cuda:0" if torch.cuda.is_available()
                     else "cpu")

### Define loss function and optimizer

<hr style="border: 2px solid red;">Come back to this; I don't understand why the log function applied to the probability can be a good measurement of the loss. 

When you have a batch of logits and corresponding labels for a classification task, and you apply the Cross Entropy Loss function in PyTorch, the process is as follows:

#### Scenario

- **`batch_output`**: This tensor represents the logits for a batch of sequences. For a batch of size 16 with 9 classes, it has a shape of `(16, 9)`.
- **`labels`**: This tensor contains the correct class labels for each sequence in the batch. For a batch of size 16, it has a shape of `(16,)`.

#### Cross Entropy Loss Calculation

1. **Apply Softmax Internally**:

   The `CrossEntropyLoss` function applies the Softmax function to the logits (raw scores) to convert them into probabilities. This operation is done internally by the function.

   For a given sequence \( i \), the probability for class \( j \) is computed as:

   $$
   p_{i,j} = \frac{\exp(\text{logit}_{i,j})}{\sum_{k} \exp(\text{logit}_{i,k})}
   $$

   where $( \text{logit}_{i,j} )$ is the score for class \( j \) for sequence \( i \), and \( \sum_{k} \exp(\text{logit}_{i,k}) \) is the sum of exponentials of logits for all classes for sequence \( i \).

2. **Compute Log Probabilities**:

   The function then extracts the probability for the correct class for each sequence. If the correct class for sequence \( i \) is \( c_i \), it calculates the log probability:

   $$
   \log(p_{i,c_i})
   $$

3. **Calculate the Negative Log Probability**:

   The loss for each sequence is computed as the negative log of the probability of the correct class:

   $$
   \text{Loss}_i = -\log(p_{i,c_i})
   $$

4. **Compute the Mean Loss**:

   Finally, the loss function averages the individual losses over all sequences in the batch to obtain the final loss:

   $$
   \text{Loss} = \frac{1}{N} \sum_{i=1}^{N} \text{Loss}_i
   $$

   where \( N \) is the batch size (16 in this case).


In [42]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [169]:
loss = criterion(output, batch_labels)

In [170]:
loss 

tensor(2.2441, grad_fn=<NllLossBackward0>)

In [118]:
output[0]

tensor([ 0.1807,  0.7302,  1.1485,  0.4753, -0.5546,  0.0336,  1.1944, -0.2405,
        -0.0737], grad_fn=<SelectBackward0>)

In [119]:
batch_labels[0]

tensor(5)

In [120]:
loss.item()

2.253777265548706

### Move the model to GPU if available

In [121]:
if torch.cuda.is_available():
    model = model.cuda()
    criterion = criterion.cuda()

## Validation and Training Modes of a Model

In machine learning, particularly in frameworks like PyTorch, models can operate in different modes depending on the phase of the workflow. The two primary modes are **Training** and **Validation**

### Training Mode (`model.train()`)

In machine learning frameworks like PyTorch, models operate in two primary modes: **Training** and **Validation (Evaluation)**. **Training Mode** (`model.train()`) is used during the model training process where dropout layers are active to prevent overfitting, batch normalization uses the current batch statistics and updates running statistics, and gradients are computed for backpropagation. In contrast, **Validation Mode** (`model.eval()`) is used for evaluating the model's performance. During this mode, dropout layers are turned off, batch normalization uses accumulated running statistics, and gradient calculations are disabled to ensure consistent and efficient evaluation of the model’s performance.


In machine learning, particularly in frameworks like PyTorch, models can operate in different modes depending on the phase of the workflow. The two primary modes are **Training** and **Validation (Evaluation)**.

### Training Mode (`model.train()`)

In **Validation Mode** (`model.eval()`), the model is set for evaluation, which includes both validation and testing phases. The primary purpose of this mode is to assess the model's performance on unseen data. During validation mode, dropout layers are inactive; this means that dropout is turned off, ensuring that all units are used and no randomness is introduced during the forward pass. Additionally, batch normalization layers use the running statistics (mean and variance) that were accumulated during training for normalization purposes, and these statistics are not updated during evaluation. Furthermore, gradient calculations are disabled, which makes the evaluation process faster and more memory-efficient. This mode ensures that the model's performance is measured consistently and accurately without the interference of training-specific behaviors.


Note: PyTorch accumulates gradients by default. This means that each time backward() is called, the gradients are added to the existing gradients for each parameter. This is useful for scenarios where you might want to accumulate gradients over multiple iterations (e.g., in cases of gradient accumulation across mini-batches) but you also need to make sure they are initialized to zero when training for a new batch. 


In [125]:
def train(train_loader, valid_loader, num_iter, model, criterion,optimizer):
    """
    This function will run predictions and then trains the model
    
    Input 
    
    train_loader: Contains the training dataset split into batches. 
    valid_loader: Contains the validation dataset split into batches. 
    model       : BertClassifier model 
    criterion   : Loss function
    optimizer   : Optimizer usually Adam is used. 
    num_iter    : Number of bachpropagation iterations 

    """ 

    #Set an initial loss: 
    best_loss = 1e8 
    
    train_loss, valid_loss = [] , []
    # Run through the iterations: 
    for i in range(num_iter): 
        print(f"Epoch {i+1} of {num_iter}")
        # Set the model into training mode 
        model.train() 
        
        # Iterate through each training batch
        for batch_labels, batch_tokens in train_loader: 
            
            # Extract the integer vectors input_ids in each batch_token
            input_seqs = batch_tokens["input_ids"] 
            # Extract the attention_masks for the current batch
            attention_mask = batch_tokens["attention_mask"]
            # Save their respective labels
            labels = batch_labels 
    
            # Reshape the input_seqs and attention_mask
            input_seqs = torch.squeeze(input_seqs, 1) # shape = (batch_size, len_seq)
            attention_mask = torch.squeeze(attention_mask,1) # shape = (batch_size, len_seq) 
            
            # Make a prediction using the BertClassifier
            output_bert = model(input_seqs, attention_mask)
            
            # Calculate the loss and save it 
            loss = criterion(output_bert, labels)
            train_loss.append(loss.item()) #this will be a list of all training losses 
            
            # Initialize the gradients as 0, crucial when training each batch 
            optimizer.zero_grad()
            
            # Start the backpropagation 
            loss.backward()
            
            # Update the parameters 
            optimizer.step() 
    
       
        # Set the model for evaluation
        model.eval() 
        for batch_labels, batch_tokens in valid_loader: 
            # Extract the integer tokens 
            input_seqs = batch_tokens["input_ids"]
            #Extract the attention_masks
            attention_mask = batch_tokens["attention_mask"]
            # Save the labels 
            labels = batch_labels 
            # Reshape the input_seqs and attention_masks 
            input_seqs = torch.squeeze(input_seqs, 1) #remove a dim of size 1 
            attention_mask = torch.squeeze(attention_mask,1) 
            # Make a prediction 
            bert_output = model(input_seqs, attention_mask) 
            # Calculate the loss 
            loss = criterion(bert_output,labels)
            # Save the loss 
            valid_loss.append(loss.item())
    
        # Take the mean of all losses in the training set and validation set: 
        loss_tmean = np.mean(train_loss)
        loss_vmean = np.mean(valid_loss)
        # Print the result 
        print(f"Mean train Loss: {loss_tmean}, Mean validation Loss: {loss_vmean}")
        # If the model performs well across all validation batches
        if loss_vmean < best_loss: 
            best_loss = loss_vmean
            # Save the model 
            #torch.save(model.state_dict(), model_path)
            print(f"Best Validation Loss: {best_loss}")

In [126]:
dropout_rate = 0.1
model = BertClassifier(dropout_rate ,9)
num_iter = 10
train(train_loader, valid_loader, num_iter, model, criterion,optimizer)




Epoch 1 of 10
Mean train Loss: 2.0757266680399575, Mean validation Loss: 2.130387544631958
Best Validation Loss: 2.130387544631958
Epoch 2 of 10
Mean train Loss: 2.03116238117218, Mean validation Loss: 2.130387544631958
Epoch 3 of 10
Mean train Loss: 2.0559387074576483, Mean validation Loss: 2.130387544631958
Epoch 4 of 10
Mean train Loss: 2.082428882519404, Mean validation Loss: 2.130387544631958
Epoch 5 of 10
Mean train Loss: 2.0759719451268515, Mean validation Loss: 2.130387544631958
Epoch 6 of 10
Mean train Loss: 2.0681277778413563, Mean validation Loss: 2.130387544631958
Epoch 7 of 10
Mean train Loss: 2.0751718225933256, Mean validation Loss: 2.130387544631958
Epoch 8 of 10
Mean train Loss: 2.073319981495539, Mean validation Loss: 2.130387544631958
Epoch 9 of 10
Mean train Loss: 2.0702677170435586, Mean validation Loss: 2.130387544631958
Epoch 10 of 10
Mean train Loss: 2.0746772567431133, Mean validation Loss: 2.130387544631958


Note that the parameters of the model are internally updating. Now once the model is finished, with the training process, let's save the model. 

In [255]:
model_path = '/Users/apple/Documents/GitHub/Transformers/model.statedict'
torch.save(model.state_dict(), model_path)

In [127]:
def test(test_loader, model, criterion ): 
    """
    Function to test the model 
    Input: 
    test_loader : to load the testing data in  batches 
    model       : the BertClassifier model 
    criterion   : the loss function (usually Cross Entropy Loss function) 
    
    """
    # Set the model into evaluation mode
    model.eval()
    test_loss = [] 
    test_acc = []
    
    # Iterate through each batch
    for batch_labels, batch_tokens in test_loader: 
        
        # Extract the input_ids and the attention masks 
        input_ids = batch_tokens["input_ids"] 
        attention_mask = batch_tokens["attention_mask"]
        
        # Squeeze the last dimension
        input_ids = torch.squeeze(input_ids, 1)
        attention_mask =  torch.squeeze(attention_mask, 1) 
        # Make a prediction using the model 
        bert_output = model(input_ids, attention_mask)

        # Calculate and save the loss 
        loss = criterion(bert_output, batch_labels) 
        test_loss.append(loss.item())
        
        # Find the predicted class and compute the accuracy 
        model_preds = torch.argmax(bert_output, axis = 1)
        # Compute accuracy
        test_acc.append(accuracy_score(batch_labels.detach().
                                        numpy(),
                                        model_preds.detach().
                                        numpy()))
    
    # Take the mean loss across all batches 
    mean_loss = np.mean(test_loss)
    mean_acc = np.mean(test_acc) * 100
    print(f"Mean Test Loss: {mean_loss}, Mean Test Accuracy: {mean_acc}")
    return(mean_loss,mean_acc)
        

In [128]:
test(test_loader, model, criterion)

Mean Test Loss: 1.876768946647644, Mean Test Accuracy: 53.125


(1.876768946647644, 53.125)

## Predict on new text
---

In [129]:
input_text = '''I am a victim of Identity Theft & currently have an Experian account that 
I can view my Experian Credit Report and getting notified when there is activity on 
my Experian Credit Report. For the past 3 days I've spent a total of approximately 9 
hours on the phone with Experian. Every time I call I get transferred repeatedly and 
then my last transfer and automated message states to press 1 and leave a message and 
someone would call me. Every time I press 1 I get an automatic message stating than you 
before I even leave a message and get disconnected. I call Experian again, explain what 
is happening and the process begins again with the same end result. I was trying to have 
this issue attended and resolved informally but I give up after 9 hours. There are hard 
hit inquiries on my Experian Credit Report that are fraud, I didn't authorize, or recall 
and I respectfully request that Experian remove the hard hit inquiries immediately just 
like they've done in the past when I was able to speak to a live Experian representative 
in the United States. The following are the hard hit inquiries : BK OF XXXX XX/XX/XXXX 
XXXX XXXX XXXX  XX/XX/XXXX XXXX  XXXX XXXX  XX/XX/XXXX XXXX  XX/XX/XXXX XXXX  XXXX 
XX/XX/XXXX'''

In [130]:
input_text = input_text.lower()
input_text = re.sub(r"[^\w\d'\s]+", " ", input_text)
input_text = re.sub("\d+", "", input_text)
input_text = re.sub(r'[x]{2,}', "", input_text)
input_text = re.sub(' +', ' ', input_text)

In [131]:
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")



In [132]:
tokens = tokenizer(input_text, padding="max_length",
                 max_length=seq_len, truncation=True,
                 return_tensors="pt")

In [133]:
input_ids = tokens["input_ids"]
attention_mask = tokens["attention_mask"]

In [47]:
device = torch.device("cuda:0" if torch.cuda.is_available()
                     else "cpu")

In [48]:
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)

In [134]:
input_ids = torch.squeeze(input_ids, 1)

In [138]:
# Create model object
model = BertClassifier(dropout, 9)

# Move the model to GPU if available
if torch.cuda.is_available():
    model = model.cuda()
    
# Forward pass
out = torch.squeeze(model(input_ids, attention_mask))

# Find predicted class
prediction = label_encoder.classes_[torch.argmax(out)]
print(f"Predicted Class: {prediction}")



Predicted Class: vehicle_loan
