In [1]:
#######################################
### -------- Load libraries ------- ###

# Load Huggingface transformers
from transformers import TFBertModel,  BertConfig, BertTokenizerFast

# Then what you need from tensorflow.keras
from tensorflow.keras.layers import Input, Dropout, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical

# And pandas for data import + sklearn because you allways need sklearn
import pandas as pd
from sklearn.model_selection import train_test_split


In [3]:
#######################################
### --------- Import data --------- ###

# Import data from csv
data = pd.read_csv(r'C:\Users\Pritam\Documents\PythonScripts\Text Data Processing\Consumer_Complaints.csv')

In [4]:
data.head()

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,3/12/2014,Mortgage,Other mortgage,"Loan modification,collection,foreclosure",,,,M&T BANK CORPORATION,MI,48382,,,Referral,3/17/2014,Closed with explanation,Yes,No,759217
1,10/1/2016,Credit reporting,,Incorrect information on credit report,Account status,I have outdated information on my credit repor...,Company has responded to the consumer and the ...,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",AL,352XX,,Consent provided,Web,10/5/2016,Closed with explanation,Yes,No,2141773
2,10/17/2016,Consumer Loan,Vehicle loan,Managing the loan or lease,,I purchased a new car on XXXX XXXX. The car de...,,"CITIZENS FINANCIAL GROUP, INC.",PA,177XX,Older American,Consent provided,Web,10/20/2016,Closed with explanation,Yes,No,2163100
3,6/8/2014,Credit card,,Bankruptcy,,,,AMERICAN EXPRESS COMPANY,ID,83854,Older American,,Web,6/10/2014,Closed with explanation,Yes,Yes,885638
4,9/13/2014,Debt collection,Credit card,Communication tactics,Frequent or repeated calls,,,"CITIBANK, N.A.",VA,23233,,,Web,9/13/2014,Closed with explanation,Yes,Yes,1027760


In [5]:
# Select required columns
data = data[['Consumer complaint narrative', 'Product', 'Issue']]
data.head()

Unnamed: 0,Consumer complaint narrative,Product,Issue
0,,Mortgage,"Loan modification,collection,foreclosure"
1,I have outdated information on my credit repor...,Credit reporting,Incorrect information on credit report
2,I purchased a new car on XXXX XXXX. The car de...,Consumer Loan,Managing the loan or lease
3,,Credit card,Bankruptcy
4,,Debt collection,Communication tactics


In [7]:
# Remove a row if any of the three remaining columns are missing
data = data.dropna()
data.head()

Unnamed: 0,Consumer complaint narrative,Product,Issue
1,I have outdated information on my credit repor...,Credit reporting,Incorrect information on credit report
2,I purchased a new car on XXXX XXXX. The car de...,Consumer Loan,Managing the loan or lease
7,An account on my credit report has a mistaken ...,Credit reporting,Credit reporting company's investigation
12,This company refuses to provide me verificatio...,Debt collection,Disclosure verification of debt
16,This complaint is in regards to Square Two Fin...,Debt collection,Improper contact or sharing of info


In [8]:
# Remove rows, where the label is present only ones (can't be split)
data = data.groupby('Issue').filter(lambda x : len(x) > 1)
data = data.groupby('Product').filter(lambda x : len(x) > 1)

In [10]:
# Remove rows, where the label is present only ones (can't be split)
data = data.groupby('Issue').filter(lambda x : len(x) > 1)
data = data.groupby('Product').filter(lambda x : len(x) > 1)

In [17]:
# Transform your output to numeric

from sklearn import preprocessing
le1 = preprocessing.LabelEncoder()
data['Issue_label'] =le1.fit_transform(data['Issue'])
le2 = preprocessing.LabelEncoder()
data['Product_label'] =le2.fit_transform(data['Product'])

In [18]:
data.head()

Unnamed: 0,Consumer complaint narrative,Product,Issue,Issue_label,Product_label
1,I have outdated information on my credit repor...,Credit reporting,Incorrect information on credit report,72,5
2,I purchased a new car on XXXX XXXX. The car de...,Consumer Loan,Managing the loan or lease,87,2
7,An account on my credit report has a mistaken ...,Credit reporting,Credit reporting company's investigation,46,5
12,This company refuses to provide me verificatio...,Debt collection,Disclosure verification of debt,53,7
16,This complaint is in regards to Square Two Fin...,Debt collection,Improper contact or sharing of info,68,7


In [38]:
# Split into train and test - stratify over Issue
data, data_test = train_test_split(data, test_size = 0.2, stratify = data[['Issue_label']])
data.head()

Unnamed: 0,Consumer complaint narrative,Product,Issue,Issue_label,Product_label
91064,XXXX has called me to sue me over a debt from ...,Debt collection,Taking/threatening an illegal action,134,7
614550,They have collected for over 10 years and neve...,Debt collection,False statements or representation,56,7
27300,i had loans that were with navient yhat were r...,Debt collection,Disclosure verification of debt,53,7
73415,"I deposited XXXX checks in an ATM in XXXX, New...",Bank account or service,Deposits and withdrawals,52,0
248216,I contacted Equifax via phone on XXXX / XXXX ...,"Credit reporting, credit repair services, or o...",Incorrect information on your report,73,6


In [20]:
#######################################
### --------- Setup BERT ---------- ###

# Name of the BERT model to use
model_name = 'bert-base-uncased'

# Max length of tokens
max_length = 100

# Load transformers config and set output_hidden_states to False
config = BertConfig.from_pretrained(model_name)
config.output_hidden_states = False

# Load BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path = model_name, config = config)

# Load the Transformers BERT model
transformer_model = TFBertModel.from_pretrained(model_name, config = config)

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=570.0), HTML(value='')))




Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [24]:
# Load the MainLayer
bert = transformer_model.layers[0]

In [25]:
# Build your model input
input_ids = Input(shape=(max_length,), name='input_ids', dtype='int32')
# attention_mask = Input(shape=(max_length,), name='attention_mask', dtype='int32') 
# inputs = {'input_ids': input_ids, 'attention_mask': attention_mask}
inputs = {'input_ids': input_ids}


In [26]:
# Load the Transformers BERT model as a layer in a Keras model
bert_model = bert(inputs)[1]
dropout = Dropout(config.hidden_dropout_prob, name='pooled_output')
pooled_output = dropout(bert_model, training=False)


In [27]:
# Then build your model output

issue = Dense(units=len(data.Issue_label.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='issue')(pooled_output)
product = Dense(units=len(data.Product_label.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='product')(pooled_output)
outputs = {'issue': issue, 'product': product}

In [28]:
# And combine it all in a model object
model = Model(inputs=inputs, outputs=outputs, name='BERT_MultiLabel_MultiClass')

In [29]:
# Take a look at the model
model.summary()

Model: "BERT_MultiLabel_MultiClass"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 100)]        0                                            
__________________________________________________________________________________________________
bert (TFBertMainLayer)          ((None, 100, 768), ( 109482240   input_ids[0][0]                  
__________________________________________________________________________________________________
pooled_output (Dropout)         (None, 768)          0           bert[0][1]                       
__________________________________________________________________________________________________
issue (Dense)                   (None, 154)          118426      pooled_output[0][0]              
_________________________________________________________________________

In [30]:
#######################################
### ------- Train the model ------- ###

# Set an optimizer
optimizer = Adam(
    learning_rate=5e-05,
    epsilon=1e-08,
    decay=0.01,
    clipnorm=1.0)

In [31]:
# Set loss and metrics
loss = {'issue': CategoricalCrossentropy(from_logits = True), 'product': CategoricalCrossentropy(from_logits = True)}
metric = {'issue': CategoricalAccuracy('accuracy'), 'product': CategoricalAccuracy('accuracy')}

In [33]:
# Compile the model
model.compile(
    optimizer = optimizer,
    loss = loss, 
    metrics = metric)

# Ready output data for the model
y_issue = to_categorical(data['Issue_label'])
y_product = to_categorical(data['Product_label'])

In [34]:
# Tokenize the input (takes some time)
x = tokenizer(
    text=data['Consumer complaint narrative'].to_list(),
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)

In [36]:
# Fit the model
history = model.fit(
    # x={'input_ids': x['input_ids'], 'attention_mask': x['attention_mask']},
    x={'input_ids': x['input_ids']},
    y={'issue': y_issue, 'product': y_product},
    validation_split=0.2,
    batch_size=64,
    epochs=1)

ResourceExhaustedError:  OOM when allocating tensor with shape[64,100,768] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator cpu
	 [[node BERT_MultiLabel_MultiClass/bert/encoder/layer_._2/output/LayerNorm/batchnorm/mul_2 (defined at C:\Users\Pritam\anaconda3\envs\tensorflow\lib\site-packages\transformers\modeling_tf_bert.py:353) ]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
 [Op:__inference_train_function_37401]

Function call stack:
train_function


In [None]:
#######################################
### ----- Evaluate the model ------ ###

# Ready test data
test_y_issue = to_categorical(data_test['Issue'])
test_y_product = to_categorical(data_test['Product'])
test_x = tokenizer(
    text=data_test['Consumer complaint narrative'].to_list(),
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = False,
    verbose = True)

# Run evaluation
model_eval = model.evaluate(
    x={'input_ids': test_x['input_ids']},
    y={'issue': test_y_issue, 'product': test_y_product}
)