In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function


In [2]:
import itertools
import os

%matplotlib inline
#import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix

from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.preprocessing import text, sequence

from keras import utils
  
print("tensorflow version",tf.__version__)

tensorflow version 2.13.0


In [3]:
df = pd.read_csv('Consumer_Complaints.csv', encoding='latin-1')

df.head()

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer Complaint,Company Public Response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date Sent to Company,Company Response to Consumer,Timely response?,Consumer disputed?,Complaint ID,Unnamed: 18
0,03-12-2014,Mortgage,Other mortgage,"Loan modification,collection,foreclosure",,,,M&T BANK CORPORATION,MI,48382,,,Referral,03/17/2014,Closed with explanation,Yes,No,759217,
1,10-01-2016,Credit reporting,,Incorrect information on credit report,Account status,I have outdated information on my credit repor...,Company has responded to the consumer and the ...,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",AL,352XX,,Consent provided,Web,10-05-2016,Closed with explanation,Yes,No,2141773,
2,10/17/2016,Consumer Loan,Vehicle loan,Managing the loan or lease,,I purchased a new car on XXXX XXXX. The car de...,,"CITIZENS FINANCIAL GROUP, INC.",PA,177XX,Older American,Consent provided,Web,10/20/2016,Closed with explanation,Yes,No,2163100,
3,06-08-2014,Credit card,,Bankruptcy,,,,AMERICAN EXPRESS COMPANY,ID,83854,Older American,,Web,06-10-2014,Closed with explanation,Yes,Yes,885638,
4,09/13/2014,Debt collection,Credit card,Communication tactics,Frequent or repeated calls,,,"CITIBANK, N.A.",VA,23233,,,Web,09/13/2014,Closed with explanation,Yes,Yes,1027760,


In [4]:
col = ['Consumer Complaint','Product']
df = df[col]
df = df[pd.notnull(df['Consumer Complaint'])]
df.head()

Unnamed: 0,Consumer Complaint,Product
1,I have outdated information on my credit repor...,Credit reporting
2,I purchased a new car on XXXX XXXX. The car de...,Consumer Loan
7,An account on my credit report has a mistaken ...,Credit reporting
12,This company refuses to provide me verificatio...,Debt collection
16,This complaint is in regards to Square Two Fin...,Debt collection


In [5]:
df.isnull().sum()
df['Product'].value_counts()

Product
Debt collection                                                                 63268
Credit reporting, credit repair services, or other personal consumer reports    49006
Mortgage                                                                        43837
Credit reporting                                                                31593
Credit card                                                                     18842
Student loan                                                                    16689
Bank account or service                                                         14887
Credit card or prepaid card                                                     10659
Consumer Loan                                                                    9474
Checking or savings account                                                      6489
Money transfer, virtual currency, or money service                               3089
Vehicle loan or lease                         

In [6]:
train_size = int(len(df) *0.3)
print("Train size: %d "% train_size)
print("Test size: %d" %(len(df) - train_size))

Train size: 83344 
Test size: 194470


In [7]:
train_narrative = df["Consumer Complaint"][:train_size]
train_product = df["Product"][:train_size]

test_narrative = df["Consumer Complaint"][:train_size]
test_product = df["Product"][:train_size]

In [8]:
max_words = 1000
tokenize= text.Tokenizer(num_words = max_words, char_level=False)

tokenize.fit_on_texts(train_narrative)
x_train = tokenize.texts_to_matrix(train_narrative)
x_test = tokenize.texts_to_matrix(test_narrative)

In [None]:
#Use sklearn utility to convert label strings to numbered index

encoder = LabelEncoder()
encoder.fit(train_product)
y_train = encoder.transform(train_product)
y_test = encoder.transform(test_product)

In [None]:
num_classes = np.max(y_train)+1
y_train = utils.to_categorical(y_train, num_classes)
y_test = utils.to_categorical(y_test, num_classes)

In [None]:
print("x_train shape" , x_train.shape)
print("x_test shape" , x_test.shape)
print("y_train shape" , y_train.shape)
print("y_test shape" , y_test.shape)


x_train shape (83344, 1000)
x_test shape (83344, 1000)
y_train shape (83344, 18)
y_test shape (83344, 18)


In [None]:
model = Sequential()
model.add(Dense(512,input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation("softmax"))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
              


In [None]:
batch_size = 32
epochs = 5

history = model.fit(x_train,y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose = 1,
                    validation_split=0.1
                    )

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
#Evaluate the accuracy of our trained model
score = model.evaluate(x_test,y_test,
                       batch_size=batch_size, verbose=1
                       )
print(score)
print('test score ',score[0])
print('test accuracy',score[1])

[0.33914506435394287, 0.8966332077980042]
test score  0.33914506435394287
test accuracy 0.8966332077980042


In [None]:
# Gere's how to generate a prediction on individual examples
text_labels = encoder.classes_

for i in range(10):
    prediction = model.predict(np.array([x_test[i]]))
    predicted_label = text_labels[np.argmax(prediction)]
    print(test_narrative.iloc[i][:50],"...")
    print('Actual label:' + test_product.iloc[i])
    print('predicted label:' + predicted_label+ '\n')

I have outdated information on my credit report th ...
Actual label:Credit reporting
predicted label:Credit reporting

I purchased a new car on XXXX XXXX. The car dealer ...
Actual label:Consumer Loan
predicted label:Consumer Loan

An account on my credit report has a mistaken date ...
Actual label:Credit reporting
predicted label:Credit reporting

This company refuses to provide me verification an ...
Actual label:Debt collection
predicted label:Debt collection

This complaint is in regards to Square Two Financi ...
Actual label:Debt collection
predicted label:Debt collection

Started the refinance of home mortgage process wit ...
Actual label:Mortgage
predicted label:Mortgage

In XXXX, I and my ex-husband applied for a refinan ...
Actual label:Mortgage
predicted label:Mortgage

I have disputed several accounts on my credit repo ...
Actual label:Credit reporting
predicted label:Credit reporting

Mortgage was transferred to Nationstar as of XXXX/ ...
Actual label:Mortgage
predicted lab