In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
%matplotlib inline

In [None]:
# Import the dataset with consumer complaints
dataset_CC = pd.read_csv("C:\\Users\\hugoo\\complaint_data.csv")

#print(dataset_CC.head(1))

In [None]:
#count number of Products
print(dataset_CC['Product'].count())

#count number of null complaints
print(dataset_CC['Consumer complaint narrative'].isnull().sum())

#count number of non null complaints
print(dataset_CC['Consumer complaint narrative'].count())


In [None]:
# number of complaints for each product before pre procesing

import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

dataset_CC['Product'].value_counts().sort_values(ascending=False).iplot(kind='bar', yTitle='Number of Complaints', title='Number complaints in each product')

In [None]:
# Verify lines with empty columns
# It is important to mention that there are 4 relevant category columns, namely 'Product', 'Sub-product', 'Issue' and 'Sub-issue'
percent_missing = dataset_CC.isnull().sum() * 100 / len(dataset_CC)
print(percent_missing.sort_values(ascending=False).head(20))

In [None]:
# Create a new dataframe with five columns
dataset_CC = dataset_CC[['Product', 'Sub-product', 'Issue', 'Sub-issue', 'Consumer complaint narrative']].copy()

print(dataset_CC['Product'].count())

# Remove missing values (NaN)
dataset_CC = dataset_CC[pd.notnull(dataset_CC['Consumer complaint narrative'])]

# after remove null complaints
print(dataset_CC['Product'].count())

# Renaming second column for a simpler name
dataset_CC.columns = ['Product','Sub-product', 'Issue', 'Sub-issue', 'Consumer_complaint'] 

dataset_CC.shape

In [None]:
# Delete all lines without consumer narrative which is the text associated with the ticket
# This step is crucial because we are gonna use this text to infer the previously mentioned categories.
dataset_CC = dataset_CC.dropna(subset=['Consumer_complaint'])
print(dataset_CC.head(1))
print(dataset_CC['Product'].count())

In [None]:
# saw unique categories of Products
pd.DataFrame(dataset_CC.Product.unique()).values

In [None]:
# Renaming categories
dataset_CC.replace({'Product': 
             {'Credit reporting, credit repair services, or other personal consumer reports': 
              'Credit reporting, repair, or other', 
              'Credit reporting': 'Credit reporting, repair, or other',
             'Credit card': 'Credit card or prepaid card',
             'Prepaid card': 'Credit card or prepaid card',
             'Payday loan': 'Payday loan, title loan, or personal loan',
             'Money transfer': 'Money transfer, virtual currency, or money service',
             'Virtual currency': 'Money transfer, virtual currency, or money service'}}, 
            inplace= True)

In [None]:
pd.DataFrame(dataset_CC.Product.unique())

In [None]:
dataset_CC.Product.value_counts()

In [None]:
# number of complaints for each product
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

dataset_CC['Product'].value_counts().sort_values(ascending=False).iplot(kind='bar', yTitle='Number of Complaints', title='Number complaints in each product')

In [None]:
# verify that have many erros in complaint like "xxxxx" and some pointation
print(dataset_CC['Consumer_complaint'].iloc[12])

In [None]:
from nltk.corpus import stopwords
import re

#start index from zero
dataset_CC = dataset_CC.reset_index(drop=True)

#replace all the special simbols by space
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')

# remove everything that is not letters
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')

#remove this stop words because the context of the complaint
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text)
    text = BAD_SYMBOLS_RE.sub('', text) 
    text = text.replace('x', '')
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text

dataset_CC['Consumer_complaint'] = dataset_CC['Consumer_complaint'].apply(clean_text)
dataset_CC['Consumer_complaint'] = dataset_CC['Consumer_complaint'].str.replace('\d+', '')

In [None]:
# verify that stop words, everything that is not letters, special simbols and "xxxx" has been remove
print(dataset_CC['Consumer_complaint'].iloc[12])

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

# Const for max of words to mantain in vocabulary
MAX_WORDS = 50000

# Const for max of words to use per complaint
MAX_WORDS_COMPLAINT = 2500

# size of vector per word
VECTOR_SIZE = 100

# all words that is not in vocabulary are replace to oov_token
tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>", lower=True)
tokenizer.fit_on_texts(dataset_CC['Consumer_complaint'].values) 

#get number of unique tokens
word_index = tokenizer.word_index 

print('Encontrados %s tokens únicos.' % len(word_index))

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# receive a list of words and return a list of index (the smallest index its the most used word)
X = tokenizer.texts_to_sequences(dataset_CC['Consumer_complaint'].values)

# convert all sequences of text with the same lenght, if the sequence is more smallest that the length its increment zeros
# else sequence its cut
X = pad_sequences(X, maxlen=MAX_WORDS_COMPLAINT)

# that result means, we have 481087 rows of complaints that have 2500 index of words
print('Shape of data tensor:', X.shape)

In [None]:
# convert categorical labels to numbers
Y = pd.get_dummies(dataset_CC['Product']).values

#that result means, we have 481087 products with 13 different numbers (categories ex: "credict card or prepaid card")
print('Shape of label tensor:', Y.shape)

In [None]:
# division in test data and training data
# random_state=42 ensure that the data division is allways the same

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.10, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

In [None]:
from tensorflow.keras.models import Sequential
# use sequential because the LSTM model its a neurol network which learn by data sequences
model = Sequential()

# use Embedding to define the max words in vocabulary, max word per complaint and vetor_size por each word
model.add(Embedding(MAX_WORDS, VECTOR_SIZE, input_length=MAX_WORDS_COMPLAINT))

#hiperparameter -> dropout (turning off some neurons) and recurrent_dropout(turning of some recurrent conexions) -> to avoid 
# the overfitting
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))

# softmax to convert numbers vector to probabilistic vector, highest probability is the chosen neuron
model.add(Dense(13, activation='softmax'))

# the loss function is for training to evaluate the difference between neural 
#network predictions and actual outputs, the optimizer the optimization algorithm that will be 
#used to adjust neural network weights during training
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# how many times the entire dataset will be passed through the model during training.
epochs = 5

#number of training samples to work through before the model’s internal parameters are updated
batch_size = 10


#validation_split means that 10% of the training data will be used for validation
history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, verbose=1, min_delta=0.0001)])
