In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import random
import time
import re
import string
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, classification_report

from nltk.corpus import stopwords
stop = stopwords.words('english')

import spacy
from spacy.util import minibatch, compounding

import torch

import warnings
warnings.filterwarnings(action="ignore")

In [106]:
# # read the data
# df = pd.read_csv('consumer_complaints.csv')

In [107]:
# df.head()

In [108]:
# df.info()

In [109]:
# # find out the number of null valuesby column
# df.isnull().sum().sort_values(ascending=False)

In [110]:
# # drop all the rows with null values in the 'consumer_complaint_narrative' column as that is the column we will 
# # be using for our analysis
# df.dropna(subset=['consumer_complaint_narrative'], axis=0, inplace=True)

In [111]:
# # check to make sure all the null values in the required column have been dropped
# df.isnull().sum().sort_values(ascending=False)

In [112]:
# df.info()

### Preprocess

In [113]:
# # create a new dataframe with the only two columns required for our analysis
# df = df[['product', 'consumer_complaint_narrative']]

In [114]:
# # function to clean text
# def clean_text_round1(text):
#     text = text.lower()                                                 # lowercase text
#     text = re.sub('\{.*?\}', '', text)                                  # remove text in curly brackets
#     text = re.sub('[%s]' % re.escape(string.punctuation), '', text)     # remove punctuations
#     text = re.sub('\w*\d\w*', '', text)                                 # remove numbers like dates
#     text = re.sub('\n', '', text)                                       # remove new line characters
#     return text

In [115]:
# df['clean_text'] = df['consumer_complaint_narrative'].apply(clean_text_round1)

In [116]:
# df.head()

In [117]:
# # function to remove xx's
# def remove_xx(text):
#     words = str(text).split()
#     for word in words:
#         if len(word) >= 2:
#             if word[0] == 'x' and word[1] == 'x':
#                 words.remove(word)
            
#     return ' '.join(words)

# remove_xx('hello, world xxxxxxxxxx')

In [118]:
# df['clean_text'] = df['clean_text'].map(lambda x: remove_xx(x))

In [119]:
# remove stop words
# df['clean_text'] = df['clean_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [120]:
# # this is enough preprocessing for now
# df.head()

In [121]:
# # save the cleaned dataset to a csv file
# df.to_csv('cleaned_consumer_complaints.csv', index=False)

In [53]:
# now we can start loading the file from here itself
df = pd.read_csv('cleaned_consumer_complaints.csv')

In [54]:
 # only the first 1000 rows to save time during training
df = df[['product', 'clean_text']][:1500]

In [55]:
df.head()

Unnamed: 0,product,clean_text
0,Debt collection,claimed owe years despite proof payment sent c...
1,Consumer Loan,due inconsistencies amount owed told bank amou...
2,Mortgage,wages earned job decreased almost half knew tr...
3,Mortgage,open current mortgage chase bank chase reporti...
4,Mortgage,submitted time submitted complaint dealt rushm...


In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   product     1500 non-null   object
 1   clean_text  1500 non-null   object
dtypes: object(2)
memory usage: 23.6+ KB


In [123]:
# # reset index
# df.reset_index()

In [58]:
# check the frequency of each category
100.0*df['product'].value_counts()/len(df)

Debt collection            25.600000
Mortgage                   25.400000
Credit reporting           14.600000
Credit card                12.466667
Bank account or service     8.133333
Consumer Loan               5.666667
Student loan                4.266667
Payday loan                 1.600000
Money transfers             1.400000
Prepaid card                0.666667
Other financial service     0.200000
Name: product, dtype: float64

### Objective
The idea here is to use the text in the 'consumer_complaint_narrative' column to categorise it to the right category 

### Prepare train/test/valid dataset

In [59]:
label_values = list(df['product'].unique())
label_values

['Debt collection',
 'Consumer Loan',
 'Mortgage',
 'Credit card',
 'Credit reporting',
 'Student loan',
 'Bank account or service',
 'Payday loan',
 'Money transfers',
 'Other financial service',
 'Prepaid card']

In [60]:
train_X, test_X, train_y, test_y = train_test_split(df['clean_text'],
                                                   df['product'],
                                                   test_size=0.15,
                                                   stratify=df['product'],
                                                    random_state=36
                                                   )

train_X, valid_X, train_y, valid_y = train_test_split(train_X,
                                                     train_y,
                                                     test_size=0.15,
                                                     stratify=train_y,
                                                      random_state=36
                                                     )

In [122]:
print('Shape of train_X:', train_X.shape)
print('Shape of train_y:', train_y.shape)

print('\nShape of test_X:', test_X.shape)
print('Shape of test_y:', test_y.shape)

print('\nShape of valid_X:', valid_X.shape)
print('Shape of valid_y:', valid_y.shape)

Shape of train_X: (1083,)
Shape of train_y: (1083,)

Shape of test_X: (225,)
Shape of test_y: (225,)

Shape of valid_X: (192,)
Shape of valid_y: (192,)


### Convert dataset to spacy compatible format

In [62]:
# one hot encode all the labels
train_y_df = pd.get_dummies(train_y)
test_y_df = pd.get_dummies(test_y)
valid_y_df = pd.get_dummies(valid_y)

In [63]:
train_y_df.head()

Unnamed: 0,Bank account or service,Consumer Loan,Credit card,Credit reporting,Debt collection,Money transfers,Mortgage,Other financial service,Payday loan,Prepaid card,Student loan
1153,0,0,0,0,0,0,1,0,0,0,0
65,0,0,1,0,0,0,0,0,0,0,0
99,0,0,0,1,0,0,0,0,0,0,0
1256,0,0,0,0,0,0,0,0,0,0,1
973,0,0,0,0,1,0,0,0,0,0,0


In [64]:
# convert data to text list and label dictionaries
train_texts = train_X.tolist()
train_cats = train_y_df.to_dict(orient='records')

test_texts = test_X.tolist()
test_cats = test_y_df.to_dict(orient='records')

valid_texts = valid_X.tolist()
valid_cats = valid_y_df.to_dict(orient='records')

In [65]:
# combine the text and labels to create data in spacy format
train_data = list(zip(train_texts, [{'cats': cats} for cats in train_cats]))
test_data = list(zip(test_texts, [{'cats': cats} for cats in test_cats]))
valid_data = list(zip(valid_texts, [{'cats': cats} for cats in valid_cats]))

In [66]:
# check
train_data[:2]

[('done multiple requests mortgage consider giving help heloc loan fulfilling requirements denied trying add mortgage xxxx deny kind help',
  {'cats': {'Bank account or service': 0,
    'Consumer Loan': 0,
    'Credit card': 0,
    'Credit reporting': 0,
    'Debt collection': 0,
    'Money transfers': 0,
    'Mortgage': 1,
    'Other financial service': 0,
    'Payday loan': 0,
    'Prepaid card': 0,
    'Student loan': 0}}),
 ('complaint regards credit card bank america payment made made insurance company purchased bank america event life changes problem made aware bank america longer conducts business result money paid insurance company well refund sent bank america behalf instead bank america sending check applied credit brought balance problem right none well however bring balance said payment credit said something paid monthly like went store purchased something returned case would credit service provided customers event unable pay bill issue bank america accepted payment charged

In [67]:
# check
test_data[:2]

[('paid back sent letter credit bureaus wife showing account satisfied full problem credit bureaus reporting account remark section settled less full amount asked remove remark account uploaded faxed documentation showing account satisfied full xxxx final process getting xxxx home remark still showing reports need remark removed',
  {'cats': {'Bank account or service': 0,
    'Consumer Loan': 0,
    'Credit card': 0,
    'Credit reporting': 1,
    'Debt collection': 0,
    'Money transfers': 0,
    'Mortgage': 0,
    'Payday loan': 0,
    'Prepaid card': 0,
    'Student loan': 0}}),
 ('mortgage company city mortgage would rather foreclose except monthly payments fighting mortgage company two years believe predatory lending fraud went modification signed modification still foreclosed',
  {'cats': {'Bank account or service': 0,
    'Consumer Loan': 0,
    'Credit card': 0,
    'Credit reporting': 0,
    'Debt collection': 0,
    'Money transfers': 0,
    'Mortgage': 1,
    'Payday loan':

In [68]:
# unpack the text and lables used for evaluation later
train_texts, train_labels = list(zip(*train_data))
test_texts, test_labels = list(zip(*test_data))
valid_texts, valid_labels = list(zip(*valid_data))

### Construct spacy model

In [96]:
def train_spacy(iterations, model_arch, dropout, learn_rate, output_dir):

    nlp = spacy.load('en_core_web_lg')

    textcat = nlp.create_pipe('textcat', config={'exclusive_classes':True, 'architecture':model_arch})
    nlp.add_pipe(textcat)

    for _, label in enumerate(label_values):
        textcat.add_label(label)

    pipe_exceptions = ['textcat']
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

    with nlp.disable_pipes(*other_pipes):
    #     print(nlp.pipe_names)
        optimizer = nlp.begin_training()
        optimizer.learn_rate = learn_rate
        print('Training the model..')
        total_start_time = time.clock()

    for i in range(iterations):
        print('\nIteration:', str(i+1))
        start_time = time.clock()
        losses = {}
        true_labels = []
        pred_labels = []

        random.shuffle(train_data)
        batches = minibatch(train_data, size=compounding(4., 32., 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=dropout, losses=losses)

        with textcat.model.use_params(optimizer.averages):
            
            nlp.to_disk(output_dir)

            docs = [nlp.tokenizer(text) for text in valid_texts]

            for j, doc in enumerate(textcat.pipe(docs)):
                true_series = pd.Series(valid_labels[j]['cats'])
                true_label = true_series.idxmax()
                true_labels.append(true_label)

                pred_series = pd.Series(doc.cats)
                pred_label = pred_series.idxmax()
                pred_labels.append(pred_label)

            score_f1 = f1_score(true_labels, pred_labels, average='weighted')
            score_ac = accuracy_score(true_labels, pred_labels)
            
            print(classification_report(true_labels, pred_labels))
            print('\ntextcat_loss: {:.3f}\t f1_score: {:.3f}\t accuracy_score: {:.3f}'.format(losses['textcat'], score_f1, score_ac))

            print('\nElapsed time:', str(round((time.clock() - start_time)/60,2)) + ' minutes')
            
    print('\nTotal time:', str(round((time.clock() - total_start_time)/60,2)) + ' minutes')
            
    return nlp

In [None]:
# function to evaluate results on unseen test data
def evaluate(test_texts, test_labels, model):
    
    nlp = spacy.load(model)
    
    docs = [nlp.tokenizer(text) for text in test_texts]
    
    textcat = nlp.get_pipe('textcat')
    
    true_labels = []
    pred_labels = []

    for j, doc in enumerate(textcat.pipe(docs)):
        true_series = pd.Series(test_labels[j]['cats'])
        true_label = true_series.idxmax()
        true_labels.append(true_label)

        pred_series = pd.Series(doc.cats)
        pred_label = pred_series.idxmax()
        pred_labels.append(pred_label)
        
    print(classification_report(true_labels, pred_labels))

In [97]:
# bag of words model architecture
train_spacy(10, 'bow', 0.2, 4e-3, 'bow_model')

Training the model..

Iteration: 1
                         precision    recall  f1-score   support

Bank account or service       0.61      0.69      0.65        16
          Consumer Loan       0.75      0.27      0.40        11
            Credit card       0.59      0.79      0.68        24
       Credit reporting       0.83      0.86      0.84        28
        Debt collection       0.86      0.86      0.86        49
        Money transfers       1.00      0.33      0.50         3
               Mortgage       0.85      0.94      0.89        49
            Payday loan       0.00      0.00      0.00         3
           Prepaid card       0.00      0.00      0.00         1
           Student loan       0.40      0.25      0.31         8

               accuracy                           0.77       192
              macro avg       0.59      0.50      0.51       192
           weighted avg       0.76      0.77      0.75       192


textcat_loss: 7.950	 f1_score: 0.752	 accuracy_scor

                         precision    recall  f1-score   support

Bank account or service       0.56      0.56      0.56        16
          Consumer Loan       0.60      0.27      0.37        11
            Credit card       0.59      0.71      0.64        24
       Credit reporting       0.76      0.79      0.77        28
        Debt collection       0.76      0.84      0.80        49
        Money transfers       1.00      0.33      0.50         3
               Mortgage       0.89      0.96      0.92        49
            Payday loan       0.00      0.00      0.00         3
           Prepaid card       0.00      0.00      0.00         1
           Student loan       0.75      0.38      0.50         8

               accuracy                           0.74       192
              macro avg       0.59      0.48      0.51       192
           weighted avg       0.73      0.74      0.73       192


textcat_loss: 0.193	 f1_score: 0.728	 accuracy_score: 0.745

Elapsed time: 5.59 minute

<spacy.lang.en.English at 0x7fd635aac710>

In [101]:
# evaluate results on unseen test data
evaluate(test_texts, test_labels, 'bow_model')

                         precision    recall  f1-score   support

Bank account or service       0.70      0.78      0.74        18
          Consumer Loan       0.50      0.31      0.38        13
            Credit card       0.75      0.86      0.80        28
       Credit reporting       0.78      0.76      0.77        33
        Debt collection       0.76      0.90      0.83        58
        Money transfers       0.00      0.00      0.00         3
               Mortgage       0.88      0.89      0.89        57
            Payday loan       1.00      0.25      0.40         4
           Prepaid card       0.00      0.00      0.00         1
           Student loan       0.83      0.50      0.62        10

               accuracy                           0.78       225
              macro avg       0.62      0.52      0.54       225
           weighted avg       0.77      0.78      0.77       225



In [102]:
# convolutional neural network as model architecture
train_spacy(10, 'simple_cnn', 0.2, 4e-3, 'simple_cnn_model')

Training the model..

Iteration: 1
                         precision    recall  f1-score   support

Bank account or service       1.00      0.06      0.12        16
          Consumer Loan       0.00      0.00      0.00        11
            Credit card       0.42      0.79      0.55        24
       Credit reporting       0.83      0.68      0.75        28
        Debt collection       0.75      0.80      0.77        49
        Money transfers       0.00      0.00      0.00         3
               Mortgage       0.65      0.94      0.77        49
            Payday loan       0.00      0.00      0.00         3
           Prepaid card       0.00      0.00      0.00         1
           Student loan       0.00      0.00      0.00         8

               accuracy                           0.65       192
              macro avg       0.36      0.33      0.30       192
           weighted avg       0.61      0.65      0.58       192


textcat_loss: 11.246	 f1_score: 0.580	 accuracy_sco

                         precision    recall  f1-score   support

Bank account or service       0.44      0.50      0.47        16
          Consumer Loan       0.40      0.36      0.38        11
            Credit card       0.76      0.67      0.71        24
       Credit reporting       0.75      0.75      0.75        28
        Debt collection       0.84      0.84      0.84        49
        Money transfers       1.00      0.67      0.80         3
               Mortgage       0.80      0.92      0.86        49
            Payday loan       0.00      0.00      0.00         3
           Prepaid card       0.00      0.00      0.00         1
           Student loan       0.67      0.50      0.57         8

               accuracy                           0.73       192
              macro avg       0.57      0.52      0.54       192
           weighted avg       0.73      0.73      0.73       192


textcat_loss: 0.515	 f1_score: 0.728	 accuracy_score: 0.734

Elapsed time: 5.12 minute

<spacy.lang.en.English at 0x7fd635526358>

In [103]:
# evaluate results on unseen test data
evaluate(test_texts, test_labels, 'simple_cnn_model')

                         precision    recall  f1-score   support

Bank account or service       0.65      0.72      0.68        18
          Consumer Loan       0.46      0.46      0.46        13
            Credit card       0.69      0.71      0.70        28
       Credit reporting       0.84      0.79      0.81        33
        Debt collection       0.80      0.81      0.80        58
        Money transfers       0.00      0.00      0.00         3
               Mortgage       0.78      0.88      0.83        57
            Payday loan       0.00      0.00      0.00         4
           Prepaid card       0.00      0.00      0.00         1
           Student loan       0.62      0.50      0.56        10

               accuracy                           0.74       225
              macro avg       0.48      0.49      0.48       225
           weighted avg       0.72      0.74      0.73       225



In [104]:
# ensemble model architecture
train_spacy(10, 'ensemble', 0.2, 4e-3, 'ensemble_model')

Training the model..

Iteration: 1
                         precision    recall  f1-score   support

Bank account or service       0.55      0.38      0.44        16
          Consumer Loan       0.00      0.00      0.00        11
            Credit card       0.29      0.21      0.24        24
       Credit reporting       0.76      0.57      0.65        28
        Debt collection       0.49      0.82      0.61        49
        Money transfers       0.00      0.00      0.00         3
               Mortgage       0.75      0.94      0.84        49
            Payday loan       0.00      0.00      0.00         3
           Prepaid card       0.00      0.00      0.00         1
           Student loan       0.00      0.00      0.00         8

               accuracy                           0.59       192
              macro avg       0.28      0.29      0.28       192
           weighted avg       0.51      0.59      0.53       192


textcat_loss: 11.870	 f1_score: 0.532	 accuracy_sco

                         precision    recall  f1-score   support

Bank account or service       0.67      0.88      0.76        16
          Consumer Loan       0.80      0.36      0.50        11
            Credit card       0.75      0.75      0.75        24
       Credit reporting       0.81      0.75      0.78        28
        Debt collection       0.84      0.84      0.84        49
        Money transfers       0.50      0.33      0.40         3
               Mortgage       0.89      1.00      0.94        49
            Payday loan       0.00      0.00      0.00         3
           Prepaid card       0.00      0.00      0.00         1
           Student loan       0.56      0.62      0.59         8

               accuracy                           0.80       192
              macro avg       0.58      0.55      0.56       192
           weighted avg       0.78      0.80      0.78       192


textcat_loss: 1.973	 f1_score: 0.784	 accuracy_score: 0.797

Elapsed time: 6.43 minute

<spacy.lang.en.English at 0x7fd53ee04a90>

In [105]:
# evaluate results on unseen test data
evaluate(test_texts, test_labels, 'ensemble_model')

                         precision    recall  f1-score   support

Bank account or service       0.61      0.78      0.68        18
          Consumer Loan       1.00      0.62      0.76        13
            Credit card       0.75      0.96      0.84        28
       Credit reporting       0.80      0.73      0.76        33
        Debt collection       0.85      0.86      0.85        58
        Money transfers       0.00      0.00      0.00         3
               Mortgage       0.93      0.93      0.93        57
            Payday loan       0.67      0.50      0.57         4
           Prepaid card       0.00      0.00      0.00         1
           Student loan       0.56      0.50      0.53        10

               accuracy                           0.81       225
              macro avg       0.62      0.59      0.59       225
           weighted avg       0.81      0.81      0.80       225

