In [1]:
## To work with the data
import pandas as pd
import numpy as np

In [2]:
## Let's read our data
df = pd.read_csv("consumer_compliants.zip", compression = "zip")

In [3]:
## Take a loot at the data
df.head()

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,4/3/2020,Vehicle loan or lease,Loan,Getting a loan or lease,Fraudulent loan,This auto loan was opened on XX/XX/2020 in XXX...,Company has responded to the consumer and the ...,TRUIST FINANCIAL CORPORATION,PA,,,Consent provided,Web,4/3/2020,Closed with explanation,Yes,,3591341
1,3/12/2020,Debt collection,Payday loan debt,Attempts to collect debt not owed,Debt is not yours,In XXXX of 2019 I noticed a debt for {$620.00}...,,CURO Intermediate Holdings,CO,806XX,,Consent provided,Web,3/12/2020,Closed with explanation,Yes,,3564184
2,2/6/2020,Vehicle loan or lease,Loan,Getting a loan or lease,Credit denial,"As stated from Capital One, XXXX XX/XX/XXXX an...",,CAPITAL ONE FINANCIAL CORPORATION,OH,430XX,,Consent provided,Web,2/6/2020,Closed with explanation,Yes,,3521949
3,3/6/2020,Checking or savings account,Savings account,Managing an account,Banking errors,"Please see CFPB case XXXX. \n\nCapital One, in...",,CAPITAL ONE FINANCIAL CORPORATION,CA,,,Consent provided,Web,3/6/2020,Closed with explanation,Yes,,3556237
4,2/14/2020,Debt collection,Medical debt,Attempts to collect debt not owed,Debt is not yours,This debt was incurred due to medical malpract...,Company believes it acted appropriately as aut...,"Merchants and Professional Bureau, Inc.",OH,432XX,,Consent provided,Web,2/14/2020,Closed with explanation,Yes,,3531704


In [4]:
## To be abe to read whole narrative
# pd.set_option("display.max_colwidth", -1)

ValueError: Value must be a nonnegative integer or None

- there are a lot of attributes which we are not interested in for the task in hand. We'll be working on Customer complaint narrative and try to perform topic modeling on that.

In [10]:
## Useful columns
df = df[['Product', 'Consumer complaint narrative']]
df.columns = ['Product', 'Narrative']

In [11]:
df.head()

Unnamed: 0,Product,Narrative
0,Vehicle loan or lease,This auto loan was opened on XX/XX/2020 in XXX...
1,Debt collection,In XXXX of 2019 I noticed a debt for {$620.00}...
2,Vehicle loan or lease,"As stated from Capital One, XXXX XX/XX/XXXX an..."
3,Checking or savings account,"Please see CFPB case XXXX. \n\nCapital One, in..."
4,Debt collection,This debt was incurred due to medical malpract...


In [12]:
df.shape

(57453, 2)

In [13]:
df['Product'].value_counts()
## Class imbalance

Product
Debt collection                21772
Credit card or prepaid card    13193
Mortgage                        9799
Checking or savings account     7003
Student loan                    2950
Vehicle loan or lease           2736
Name: count, dtype: int64

#### Preprocessing

In [14]:
## performes tokenization.
from nltk import word_tokenize

In [15]:
## Custom Tokenization:
## Those card numbers with Xx in them are of no use. also, I am not interested in amount and other numerical features. 
def tokenize(sentance):
    tokens = word_tokenize(sentance)
    tokens = [word for word in tokens if ( word.isalpha() and len(word)>3 and len(word.strip('Xx/'))>2) ]
    return tokens

In [16]:
## Used for splitting our data into train and test set.
from sklearn.model_selection import train_test_split

X, y = df[['Narrative']], df[['Product']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y)

In [18]:
## Resolving class Imbalance
from imblearn.over_sampling import RandomOverSampler

strategy = {
    "Debt collection"              :  21772,
    "Credit card or prepaid card"  :  13193,
    "Mortgage"                     :  11000 ,
    "Checking or savings account"  :  9000 ,
    "Student loan"                 :  7000 ,
    "Vehicle loan or lease"        :  7000 }

ros = RandomOverSampler(sampling_strategy = strategy, random_state=42)
X_train, y_train = ros.fit_resample(X_train, y_train)

In [19]:
## we need to represent words as numerical values/ vectors.
from sklearn.feature_extraction.text import TfidfVectorizer

In [20]:
## Very helpful indeed. Different libraries may have differet implementation of the same functionality.
## It is useful to take a look at the parameters of the function
TfidfVectorizer?

[1;31mInit signature:[0m
[0mTfidfVectorizer[0m[1;33m([0m[1;33m
[0m    [1;33m*[0m[1;33m,[0m[1;33m
[0m    [0minput[0m[1;33m=[0m[1;34m'content'[0m[1;33m,[0m[1;33m
[0m    [0mencoding[0m[1;33m=[0m[1;34m'utf-8'[0m[1;33m,[0m[1;33m
[0m    [0mdecode_error[0m[1;33m=[0m[1;34m'strict'[0m[1;33m,[0m[1;33m
[0m    [0mstrip_accents[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mlowercase[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m    [0mpreprocessor[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mtokenizer[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0manalyzer[0m[1;33m=[0m[1;34m'word'[0m[1;33m,[0m[1;33m
[0m    [0mstop_words[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mtoken_pattern[0m[1;33m=[0m[1;34m'(?u)\\b\\w\\w+\\b'[0m[1;33m,[0m[1;33m
[0m    [0mngram_range[0m[1;33m=[0m[1;33m([0m[1;36m1[0m[1;33m,[0m [1;36m1[0m[1;33m)[0m[1;33m,[0m[1;33m
[0m    [0m

In [21]:
vectorizer = TfidfVectorizer(tokenizer = tokenize, stop_words = 'english', 
                            max_df = 0.75, min_df = 50, max_features = 10000)
vectors = vectorizer.fit_transform(X_train['Narrative'])



In [22]:
vectors.shape

(68965, 4943)

In [28]:
## Feature names mapped to the array index.
## 1st feature in vectors is the word at 1 index of the list.
vectorizer.get_feature_names_out()

array(['aadvantage', 'abandoned', 'abide', ..., 'youre', 'zero', 'zone'],
      dtype=object)

### Model Building

In [29]:
from sklearn.decomposition import LatentDirichletAllocation

In [30]:
LatentDirichletAllocation?

[1;31mInit signature:[0m
[0mLatentDirichletAllocation[0m[1;33m([0m[1;33m
[0m    [0mn_components[0m[1;33m=[0m[1;36m10[0m[1;33m,[0m[1;33m
[0m    [1;33m*[0m[1;33m,[0m[1;33m
[0m    [0mdoc_topic_prior[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mtopic_word_prior[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mlearning_method[0m[1;33m=[0m[1;34m'batch'[0m[1;33m,[0m[1;33m
[0m    [0mlearning_decay[0m[1;33m=[0m[1;36m0.7[0m[1;33m,[0m[1;33m
[0m    [0mlearning_offset[0m[1;33m=[0m[1;36m10.0[0m[1;33m,[0m[1;33m
[0m    [0mmax_iter[0m[1;33m=[0m[1;36m10[0m[1;33m,[0m[1;33m
[0m    [0mbatch_size[0m[1;33m=[0m[1;36m128[0m[1;33m,[0m[1;33m
[0m    [0mevaluate_every[0m[1;33m=[0m[1;33m-[0m[1;36m1[0m[1;33m,[0m[1;33m
[0m    [0mtotal_samples[0m[1;33m=[0m[1;36m1000000.0[0m[1;33m,[0m[1;33m
[0m    [0mperp_tol[0m[1;33m=[0m[1;36m0.1[0m[1;33m,[0m[1;33m
[0m    [0mmean_change_tol[0m[1;33

In [31]:
lda = LatentDirichletAllocation(n_components = 6, learning_method = 'online', max_iter = 100,
                         n_jobs = -1, random_state = 123)

W1 = lda.fit_transform(vectors)

H1 = lda.components_

In [32]:
## H1
W1

array([[0.03826295, 0.01854838, 0.03993724, 0.09456617, 0.01835607,
        0.79032918],
       [0.03284839, 0.03272191, 0.03276249, 0.03349488, 0.0333319 ,
        0.83484044],
       [0.02750916, 0.0478809 , 0.0276351 , 0.02772895, 0.02712774,
        0.84211816],
       ...,
       [0.03619558, 0.03749331, 0.03575796, 0.38915436, 0.03611963,
        0.46527916],
       [0.02123505, 0.02122891, 0.89386232, 0.02122752, 0.02122279,
        0.02122342],
       [0.25284468, 0.03635479, 0.03596977, 0.60240052, 0.0361438 ,
        0.03628645]])

In [33]:
## W1 -> Probability of document being of the topic(i). 6 topics->6 probabilities.
## H1 -> Topic-> Words
print(W1.shape, H1.shape)

(68965, 6) (6, 4943)


In [34]:
vocab = np.array(vectorizer.get_feature_names_out())

## Gives last (numwords-1) numbers in descending order. from n to 0.
## So, for each topic, we get top 10 words that appear in that topic.
def get_top_words(words):
    res = []
    for index in np.argsort(words)[:-10:-1]:
        res.append(vocab[index])
    return res
## top_words = lambda words: [vocab[index] for index in np.argsort(words)[:-top:-1]]

## Get topic words using H1.
topic_words = (get_top_words(words) for words in H1)
topics = [','.join(t) for t in topic_words]

In [37]:
topics
## These are based on my intuition. Can be interpreted entirely differently as well.
## 0 -> Student Loans
## 1 -> Mortgage
## 2 -> Credit card or prepaid card
## 3 -> Debt Collection
## 4 -> Vehicle loan or lease
## 5 -> Checking or savings account

['loan,loans,payments,student,navient,payment,forbearance,rate,income',
 'mortgage,escrow,loan,modification,property,insurance,home,foreclosure,servicing',
 'usaa,bonus,opened,promotion,checks,suntrust,union,credit,relation',
 'debt,credit,collection,company,account,report,information,letter,reporting',
 'identity,theft,victim,belong,report,affidavit,debt,santander,does',
 'account,bank,card,payment,credit,told,called,money,said']

In [38]:
colnames = ["Topic_" + str(i) for i in range(lda.n_components)]
df1 = pd.DataFrame(np.round(W1, 2), columns = colnames, index = X_train.index)
# significant_topic = np.argmax(df1.values, axis = 1)
df1["Dominant Topic"] = np.argmax(df1.values, axis = 1)

In [39]:
df1 = X_train.join(df1)
df1 = df1.join(y_train)

In [40]:
df1.head()

Unnamed: 0,Narrative,Topic_0,Topic_1,Topic_2,Topic_3,Topic_4,Topic_5,Dominant Topic,Product
0,On ( date ) I contacted Bank of America in reg...,0.04,0.02,0.04,0.09,0.02,0.79,5,Checking or savings account
1,I had a issue with capital one buy power card ...,0.03,0.03,0.03,0.03,0.03,0.83,5,Credit card or prepaid card
2,"On XX/XX/2019, 20 fraudulent charges were made...",0.03,0.05,0.03,0.03,0.03,0.84,5,Credit card or prepaid card
3,Started a Credit Card through XXXX XXXX with X...,0.03,0.03,0.03,0.06,0.03,0.82,5,Credit card or prepaid card
4,I went in to a Union Bank located in XXXX XXXX...,0.02,0.02,0.07,0.08,0.02,0.79,5,Checking or savings account


In [41]:
W2 = lda.transform(vectorizer.transform(X_test['Narrative']))

X_test["Dominant Topic"] = np.argmax(W2, axis =1)

In [42]:
X_test.join(y_test).head(10)

Unnamed: 0,Narrative,Dominant Topic,Product
40496,Our mortgage loan is serviced by The Money Sou...,5,Mortgage
17199,To Whom It May Concern : This letter is regard...,3,Debt collection
45428,I started receiving calls from Portfolio Recov...,3,Debt collection
56085,"This company, Credence Resource Management is ...",3,Debt collection
41199,"I shopped at Nordstrom rack on XX/XX/XXXX, and...",5,Credit card or prepaid card
35599,In XXXX I filed complaint number XXXX. I also ...,5,Mortgage
17220,I recently started to look into my credit back...,3,Debt collection
14443,This company ( National credit Adjusters ) whi...,3,Debt collection
27947,I present the complaint of the collection agen...,3,Debt collection
172,I moved out of my old apartment complex before...,3,Debt collection
