In [2]:
import pandas as pd
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\66885\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
df = pd.read_pickle('consumer_complaint_dataset.data', compression='gzip')

In [4]:
topic_unique = list(df["topic"].unique())
topic_unique

['Debt collection',
 'Credit reporting, credit repair services, or other personal consumer reports',
 'Money transfer, virtual currency, or money service',
 'Mortgage',
 'Student loan',
 'Vehicle loan or lease',
 'Checking or savings account',
 'Credit card or prepaid card',
 'Credit card',
 'Payday loan, title loan, or personal loan',
 'Consumer Loan',
 'Payday loan',
 'Bank account or service',
 'Credit reporting',
 'Other financial service',
 'Prepaid card',
 'Money transfers',
 'Virtual currency']

In [5]:
df["input"]

0         transworld systems inc. \nis trying to collect...
1         I would like to request the suppression of the...
2         Over the past 2 weeks, I have been receiving e...
3         I HAD FILED WITH CFPB ON XX/XX/XXXX19 TO HAVE ...
4         I have several accounts that the balance is in...
                                ...                        
492250    I was on automatic payment for my car loan. In...
492251    I recieved a collections call from an unknown ...
492252    On XXXX XXXX, 2015, I contacted XXXX XXXX, who...
492253    I can not get from chase who services my mortg...
492254    I made a payment to CITI XXXX Credit Card on X...
Name: input, Length: 492255, dtype: object

## Cleaning text

In [6]:
import re

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    text = text.replace('x', '')
#    text = re.sub(r'\W+', '', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text

df['input'] = df['input'].apply(clean_text)

# Remove groups of numerical sequences with whitespace
df['input'] = df['input'].str.replace('\d+', '')

  df['input'] = df['input'].str.replace('\d+', '')


In [7]:
data = df["input"][0:10000]

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_df=0.5, max_features=2000,
                                 min_df=2,use_idf=True)
X = vectorizer.fit_transform(data)

In [9]:
from sklearn.cluster import KMeans
km = KMeans(n_clusters=len(topic_unique), max_iter=100, n_init=55, n_jobs=-1)

In [10]:
km.fit(X)



KMeans(max_iter=100, n_clusters=18, n_init=55, n_jobs=-1)

In [11]:
order_centroids = km.cluster_centers_.argsort()[:, ::-1]

In [12]:
km.cluster_centers_

array([[0.00531037, 0.0039725 , 0.00078712, ..., 0.00793773, 0.00038612,
        0.00116101],
       [0.00291574, 0.00742936, 0.00109447, ..., 0.00564301, 0.        ,
        0.00420447],
       [0.00340815, 0.01108831, 0.00192987, ..., 0.00931645, 0.        ,
        0.        ],
       ...,
       [0.00168   , 0.01100783, 0.0025929 , ..., 0.00726653, 0.00206155,
        0.00157032],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.00525593, 0.00790671, 0.00174   , ..., 0.00415915, 0.        ,
        0.00028131]])

In [13]:
order_centroids

array([[  14, 1506, 1508, ...,  325, 1731, 1106],
       [1255,  213,  337, ..., 1376,  586, 1257],
       [ 639, 1506,  912, ...,  474, 1306, 1999],
       ...,
       [ 296,  259,   14, ..., 1430, 1446, 1015],
       [1270,  576, 1508, ..., 1328, 1329,    0],
       [ 984, 1275, 1276, ..., 1113, 1111, 1287]], dtype=int64)

In [20]:
for name in range(len(topic_unique)):
    print("Cluster %d :" % name ,topic_unique[name])

Cluster 0 : Debt collection
Cluster 1 : Credit reporting, credit repair services, or other personal consumer reports
Cluster 2 : Money transfer, virtual currency, or money service
Cluster 3 : Mortgage
Cluster 4 : Student loan
Cluster 5 : Vehicle loan or lease
Cluster 6 : Checking or savings account
Cluster 7 : Credit card or prepaid card
Cluster 8 : Credit card
Cluster 9 : Payday loan, title loan, or personal loan
Cluster 10 : Consumer Loan
Cluster 11 : Payday loan
Cluster 12 : Bank account or service
Cluster 13 : Credit reporting
Cluster 14 : Other financial service
Cluster 15 : Prepaid card
Cluster 16 : Money transfers
Cluster 17 : Virtual currency


In [14]:
terms = vectorizer.get_feature_names()
for i in range(len(topic_unique)):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
    print()

Cluster 0: account report reporting opened removed closed balance collection company information
Cluster 1: paid bill collection insurance full account balance amount debt report
Cluster 2: equifa report information account reporting dispute accounts file removed tried
Cluster 3: information reporting report bankruptcy inaccurate bureaus eperian sent letter transunion
Cluster 4: consumer agency shall block section information theft identity reporting subsection
Cluster 5: loan loans student payments navient interest payment would pay told
Cluster 6: report company call inquiries number never phone information called get
Cluster 7: theft identity victim report information fraudulent accounts items account result
Cluster 8: debt collection company letter validation report collect account owe alleged
Cluster 9: bank account money check funds told america paypal would checking
Cluster 10: payment mortgage payments loan would made pay month account escrow
Cluster 11: inquiry hard unauthoriz