In [1]:
import pandas as pd
import gensim
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from string import punctuation
from nltk.corpus import stopwords
from gensim import corpora
from gensim.models import CoherenceModel
import pyLDAvis.gensim
import warnings
warnings.filterwarnings('ignore')



In [2]:
df = pd.read_csv('C:/Users/Samarth/Desktop/Mtech AI/NLP/Data/Consumer_Complaints.csv')

In [3]:
df.head()

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,3/12/2014,Mortgage,Other mortgage,"Loan modification,collection,foreclosure",,,,M&T BANK CORPORATION,MI,48382,,,Referral,3/17/2014,Closed with explanation,Yes,No,759217
1,10/1/2016,Credit reporting,,Incorrect information on credit report,Account status,I have outdated information on my credit repor...,Company has responded to the consumer and the ...,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",AL,352XX,,Consent provided,Web,10/5/2016,Closed with explanation,Yes,No,2141773
2,10/17/2016,Consumer Loan,Vehicle loan,Managing the loan or lease,,I purchased a new car on XXXX XXXX. The car de...,,"CITIZENS FINANCIAL GROUP, INC.",PA,177XX,Older American,Consent provided,Web,10/20/2016,Closed with explanation,Yes,No,2163100
3,6/8/2014,Credit card,,Bankruptcy,,,,AMERICAN EXPRESS COMPANY,ID,83854,Older American,,Web,6/10/2014,Closed with explanation,Yes,Yes,885638
4,9/13/2014,Debt collection,Credit card,Communication tactics,Frequent or repeated calls,,,"CITIBANK, N.A.",VA,23233,,,Web,9/13/2014,Closed with explanation,Yes,Yes,1027760


In [4]:
df = df[['Consumer complaint narrative','Product']]

In [5]:
df.head()

Unnamed: 0,Consumer complaint narrative,Product
0,,Mortgage
1,I have outdated information on my credit repor...,Credit reporting
2,I purchased a new car on XXXX XXXX. The car de...,Consumer Loan
3,,Credit card
4,,Debt collection


In [6]:
df.isna().sum()

Consumer complaint narrative    704013
Product                              0
dtype: int64

In [7]:
df.dropna(inplace=True)
df = df[:1000]

In [8]:
df.shape

(1000, 2)

In [9]:
lem = WordNetLemmatizer()
stop_words = list(set(stopwords.words('english')))+list(punctuation)+['xxxx','xxx','xx','xxxxx','xxxx xxxx','l.']

In [10]:
def cleaning(text):
    text = text.lower()
    word = word_tokenize(text)
    word = [w for w in word if w not in stop_words]
    words = [lem.lemmatize(w,'v') for w in word]
    return words

In [11]:
complaints = list(df['Consumer complaint narrative'].values)

In [12]:
data = [cleaning(text) for text in complaints]

### Creating a dictionary with unique id for each word

In [13]:
dictionary = corpora.Dictionary(data)

In [14]:
doc_term_matrix = [dictionary.doc2bow(doc) for doc in data]

### To see how the words are mapped in the dictionary

In [15]:
print ([[(dictionary[id], freq) for id, freq in cp] for cp in doc_term_matrix[:1]])

[[('credit', 2), ('dispute', 1), ('information', 2), ('meet', 1), ('old', 1), ('outdated', 1), ('previously', 1), ('remove', 1), ('report', 2), ('requirements', 1), ('seven', 1), ('years', 1), ('yet', 1)]]


### Here we can interpret that first five topics

In [16]:
Lda = gensim.models.ldamodel.LdaModel
ldamodel = Lda(corpus = doc_term_matrix, num_topics=5, id2word = dictionary, passes=50)
print(ldamodel.print_topics(num_topics=5, num_words=7))

[(0, '0.047*"report" + 0.040*"credit" + 0.026*"account" + 0.021*"information" + 0.014*"dispute" + 0.011*"request" + 0.010*"remove"'), (1, '0.036*"loan" + 0.012*"interest" + 0.012*"``" + 0.010*"pay" + 0.008*"make" + 0.008*"payments" + 0.008*"amount"'), (2, '0.020*"bank" + 0.015*"account" + 0.012*"mortgage" + 0.010*"check" + 0.010*"would" + 0.010*"tell" + 0.010*"``"'), (3, '0.032*"debt" + 0.018*"pay" + 0.010*"collection" + 0.010*"call" + 0.009*"company" + 0.009*"receive" + 0.008*"send"'), (4, '0.023*"call" + 0.019*"credit" + 0.017*"account" + 0.013*"card" + 0.012*"pay" + 0.011*"say" + 0.010*"get"')]


### Coherence Model basically divides the process into 4 different stages i.e. Segmentation, Probability Approch, Confirmation Measure and Aggregation. The topic coherence is applied to the top N words from the topic. It is defined as the average of the pairwise word-similarity scores of the words in the topic. Hence higher the score better the model is

In [17]:
print('Perplexity: ', ldamodel.log_perplexity(doc_term_matrix))  # a measure of how good the model is. lower the better.

coherence_model_lda = CoherenceModel(model=ldamodel, texts=data, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

Perplexity:  -7.067617307273172
Coherence Score:  0.41391777481759445


In [18]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary)
vis