<a href="https://colab.research.google.com/github/srivatsan88/YouTubeLI/blob/master/Topic_Modeling_using_NMF_scikit_learn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition
import matplotlib.pyplot as plt
import numpy as np
import re
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split

In [None]:
nltk.download('punkt')

In [None]:
df=pd.read_csv('https://github.com/srivatsan88/YouTubeLI/blob/master/dataset/consumer_compliants.zip?raw=true', compression='zip', sep=',', quotechar='"')

In [None]:
df

In [None]:
df['Product'].value_counts()

In [None]:
df['Company'].value_counts()

In [None]:
complaints_df=df[['Consumer complaint narrative','Product','Company']].rename(columns={'Consumer complaint narrative':'complaints'})

In [None]:
pd.set_option('display.max_colwidth', -1)
complaints_df

In [None]:
X_train, X_hold = train_test_split(complaints_df, test_size=0.3, random_state=111)

In [None]:
X_train['Product'].value_counts()

In [None]:
#stemmer = PorterStemmer()
stemmer = nltk.stem.SnowballStemmer('english')
nltk.download('stopwords')
stop_words = set(nltk.corpus.stopwords.words('english'))

In [None]:
def tokenize(text):
   tokens = [word for word in nltk.word_tokenize(text) if (len(word) > 3 and len(word.strip('Xx/')) > 2 and len(re.sub('\d+', '', word.strip('Xx/'))) > 3) ] 
   tokens = map(str.lower, tokens)
   stems = [stemmer.stem(item) for item in tokens if (item not in stop_words)]
   return stems

In [None]:
vectorizer = TfidfVectorizer(tokenizer=tokenize, stop_words=None, max_df=0.75, max_features=1000, lowercase=False, ngram_range=(1,2))
tfidf_vectors = vectorizer.fit_transform(X_train.complaints) 

In [None]:
tfidf_vectors.A

In [None]:
vectorizer.get_feature_names()

In [None]:
clf = decomposition.NMF(n_components=6, random_state=111)

W1 = clf.fit_transform(tfidf_vectors)
H1 = clf.components_

In [None]:
H1

array([[0.00000000e+00, 2.26701694e-01, 5.67180774e-02, ...,
        1.04962564e-01, 4.93310687e-02, 2.36510109e-01],
       [9.02557623e-02, 5.16984659e-03, 6.72853708e-03, ...,
        6.30409582e-02, 3.56037922e-02, 1.66265209e-01],
       [7.23475845e-03, 1.17355438e-01, 1.45066546e-02, ...,
        9.38266155e-02, 4.20839489e-02, 9.72722832e-02],
       [9.27737536e-02, 1.50914117e-01, 2.39515858e-02, ...,
        6.61443857e-02, 1.55528206e-02, 5.20053002e-01],
       [0.00000000e+00, 5.84106407e-05, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [7.02964646e-03, 8.09717204e-02, 2.67704104e-02, ...,
        8.54121270e-02, 1.32456081e-02, 1.66357867e-01]])

In [None]:
W1

array([[0.05224125, 0.00941627, 0.        , 0.        , 0.        ,
        0.        ],
       [0.03875709, 0.        , 0.05520863, 0.01873516, 0.00121965,
        0.04867738],
       [0.        , 0.04892786, 0.0223263 , 0.        , 0.00657403,
        0.00129874],
       ...,
       [0.02200383, 0.        , 0.0426682 , 0.00044996, 0.        ,
        0.0314196 ],
       [0.        , 0.02779387, 0.00191482, 0.00111531, 0.00191333,
        0.02376908],
       [0.        , 0.        , 0.        , 0.07995555, 0.        ,
        0.        ]])

In [None]:
num_words=15

vocab = np.array(vectorizer.get_feature_names())

top_words = lambda t: [vocab[i] for i in np.argsort(t)[:-num_words-1:-1]]
topic_words = ([top_words(t) for t in H1])
topics = [' '.join(t) for t in topic_words]

In [None]:
topics

['call told phone number would receiv said back ask time contact compani email inform work',
 'debt report credit collect credit report compani valid account disput remov agenc inform letter request provid',
 'account bank check close deposit open money fund check account bank america america close account chase branch transact',
 'payment loan mortgag month late interest paid make made amount servic year home appli escrow',
 'ident theft theft ident victim ident victim belong debt belong affidavit report attach legal pleas result debt fraudul',
 'card credit credit card charg purchas capit balanc disput citi use transact fraud interest chase merchant']

In [None]:
colnames = ["Topic" + str(i) for i in range(clf.n_components)]
docnames = ["Doc" + str(i) for i in range(len(X_train.complaints))]
df_doc_topic = pd.DataFrame(np.round(W1, 2), columns=colnames, index=docnames)
significant_topic = np.argmax(df_doc_topic.values, axis=1)
df_doc_topic['dominant_topic'] = significant_topic

In [None]:
df_doc_topic

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,dominant_topic
Doc0,0.05,0.01,0.00,0.00,0.00,0.00,0
Doc1,0.04,0.00,0.06,0.02,0.00,0.05,2
Doc2,0.00,0.05,0.02,0.00,0.01,0.00,1
Doc3,0.03,0.00,0.01,0.00,0.00,0.01,0
Doc4,0.01,0.00,0.04,0.00,0.00,0.01,2
...,...,...,...,...,...,...,...
Doc40212,0.01,0.00,0.00,0.00,0.00,0.06,5
Doc40213,0.00,0.01,0.05,0.01,0.00,0.02,2
Doc40214,0.02,0.00,0.04,0.00,0.00,0.03,2
Doc40215,0.00,0.03,0.00,0.00,0.00,0.02,1


In [None]:
X_train.head()

Unnamed: 0,complaints,Product,Company
30936,"Throughout XX/XX/2019, I have received multiple calls from this company. I have also asked them to not call me or anyone associated with me. They have called my mother in law way too many times asking for my partner. They have called my partner multiple times asking for me. I'm not sure what kind of operation is being ran there, but it needs to stop. I'm giving notice to cease all contact with me or anyone else except the creditor about this claimed debt. Only contact me in writing, not by phone. Thank you.",Debt collection,TRANSWORLD SYSTEMS INC
34878,"I have called several times, even spoken to a manager to try to clear this matter up and am not able to resolve it. \n\nCiti is claiming there is fraudulent activity on my account, at which they are requesting that I call in to resolve it. I have spoken to about five or six of their representatives, trying to resolve it. They are determining that it is inconclusive, and they are unable to move any further by phone. \n\nAt this particular time, they have "" frozen '' my spending. There is a statement on my page that will not print out that tells me charges to my account may be limited. When I try to print it, it does not show on that page because of the formatting that they have used on their web site. \n\nI have not received any correspondence from them in over a month. I am writing to them today to request that they close my account and apply the balance toward what I owe, because I am not able to use the account. \n\nFor some reason, I made a payment on the XXXX of XXXX and that payment has been cancelled. \n\nEvery time I have tried to contact Citi, they give some reason or another as to why we can not resolve the matter. I have asked them what do I need to do to resolve it, but they are unable to provide a resolution. \n\nThe problem is that I have changed my phone number. My address is still the same, but I have been staying with my sister temporarily off-and-on since her mild XXXX in XXXX. I refuse to change my address to hers, even though I have noticed that my creditors have done so against my will. I told them to keep it as my temporary address, before advising others not to change it at all. I am back and forth between her house and my mailing address listed on this account. \n\nMy problem is with their fraud department. They are unable to give me a solid reason why my account is tagged for fraud. They have failed to send me notification stating steps to verify my information, and to notify me of any results from an "" investigation '' that they are having or holding on my account. I have been unable to use my account since XXXX. \n\nThe last time I called before my account was "" flagged '', I was trying to make a payment by phone. My phone number had changed, and I verified my information. Something XXXX me off and I hung up the phone ( I believe they asked me to enter my full credit card number ), which is usually unusual when you verify the information. When I called back, they did the same thing - at which I explained that I was driving. Almost every time I called, I was driving. I was not about to risk getting into a wreck to get a credit card out, but was willing to verify all of my other information. \n\nSame scenario, repeatedly. I was trying to make a payment, and when I decided not to on that phone call and hung up, it was only then that I mysteriously received information saying my account had been flagged. Prior to this, the only charges that had been made on the account were to make my XXXX XXXX payments, so there are no indications of fraud on this account. \n\nNow that my account has been flagged, I would like my money back. This is a secured card, where I invested my money with Citi and they have locked me from using the card. I have made 2 payments since then, and now, like once before, for some reason, one of my payments didn't go through. \n\nNot only that, my account disappeared from their system for a little over 48 hours that I had used prior on the account. They could not process a payment on this account, so I had to use my debit card tied to this account. I have no idea why they could not use this account. However, it appears that the same thing has been going on, because a recent payment that I made has been cancelled. I am not sure why this time, but they have accessed a late fee on my account. \n\nAt this point, I am tired of Citi. I had a good relationship with them when I was in college, and they were called Citibank or Citigroup before they branched off into Citi. \n\nMy secured account is set to mature in XXXX ; however, I just want my money back. I want them to take the money that I owe them and pay off my account, and say that it is paid, and closed at customer request. This is too much headache to go through for a secured credit card. \n\nIf they have rules surrounding verification of my account, and new phone numbers, they should have just tried to issue a refund of my monies ... and let my account go ... \n\nI will pay the balance of {$7.00}, once they issue a credit to my card. But I am done with Citi ... I will never bank with them by myriad of choice again.",Credit card or prepaid card,"CITIBANK, N.A."
2078,I have disputed this account and have reached out to the company/creditors MANY times but failed. This account is being reported inaccurately and incorrectly on my credit report.,Debt collection,Resurgent Capital Services L.P.
35922,"I have called Synchrony Bank/ Care Credit repeatedly 3 days in a row while I hold on the line for 3 hours and no success. I get online where they direct us to for live chat which responds to "" online chat not available. Please try again later '' They claim they have heavy amount of calls as recorded while holding for hours. I do not believe they answer calls or concerned with customers. In the midst of this national pandemic coronavirus that is causing financial hardships, they are not helping customers.. We as citizens are reaching out for help to no avail to this bank.",Credit card or prepaid card,SYNCHRONY FINANCIAL
15465,"PNC recently changed their policy to not allow it's users to link their accounts or debit cards to third parties, thus denying access to my money that they are holding. This is illegal and needs to stop. I was told by my investment company PNC no longer allows me to link my account with them so I can not automatically deposit funds.",Checking or savings account,PNC Bank N.A.


In [None]:
WHold = clf.transform(vectorizer.transform(X_hold.complaints[:5]))

In [None]:
colnames = ["Topic" + str(i) for i in range(clf.n_components)]
docnames = ["Doc" + str(i) for i in range(len(X_hold[:5].complaints))]
df_doc_topic = pd.DataFrame(np.round(WHold, 2), columns=colnames, index=docnames)
significant_topic = np.argmax(df_doc_topic.values, axis=1)
df_doc_topic['dominant_topic'] = significant_topic

In [None]:
df_doc_topic

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,dominant_topic
Doc0,0.01,0.0,0.1,0.0,0.0,0.0,2
Doc1,0.01,0.02,0.02,0.04,0.0,0.0,3
Doc2,0.0,0.0,0.04,0.01,0.0,0.04,2
Doc3,0.01,0.0,0.0,0.06,0.0,0.0,3
Doc4,0.0,0.0,0.01,0.04,0.0,0.04,3


In [None]:
X_hold.head()

Unnamed: 0,complaints,Product,Company
30060,"I have a business checking account at BB & T. On XX/XX/2019, I attempted to deposit a check into my account and I received a message stating that I was over my monthly mobile deposit limit. I was confused because it was the first of the month and I had not deposited any checks since the previous month. I called BB & T and they said that I couldnt deposit checks into business accounts via the mobile app even though I had done that before. \n\nI was instructed to open a personal account, into which I could deposit checks via the mobile app. I was told that if I opened the account online I would have immediate access, that I could link my personal and business accounts, and immediately be able to transfer money between them. \n\nOn XX/XX/XXXX, I opened my personal account online. Though I successfully opened online, I did not have online access as I had been promised. Because I was traveling in an area where there were no BB & T branches, I could not go into a branch until XX/XX/2019. In the intervening time, my business account became overdrawn. I had the money in my personal account to bring it back into the black, but BB & T could not make the transfer until I went into a branch. During this time, I incurred an astonishing {$320.00} in overdraft fees in my business account because I had no way of transferring money between the two accounts. \n\nWhen I went into the branch, I met with XXXX XXXX in XXXX XXXX County, Florida. She investigated the accounts and told me she would link them and that I would have online access in XXXX hours. The online access still never material, I was not able to transfer money and I received an additional {$36.00} overdraft fee. This brought the total to {$360.00} in overdraft fees. \n\nI had only opened the second account on BB & Ts advice ; I could have had my customer send a bank wire directly into business account in the first week of XXXX to keep the account positive. My TOTAL deposits between the two accounts was always positive. \n\nI requested a refund and was told they would not refund any of the fees.",Checking or savings account,BB&T CORPORATION
53473,"To who it may concern, My concern is regarding Shellpoint Mortgage Servicing Company. This company has received my monthly mortgage payments and have failed to account it as payments received. My bank has sent me proof the account was cleared and paid. This company is trying to foreclosed on my house and has sent me a letter indicating they sent my file to their Loss Mitigation dept. The person who they assigned it to is XXXX XXXX XXXX x XXXX. Their corporate number is XXXX. \n\nI have can send proof of my claim from above should you need it. Please assist.",Mortgage,"Shellpoint Partners, LLC"
35879,I contacted XXXX about fraudulent charges that were made on my account and the customer service representative told me that they wouldnt be able to issue anew card or remove my fraudulent charges since the account was closed due to non-payment. I was unaware of this these charging that were made.. To my knowledge I didnt owe a payment because the account wasnt being used.,Credit card or prepaid card,"EQUIFAX, INC."
20993,"I first applied for the Fedloan Serving program in XXXX in hopes of getting loan forgiveness since I've been paying on my student loans since late XXXX or early XXXX and I have been a public servant since XXXX, XXXX. I have never missed a payment to any of the various loan companies that owned my loans. The Fedloan Program turned me down in XXXX, saying my loans, which were originally Federal Stafford Loans, did not qualify me for the Program. In XXXX, XXXX, I was solicited by XXXX XXXX and told I could get into this program to reduce my loan payments. I was charged {$690.00} by XXXX XXXX to get me into the Fedloan Serving program. My loan payments went down to $ XXXX/month for the first year. In XXXX, XXXX, Fedloan Servicing informed me via email that I needed to recertify my "" income-driven repayment plan. Their customer service helped me submit the form XXXX XXXX ) via the internet and it involved the IRS. For reasons I don't understand, Fedloan is now increasing my monthly payments to {$380.00}, effective XX/XX/XXXX. This is a 60 % increase. On XX/XX/XXXX, I called Fedloan regarding this issue. The customer service person suggested I ask for a recalculation and provide pay-stubs rather than IRS information. On the same call, I was transferred to a second agent who listened to my concern and suggested I get out of the Fedloan Program since I really didn't qualify for it anyway and I could save money by going to a regular payment plan with a different servicer such as XXXX which had serviced my loan previously. I feel like I've been miss-lead by the Fedloan Serving Program and by Mr. XXXX XXXX, the case manager at XXXX XXXX ( http : //www.nexum-servicing.com ). After receiving my {$690.00}, I was never able to talk to XXXX on the phone again. He had convinced me I could save money with Fedloan and he told me my payments would go down to {$0.00} when I retire from State Service in XXXX. He told me I would receive loan forgiveness once my payments went to {$0.00}. I have concluded his advise was incorrect and or miss-leading. I regret having my loan transferred to the Fedloan Program because its representatives have indicated I will not receive loan forgiveness for my 20 years of public service. I am very concerned about the cost of my student loan debt, especially since I intend to retire from State service in XXXX. I am also concerned about how I will be able to afford these student loan payments once I am receiving a pension. I am also very disappointed that I haven't been able to "" qualify '' for loan forgiveness based on my long years of public service. I think this whole Fedloan Program is complete scam and I have been duped.",Student loan,AES/PHEAA
53,"On several occasions ( XXXX ) I have tried to reach someone in costumer service today XX/XX/2020 I tried again to get through and was on hold for over 20 minutes and hung up. \n\n3 Years ago I have made no more than {$600.00} in purchases, and while making payments on my account with a {$9000.00} limit the interest charges are deducted from the credit limit and the interest was based on the balance each month. this left my interest and charge fess higher than my monthly payment which amounted to around {$240.00} each month. \n\nIn all of this time the interest was so big that it completely eat up the limit balance on the card, which left in me in a vicious cycle of never paying down the money owed. with in 2 years I was able to get the balance down {$2000.00} and I asked Walmart Capital One to reduce the balance down. Now it is back up to the high bank fees and interest charges that have again gone beyond what I can afford monthly the {$240.00}. This is a unfair and infantile game that is forced on me as an consumer.",Credit card or prepaid card,CAPITAL ONE FINANCIAL CORPORATION
