In [1]:
import gensim
import gzip
import pandas as pd
from nltk.tokenize import RegexpTokenizer
import numpy as np
import re

In [2]:
comp_df = pd.read_csv('D:\\Datasets\\DeepLearning\\NLP\\complaints.csv')

In [3]:
comp_df.head()

Unnamed: 0,Consumer complaint narrative,Product
0,I have outdated information on my credit repor...,Credit reporting
1,I purchased a new car on XXXX XXXX. The car de...,Consumer Loan
2,An account on my credit report has a mistaken ...,Credit reporting
3,This company refuses to provide me verificatio...,Debt collection
4,This complaint is in regards to Square Two Fin...,Debt collection


In [4]:
comp_df.shape

(179776, 2)

Tokenizer

In [5]:
def complaint_to_words(comp):
    
    words = RegexpTokenizer('\w+').tokenize(comp)
    words = [re.sub(r'([xx]+)|([XX]+)|(\d+)', '', w).lower() for w in words]
    words = list(filter(lambda a: a != '', words))
    
    return words

Vocabulary

In [6]:
all_words = list()
for comp in comp_df['Consumer complaint narrative']:
    for w in complaint_to_words(comp):
        all_words.append(w)

In [None]:
print('Size of vocabulary: {}'.format(len(set(all_words))))

## It will return size of vocabulary = 76908... it will take time

In [8]:
print('Complaint\n', comp_df['Consumer complaint narrative'][10], '\n')
print('Tokens\n\n', complaint_to_words(comp_df['Consumer complaint narrative'][10]))

Complaint
 Without provocation, I received notice that my credit line was being decreased by nearly 100 %. My available credit was reduced from $ XXXX to XXXX ( the rough amount of my available balance ). 

When I called to question the change, I was provided a nob-descript response referencing my XXXX report. It was my understanding that under the FCRA I was entitled to a copy of this report, but was refused by Citi and have been given no further explanation. 

This is predatory in that it affects my utilization of credit, further subjecting me to increase in APrs, etc and a higher cost of credit without any reason. 

Tokens

 ['without', 'provocation', 'i', 'received', 'notice', 'that', 'my', 'credit', 'line', 'was', 'being', 'decreased', 'by', 'nearly', 'my', 'available', 'credit', 'was', 'reduced', 'from', 'to', 'the', 'rough', 'amount', 'of', 'my', 'available', 'balance', 'when', 'i', 'called', 'to', 'question', 'the', 'change', 'i', 'was', 'provided', 'a', 'nob', 'descript', 'res

Gensim Package

In [9]:
comp_df1 = comp_df[['Consumer complaint narrative','Product']]

In [10]:
# For each row, combine all the columns into one column
comp_df2 = comp_df1.apply(lambda x: ','.join(x.astype(str)), axis=1)

In [11]:
# Store them in a pandas dataframe
comp_df_final = pd.DataFrame({'Summary': comp_df2})

In [12]:
complaints_list = [row.split(',') for row in comp_df_final['Summary']]

In [13]:
complaints_list

[['I have outdated information on my credit report that I have previously disputed that has yet to be removed this information is more then seven years old and does not meet credit reporting requirements',
  'Credit reporting'],
 ['I purchased a new car on XXXX XXXX. The car dealer called Citizens Bank to get a 10 day payoff on my loan',
  ' good till XXXX XXXX. The dealer sent the check the next day. When I balanced my checkbook on XXXX XXXX. I noticed that Citizens bank had taken the automatic payment out of my checking account at XXXX XXXX XXXX Bank. I called Citizens and they stated that they did not close the loan until XXXX XXXX. ( stating that they did not receive the check until XXXX. XXXX. ). I told them that I did not believe that the check took that long to arrive. XXXX told me a check was issued to me for the amount overpaid',
  ' they deducted additional interest. Today ( XXXX XXXX',
  ' ) I called Citizens Bank again and talked to a supervisor named XXXX',
  ' because on 

In [14]:
model = gensim.models.Word2Vec(complaints_list, min_count=1,size= 100,workers=5, window =2, sg = 1)

In [15]:
wrd = ['Mortgage']
model.wv.most_similar(positive=wrd)

[('Credit card', 0.9991791248321533),
 ('Consumer Loan', 0.9991637468338013),
 ('Money transfers', 0.9989559054374695),
 ('Bank account or service', 0.9984868168830872),
 ('Student loan', 0.9984151124954224),
 ('Debt collection', 0.9974035024642944),
 ('Prepaid card', 0.9969540238380432),
 (' again', 0.9935368299484253),
 ('On XX/XX/XXXX', 0.9933454990386963),
 ('Other financial service', 0.9931204915046692)]

In [16]:
wrd = ['Bank account or service']
model.wv.most_similar(negative=wrd)

[(' which can allow me to seek damages from a collection agency.',
  0.9590843915939331),
 (' expenses or charges of any kind to the original debt.',
  0.9312505722045898),
 (' which is in violation of the FDCPA.', 0.9311723709106445),
 (' I filed a police report ( see attached ). This account is unknown and was not opened by me.',
  0.9288846254348755),
 ('Consistently calling me at work even though I told them that I can not take calls at work',
  0.9217177033424377),
 ('I have sent the creditor multiple certified letters asking for proper validation of this 6 year old debt. All they send me is a partial statement and a letter stating that the debt is valid. They are not showing me anything that is bearing my signature',
  0.9182758331298828),
 (' we continue to receive demand payment letters.', 0.9152593016624451),
 (' Have been lied too twice about getting it done first yesterday XXXX and again today XXXX/XXXX/2016',
  0.9121207594871521),
 (' Defamation of character and Identity T

In [17]:
model.wv.similarity(w1='Debt collection',w2='Credit reporting')

0.9465815

In [18]:
model.wv.similarity(w1='Bank account or service',w2='Virtual currency')

0.82751167

In [20]:
model.most_similar('Virtual currency')[:5]

  """Entry point for launching an IPython kernel.


[(' inquiries', 0.9822816848754883),
 (' by certified mail', 0.9811859726905823),
 (' maintenance', 0.9797521829605103),
 (' accounts', 0.978130578994751),
 (' employees', 0.9779316186904907)]

In [21]:
model['Virtual currency']  ### Get word embeddings 

  """Entry point for launching an IPython kernel.


array([-0.00519705, -0.01239634, -0.00086866, -0.00340993, -0.03007155,
       -0.0214224 , -0.0028906 ,  0.01169615,  0.01083078,  0.01033507,
        0.00605114,  0.04106498, -0.02106198,  0.02221211,  0.01397375,
        0.00981188, -0.00671482,  0.01206845, -0.02884769, -0.00384241,
        0.01868324,  0.0359574 , -0.00886882,  0.00025531, -0.00875994,
        0.00469928,  0.01758083, -0.00913033,  0.00855864,  0.00556441,
        0.00303321,  0.01485432,  0.05136244,  0.0132432 ,  0.0290224 ,
       -0.00221163,  0.03132263, -0.01431412, -0.00905181, -0.01235155,
        0.02184692, -0.05767189,  0.03107389,  0.02271506,  0.01862708,
       -0.03675528, -0.00062167,  0.01601755, -0.0139142 ,  0.024488  ,
       -0.00017115, -0.01309853, -0.00046829,  0.03619125, -0.02081721,
        0.04420558, -0.01444558, -0.01080661, -0.00544311, -0.00563703,
       -0.04628185,  0.01825911, -0.00646264,  0.00249977,  0.00625345,
       -0.02071907, -0.02663035,  0.00464804, -0.00988837, -0.02

Example 2 

Create your own sentence corpus ..

In [22]:
sentences = [['He', 'is', 'an', 'Intelligent', 'person'],
['He', 'is', 'tall', 'and', 'muscular'],
['He', 'loves', 'playing','Cricket'],
['He', 'likes', 'to','analyze','the','data'],
['and', 'is', 'good', 'in','visualization']]

In [23]:
from gensim.models import Word2Vec
model = Word2Vec(sentences, min_count=1)

To see the dictionary of unique words that exist at least twice in the corpus

In [24]:
print(model.wv.vocab)

{'He': <gensim.models.keyedvectors.Vocab object at 0x000001B7D844AC48>, 'is': <gensim.models.keyedvectors.Vocab object at 0x000001B7D844A608>, 'an': <gensim.models.keyedvectors.Vocab object at 0x000001B7D8442688>, 'Intelligent': <gensim.models.keyedvectors.Vocab object at 0x000001B7D8442848>, 'person': <gensim.models.keyedvectors.Vocab object at 0x000001B7D8442FC8>, 'tall': <gensim.models.keyedvectors.Vocab object at 0x000001B7D8442E88>, 'and': <gensim.models.keyedvectors.Vocab object at 0x000001B7D8442D88>, 'muscular': <gensim.models.keyedvectors.Vocab object at 0x000001B7D8442B88>, 'loves': <gensim.models.keyedvectors.Vocab object at 0x000001B7D8442F48>, 'playing': <gensim.models.keyedvectors.Vocab object at 0x000001B7D8442E08>, 'Cricket': <gensim.models.keyedvectors.Vocab object at 0x000001B7D8442308>, 'likes': <gensim.models.keyedvectors.Vocab object at 0x000001B7D8442908>, 'to': <gensim.models.keyedvectors.Vocab object at 0x000001B7D8442948>, 'analyze': <gensim.models.keyedvectors

Word2Vec model converts words to their corresponding vectors

In [25]:
model.wv['Intelligent']

array([-1.4073958e-03, -2.6569541e-03, -4.6726293e-03, -3.9569749e-03,
       -3.5724954e-03, -3.6935743e-03, -4.9592336e-03, -2.2116359e-03,
        3.3630207e-03, -5.8258360e-04, -4.2070714e-03, -4.2298972e-03,
       -1.5518513e-05, -3.4769948e-03, -4.7729621e-03,  2.0767963e-03,
        2.4370704e-05,  4.3757772e-03,  2.1032824e-03, -4.9774419e-03,
        1.3721166e-03,  3.9308569e-03,  3.1698274e-03,  1.3993001e-03,
       -1.8647485e-04,  3.2635299e-03, -1.6392850e-03,  4.9448824e-03,
        2.1441649e-03,  4.2011389e-03,  4.4056824e-03,  1.0584828e-03,
       -4.0046936e-03,  2.3425457e-03,  3.6671960e-03, -4.7474192e-04,
        5.9689215e-04, -4.9779215e-03,  3.6404254e-03, -2.8890232e-04,
        2.7298683e-03,  7.8199257e-04, -4.1608992e-03,  4.3491921e-03,
       -1.7197850e-03, -4.2847735e-03,  9.5890180e-05, -4.0739235e-03,
        1.2449332e-03, -4.1655768e-03, -4.1291295e-03,  5.8153929e-04,
        1.1080471e-03, -3.3416206e-04,  1.9402002e-03,  3.0727526e-03,
      

In [26]:
model.wv['Cricket']

array([ 4.31537628e-03, -4.11170861e-03, -9.00422128e-06, -3.23250541e-03,
       -5.00730122e-04, -4.97560436e-03,  1.03644654e-03,  1.33047358e-03,
        7.25523045e-04,  2.79305922e-03,  3.02267703e-03,  2.16625887e-03,
        3.43738194e-03, -1.24129365e-04,  1.18102389e-03, -6.38108118e-04,
       -2.25458667e-03,  1.27671612e-03, -3.83235654e-03,  1.03377062e-03,
        2.97653442e-03, -1.73066813e-03,  3.83375533e-04, -2.30908999e-03,
        2.74521788e-03,  1.59929891e-03, -2.45929579e-03,  4.62241378e-03,
        1.15413417e-03, -3.95803014e-03, -3.03371507e-03, -3.62670282e-03,
       -1.50691217e-03, -4.18696646e-03,  2.76121357e-03,  3.96764232e-03,
        2.70279328e-04, -1.10778795e-03, -4.88911942e-03,  2.25040712e-03,
       -4.39131458e-04,  2.55714171e-03,  1.26374292e-03, -1.86271884e-03,
        3.03448667e-03,  1.32725970e-03, -2.68801651e-03,  2.38355412e-03,
        3.34559521e-03,  2.54933420e-03, -1.94908248e-03, -2.74454965e-03,
       -1.25133898e-03, -

In [27]:
model.wv.most_similar('visualization')

[('is', 0.14307232201099396),
 ('data', 0.10009141266345978),
 ('and', 0.07863322645425797),
 ('good', 0.054370056837797165),
 ('loves', 0.04528298228979111),
 ('person', 0.03788425773382187),
 ('tall', -0.005685862153768539),
 ('analyze', -0.006426125764846802),
 ('an', -0.008233942091464996),
 ('in', -0.017930950969457626)]