In [2]:
# We need the following libraries to carry out the activities
import re

import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

Read the text data

In [3]:
complaints_data = pd.read_csv("../data/consumer_loan_complaints.csv")


In [4]:
# preview the data

complaints_data.head()

Unnamed: 0,user_id,Date received,Product,Issue,Consumer complaint narrative,State,ZIP code,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,44fefdad-7045-4be5-890e-12e84ae6fdc9,01/27/2016,Consumer Loan,Account terms and changes,,AL,35180,Phone,01/27/2016,Closed with explanation,Yes,No,1760486
1,c49d5d60-909f-406b-b7ff-51143fcb650b,08/26/2014,Consumer Loan,Account terms and changes,,NC,278XX,Phone,08/29/2014,Closed with non-monetary relief,Yes,No,1001740
2,9b2cd5d2-900e-4052-831f-6489f6d568af,08/22/2012,Consumer Loan,Account terms and changes,,TN,37205,Referral,08/23/2012,Closed with non-monetary relief,Yes,No,140039
3,b7e5b324-268e-4502-81a1-1a025673c2a0,05/07/2013,Consumer Loan,Problems when you are unable to pay,,OH,43081,Web,05/08/2013,Closed with explanation,Yes,Yes,401541
4,684eeb4c-c9c3-4a97-8213-f3962a6c0aba,06/15/2016,Consumer Loan,Managing the line of credit,,NC,27216,Phone,09/08/2016,Closed with non-monetary relief,Yes,No,1970341


In [5]:
# check shape of the data


complaints_data['Consumer disputed?'].value_counts()


No     1466
Yes     358
Name: Consumer disputed?, dtype: int64

In [6]:
# check the structure of the data

complaints_data.info()

# lots of missing values for the consumer complaints

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1824 entries, 0 to 1823
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   user_id                       1824 non-null   object
 1   Date received                 1824 non-null   object
 2   Product                       1824 non-null   object
 3   Issue                         1824 non-null   object
 4   Consumer complaint narrative  44 non-null     object
 5   State                         1801 non-null   object
 6   ZIP code                      1789 non-null   object
 7   Submitted via                 1824 non-null   object
 8   Date sent to company          1824 non-null   object
 9   Company response to consumer  1824 non-null   object
 10  Timely response?              1824 non-null   object
 11  Consumer disputed?            1824 non-null   object
 12  Complaint ID                  1824 non-null   int64 
dtypes: int64(1), objec

In [7]:
# look at the different types of issues

complaints_data.Issue.value_counts()

# we could simply encode this information

Managing the line of credit            806
Account terms and changes              484
Shopping for a line of credit          301
Problems when you are unable to pay    233
Name: Issue, dtype: int64

In [8]:
# look at the different types of responses to consumer

complaints_data['Company response to consumer'].value_counts()

Closed with explanation            1291
Closed with non-monetary relief     184
Closed with monetary relief         182
Closed without relief                75
Closed                               65
Closed with relief                   19
Untimely response                     8
Name: Company response to consumer, dtype: int64

Subset data to consider only consumer complaints

In [9]:
# number of missing consumer complaints

print("# of users with no comaplaints data:", complaints_data['Consumer complaint narrative'].isnull().sum())

# of users with no comaplaints data: 1780


In [10]:
# reduce the dataset to only contain the text data we are interested in processing

text_data = complaints_data[~complaints_data['Consumer complaint narrative'].isnull()][['user_id', 'Consumer complaint narrative']]
text_data.head(n = 3)

Unnamed: 0,user_id,Consumer complaint narrative
53,1a1448a4-bfe5-455f-bc29-dc79ec5fb2c0,"NONE OF YOUR "" MY LOAN IS A '' below apply to ..."
59,5fede48c-096e-4f82-997d-8229007d8318,XX/XX/2014 I received a letter from the IRS st...
65,fd9fc5ff-19bc-424c-880e-c159c110d21f,This was a revolving account in which I paid W...


In [11]:
# print shape of the data

text_data.shape

(44, 2)

In [12]:
# preview an example of the consumer complaints

sample_text = text_data['Consumer complaint narrative'].iloc[0]
sample_text

# can see data contains punctations, numbers, stop words

'NONE OF YOUR " MY LOAN IS A \'\' below apply to this situation! This was a car loan but the company is providing fraudulent information this is damaging my credit! \n\nRE : MidAtlantic Finance Company Account No. XXXX - NOT TO BE CONFUSED with my current MAF loan MidAtlantic Finance Company has reported several false items to all XXXX credit reporting agencies, and continues to do so. It is damaging my credit so much so that I was told I did n\'t qualify for a mortgage. \n\nMost recently, I settled this account per agreement on XXXX XXXX, XXXX, yet MAF reported it is a payment on the amount claimed owed ( which has been disputed since XXXX XXXX ). But that is just the most recent false information that was reported. It is showing a debt of {$250.00} per month along with XXXX different amounts charged off of the {$950.00} ( plus interest ) and another {$5100.00} that INCLUDES the {$950.00}. Please refer to the following as I NEVER owed MAF {$8100.00} as it reported. That was the origin

### NLP Fundamentals

In [13]:
! pip install spacy

You should consider upgrading via the '/Users/shaq/.pyenv/versions/3.8.10/bin/python3.8 -m pip install --upgrade pip' command.[0m


In [13]:
! python -m spacy download en

[38;5;3m⚠ As of spaCy v3.0, shortcuts like 'en' are deprecated. Please use the
full pipeline package name 'en_core_web_sm' instead.[0m
Collecting en-core-web-sm==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.3.0/en_core_web_sm-3.3.0-py3-none-any.whl (12.8 MB)
[K     |████████████████████████████████| 12.8 MB 2.1 MB/s eta 0:00:01
You should consider upgrading via the '/Users/shaq/.pyenv/versions/3.8.10/bin/python -m pip install --upgrade pip' command.[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [14]:
# For the purpose of cleaning text, going to use the SpaCy library
# # make sure your downloaded the english model with "python -m spacy download en"

import spacy
nlp = spacy.load('en_core_web_sm')


In [15]:
document = nlp(sample_text)

Tokenize the sample_text into sentences

In [16]:
for sentence in document.sents:
    print(sentence)

NONE OF YOUR " MY LOAN IS A '' below apply to this situation!
This was a car loan but the company is providing fraudulent information this is damaging my credit!


RE : MidAtlantic Finance Company Account
No.
XXXX - NOT TO BE CONFUSED with my current MAF loan MidAtlantic Finance Company has reported several false items to all XXXX credit reporting agencies, and continues to do so.
It is damaging my credit so much so that I was told I did n't qualify for a mortgage.


Most recently, I settled this account per agreement on XXXX XXXX, XXXX, yet MAF reported it is a payment on the amount claimed owed ( which has been disputed since XXXX XXXX ).
But that is just the most recent false information that was reported.
It is showing a debt of {$250.00} per month along with XXXX different amounts charged off of the {$950.00} ( plus interest ) and another {$5100.00} that INCLUDES the {$950.00}.
Please refer to the following as I NEVER owed MAF {$8100.00} as it reported.
That was the original amoun

Tokenize the sentences into words

In [17]:
# For the purpose of showing how to process text data, let's take a snippet of the sample text to create a small "document"

sentence = nlp('It is showing a debt of {$250.00} per month along with XXXX different amounts charged off of the {$950.00} ( plus interest ) and another {$5100.00} that INCLUDES the {$950.00}.')

In [18]:
# SpaCy automatically breaks your document into "tokens" when a document is created using the model

for token in sentence:
    print(token.text)

It
is
showing
a
debt
of
{
$
250.00
}
per
month
along
with
XXXX
different
amounts
charged
off
of
the
{
$
950.00
}
(
plus
interest
)
and
another
{
$
5100.00
}
that
INCLUDES
the
{
$
950.00
}
.


In [19]:
# can also lower case all the sentences

for token in sentence:
    print(token.text, token.lower_)

It it
is is
showing showing
a a
debt debt
of of
{ {
$ $
250.00 250.00
} }
per per
month month
along along
with with
XXXX xxxx
different different
amounts amounts
charged charged
off off
of of
the the
{ {
$ $
950.00 950.00
} }
( (
plus plus
interest interest
) )
and and
another another
{ {
$ $
5100.00 5100.00
} }
that that
INCLUDES includes
the the
{ {
$ $
950.00 950.00
} }
. .


Identify Parts of Speech of the tokens

In [20]:
# We can also see the parts of speech of each of these tokens using the .pos_ attribute shown below:

for token in sentence:
  print(token, token.pos_)

It PRON
is AUX
showing VERB
a DET
debt NOUN
of ADP
{ PUNCT
$ SYM
250.00 NUM
} PUNCT
per ADP
month NOUN
along ADP
with ADP
XXXX NOUN
different ADJ
amounts NOUN
charged VERB
off ADP
of ADP
the DET
{ PUNCT
$ SYM
950.00 NUM
} PUNCT
( PUNCT
plus CCONJ
interest NOUN
) PUNCT
and CCONJ
another PRON
{ PUNCT
$ SYM
5100.00 NUM
} PUNCT
that PRON
INCLUDES VERB
the DET
{ PUNCT
$ SYM
950.00 NUM
} PUNCT
. PUNCT


In [21]:
# note for Jason - can tidy up the output if needed
pos = []

for token in sentence:
  pos.append({"word": token,
              "part of speech": token.pos_
              })

pd.DataFrame(pos)

Unnamed: 0,word,part of speech
0,It,PRON
1,is,AUX
2,showing,VERB
3,a,DET
4,debt,NOUN
5,of,ADP
6,{,PUNCT
7,$,SYM
8,250.00,NUM
9,},PUNCT


Identify Stopwords

In [22]:
# In computing, stop words are words which are filtered out before or after processing of natural language data

for token in sentence:
  print(token, token.is_stop)
  
# True means the token is a stop word and should be removed

It True
is True
showing False
a True
debt False
of True
{ False
$ False
250.00 False
} False
per True
month False
along True
with True
XXXX False
different False
amounts False
charged False
off True
of True
the True
{ False
$ False
950.00 False
} False
( False
plus False
interest False
) False
and True
another True
{ False
$ False
5100.00 False
} False
that True
INCLUDES False
the True
{ False
$ False
950.00 False
} False
. False


Identify Punctuation

In [23]:
for token in sentence:
  print(token, token.is_punct)
  
# True means the token is punctuation and should be removed

It False
is False
showing False
a False
debt False
of False
{ True
$ False
250.00 False
} True
per False
month False
along False
with False
XXXX False
different False
amounts False
charged False
off False
of False
the False
{ True
$ False
950.00 False
} True
( True
plus False
interest False
) True
and False
another False
{ True
$ False
5100.00 False
} True
that False
INCLUDES False
the False
{ True
$ False
950.00 False
} True
. True


Identify numbers

In [24]:
for token in sentence:
  print(token, token.like_num)

It False
is False
showing False
a False
debt False
of False
{ False
$ False
250.00 True
} False
per False
month False
along False
with False
XXXX False
different False
amounts False
charged False
off False
of False
the False
{ False
$ False
950.00 True
} False
( False
plus False
interest False
) False
and False
another False
{ False
$ False
5100.00 True
} False
that False
INCLUDES False
the False
{ False
$ False
950.00 True
} False
. False


Stemming the text

In [25]:
# Stemming refers to reducing a word to its root form. W
# There are two types of stemmers in NLTK: Porter Stemmer and Snowball stemmers. Both of them have been implemented using different algorithms.

In [26]:
# We can not actually perform stemming in SpaCy so we will use another popular library for NLP

import nltk
nltk.download('punkt')
  
from nltk.tokenize import word_tokenize
from nltk.stem.porter import *
from nltk.stem.snowball import SnowballStemmer

[nltk_data] Downloading package punkt to /Users/shaq/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [27]:
# because we are using a different library we need to tokenize the sample sentence again. Let's try another sentence.

text = 'This was a car loan but the company is providing fraudulent information this is damaging my credit!'
print(word_tokenize(text))


['This', 'was', 'a', 'car', 'loan', 'but', 'the', 'company', 'is', 'providing', 'fraudulent', 'information', 'this', 'is', 'damaging', 'my', 'credit', '!']


Porter Stemmer

In [28]:
stemmer = PorterStemmer()

for token in word_tokenize(text):
  print(token, '-->' ,stemmer.stem(token))
  
# information becomes inform, company compani, providing becomes provid, fraudulent become fraudul, this, become thi - none of these words exists :(


This --> thi
was --> wa
a --> a
car --> car
loan --> loan
but --> but
the --> the
company --> compani
is --> is
providing --> provid
fraudulent --> fraudul
information --> inform
this --> thi
is --> is
damaging --> damag
my --> my
credit --> credit
! --> !


Snowball Stemmer

In [29]:
stemmer = SnowballStemmer(language='english')

for token in word_tokenize(text):
    print(token, '-->' , stemmer.stem(token))
    
# damaging become damag..

This --> this
was --> was
a --> a
car --> car
loan --> loan
but --> but
the --> the
company --> compani
is --> is
providing --> provid
fraudulent --> fraudul
information --> inform
this --> this
is --> is
damaging --> damag
my --> my
credit --> credit
! --> !


Lemmatizing the text

In [30]:
# Lemmatization is the process of grouping together the different inflected forms of a word so they can be analysed as a single item. 
# Lemmatization is similar to stemming but it brings context to the words. So it links words with similar meaning to one word.

# Let's use the same text as above

parsed_text = nlp(text)

for token in parsed_text:
  print(token, '-->', token.lemma_)
  
# We can see that [IS/is, was] were lemmatized to [be], 
# [providing] was lemmatized to [provide]
# no changes to [fraudulent] and [information]

This --> this
was --> be
a --> a
car --> car
loan --> loan
but --> but
the --> the
company --> company
is --> be
providing --> provide
fraudulent --> fraudulent
information --> information
this --> this
is --> be
damaging --> damage
my --> my
credit --> credit
! --> !


### Text Preprocessing

In [31]:
def spacy_cleaner(original_text):
    """Cleans text data to be processed. Removes punctuations, whitespaces, numbers, stopwords from the text and lemmatizes each token"""

    nlp = spacy.load("en_core_web_sm")

    final_tokens = []
    parsed_text = nlp(original_text)

    for token in parsed_text:
        if token.is_punct or token.is_space or token.like_num or token.is_stop:
            pass
        else:
            if token.lemma_ == '-PRON-':
                final_tokens.append(str(token))
            else:
                sc_removed = re.sub("[^a-zA-Z]", '', str(token.lemma_))  # code to keep pronouns as they are
                if len(sc_removed) > 1:
                    final_tokens.append(sc_removed)
    joined = ' '.join(final_tokens)
    preprocessed_text = re.sub(r'(.)\1+', r'\1\1', joined)

    return preprocessed_text

In [32]:
# apply definition to clean the sample_text

spacy_cleaner(sample_text)

'loan apply situation car loan company provide fraudulent information damage credit MidAtlantic Finance Company Account xx confused current MAF loan MidAtlantic Finance Company report false item xx credit reporting agency continue damage credit tell qualify mortgage recently settle account agreement xx xx xx MAF report payment claim owe dispute xx xx recent false information report show debt month xx different amount charge plus interest include refer following owe MAF report original finance xx xx xx xx payment month xx xx xx xx car purchase xx xx finance HOUSE xx xx MAF statement XX xx xx payment MAF xx xx responsible prior late payment MAF record delinquency XX xx credit report know consider own MAF xx xx charge follow reason Car total xx xx xx pay xx payment plus additional interest fee xx xx xx insurance company pay xx xx xx leave balance MAF dispute XX xx give payoff xx xx expiration date dispute month give finally MAF send accounting xx xx support claim payment wrongfully charge

In [33]:
# before cleaning

sample_text

'NONE OF YOUR " MY LOAN IS A \'\' below apply to this situation! This was a car loan but the company is providing fraudulent information this is damaging my credit! \n\nRE : MidAtlantic Finance Company Account No. XXXX - NOT TO BE CONFUSED with my current MAF loan MidAtlantic Finance Company has reported several false items to all XXXX credit reporting agencies, and continues to do so. It is damaging my credit so much so that I was told I did n\'t qualify for a mortgage. \n\nMost recently, I settled this account per agreement on XXXX XXXX, XXXX, yet MAF reported it is a payment on the amount claimed owed ( which has been disputed since XXXX XXXX ). But that is just the most recent false information that was reported. It is showing a debt of {$250.00} per month along with XXXX different amounts charged off of the {$950.00} ( plus interest ) and another {$5100.00} that INCLUDES the {$950.00}. Please refer to the following as I NEVER owed MAF {$8100.00} as it reported. That was the origin

In [34]:
# apply definition to clean all the text

text_data['consumer_complaints_cleaned'] = text_data['Consumer complaint narrative'].apply(lambda x: spacy_cleaner(x))

text_data.head(n = 3)

Unnamed: 0,user_id,Consumer complaint narrative,consumer_complaints_cleaned
53,1a1448a4-bfe5-455f-bc29-dc79ec5fb2c0,"NONE OF YOUR "" MY LOAN IS A '' below apply to ...",loan apply situation car loan company provide ...
59,5fede48c-096e-4f82-997d-8229007d8318,XX/XX/2014 I received a letter from the IRS st...,XX XX receive letter IRS state owe agency ask ...
65,fd9fc5ff-19bc-424c-880e-c159c110d21f,This was a revolving account in which I paid W...,revolve account pay Wells Fargo National Bank ...


### Text Feature Extraction

In [35]:
# sample corpus - some snippets of text from the consumer complaints

corpus = ['car loan company provide fraudulent information',
          'damage credit MidAtlantic Finance Company Account.',
          'qualify mortgage recently settle account agreement',
          'XX XX XX MAF report payment claim owe dispute.']


Bag of words

In [36]:
# Text Analysis is a major application field for machine learning algorithms. 
# However the raw data, a sequence of symbols cannot be fed directly to the algorithms themselves as most of them expect numerical feature vectors.
# In order to address this, scikit-learn provides utilities for the most common ways to extract numerical features from text content, namely:
# tokenizing strings and giving an integer id for each possible token, for instance by using white-spaces and punctuation as token separators.
# each individual token occurrence frequency (normalized or not) is treated as a feature.

# We call vectorization the general process of turning a collection of text documents into numerical feature vectors.

# This specific strategy (tokenization, counting and normalization) is called the Bag of Words or “Bag of n-grams” representation. 
# Documents are described by word occurrences while completely ignoring the relative position information of the words in the document.

In [37]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

vectorizer

CountVectorizer()

In [38]:
X = vectorizer.fit_transform(corpus)
X

<4x23 sparse matrix of type '<class 'numpy.int64'>'
	with 25 stored elements in Compressed Sparse Row format>

In [39]:
# feature extraction of sample text

pd.DataFrame(X.toarray(), columns = vectorizer.get_feature_names())

Unnamed: 0,account,agreement,car,claim,company,credit,damage,dispute,finance,fraudulent,...,midatlantic,mortgage,owe,payment,provide,qualify,recently,report,settle,xx
0,0,0,1,0,1,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
1,1,0,0,0,1,1,1,0,1,0,...,1,0,0,0,0,0,0,0,0,0
2,1,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,1,0,1,0
3,0,0,0,1,0,0,0,1,0,0,...,0,0,1,1,0,0,0,1,0,3


In [40]:
# number of words generated

print('-'*50)
print("number of words extracted from cleaned text:", len(vectorizer.get_feature_names()))
print('-'*50)

--------------------------------------------------
number of words extracted from cleaned text: 23
--------------------------------------------------


In [41]:
# Bag of words for the cleaned consumer complaints

X = vectorizer.fit_transform(text_data['consumer_complaints_cleaned'])
X

<44x930 sparse matrix of type '<class 'numpy.int64'>'
	with 2182 stored elements in Compressed Sparse Row format>

In [42]:
# number of words generated

print('-'*50)
print("number of words extracted from cleaned text:", len(vectorizer.get_feature_names()))
print('-'*50)

--------------------------------------------------
number of words extracted from cleaned text: 930
--------------------------------------------------


In [43]:
text_features_df = pd.DataFrame(X.toarray(), columns = vectorizer.get_feature_names()).head()
text_features_df

Unnamed: 0,able,absolutely,accept,acceptable,access,accident,accordingly,account,accounting,acct,...,workshop,worth,write,wrong,wrongfully,xx,xxwhy,year,yes,yr
0,0,0,0,0,0,0,1,6,3,0,...,0,0,1,0,1,95,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,21,0,1,0,0
2,0,0,0,0,0,0,0,8,0,0,...,0,0,0,0,0,5,0,0,0,0
3,0,0,0,0,0,0,0,3,0,0,...,0,0,0,0,0,17,0,1,0,0
4,0,0,0,1,1,0,0,1,0,0,...,0,0,0,0,0,2,0,0,0,0


Principal Component Analysis for Dimensionality Reduction

In [44]:
from sklearn.decomposition import PCA

pca = PCA(n_components = 2)
# prepare transform on dataset
pca.fit(text_features_df)
# apply transform to dataset
reduced = pca.transform(text_features_df)

reduced

array([[ 71.1977141 ,   0.36531191],
       [ -8.72383083, -13.07471184],
       [-23.73229916,  -3.14313894],
       [-12.41008309,  14.56434135],
       [-26.33150102,   1.28819752]])

In [45]:
reduced_df = pd.DataFrame(reduced, columns=['PCA1', 'PCA2'])

reduced_df

Unnamed: 0,PCA1,PCA2
0,71.197714,0.365312
1,-8.723831,-13.074712
2,-23.732299,-3.143139
3,-12.410083,14.564341
4,-26.331501,1.288198


In [46]:
pca = PCA(n_components = 5)
# prepare transform on dataset
pca.fit(text_features_df)
# apply transform to dataset
reduced = pca.transform(text_features_df)

reduced

array([[ 7.11977141e+01,  3.65311908e-01,  2.62508380e+00,
         4.70094407e-01,  2.02615702e-15],
       [-8.72383083e+00, -1.30747118e+01, -8.66676714e+00,
        -3.82018761e+00,  1.29549149e-14],
       [-2.37322992e+01, -3.14313894e+00,  3.39074056e+00,
         1.08333528e+01,  4.39787096e-14],
       [-1.24100831e+01,  1.45643413e+01, -8.06196648e+00,
        -4.84247073e-01, -6.04932771e-14],
       [-2.63315010e+01,  1.28819752e+00,  1.07129093e+01,
        -6.99901252e+00,  1.74166237e-15]])

In [47]:
reduced_df = pd.DataFrame(reduced, columns=['PCA1', 'PCA2', 'PCA3', 'PCA4', 'PCA5'])

reduced_df

Unnamed: 0,PCA1,PCA2,PCA3,PCA4,PCA5
0,71.197714,0.365312,2.625084,0.470094,2.026157e-15
1,-8.723831,-13.074712,-8.666767,-3.820188,1.295491e-14
2,-23.732299,-3.143139,3.390741,10.833353,4.397871e-14
3,-12.410083,14.564341,-8.061966,-0.484247,-6.049328e-14
4,-26.331501,1.288198,10.712909,-6.999013,1.741662e-15


TF-IDF

In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer

# In a large text corpus, some words will be very present (e.g. “the”, “a”, “is” in English) hence carrying very little meaningful information about the actual contents of the document. 
# If we were to feed the direct count data directly to a classifier those very frequent terms would shadow the frequencies of rarer yet more interesting terms.
# In order to re-weight the count features into floating point values suitable for usage by a classifier it is very common to use the tf–idf transform.
# Tf means term-frequency while tf–idf means term-frequency times inverse document-frequency: .

tfidf = TfidfVectorizer(sublinear_tf = True, norm = 'l2')
# set sublinear_tf to true, to use the log of frequency, as word frequency follows an exponential distribution
# normalize the vectors so that length of a document does not bias its representation


tfidf

TfidfVectorizer(sublinear_tf=True)

In [49]:
X = tfidf.fit_transform(corpus)
X

<4x23 sparse matrix of type '<class 'numpy.float64'>'
	with 25 stored elements in Compressed Sparse Row format>

In [50]:
pd.DataFrame(X.toarray(), columns = tfidf.get_feature_names())

# The higher the number, the more important the word is in the document 

Unnamed: 0,account,agreement,car,claim,company,credit,damage,dispute,finance,fraudulent,...,midatlantic,mortgage,owe,payment,provide,qualify,recently,report,settle,xx
0,0.0,0.0,0.421765,0.0,0.332524,0.0,0.0,0.0,0.0,0.421765,...,0.0,0.0,0.0,0.0,0.421765,0.0,0.0,0.0,0.0,0.0
1,0.344315,0.0,0.0,0.0,0.344315,0.436719,0.436719,0.0,0.436719,0.0,...,0.436719,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.332524,0.421765,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.421765,0.0,0.0,0.0,0.421765,0.421765,0.0,0.421765,0.0
3,0.0,0.0,0.0,0.310025,0.0,0.0,0.0,0.310025,0.0,0.0,...,0.0,0.0,0.310025,0.310025,0.0,0.0,0.0,0.310025,0.0,0.650622


In [51]:
X = tfidf.fit_transform(text_data['consumer_complaints_cleaned'])
X

<44x930 sparse matrix of type '<class 'numpy.float64'>'
	with 2182 stored elements in Compressed Sparse Row format>

In [52]:
text_features_df = pd.DataFrame(X.toarray(), columns = tfidf.get_feature_names()).head(n = 5)
text_features_df

Unnamed: 0,able,absolutely,accept,acceptable,access,accident,accordingly,account,accounting,acct,...,workshop,worth,write,wrong,wrongfully,xx,xxwhy,year,yes,yr
0,0.0,0.0,0.0,0.0,0.0,0.0,0.080791,0.08706,0.169549,0.0,...,0.0,0.0,0.072827,0.0,0.080791,0.12469,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.107828,0.0,0.058402,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.148543,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.090619,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.088229,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.11602,0.0,0.066303,0.0,0.0
4,0.0,0.0,0.0,0.120108,0.120108,0.0,0.0,0.046361,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.056512,0.0,0.0,0.0,0.0


Principal Component Analysis for Dimensionality Reduction

In [53]:
pca = PCA(n_components = 5)
# prepare transform on dataset
pca.fit(text_features_df)
# apply transform to dataset
reduced = pca.transform(text_features_df)

reduced

array([[ 2.64546180e-01, -3.61162747e-01, -5.71455448e-01,
        -3.96965150e-01,  1.60028241e-16],
       [-5.11105745e-01, -3.72418466e-01, -9.53391133e-02,
         5.48972789e-01, -1.57426155e-16],
       [-5.38687690e-01,  2.30456708e-01,  3.20220863e-01,
        -5.10891706e-01,  4.92227786e-16],
       [ 2.22362072e-01,  7.45494570e-01, -2.34538634e-01,
         2.69064957e-01,  6.67868538e-16],
       [ 5.62885183e-01, -2.42370065e-01,  5.81112331e-01,
         8.98191096e-02, -1.08593690e-15]])

In [54]:
reduced_df = pd.DataFrame(reduced, columns=['PCA1', 'PCA2', 'PCA3', 'PCA4', 'PCA5'])

reduced_df

Unnamed: 0,PCA1,PCA2,PCA3,PCA4,PCA5
0,0.264546,-0.361163,-0.571455,-0.396965,1.600282e-16
1,-0.511106,-0.372418,-0.095339,0.548973,-1.574262e-16
2,-0.538688,0.230457,0.320221,-0.510892,4.922278e-16
3,0.222362,0.745495,-0.234539,0.269065,6.678685e-16
4,0.562885,-0.24237,0.581112,0.089819,-1.085937e-15


Word Embeddings

In [55]:
! python3 -m pip install tensorflow-hub

You should consider upgrading via the '/Users/shaq/.pyenv/versions/3.8.10/bin/python3 -m pip install --upgrade pip' command.[0m


In [56]:
# In natural language processing, Word embedding is a term used for the representation of words for text analysis,
#  typically in the form of a real-valued vector that encodes the meaning of the word such that the words that are closer in the vector space are expected to be similar in meaning.


import tensorflow_hub as hub

model = hub.KerasLayer("https://tfhub.dev/google/nnlm-en-dim128/2")
embeddings = model(corpus)

print(embeddings.shape)  

(4, 128)


In [57]:
pd.DataFrame(embeddings.numpy())

# black box - the words are embedded based on the pre trained model that is used - transforms the text into numbers
# https://monkeylearn.com/blog/word-embeddings-transform-text-numbers/

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,118,119,120,121,122,123,124,125,126,127
0,0.038073,0.264463,0.048195,-0.141991,-0.247594,0.06398,0.122019,-0.02096,-0.176111,0.307387,...,0.279637,0.069571,-0.005267,-0.134918,-0.012667,-0.010323,-0.014336,0.124786,0.100044,-0.021039
1,-0.069166,0.185754,0.013495,-0.097312,-0.112262,-0.002485,0.017564,-0.140766,0.052705,-0.040677,...,0.090535,0.252718,-0.148882,0.028575,-0.157523,-0.005775,0.014667,0.126651,-0.077523,0.036436
2,0.143959,0.040201,0.131933,-0.163261,-0.153106,0.173849,0.085214,0.096441,0.077923,0.14729,...,0.101364,0.21483,0.020511,0.005997,-0.184784,-0.000469,-0.027314,0.211081,0.0454,-0.037874
3,-0.007749,0.201764,-0.024864,-0.285442,-0.008982,0.046572,-0.175117,0.31593,-0.052329,0.078106,...,0.048163,0.109207,0.034085,-0.067236,-0.031543,-0.046939,0.001894,0.063037,-0.015323,-0.2294


In [58]:
# embedding all the text

embeddings = model(text_data['consumer_complaints_cleaned'])

print(embeddings.shape)  

(44, 128)


In [59]:
pd.DataFrame(embeddings.numpy())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,118,119,120,121,122,123,124,125,126,127
0,0.515928,0.886004,-0.095039,-1.90936,0.472184,1.199747,-0.096305,1.225368,-0.035325,0.673008,...,0.284308,1.267973,0.493323,-0.034267,-0.086624,0.060018,0.8012,0.011742,0.582289,-1.256829
1,0.486961,1.115987,-0.068814,-1.208759,0.079021,0.358721,0.397423,0.249672,-0.443215,0.660866,...,-0.102162,0.20548,-0.040274,-0.343966,0.361158,0.286191,0.022067,-0.149214,0.574115,-0.152614
2,0.596453,0.620757,0.016074,-0.784194,0.034492,0.130245,0.270735,0.126117,0.080723,0.555793,...,-0.094275,0.352287,-0.08143,-0.255998,0.005563,0.166629,0.100017,0.061587,0.543561,-0.053615
3,0.584578,0.45776,0.549587,-0.932812,-0.101567,0.456298,0.346412,0.478145,0.010464,0.540493,...,-0.017639,0.949757,0.113978,-0.277338,-0.346869,0.075336,0.128787,0.009813,0.706084,-0.541401
4,0.478477,0.608943,-0.196919,-0.804961,-0.260767,0.28327,0.319497,0.186822,-0.120601,0.888518,...,0.026355,0.476502,-0.022652,-0.517649,-0.077582,0.212551,-0.117033,0.164736,0.389277,-0.062754
5,0.924856,0.925044,0.025454,-1.695489,-0.329399,0.043155,0.561555,0.220304,0.136015,0.605098,...,-0.107479,0.875495,0.057763,-0.307264,-0.162559,0.291617,0.160788,0.286411,0.818867,-0.1537
6,0.8147,1.065405,-0.029501,-2.292808,0.153121,0.90449,-0.019001,1.262061,-0.000946,0.956282,...,0.050122,0.883754,0.453948,-0.090303,0.098723,0.240984,0.236366,-0.082214,0.835546,-1.152121
7,0.19813,0.301659,0.084797,-0.245663,-0.167861,0.083673,-0.101693,0.071951,-0.131065,0.178811,...,-0.04619,0.151211,0.006037,0.00531,0.085914,0.004807,-0.029271,0.090201,0.061405,-0.222765
8,0.138408,0.318974,0.028185,-0.526174,-0.14703,0.115458,0.038369,-0.03313,-0.005632,0.299921,...,-0.033131,0.118216,-0.002534,-0.014765,0.096689,0.09959,-0.144528,0.005114,0.141343,-0.15125
9,-0.036331,0.508712,-0.137589,-0.370079,0.059352,0.061299,0.02786,0.082903,-0.106118,0.123751,...,-0.111689,-0.047067,0.145813,-0.095298,0.078018,0.009336,0.121719,-0.101795,0.025952,0.080499


In [60]:
# Resources:

# https://theiconic.tech/learning-to-rank-is-good-for-your-ml-career-part-1-background-and-word-embeddings-1867c8703c4c
# https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html
# https://realpython.com/natural-language-processing-spacy-python/