# Import software libraries

In [1]:
import sys                                                    # Read system parameters.
import numpy as np                                            # Work with multi-dimensional arrays.
import pandas as pd
import spacy                                                  # Process text.
import nltk                                                   # Process text.
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
nltk.data.path.append('/home/jovyan/work/nltk_data/')
import re                                                     # Work with regular expressions.
import warnings                                               # Suppress warnings.
warnings.filterwarnings('ignore')

# Summarize software libraries used.
print('Libraries used in this project:')
print('- Python {}'.format(sys.version))
print('- NumPy {}'.format(np.__version__))
print('- pandas {}'.format(pd.__version__))
print('- spaCy {}'.format(spacy.__version__))
print('- NLTK {}'.format(nltk.__version__))

Libraries used in this project:
- Python 3.7.6 | packaged by conda-forge | (default, Mar 23 2020, 23:03:20) 
[GCC 7.3.0]
- NumPy 1.19.2
- pandas 1.1.3
- spaCy 3.0.5
- NLTK 3.5


# Read and preview the text data

In [2]:
complaints_data = pd.read_csv('/home/jovyan/work/Text/data/consumer_loan_complaints.csv')

complaints_data.head()



Unnamed: 0,user_id,Date received,Product,Issue,Consumer complaint narrative,State,ZIP code,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,44fefdad-7045-4be5-890e-12e84ae6fdc9,01/27/2016,Consumer Loan,Account terms and changes,,AL,35180,Phone,01/27/2016,Closed with explanation,Yes,No,1760486
1,c49d5d60-909f-406b-b7ff-51143fcb650b,08/26/2014,Consumer Loan,Account terms and changes,,NC,278XX,Phone,08/29/2014,Closed with non-monetary relief,Yes,No,1001740
2,9b2cd5d2-900e-4052-831f-6489f6d568af,08/22/2012,Consumer Loan,Account terms and changes,,TN,37205,Referral,08/23/2012,Closed with non-monetary relief,Yes,No,140039
3,b7e5b324-268e-4502-81a1-1a025673c2a0,05/07/2013,Consumer Loan,Problems when you are unable to pay,,OH,43081,Web,05/08/2013,Closed with explanation,Yes,Yes,401541
4,684eeb4c-c9c3-4a97-8213-f3962a6c0aba,06/15/2016,Consumer Loan,Managing the line of credit,,NC,27216,Phone,09/08/2016,Closed with non-monetary relief,Yes,No,1970341


# Check the shape of the data

In [3]:

complaints_data.shape

(1824, 13)

# Retrieve information about the data

In [4]:

complaints_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1824 entries, 0 to 1823
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   user_id                       1824 non-null   object
 1   Date received                 1824 non-null   object
 2   Product                       1824 non-null   object
 3   Issue                         1824 non-null   object
 4   Consumer complaint narrative  44 non-null     object
 5   State                         1801 non-null   object
 6   ZIP code                      1789 non-null   object
 7   Submitted via                 1824 non-null   object
 8   Date sent to company          1824 non-null   object
 9   Company response to consumer  1824 non-null   object
 10  Timely response?              1824 non-null   object
 11  Consumer disputed?            1824 non-null   object
 12  Complaint ID                  1824 non-null   int64 
dtypes: int64(1), objec

In [5]:
complaints_data.Issue.value_counts()

Managing the line of credit            806
Account terms and changes              484
Shopping for a line of credit          301
Problems when you are unable to pay    233
Name: Issue, dtype: int64

In [6]:
complaints_data['Company response to consumer'].value_counts()

Closed with explanation            1291
Closed with non-monetary relief     184
Closed with monetary relief         182
Closed without relief                75
Closed                               65
Closed with relief                   19
Untimely response                     8
Name: Company response to consumer, dtype: int64

# Extract a subset of data to consider only consumer complaints

In [7]:
print('Number of users with no complaints data:',
      complaints_data['Consumer complaint narrative'].isnull().sum())


Number of users with no complaints data: 1780


In [8]:
# Remove records with missing complaint narratives.

text_data = complaints_data[~complaints_data \
                            ['Consumer complaint narrative'].isnull()] \
                           [['user_id', 'Consumer complaint narrative']]

text_data.head(n = 3)

Unnamed: 0,user_id,Consumer complaint narrative
53,1a1448a4-bfe5-455f-bc29-dc79ec5fb2c0,"NONE OF YOUR "" MY LOAN IS A '' below apply to ..."
59,5fede48c-096e-4f82-997d-8229007d8318,XX/XX/2014 I received a letter from the IRS st...
65,fd9fc5ff-19bc-424c-880e-c159c110d21f,This was a revolving account in which I paid W...


In [9]:
text_data.shape

(44, 2)

# Preview an example of the consumer complaints

In [10]:
sample_text = text_data['Consumer complaint narrative'].iloc[0]
sample_text


'NONE OF YOUR " MY LOAN IS A \'\' below apply to this situation! This was a car loan but the company is providing fraudulent information this is damaging my credit! \n\nRE : MidAtlantic Finance Company Account No. XXXX - NOT TO BE CONFUSED with my current MAF loan MidAtlantic Finance Company has reported several false items to all XXXX credit reporting agencies, and continues to do so. It is damaging my credit so much so that I was told I did n\'t qualify for a mortgage. \n\nMost recently, I settled this account per agreement on XXXX XXXX, XXXX, yet MAF reported it is a payment on the amount claimed owed ( which has been disputed since XXXX XXXX ). But that is just the most recent false information that was reported. It is showing a debt of {$250.00} per month along with XXXX different amounts charged off of the {$950.00} ( plus interest ) and another {$5100.00} that INCLUDES the {$950.00}. Please refer to the following as I NEVER owed MAF {$8100.00} as it reported. That was the origin

# Tokenize the sample text into sentences

In [11]:
nlp = spacy.load('/home/jovyan/work/spacy_data/' +
                 'en_core_web_sm/en_core_web_sm-3.0.0/')

document = nlp(sample_text)

In [12]:
for sentence in document.sents:
    print(sentence)


NONE OF YOUR " MY LOAN IS A '' below apply to this situation!
This was a car loan but the company is providing fraudulent information this is damaging my credit!


RE : MidAtlantic Finance Company Account
No.
XXXX - NOT TO BE CONFUSED with my current MAF loan MidAtlantic Finance Company has reported several false items to all XXXX credit reporting agencies, and continues to do so.
It is damaging my credit so much so that I was told I did n't qualify for a mortgage.


Most recently, I settled this account per agreement on XXXX XXXX, XXXX, yet MAF reported it is a payment on the amount claimed owed ( which has been disputed since XXXX XXXX ).
But that is just the most recent false information that was reported.
It is showing a debt of {$250.00} per month along with XXXX different amounts charged off of the {$950.00} ( plus interest ) and another {$5100.00} that INCLUDES the {$950.00}.
Please refer to the following as I NEVER owed MAF {$8100.00} as it reported.
That was the original amoun

# Tokenize the sentences into words

In [None]:
sentence = nlp('It is showing a debt of {$250.00} per month along ' \
               'with XXXX different amounts charged off of the ' \
               '{$950.00} ( plus interest ) and another {$5100.00} ' \
               'that INCLUDES the {$950.00}.')

In [13]:
for token in sentence:
    print(token.text)


4
)
MAF
-
Agreement
XX
/
XX
/
XXXX
-
Agreement
clearly
stating
that
if
the
{
$
500.00
}
was
paid
by
XXXX
XXXX
,
XXXX
,
my
account
would
be
considered
"
settlement
of
the
account
''
yet
MAF
has
reported
it
as
a
PAYMENT
.


# Identify the parts of speech for each token

In [14]:
pos = []

for token in sentence:
    pos.append({'Word': token,
                'Part of Speech': token.pos_
               })

pd.DataFrame(pos)








Unnamed: 0,Word,Part of Speech
0,4,NUM
1,),PUNCT
2,MAF,PROPN
3,-,PUNCT
4,Agreement,PROPN
5,XX,PROPN
6,/,SYM
7,XX,PROPN
8,/,SYM
9,XXXX,PROPN


# Identify stop words

In [16]:


stop = []

for token in sentence:
    stop.append({'Word': token,
                 'Stop Word?': token.is_stop
               })

pd.DataFrame(stop)







Unnamed: 0,Word,Stop Word?
0,4,False
1,),False
2,MAF,False
3,-,False
4,Agreement,False
5,XX,False
6,/,False
7,XX,False
8,/,False
9,XXXX,False


# Stem the text

In [17]:
text = 'This was a car loan but the company is providing ' \
       'fraudulent information this is damaging my credit!'

print(word_tokenize(text))

['This', 'was', 'a', 'car', 'loan', 'but', 'the', 'company', 'is', 'providing', 'fraudulent', 'information', 'this', 'is', 'damaging', 'my', 'credit', '!']


In [18]:
 stemmer = SnowballStemmer(language = 'english')

for token in word_tokenize(text):
    print(token, '-->' , stemmer.stem(token))




This --> this
was --> was
a --> a
car --> car
loan --> loan
but --> but
the --> the
company --> compani
is --> is
providing --> provid
fraudulent --> fraudul
information --> inform
this --> this
is --> is
damaging --> damag
my --> my
credit --> credit
! --> !


# Lemmatize the text

In [20]:
parsed_text = nlp(text)

for token in parsed_text:
    print(token, '-->', token.lemma_)

This --> this
was --> be
a --> a
car --> car
loan --> loan
but --> but
the --> the
company --> company
is --> be
providing --> provide
fraudulent --> fraudulent
information --> information
this --> this
is --> be
damaging --> damage
my --> my
credit --> credit
! --> !


# Transform the text

In [21]:
def spacy_cleaner(original_text):
    """Cleans text data to be processed.
    Removes punctuation, whitespace, numbers, stopwords from the text
    and lemmatizes each token."""

    final_tokens = []
    parsed_text = nlp(original_text)

    for token in parsed_text:
        if token.is_punct or token.is_space or token.like_num or token.is_stop:
            pass
        else:
            if token.lemma_ == '-PRON-':
                final_tokens.append(str(token))  # Keep pronouns as they are.
            else:
                sc_removed = re.sub('[^a-zA-Z]', '', str(token.lemma_))
                if len(sc_removed) > 1:
                    final_tokens.append(sc_removed)
    joined = ' '.join(final_tokens)
    preprocessed_text = re.sub(r'(.)\1+', r'\1\1', joined)

    return preprocessed_text

In [22]:
# Apply transformation to sample.

spacy_cleaner(sample_text)

'LOAN apply situation car loan company provide fraudulent information damage credit MidAtlantic Finance Company Account xx confused current MAF loan MidAtlantic Finance Company report false item xx credit reporting agency continue damage credit tell qualify mortgage recently settle account agreement XX XX XX MAF report payment claim owe dispute XX XX recent false information report show debt month xx different amount charge plus interest include refer following owe MAF report original finance XX xx XX XX payment month XX xx xx XX car purchase XX XX finance HOUSE XX XX MAF statement XX XX XX payment MAF XX XX responsible prior late payment MAF record delinquency xx XX credit report know consider own MAF XX XX charge follow reason car total XX XX XX pay xx payment plus additional interest fee XX XX XX insurance company pay XX XX XX leave balance MAF dispute XX XX give payoff XX xx expiration date dispute month give finally MAF send accounting XX xx support claim payment wrongfully charge

In [23]:
# Compare to sample before transformation.

sample_text

'NONE OF YOUR " MY LOAN IS A \'\' below apply to this situation! This was a car loan but the company is providing fraudulent information this is damaging my credit! \n\nRE : MidAtlantic Finance Company Account No. XXXX - NOT TO BE CONFUSED with my current MAF loan MidAtlantic Finance Company has reported several false items to all XXXX credit reporting agencies, and continues to do so. It is damaging my credit so much so that I was told I did n\'t qualify for a mortgage. \n\nMost recently, I settled this account per agreement on XXXX XXXX, XXXX, yet MAF reported it is a payment on the amount claimed owed ( which has been disputed since XXXX XXXX ). But that is just the most recent false information that was reported. It is showing a debt of {$250.00} per month along with XXXX different amounts charged off of the {$950.00} ( plus interest ) and another {$5100.00} that INCLUDES the {$950.00}. Please refer to the following as I NEVER owed MAF {$8100.00} as it reported. That was the origin

In [24]:
# Apply transformation to entire dataset.

text_data['consumer_complaints_cleaned'] = \
text_data['Consumer complaint narrative'].apply(lambda x: spacy_cleaner(x))

text_data.head(n = 3)




Unnamed: 0,user_id,Consumer complaint narrative,consumer_complaints_cleaned
53,1a1448a4-bfe5-455f-bc29-dc79ec5fb2c0,"NONE OF YOUR "" MY LOAN IS A '' below apply to ...",LOAN apply situation car loan company provide ...
59,5fede48c-096e-4f82-997d-8229007d8318,XX/XX/2014 I received a letter from the IRS st...,XX XX receive letter IRS state owe agency ask ...
65,fd9fc5ff-19bc-424c-880e-c159c110d21f,This was a revolving account in which I paid W...,revolving account pay Wells Fargo National Ban...
