In [None]:
import pandas as pd

In [39]:
# Import data
df = pd.read_csv('complaints.csv')

df.head()

  df = pd.read_csv('complaints.csv')


Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,2023-01-10,"Credit reporting, credit repair services, or o...",Credit reporting,Improper use of your report,Credit inquiries on your report that you don't...,,,"EQUIFAX, INC.",AL,36736.0,,Other,Web,2023-01-10,In progress,Yes,,6414057
1,2023-01-09,Checking or savings account,Checking account,Managing an account,Cashing a check,,,FIFTH THIRD FINANCIAL CORPORATION,OH,45208.0,,,Phone,2023-01-09,Closed with explanation,Yes,,6416102
2,2023-01-09,Debt collection,I do not know,Attempts to collect debt not owed,Debt is not yours,,,"Bull City Financial Solutions, Inc",NC,27858.0,,,Web,2023-01-09,Closed with explanation,Yes,,6415994
3,2023-01-09,"Credit reporting, credit repair services, or o...",Credit reporting,Problem with a credit reporting company's inve...,Difficulty submitting a dispute or getting inf...,,,"EQUIFAX, INC.",TX,76002.0,,,Web,2023-01-09,In progress,Yes,,6418427
4,2023-01-09,"Credit reporting, credit repair services, or o...",Credit reporting,Problem with a credit reporting company's inve...,Difficulty submitting a dispute or getting inf...,,,"EQUIFAX, INC.",TX,76002.0,,,Web,2023-01-09,In progress,Yes,,6418451


In [40]:
# Select Products and Consumer Complaint narrative
new_df = df[['Product','Consumer complaint narrative']]
new_df.shape

(3244309, 2)

In [41]:
new_df['Consumer complaint narrative'].isna().sum()

2072247

In [42]:
# Remove NA values
new_df = new_df.dropna()
new_df.shape

(1172062, 2)

In [43]:
# Get rid of columns with multiple product lables
new_df = new_df[new_df['Product'].str.count(',') == 0]
new_df.head()

Unnamed: 0,Product,Consumer complaint narrative
40,Debt collection,"This debt collector company by name, Credit Ma..."
63,Mortgage,We were a victim of Hurricane Ian. My wife and...
159,Debt collection,"I sent a letter to I.C. Systems on XX/XX/2022,..."
185,Debt collection,I have been a member with USAA Federal Savings...
467,Vehicle loan or lease,I bought my leased vehicle from XXXX XXXX in X...


In [44]:
# For testing purpose, deal with the first 100 rows first
new_df = new_df.iloc[:1000,]

In [80]:
# Get part-of-speech tags for each token in a complaint
import nltk
from nltk import word_tokenize

new_df['tokenized'] = new_df['Consumer complaint narrative'].str.lower().apply(nltk.word_tokenize)
new_df[['tokenized']].head()

Unnamed: 0,tokenized
40,"[this, debt, collector, company, by, name, ,, ..."
63,"[we, were, a, victim, of, hurricane, ian, ., m..."
159,"[i, sent, a, letter, to, i.c, ., systems, on, ..."
185,"[i, have, been, a, member, with, usaa, federal..."
467,"[i, bought, my, leased, vehicle, from, xxxx, x..."


In [81]:
## Move this to the end

from nltk import pos_tag

new_df['tagged'] = new_df['tokenized'].apply(nltk.pos_tag)
new_df[['tagged']].head()

Unnamed: 0,tagged
40,"[(this, DT), (debt, NN), (collector, NN), (com..."
63,"[(we, PRP), (were, VBD), (a, DT), (victim, NN)..."
159,"[(i, NN), (sent, VBD), (a, DT), (letter, NN), ..."
185,"[(i, NNS), (have, VBP), (been, VBN), (a, DT), ..."
467,"[(i, NN), (bought, VBD), (my, PRP$), (leased, ..."


In [50]:
# Get rid of all punctuation
import string
regular_punct = list(string.punctuation)

# text as the row
def remove_punctuation(text,punct_list):
    for punc in punct_list:
        if punc in text:
            text = list(map(lambda x: x.replace(punc,''),text))
    return text

In [147]:
new_df['punct_removed'] = [remove_punctuation(w,regular_punct) for w in new_df['tokenized']]
new_df['punct_removed'].head()


40     [this, debt, collector, company, by, name, , c...
63     [we, were, a, victim, of, hurricane, ian, , my...
159    [i, sent, a, letter, to, ic, , systems, on, xx...
185    [i, have, been, a, member, with, usaa, federal...
467    [i, bought, my, leased, vehicle, from, xxxx, x...
Name: punct_removed, dtype: object

In [113]:
# Remove stop words
from nltk.corpus import stopwords

stop_words = nltk.corpus.stopwords.words('english')

# Query as the row 
def remove_stopwords(query):
    result = [word for word in query if word not in stop_words]
    #result = ' '.join(resultwords)
    return result

In [115]:
new_df['stop_words_removed'] = [remove_stopwords(w) for w in new_df['punct_removed']]
#[w for w in new_df['punct_removed'] if w not in stop_words]
new_df['stop_words_removed'].head()

40     [debt, collector, company, name, , credit, man...
63     [victim, hurricane, ian, , wife, took, time, g...
159    [sent, letter, ic, , systems, xx/xx/2022, , co...
185    [member, usaa, federal, savings, bank, 25, yea...
467    [bought, leased, vehicle, xxxx, xxxx, xx/xx/20...
Name: stop_words_removed, dtype: object

In [148]:
from nltk import pos_tag

new_df['test_tagged'] = new_df['stop_words_removed'].apply(nltk.pos_tag)
new_df[['test_tagged']].head()

Unnamed: 0,test_tagged
40,"[(debt, NN), (collector, NN), (company, NN), (..."
63,"[(victim, NN), (hurricane, NN), (ian, JJ), (, ..."
159,"[(sent, JJ), (letter, NN), (ic, NN), (, NNP), ..."
185,"[(member, NN), (usaa, JJ), (federal, JJ), (sav..."
467,"[(bought, NN), (leased, VBD), (vehicle, NN), (..."


In [149]:
from nltk.corpus import wordnet

def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''

# where treebank_tag is the second element in the tuple of ['tagged']

In [190]:
#lemmatizer.lemmatize('owe', get_wordnet_pos('PRP'))
get_wordnet_pos('PRP')

''

In [185]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

for (token,tag) in new_df.iloc[0]['tagged']:
    print(token,tag)
#    print(lemmatizer.lemmatize(token, get_wordnet_pos(tag)))



this DT
debt NN
collector NN
company NN
by IN
name NN
, ,
credit NN
management NN
lp NN
is VBZ
falsely RB
representing VBG
that IN
i JJ
owe VBP
the DT
sum NN
of IN
{ (
$ $
59.00 CD
} )
to TO
xxxx VB
xxxx NNP
xxxx NNP
xxxxxxxx NNP
xxxx NNP
xxxx NNP
. .
i NN
have VBP
never RB
come VBN
across IN
the DT
name NN
of IN
xxxx NNP
, ,
xxxxxxxx NNP
xxxx NNP
xxxx NNP
xxxx NNP
xxxx NNP
, ,
let VB
alone JJ
order NN
or CC
request VB
any DT
service NN
or CC
product NN
from IN
them PRP
. .
i NN
have VBP
also RB
never RB
received VBD
any DT
bill NN
from IN
this DT
xxxx JJ
xxxxxxxx NNP
xxxx NNP
xxxxxxxx NNP
xxxx NNP
. .
i NN
received VBD
only RB
one CD
debt NN
collection NN
notice NN
from IN
credit NN
management NN
lp NN
and CC
this DT
was VBD
dated VBN
xx/xx/22 NNP
. .
i NN
responded VBD
to TO
them PRP
asking VBG
for IN
details NNS
of IN
this DT
unknown JJ
debt NN
and CC
requested VBN
verification NN
and CC
validation NN
amongst RB
other JJ
details NNS
. .
credit NN
management NN
lp NN
responded VBD
wi

In [154]:
# Lemmatize all the tokens based on the POS tags you created
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# text as the row
def lemmatize_text(text):
    result = []
    for (token,tag) in text: #new_df.iloc[1]['test_tagged']:
        result.append(lemmatizer.lemmatize(token, get_wordnet_pos(tag)))

    #return [lemmatizer.lemmatize(token, get_wordnet_pos(tag)) for (token, tag) in text]
    return result
    
test = [lemmatize_text(w) for w in new_df['test_tagged']]
test
    
#new_df['text_lemmatized'] = new_df['stop_words_removed'].apply(lemmatize_text)


KeyError: ''

In [None]:
x = [lemmatizer.lemmatize(token,get_wordnet_pos(tag)) for (token,tag) in tokenize_pos(i) and set(list(token)) != {'x'}]
x

In [144]:
new_df.head()

Unnamed: 0,Product,Consumer complaint narrative,tokenized,tagged,punct_removed,stop_words_removed,text_lemmatized,test_tagged
40,Debt collection,"This debt collector company by name, Credit Ma...","[this, debt, collector, company, by, name, ,, ...","[(this, DT), (debt, NN), (collector, NN), (com...","[this, debt, collector, company, by, name, , c...","[debt, collector, company, name, , credit, man...","[debt, collector, company, name, , credit, man...","[(debt, NN), (collector, NN), (company, NN), (..."
63,Mortgage,We were a victim of Hurricane Ian. My wife and...,"[we, were, a, victim, of, hurricane, ian, ., m...","[(we, PRP), (were, VBD), (a, DT), (victim, NN)...","[we, were, a, victim, of, hurricane, ian, , my...","[victim, hurricane, ian, , wife, took, time, g...","[victim, hurricane, ian, , wife, took, time, g...","[(victim, NN), (hurricane, NN), (ian, JJ), (, ..."
159,Debt collection,"I sent a letter to I.C. Systems on XX/XX/2022,...","[i, sent, a, letter, to, i.c, ., systems, on, ...","[(i, NN), (sent, VBD), (a, DT), (letter, NN), ...","[i, sent, a, letter, to, ic, , systems, on, xx...","[sent, letter, ic, , systems, xx/xx/2022, , co...","[sent, letter, ic, , system, xx/xx/2022, , col...","[(sent, JJ), (letter, NN), (ic, NN), (, NNP), ..."
185,Debt collection,I have been a member with USAA Federal Savings...,"[i, have, been, a, member, with, usaa, federal...","[(i, NNS), (have, VBP), (been, VBN), (a, DT), ...","[i, have, been, a, member, with, usaa, federal...","[member, usaa, federal, savings, bank, 25, yea...","[member, usaa, federal, saving, bank, 25, year...","[(member, NN), (usaa, JJ), (federal, JJ), (sav..."
467,Vehicle loan or lease,I bought my leased vehicle from XXXX XXXX in X...,"[i, bought, my, leased, vehicle, from, xxxx, x...","[(i, NN), (bought, VBD), (my, PRP$), (leased, ...","[i, bought, my, leased, vehicle, from, xxxx, x...","[bought, leased, vehicle, xxxx, xxxx, xx/xx/20...","[bought, leased, vehicle, xxxx, xxxx, xx/xx/20...","[(bought, NN), (leased, VBD), (vehicle, NN), (..."
