# Feature Engineering

In [1]:
import pandas as pd
import numpy as np
import keras
import nltk
import re
import codecs
import vaderSentiment
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from collections import namedtuple

Using TensorFlow backend.


In [2]:
#read in narrative file
nars=pd.read_csv("narratives.csv",index_col=0)

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
nars.head()

Unnamed: 0,date_rec,prod,subprod,issue,sub_issue,narrative,pub_resp,company,state,zip,Tags,consent,how_submit,date_to_company,comp_resp,timely_resp,cons_disp,id
177,01/11/2019,Mortgage,Conventional home mortgage,Applying for a mortgage or refinancing an exis...,,"""Wells Fargo is charging me an exorbitant amou...",Company has responded to the consumer and the ...,WELLS FARGO & COMPANY,CA,90016,,Consent provided,Web,01/11/2019,Closed with explanation,Yes,,3122170
186,01/11/2019,Mortgage,Conventional home mortgage,Applying for a mortgage or refinancing an exis...,,"""I have filed several complaints against Wells...",Company has responded to the consumer and the ...,WELLS FARGO & COMPANY,CA,90016,,Consent provided,Web,01/11/2019,Closed with explanation,Yes,,3122004
188,01/11/2019,"Credit reporting, credit repair services, or o...",Credit reporting,Improper use of your report,Credit inquiries on your report that you don't...,"""Negative remark on my report that I don't rec...",Company has responded to the consumer and the ...,"HCFS Health Care Financial Services, Inc.",FL,330XX,Servicemember,Consent provided,Web,01/11/2019,Closed with explanation,Yes,,3122533
189,01/11/2019,Debt collection,Other debt,Attempts to collect debt not owed,Debt is not yours,"""On XX/XX/2016, this debt for {$160.00} appear...",Company believes it acted appropriately as aut...,Penn Credit Corporation,GA,302XX,,Consent provided,Web,01/11/2019,Closed with explanation,Yes,,3122250
195,01/11/2019,Mortgage,VA mortgage,Trouble during payment process,,"""My mortgage was with XXXX and was sold to Fla...",Company has responded to the consumer and the ...,"FLAGSTAR BANK, FSB",WA,983XX,Servicemember,Consent provided,Web,01/11/2019,Closed with explanation,Yes,,3122028


In [4]:
nars.shape

(359209, 18)

The first thing to do is look at the response variable in greater detail. We previously coded the response variable into three categories, yet one, "untimely response" does not really provide any actionable decision for a user. Additionally, there are very few responses that receive untimely responses (less than 1%), so we will remove these for consideration now.

In [5]:
nars['comp_resp'].value_counts(dropna=False)

Closed with explanation            289812
Closed with non-monetary relief     42580
Closed with monetary relief         20566
Closed                               3741
Untimely response                    2506
NaN                                     4
Name: comp_resp, dtype: int64

We'll recode these responses to three categories: closed, closed with relief, and untimely response. Additionally, we will remove any rows with missing data for now. 

In [6]:
#first, remove missing
nars=nars.dropna(subset=['comp_resp'])
nars.shape

(359205, 18)

In [7]:
#re-define response variable
def response(series):
    if series == 'Closed with explanation':
        return 'closed'
    elif series == 'Closed':
        return 'closed'
    elif series == 'Closed with non-monetary relief':
        return 'relief'
    elif series == 'Closed with monetary relief':
        return 'relief'
    elif series == 'Untimely response':
        return 'late'
    else:
        return 'unknown'
    
nars['response']=nars['comp_resp'].apply(response)

In [8]:
#drop responses that are listed as late
nars=nars[nars.response !='late']
nars.shape

(356699, 19)

# Getting the sentiment of texts

We first need to get the sentiment of the texts as a meta-feature. We'll compute the polarity using this function, and assign negative complaints as -1, neutral complaints as 0, and positive complaints as 1.

In [9]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

def sentiment_analyzer_scores(text):
    score = analyser.polarity_scores(text)
    lb = score['compound']
    if lb >= 0.05:
        return 1
    elif (lb > -0.05) and (lb < 0.05):
        return 0
    else:
        return -1

In [11]:
#need to loop through the narratives and get a score
#first, subset narratives
narratives=nars['narrative']

sentiment=[]

for i in narratives:
    senti=sentiment_analyzer_scores(i)
    sentiment.append(senti)

In [27]:
len(sentiment)

356699

In [52]:

#sent_lis=pd.DataFrame(sentiment)
#write sentiment to hard drive to read in and merge later
#sent_lis.to_csv('sentiment_28jan.csv')
#sent_lis.shape
#sent_w_ids=pd.merge(ids,sent_lis,left_index=True)
#sent_w_ids.shape

# Getting the part of speech tags

We also want the part-of-speech tags for each complaint. How many adjectives or nouns or whatever are present in each complaint?

In [15]:
import spacy
from collections import Counter
nlp=spacy.load('en')

def postag(text):
    doc=nlp(text)
    pos=[(i, i.pos_) for i in doc]
    counts=Counter(tag for word, tag in pos)
    return counts

In [16]:
pos_sents=[]

narratives=nars['narrative']

for i in narratives:
    tags=postag(i)
    pos_sents.append(tags)  

In [35]:
#part_of_speech=pd.DataFrame(pos_sents)
#part_of_speech.head()
#part_of_speech.shape
#part_of_speech.to_csv('all_pos.csv')

In [None]:
len(pos_sents)

In [21]:
#nars_w_sent_pos=pd.merge(nars_w_sent,part_of_speech,left_index=True,right_index=True)
#nars_w_sent_pos.to_csv("nars_senti_pos.csv")

(157241, 36)

# Functions to Clean Narrative Text

Now that the data frame contains only the observations we are interested in considering further, we should now clean the text and derive features from it. We'll consider two sets of features: "meta-features" that describe the complaint and unigrams and bigrams that are featured in the complaint. 

The meta-features we will consider are the length of the complaint, in terms of the number of words, the number of sentences, and the average number of words per sentence. In addition to these descriptives, we will also consider the sentiment of the complaint and the different parts-of-speech in each complaint. 

We'll first derive the features about the length of the complaint.

In [53]:
#sentence tokenizer
def sent_word_tok(text):
    sents=nltk.sent_tokenize(text)
    words=nltk.word_tokenize(text)
    num_sents=len(sents)
    num_words=len(words)
    
    if num_words == 0:
        avg_word_sent == 0
    else:
        avg_word_sent = num_words/num_sents
    return {'num_word': num_words, 'num_sent': num_sents, 'avg_words_sent': avg_word_sent}

In [54]:
length=[]

for i in narratives:
    toks=sent_word_tok(i)
    length.append(toks)

In [56]:
sent_desc=pd.DataFrame(length)
#sent_desc.to_csv('sent_desc_28jan.csv')

In [57]:
#nars_w_sent_pos_len=pd.merge(nars_w_sent_pos,sent_desc,left_index=True,right_index=True)
#nars_w_sent_pos_len.to_csv("nars_senti_pos_len.csv")
#nars_w_sent_pos_len.head()

# Combine the derived features to train the models

Now, will read in CSV files that were save to disk, and combine the derived features with the other data about the complaints. We first will read in the CSV files, merge them together, then merge with the main file containing information about the complaints. 

In [9]:
#read in csv files
#sentiment
sentiment_df=pd.read_csv('sentiment_28jan.csv',index_col=0)
sentiment_df=sentiment_df.rename(columns={'0':'sentiment'})
#part-of-speech tags
pos_df=pd.read_csv('all_pos.csv',index_col=0)
#length descriptions
leng_df=pd.read_csv('sent_desc_28jan.csv',index_col=0)

In [10]:
#combine the data frames
sent_and_pos=pd.merge(sentiment_df,pos_df,left_index=True,right_index=True)
meta_all=pd.merge(sent_and_pos,leng_df,left_index=True,right_index=True)

#create copy of nars so that everything works
short_nars=nars.copy()
short_nars=short_nars.reset_index()

#combine the merged files with narratives together 
nars_meta=pd.merge(short_nars,meta_all,left_index=True,right_index=True)

nars_meta.head()

Unnamed: 0,index,date_rec,prod,subprod,issue,sub_issue,narrative,pub_resp,company,state,...,PRON,PROPN,PUNCT,SPACE,SYM,VERB,X,avg_words_sent,num_sent,num_word
0,177,01/11/2019,Mortgage,Conventional home mortgage,Applying for a mortgage or refinancing an exis...,,"""Wells Fargo is charging me an exorbitant amou...",Company has responded to the consumer and the ...,WELLS FARGO & COMPANY,CA,...,37.0,94.0,62,,2.0,144.0,,34.545455,22,760
1,186,01/11/2019,Mortgage,Conventional home mortgage,Applying for a mortgage or refinancing an exis...,,"""I have filed several complaints against Wells...",Company has responded to the consumer and the ...,WELLS FARGO & COMPANY,CA,...,7.0,27.0,26,,1.0,38.0,,30.125,8,241
2,188,01/11/2019,"Credit reporting, credit repair services, or o...",Credit reporting,Improper use of your report,Credit inquiries on your report that you don't...,"""Negative remark on my report that I don't rec...",Company has responded to the consumer and the ...,"HCFS Health Care Financial Services, Inc.",FL,...,1.0,,2,,,2.0,,12.0,1,12
3,189,01/11/2019,Debt collection,Other debt,Attempts to collect debt not owed,Debt is not yours,"""On XX/XX/2016, this debt for {$160.00} appear...",Company believes it acted appropriately as aut...,Penn Credit Corporation,GA,...,18.0,10.0,26,,2.0,45.0,,21.333333,12,256
4,195,01/11/2019,Mortgage,VA mortgage,Trouble during payment process,,"""My mortgage was with XXXX and was sold to Fla...",Company has responded to the consumer and the ...,"FLAGSTAR BANK, FSB",WA,...,22.0,5.0,19,1.0,,48.0,,22.444444,9,202


Now that meta-features have been derived, we need to get unigrams and bigrams that appear in the narratives. We will first clean the text and then generate a matrix representation.

In [11]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

def standardize_text(df,text_field):
    df[text_field] = df[text_field].str.replace(r"http\S+", "")
    df[text_field] = df[text_field].str.replace(r"http", "")
    df[text_field] = df[text_field].str.replace(r"@\S+", "")
    df[text_field] = df[text_field].str.replace(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ")
    df[text_field] = df[text_field].str.replace(r"@", "at")
    df[text_field] = df[text_field].str.replace(r"\"", "")
    df[text_field] = df[text_field].str.lower()
    df[text_field] = df[text_field].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
    return df

In [12]:
nars_meta = standardize_text(nars_meta, "narrative")

In [13]:
nars_meta.head()

Unnamed: 0,index,date_rec,prod,subprod,issue,sub_issue,narrative,pub_resp,company,state,...,PRON,PROPN,PUNCT,SPACE,SYM,VERB,X,avg_words_sent,num_sent,num_word
0,177,01/11/2019,Mortgage,Conventional home mortgage,Applying for a mortgage or refinancing an exis...,,wells fargo charging exorbitant amount fees at...,Company has responded to the consumer and the ...,WELLS FARGO & COMPANY,CA,...,37.0,94.0,62,,2.0,144.0,,34.545455,22,760
1,186,01/11/2019,Mortgage,Conventional home mortgage,Applying for a mortgage or refinancing an exis...,,"filed several complaints wells fargo, opposing...",Company has responded to the consumer and the ...,WELLS FARGO & COMPANY,CA,...,7.0,27.0,26,,1.0,38.0,,30.125,8,241
2,188,01/11/2019,"Credit reporting, credit repair services, or o...",Credit reporting,Improper use of your report,Credit inquiries on your report that you don't...,negative remark report recognize,Company has responded to the consumer and the ...,"HCFS Health Care Financial Services, Inc.",FL,...,1.0,,2,,,2.0,,12.0,1,12
3,189,01/11/2019,Debt collection,Other debt,Attempts to collect debt not owed,Debt is not yours,"xx xx 2016, debt 160 00 appeared credit report...",Company believes it acted appropriately as aut...,Penn Credit Corporation,GA,...,18.0,10.0,26,,2.0,45.0,,21.333333,12,256
4,195,01/11/2019,Mortgage,VA mortgage,Trouble during payment process,,mortgage xxxx sold flagstar tried process firs...,Company has responded to the consumer and the ...,"FLAGSTAR BANK, FSB",WA,...,22.0,5.0,19,1.0,,48.0,,22.444444,9,202


In [14]:
#write the cleaned text to a csv
nars_meta.to_csv("nars_clean_meta_28jan.csv")