In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import itertools
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import seaborn as sns
%matplotlib inline
#%matplotlib notebook
import nltk
import gensim
import spacy

In [2]:
# load data
df=pd.read_csv('train.csv')
df.head(10)

Unnamed: 0,Complaint-ID,Date-received,Transaction-Type,Complaint-reason,Company-response,Date-sent-to-company,Complaint-Status,Consumer-disputes,Consumer-complaint-summary
0,Tr-1,11/11/2015,Mortgage,"Loan servicing, payments, escrow account",,11/11/2015,Closed with explanation,Yes,"Seterus, Inc a déposé un faux rapport auprès d..."
1,Tr-2,7/7/2015,Credit reporting,Incorrect information on credit report,Company chooses not to provide a public response,7/7/2015,Closed with non-monetary relief,No,XX / XX / XXXX La requête en faillite n ° XXXX...
2,Tr-3,5/7/2015,Bank account or service,Using a debit or ATM card,,5/7/2015,Closed with explanation,No,"El XXXX / XXXX / 15, estaba preparando el vuel..."
3,Tr-4,11/12/2016,Debt collection,Cont'd attempts collect debt not owed,Company believes it acted appropriately as aut...,11/12/2016,Closed with explanation,No,"The loan was paid in XXXX XXXX. In XXXX, 4 yea..."
4,Tr-5,9/29/2016,Credit card,Payoff process,Company has responded to the consumer and the ...,9/29/2016,Closed with explanation,No,J'ai obtenu un compte de crédit de soins pour ...
5,Tr-6,8/2/2016,Mortgage,"Loan modification,collection,foreclosure",,8/3/2016,Closed with explanation,Yes,The owner of my original mortgage filed for ba...
6,Tr-7,3/26/2017,Credit reporting,Incorrect information on credit report,,3/26/2017,Closed with explanation,No,J'ai été victime d'une fraude d'identité et j'...
7,Tr-8,10/15/2016,Bank account or service,Problems caused by my funds being low,Company has responded to the consumer and the ...,10/15/2016,Closed with explanation,No,"Je suis en train de faire faillite et, par con..."
8,Tr-9,1/18/2016,Debt collection,Cont'd attempts collect debt not owed,,1/18/2016,Closed with explanation,Yes,Una agencia de cobranza me hizo adulterar de q...
9,Tr-10,8/17/2015,Credit reporting,Incorrect information on credit report,Company chooses not to provide a public response,8/17/2015,Closed with non-monetary relief,No,"Le XXXX / XXXX / 2015, j'ai reçu une réponse d..."


In [3]:
df.info

<bound method DataFrame.info of       Complaint-ID Date-received  \
0             Tr-1    11/11/2015   
1             Tr-2      7/7/2015   
2             Tr-3      5/7/2015   
3             Tr-4    11/12/2016   
4             Tr-5     9/29/2016   
5             Tr-6      8/2/2016   
6             Tr-7     3/26/2017   
7             Tr-8    10/15/2016   
8             Tr-9     1/18/2016   
9            Tr-10     8/17/2015   
10           Tr-11     1/10/2016   
11           Tr-12      3/5/2016   
12           Tr-13     1/29/2016   
13           Tr-14     9/24/2015   
14           Tr-15    12/28/2016   
15           Tr-16     8/17/2017   
16           Tr-17      5/2/2017   
17           Tr-18     9/29/2016   
18           Tr-19      3/2/2017   
19           Tr-20     8/31/2016   
20           Tr-21     2/18/2016   
21           Tr-22      9/8/2017   
22           Tr-23     4/28/2016   
23           Tr-24     5/19/2016   
24           Tr-25     3/10/2017   
25           Tr-26     6/22/2017

In [4]:
df.isnull().sum()

Complaint-ID                      0
Date-received                     0
Transaction-Type                  0
Complaint-reason                  0
Company-response              22506
Date-sent-to-company              0
Complaint-Status                  0
Consumer-disputes              7698
Consumer-complaint-summary        0
dtype: int64

In [5]:
df.text

AttributeError: 'DataFrame' object has no attribute 'text'

In [6]:
df['Consumer-complaint-summary']

0        Seterus, Inc a déposé un faux rapport auprès d...
1        XX / XX / XXXX La requête en faillite n ° XXXX...
2        El XXXX / XXXX / 15, estaba preparando el vuel...
3        The loan was paid in XXXX XXXX. In XXXX, 4 yea...
4        J'ai obtenu un compte de crédit de soins pour ...
5        The owner of my original mortgage filed for ba...
6        J'ai été victime d'une fraude d'identité et j'...
7        Je suis en train de faire faillite et, par con...
8        Una agencia de cobranza me hizo adulterar de q...
9        Le XXXX / XXXX / 2015, j'ai reçu une réponse d...
10       My mortgage provider, United Wholesale Mortgag...
11       My boyfriend and I bought a XXXX sofa and we h...
12       Le XXXX XXXX, XXXX à environ XXXX heures, j’ai...
13       Both accounts below are settled and closed. Yo...
14       In XX / XX / XXXX we open a lease with XXXX DB...
15       I had scheduled a full TIME payment in their o...
16       He llamado a esta compañía muchas veces para i.

In [7]:
# Fill any blank fields
# all_text=df['Consumer-complaint-summary'].fillna("", inplace=True)

# Join the title and text
# all_text = df.title.str.cat(df.text, sep=' ')

# Tokenize using nltk
words = nltk.word_tokenize(" ".join(df['Consumer-complaint-summary'].tolist()))

In [8]:
len(words)

9293734

In [9]:
from collections import Counter
w=Counter(words)
print(w.most_common(10))

[('XXXX', 391671), ('.', 388421), (',', 295657), ('the', 250426), ('I', 199642), ('de', 169060), ('to', 163347), ('a', 153220), ('and', 142593), ('that', 117835)]


In [10]:
from nltk.corpus import stopwords
import string

stop = stopwords.words('english') 
cleanwords = [i for i in words if i not in stop and i.isalpha() and len(i) > 2]

In [11]:
len(cleanwords)

4564186

In [12]:
w=Counter(cleanwords)
print(w.most_common(10))

[('XXXX', 391671), ('que', 92236), ('credit', 34450), ('account', 32246), ('The', 22305), ('payment', 20708), ('una', 20012), ('would', 18307), ('les', 17519), ('loan', 17489)]


In [13]:
bigrams = nltk.bigrams(cleanwords)
counter = Counter(bigrams)
print(counter.most_common(10))

[(('XXXX', 'XXXX'), 177114), (('credit', 'report'), 8957), (('Wells', 'Fargo'), 5369), (('credit', 'card'), 4775), (('Bank', 'America'), 3828), (('XXXX', 'The'), 3235), (('XXXX', 'que'), 2805), (('informe', 'crédito'), 2745), (('mon', 'compte'), 2703), (('account', 'XXXX'), 2561)]


In [15]:
df['Transaction-Type'].nunique()

18

In [37]:
d1 = [pd.get_dummies(df['Transaction-Type'])]

In [38]:
d2 = [pd.get_dummies(df['Complaint-reason'])]


In [45]:
d3 = [pd.get_dummies(df['Company-response'])]

In [46]:
d4 = [pd.get_dummies(df['Consumer-disputes'])]

In [48]:
df_1=pd.concat(d1,d2,d3,d4,axis=0)

TypeError: concat() got multiple values for argument 'axis'

In [49]:
y=df['Complaint-Status']

In [50]:
X_train, X_test, y_train, y_test = train_test_split(df['Consumer-complaint-summary'], y, test_size=0.33, random_state=53)

In [51]:
count_vectorizer = CountVectorizer(stop_words='english')
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)

In [52]:
count_vectorizer.get_feature_names()[:10]

['00', '000', '0000', '0001', '000k', '000xx', '001', '002', '0054', '0077']

In [54]:
count_df = pd.DataFrame(count_train.A, columns=count_vectorizer.get_feature_names())
count_df.head(10) 

MemoryError: 