In [105]:
import pandas as pd
import numpy as np

In [106]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn import metrics

In [107]:
data = pd.read_csv("./data/consumer_complaints.csv.zip", compression='zip',dtype='unicode')

In [108]:
data = pd.read_csv("./data/consumer_complaints.csv", dtype='unicode')

In [109]:
data.dropna(subset = ['consumer_complaint_narrative'], axis =0, inplace = True)

In [110]:
data.count()

date_received                   66806
product                         66806
sub_product                     46351
issue                           66806
sub_issue                       32932
consumer_complaint_narrative    66806
company_public_response         32776
company                         66806
state                           66620
zipcode                         66617
tags                            11417
consumer_consent_provided       66806
submitted_via                   66806
date_sent_to_company            66806
company_response_to_consumer    66806
timely_response                 66806
consumer_disputed?              66806
complaint_id                    66806
dtype: int64

In [112]:
data['product'].isnull().sum()

0

In [121]:
data['product'].unique()

array(['Debt collection', 'Consumer Loan', 'Mortgage', 'Credit card',
       'Credit reporting', 'Student loan', 'Bank account or service',
       'Payday loan', 'Money transfers', 'Other financial service',
       'Prepaid card'], dtype=object)

In [113]:
data['consumer_complaint_narrative'].count()

66806

In [122]:
text, y = data['consumer_complaint_narrative'], data['product']

In [123]:
text_train, text_test, y_train, y_test = train_test_split(text, y, test_size=0.33, random_state=142)

In [124]:
np.unique(y_train)

array(['Bank account or service', 'Consumer Loan', 'Credit card',
       'Credit reporting', 'Debt collection', 'Money transfers',
       'Mortgage', 'Other financial service', 'Payday loan',
       'Prepaid card', 'Student loan'], dtype=object)

In [125]:
np.unique(y_test)

array(['Bank account or service', 'Consumer Loan', 'Credit card',
       'Credit reporting', 'Debt collection', 'Money transfers',
       'Mortgage', 'Other financial service', 'Payday loan',
       'Prepaid card', 'Student loan'], dtype=object)

In [126]:
bow_vectorizer = CountVectorizer().fit(text_train)
bow_train_features = bow_vectorizer.transform(text_train)
bow_test_features = bow_vectorizer.transform(text_test)

In [127]:
print("bow_train_features:\n{}".format(repr(bow_train_features)))

bow_train_features:
<44760x43778 sparse matrix of type '<class 'numpy.int64'>'
	with 4058498 stored elements in Compressed Sparse Row format>


In [128]:
feature_names = bow_vectorizer.get_feature_names()
print("Number of features: {}".format(len(feature_names)))
print("First 20 features:\n{}".format(feature_names[:20]))
print("Features 20010 to 20030:\n{}".format(feature_names[20010:20030]))
print("Every 2000th feature:\n{}".format(feature_names[::2000]))

Number of features: 43778
First 20 features:
['00', '000', '0000', '00000', '0001', '000a', '000dollars', '000if', '000ii', '000xx', '001', '0018', '002', '003', '0077', '009', '00amount', '00but', '00eventually', '00for']
Features 20010 to 20030:
['inducement', 'inducements', 'induces', 'inducing', 'induction', 'indulgent', 'indulging', 'industrial', 'industries', 'industry', 'industrycurrent', 'industryphone', 'industrywide', 'indwelling', 'indy', 'indymac', 'indymaci', 'indymax', 'indytmac', 'ine']
Every 2000th feature:
['00', 'aceptance', 'approval', 'billionaireds', 'cheques', 'copnay', 'detrimentally', 'employers', 'finially', 'hanging', 'indoor', 'laundering', 'minavient', 'nstar', 'pension', 'purchace', 'reportedindividual', 'selectingxxxx', 'stautoin', 'thepurpose', 'understated', 'wheredone']


In [129]:
min_bow_vectorizer = CountVectorizer(min_df=5).fit(text_train)
min_bow_train_features = min_bow_vectorizer.transform(text_train)
min_bow_test_features = min_bow_vectorizer.transform(text_test)

In [130]:
print("bow_train_features with min_df: {}".format(repr(min_bow_train_features)))

bow_train_features with min_df: <44760x12512 sparse matrix of type '<class 'numpy.int64'>'
	with 4013412 stored elements in Compressed Sparse Row format>


In [131]:
min_feature_names = min_bow_vectorizer.get_feature_names()

In [132]:
print("First 50 features:\n{}".format(min_feature_names[:50]))
print("Features 20010 to 20030:\n{}".format(min_feature_names[20010:20030]))
print("Every 700th feature:\n{}".format(min_feature_names[::700]))

First 50 features:
['00', '000', '001', '01', '02', '05', '06', '09', '0f', '0n', '10', '100', '1000', '10000', '100000', '1000000', '1005', '100k', '101', '102', '1024', '1026', '1031', '1036', '104', '108', '109', '10th', '10years', '10yrs', '11', '110', '1100', '11000', '110000', '115', '11th', '12', '120', '1200', '12000', '120000', '125', '12months', '13', '130', '1300', '13000', '130000', '135']
Features 20010 to 20030:
[]
Every 700th feature:
['00', 'ach', 'auctioning', 'case', 'continue', 'die', 'equates', 'formerly', 'hostile', 'jacking', 'medial', 'oklahoma', 'preceded', 'refiling', 'satisfaction', 'standby', 'thy', 'variance']


In [133]:
print("Number of stop words: {}".format(len(ENGLISH_STOP_WORDS)))
print("Every 10th stopword:\n{}".format(list(ENGLISH_STOP_WORDS)[::10]))

Number of stop words: 318
Every 10th stopword:
['afterwards', 'becomes', 'yet', 'after', 'anyhow', 'beyond', 'seemed', 'above', 'six', 'then', 'other', 'thereby', 'has', 'latter', 'being', 'do', 'these', 'go', 'nevertheless', 'anyone', 'whereafter', 'once', 'others', 'fifty', 'mill', 'with', 'everyone', 'during', 'find', 'see', 'wherever', 'because']


In [134]:
eng_bow_vectorizer = CountVectorizer(min_df=5, stop_words="english").fit(text_train)
eng_bow_train_features = eng_bow_vectorizer.transform(text_train)
eng_bow_test_features = eng_bow_vectorizer.transform(text_test)

In [135]:
print("bow_train_features with stop words:\n{}".format(repr(eng_bow_train_features)))

bow_train_features with stop words:
<44760x12204 sparse matrix of type '<class 'numpy.int64'>'
	with 2366693 stored elements in Compressed Sparse Row format>


In [136]:
def get_metrics(true_labels, predicted_labels):
    print ('Accuracy:', np.round(metrics.accuracy_score(true_labels,predicted_labels),2))
    print ('Precision:', np.round(metrics.precision_score(true_labels,predicted_labels,average='weighted'),2))
    print ('Recall:', np.round(metrics.recall_score(true_labels, predicted_labels,average='weighted'),2))
    print ('F1 Score:', np.round(metrics.f1_score(true_labels, predicted_labels,average='weighted'),2))  

In [137]:
def train_predict_evaluate_model(classifier,train_features, train_labels, test_features, test_labels):
    # build model    
    classifier.fit(train_features, train_labels)
    # predict using model
    predictions = classifier.predict(test_features) 
    # evaluate model prediction performance   
    get_metrics(true_labels=test_labels, 
                predicted_labels=predictions)
    return predictions    

In [138]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

In [139]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer(stop_words="english")),
                     ('clf', MultinomialNB()),
                    ])

In [140]:
mnb_parameters = {
             'vect__min_df': (5, 7),
             'clf__alpha': (1e-2, 1e-3), 
             'clf__fit_prior': (True, False)}

In [141]:
mnb = MultinomialNB()

In [142]:
gs_mnb = GridSearchCV(text_clf,mnb_parameters)

In [143]:
gs_mnb

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor...nizer=None, vocabulary=None)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'vect__min_df': (5, 7), 'clf__alpha': (0.01, 0.001), 'clf__fit_prior': (True, False)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [144]:
mnb_fitted = gs_mnb.fit(text_train,y_train)

In [145]:
mnb_fitted.best_score_

0.81584003574620201

In [146]:
for param_name in sorted(mnb_parameters.keys()):
    print("%s: %r" % (param_name, mnb_fitted.best_params_[param_name]))

clf__alpha: 0.01
clf__fit_prior: True
vect__min_df: 5


In [147]:
mnb_best = MultinomialNB(alpha = 0.001,fit_prior = True)

In [148]:
# Multinomial Naive Bayes with bag of words features
mnb_bow_predictions = train_predict_evaluate_model(classifier=mnb_best,
                                           train_features=bow_train_features,
                                           train_labels=y_train,
                                           test_features=bow_test_features,
                                           test_labels=y_test)

Accuracy: 0.81
Precision: 0.81
Recall: 0.81
F1 Score: 0.81


In [149]:
def tfidf_extractor(corpus, ngram_range=(1,1)):    
    vectorizer = TfidfVectorizer(min_df=1, 
                                 norm='l2',
                                 smooth_idf=True,
                                 use_idf=True,
                                 ngram_range=ngram_range)
    features = vectorizer.fit_transform(corpus)
    return vectorizer, features

In [150]:
tfidf_vectorizer, tfidf_train_features = tfidf_extractor(text_train)  
tfidf_test_features = tfidf_vectorizer.transform(text_test) 

In [151]:
# Multinomial Naive Bayes with tfidf features                                           
mnb_tfidf_predictions = train_predict_evaluate_model(classifier=mnb_best,
                                           train_features=tfidf_train_features,
                                           train_labels=y_train,
                                           test_features=tfidf_test_features,
                                           test_labels=y_test)

Accuracy: 0.8
Precision: 0.8
Recall: 0.8
F1 Score: 0.8


In [152]:
svm_best = SGDClassifier(loss='hinge', max_iter=100)

In [153]:
# Support Vector Machine with bag of words features
svm_bow_predictions = train_predict_evaluate_model(classifier=svm_best,
                                           train_features=bow_train_features,
                                           train_labels=y_train,
                                           test_features=bow_test_features,
                                           test_labels=y_test)

Accuracy: 0.82
Precision: 0.82
Recall: 0.82
F1 Score: 0.82


In [160]:
cm = metrics.confusion_matrix(y_test, svm_bow_predictions)
pd.DataFrame(cm, index=range(1,12), columns=range(1,12))

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11
1,1250,38,112,57,147,42,61,35,16,67,7
2,13,702,34,96,174,7,54,8,31,8,13
3,99,75,1835,183,285,20,27,24,12,23,5
4,21,31,59,3684,307,2,48,5,6,1,8
5,28,93,104,359,5136,4,83,6,50,4,49
6,33,4,5,7,18,141,3,3,1,11,0
7,32,49,17,142,193,8,4468,22,13,5,6
8,7,1,0,2,7,2,5,7,0,0,3
9,4,27,1,12,89,2,11,2,90,1,1
10,23,0,23,5,9,7,1,1,1,208,0


In [161]:
class_names = y

In [181]:
for document, label, predicted_label in zip(text_test, y_test, svm_bow_predictions):
    print (label,predicted_label)

Credit reporting Credit reporting
Consumer Loan Consumer Loan
Credit card Credit card
Student loan Student loan
Student loan Debt collection
Debt collection Debt collection
Debt collection Debt collection
Mortgage Mortgage
Debt collection Credit reporting
Credit card Debt collection
Credit card Credit card
Debt collection Debt collection
Debt collection Debt collection
Bank account or service Bank account or service
Credit card Credit card
Debt collection Debt collection
Mortgage Mortgage
Debt collection Debt collection
Debt collection Debt collection
Credit reporting Credit reporting
Mortgage Mortgage
Mortgage Mortgage
Debt collection Debt collection
Mortgage Mortgage
Credit reporting Credit reporting
Credit card Credit card
Debt collection Credit reporting
Mortgage Mortgage
Credit reporting Credit reporting
Mortgage Mortgage
Credit card Credit card
Credit reporting Credit reporting
Credit card Credit reporting
Mortgage Mortgage
Credit reporting Credit reporting
Debt collection Debt c

In [170]:
svm_bow_predictions

array(['Credit reporting', 'Consumer Loan', 'Credit card', ..., 'Mortgage',
       'Mortgage', 'Debt collection'],
      dtype='<U23')