In [124]:
import warnings
warnings.filterwarnings("ignore")

In [125]:
import pandas as pd

In [126]:
imdb = pd.read_csv('IMDB Dataset.csv')

In [127]:
imdb.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [128]:
imdb.shape

(50000, 2)

In [129]:
from sklearn.model_selection import train_test_split

In [130]:
imdb.review

0        One of the other reviewers has mentioned that ...
1        A wonderful little production. <br /><br />The...
2        I thought this was a wonderful way to spend ti...
3        Basically there's a family where a little boy ...
4        Petter Mattei's "Love in the Time of Money" is...
                               ...                        
49995    I thought this movie did a down right good job...
49996    Bad plot, bad dialogue, bad acting, idiotic di...
49997    I am a Catholic taught in parochial elementary...
49998    I'm going to have to disagree with the previou...
49999    No one expects the Star Trek movies to be high...
Name: review, Length: 50000, dtype: object

In [131]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.pipeline import make_pipeline

vec = CountVectorizer()
clf = LogisticRegressionCV()
pipe = make_pipeline(vec, clf)
pipe.fit(imdb.review,imdb.sentiment)

Pipeline(steps=[('countvectorizer', CountVectorizer()),
                ('logisticregressioncv', LogisticRegressionCV())])

In [132]:
from sklearn import metrics

def print_report(pipe):
    y_test = imdb.sentiment
    y_pred = pipe.predict(imdb.review)
    report = metrics.classification_report(y_test, y_pred)
    print(report)
    print("accuracy: {:0.3f}".format(metrics.accuracy_score(y_test, y_pred)))

print_report(pipe)


              precision    recall  f1-score   support

    negative       0.96      0.95      0.95     25000
    positive       0.95      0.96      0.95     25000

    accuracy                           0.95     50000
   macro avg       0.95      0.95      0.95     50000
weighted avg       0.95      0.95      0.95     50000

accuracy: 0.951


In [133]:
!pip install eli5



In [134]:
import eli5
eli5.show_weights(clf, top=10) #this result is not meaningful, as weight and feature names are not there

Weight?,Feature
+0.875,x31039
… 51895 more positive …,… 51895 more positive …
… 49991 more negative …,… 49991 more negative …
-0.775,x31913
-0.793,x25426
-0.813,x89908
-0.858,x69434
-0.889,x11720
-0.970,x25428
-1.071,x7242


In [135]:
eli5.show_weights(clf,feature_names=vec.get_feature_names(),target_names=set(imdb.sentiment)) 
#make sense

Weight?,Feature
+0.875,excellent
+0.753,refreshing
+0.733,perfect
+0.716,superb
… 51892 more positive …,… 51892 more positive …
… 49984 more negative …,… 49984 more negative …
-0.713,lacks
-0.718,poor
-0.726,forgettable
-0.728,laughable


In [136]:
eli5.show_prediction(clf, imdb.review[0], vec=vec,
                     target_names=set(imdb.sentiment)) # explain local prediction

Contribution?,Feature
1.715,Highlighted in text (sum)
-0.016,<BIAS>


In [137]:
eli5.show_prediction(clf, imdb.review[123], vec=vec,
                     target_names=set(imdb.sentiment)) # explain local prediction

Contribution?,Feature
3.861,Highlighted in text (sum)
0.016,<BIAS>


In [138]:
eli5.show_prediction(clf, imdb.review[100], vec=vec,
                     target_names=set(imdb.sentiment)) # explain local prediction

Contribution?,Feature
0.725,Highlighted in text (sum)
-0.016,<BIAS>


In [139]:
# the BIAS is the intercept term from the model

In [140]:
# the above approach uses a bag of words vectorizer and a linear classifier

In [141]:
# for other classifiers the feature selection could be harder

In [142]:
# Improvement in Modeling pipeline to make more sense out of text classification

In [143]:
vec = CountVectorizer(stop_words='english')
clf = LogisticRegressionCV()
pipe = make_pipeline(vec, clf)
pipe.fit(imdb.review, imdb.sentiment)

print_report(pipe)


              precision    recall  f1-score   support

    negative       0.95      0.94      0.95     25000
    positive       0.94      0.95      0.95     25000

    accuracy                           0.95     50000
   macro avg       0.95      0.95      0.95     50000
weighted avg       0.95      0.95      0.95     50000

accuracy: 0.948


In [144]:
eli5.show_prediction(clf, imdb.review[0], vec=vec,
                     target_names=set(imdb.sentiment),
                     targets=['positive'])

Contribution?,Feature
1.315,Highlighted in text (sum)
-0.008,<BIAS>


In [145]:
# Model improvement using the TF*IDF Vector

In [146]:
from sklearn.feature_extraction.text import TfidfVectorizer

vec = TfidfVectorizer()
clf = LogisticRegressionCV()
pipe = make_pipeline(vec, clf)
pipe.fit(imdb.review, imdb.sentiment)

print_report(pipe)

              precision    recall  f1-score   support

    negative       0.96      0.95      0.95     25000
    positive       0.95      0.96      0.95     25000

    accuracy                           0.95     50000
   macro avg       0.95      0.95      0.95     50000
weighted avg       0.95      0.95      0.95     50000

accuracy: 0.954


In [147]:
eli5.show_prediction(clf, imdb.review[0], vec=vec,
                     target_names=set(imdb.sentiment),
                     targets=['positive'])

Contribution?,Feature
1.526,Highlighted in text (sum)
-0.186,<BIAS>


In [148]:
# TF*IDF with Stop word removal process

In [149]:
vec = TfidfVectorizer(stop_words='english')
clf = LogisticRegressionCV()
pipe = make_pipeline(vec, clf)
pipe.fit(imdb.review, imdb.sentiment)

print_report(pipe)

              precision    recall  f1-score   support

    negative       0.96      0.95      0.96     25000
    positive       0.95      0.96      0.96     25000

    accuracy                           0.96     50000
   macro avg       0.96      0.96      0.96     50000
weighted avg       0.96      0.96      0.96     50000

accuracy: 0.956


In [150]:
eli5.show_prediction(clf, imdb.review[0], vec=vec,
                     target_names=set(imdb.sentiment),
                     targets=['positive'])

Contribution?,Feature
1.227,Highlighted in text (sum)
-0.102,<BIAS>


In [151]:
# N-gram based on words vs. N-Gram based on characters of words, which one is better

In [152]:
vec = TfidfVectorizer(stop_words='english', analyzer='char',
                      ngram_range=(2,5))
clf = LogisticRegressionCV()
pipe = make_pipeline(vec, clf)
pipe.fit(imdb.review, imdb.sentiment)

print_report(pipe)

              precision    recall  f1-score   support

    negative       1.00      1.00      1.00     25000
    positive       1.00      1.00      1.00     25000

    accuracy                           1.00     50000
   macro avg       1.00      1.00      1.00     50000
weighted avg       1.00      1.00      1.00     50000

accuracy: 0.998


In [153]:
eli5.show_prediction(clf, imdb.review[0], vec=vec,
                     target_names=set(imdb.sentiment),
                     targets=['positive'])

Contribution?,Feature
3.106,Highlighted in text (sum)
-0.192,<BIAS>


In [154]:
# char analyzer creates character n-grams but it takes a lot of time
# char_wb analyzer creates character n-grams, but those characters does not cross word boundaries 

In [155]:
vec = TfidfVectorizer(stop_words='english', analyzer='char_wb',
                      ngram_range=(2,5))
clf = LogisticRegressionCV()
pipe = make_pipeline(vec, clf)
pipe.fit(imdb.review, imdb.sentiment)

print_report(pipe)

              precision    recall  f1-score   support

    negative       0.99      0.99      0.99     25000
    positive       0.99      0.99      0.99     25000

    accuracy                           0.99     50000
   macro avg       0.99      0.99      0.99     50000
weighted avg       0.99      0.99      0.99     50000

accuracy: 0.989


In [156]:
eli5.show_prediction(clf, imdb.review[0], vec=vec,
                     target_names=set(imdb.sentiment),
                     targets=['positive'])

Contribution?,Feature
3.595,Highlighted in text (sum)
-0.137,<BIAS>


In [157]:
# to deal with large vocabularies we can use Hashing vectorizer instead of count or tfidf vectorizer
# this time we need a complex algorithm to classify, instead of simple linear model

In [159]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

vec = HashingVectorizer(stop_words='english', ngram_range=(1,2))
clf = SGDClassifier(random_state=42)
pipe = make_pipeline(vec, clf)
pipe.fit(imdb.review, imdb.sentiment)

print_report(pipe)

              precision    recall  f1-score   support

    negative       0.92      0.90      0.91     25000
    positive       0.90      0.92      0.91     25000

    accuracy                           0.91     50000
   macro avg       0.91      0.91      0.91     50000
weighted avg       0.91      0.91      0.91     50000

accuracy: 0.909


In [160]:
eli5.show_prediction(clf, imdb.review[0], vec=vec,
                     target_names=set(imdb.sentiment),
                     targets=['positive'])

Contribution?,Feature
0.243,Highlighted in text (sum)
0.025,<BIAS>


In [161]:
# InvertableHashingVectorizer. It can be used to get feature names for HahshingVectorizer 
# without fitiing a huge vocabulary

In [162]:
from eli5.sklearn import InvertableHashingVectorizer
import numpy as np

In [163]:
ivec = InvertableHashingVectorizer(vec)
sample_size = len(imdb.review) // 10
X_sample = np.random.choice(imdb.review, size=sample_size)
ivec.fit(X_sample)

InvertableHashingVectorizer(vec=HashingVectorizer(ngram_range=(1, 2),
                                                  stop_words='english'))

In [164]:
eli5.show_weights(clf, vec=ivec, top=20,
                  target_names=set(imdb.sentiment))

Weight?,Feature
+4.547,excellent
+3.812,great
+3.293,wonderful …
+3.270,perfect …
+3.195,amazing …
+3.066,brilliant …
+3.038,best
… 456067 more positive …,… 456067 more positive …
… 455556 more negative …,… 455556 more negative …
-3.070,supposed


If a library is not supported by eli5 directly, or the text processing pipeline is too complex for eli5, eli5 can still help - it provides an implementation of LIME (Ribeiro et al., 2016) algorithm which allows to explain predictions of arbitrary classifiers, including text classifiers. eli5.lime can also help when it is hard to get exact mapping between model coefficients and text features, e.g. if there is dimension reduction involved.

In [165]:
import pandas as pd
df = pd.read_csv('complaints.csv')
df.shape
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1989953 entries, 0 to 1989952
Data columns (total 18 columns):
 #   Column                        Dtype 
---  ------                        ----- 
 0   Date received                 object
 1   Product                       object
 2   Sub-product                   object
 3   Issue                         object
 4   Sub-issue                     object
 5   Consumer complaint narrative  object
 6   Company public response       object
 7   Company                       object
 8   State                         object
 9   ZIP code                      object
 10  Tags                          object
 11  Consumer consent provided?    object
 12  Submitted via                 object
 13  Date sent to company          object
 14  Company response to consumer  object
 15  Timely response?              object
 16  Consumer disputed?            object
 17  Complaint ID                  int64 
dtypes: int64(1), object(17)
memory usage: 273.

In [166]:
# Create a new dataframe with two columns
df1 = df[['Product', 'Consumer complaint narrative']].copy()


In [167]:
# Remove missing values (NaN)
df1 = df1[pd.notnull(df1['Consumer complaint narrative'])]


In [168]:
# Renaming second column for a simpler name
df1.columns = ['Product', 'Consumer_complaint'] 


In [169]:
df1.shape


(675946, 2)

In [170]:
# Because the computation is time consuming (in terms of CPU), the data was sampled
df2 = df1.sample(20000, random_state=1).copy()


In [171]:
# Renaming categories
df2.replace({'Product': 
             {'Credit reporting, credit repair services, or other personal consumer reports': 
              'Credit reporting, repair, or other', 
              'Credit reporting': 'Credit reporting, repair, or other',
             'Credit card': 'Credit card or prepaid card',
             'Prepaid card': 'Credit card or prepaid card',
             'Payday loan': 'Payday loan, title loan, or personal loan',
             'Money transfer': 'Money transfer, virtual currency, or money service',
             'Virtual currency': 'Money transfer, virtual currency, or money service'}}, 
            inplace= True)


In [172]:
df2.head()

Unnamed: 0,Product,Consumer_complaint
868190,Mortgage,"Dear CFPB, I currently have a Bank of America ..."
34778,Student loan,XXXX XXXX XXXX XXXX XXXX XXXX. # XXXX XXXX Oh ...
752231,"Money transfer, virtual currency, or money ser...",Here is a brief timeline of events that I can ...
715237,"Credit reporting, repair, or other",I sent a total of 5 letter to TransUnion regar...
171294,"Credit reporting, repair, or other",I checked my credit using XXXX on XX/XX/XXXX a...


In [173]:
pd.DataFrame(df2.Product.unique())


Unnamed: 0,0
0,Mortgage
1,Student loan
2,"Money transfer, virtual currency, or money ser..."
3,"Credit reporting, repair, or other"
4,Debt collection
5,Credit card or prepaid card
6,Checking or savings account
7,Consumer Loan
8,Vehicle loan or lease
9,"Payday loan, title loan, or personal loan"


In [174]:
# Create a new column 'category_id' with encoded categories 
df2['category_id'] = df2['Product'].factorize()[0]
category_id_df = df2[['Product', 'category_id']].drop_duplicates()


In [175]:
# Dictionaries for future use
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'Product']].values)


In [176]:
# New dataframe
df2.head()


Unnamed: 0,Product,Consumer_complaint,category_id
868190,Mortgage,"Dear CFPB, I currently have a Bank of America ...",0
34778,Student loan,XXXX XXXX XXXX XXXX XXXX XXXX. # XXXX XXXX Oh ...,1
752231,"Money transfer, virtual currency, or money ser...",Here is a brief timeline of events that I can ...,2
715237,"Credit reporting, repair, or other",I sent a total of 5 letter to TransUnion regar...,3
171294,"Credit reporting, repair, or other",I checked my credit using XXXX on XX/XX/XXXX a...,3


In [184]:
df2.Product.value_counts()

Credit reporting, repair, or other                    8231
Debt collection                                       4075
Mortgage                                              2216
Credit card or prepaid card                           2000
Checking or savings account                            849
Student loan                                           788
Bank account or service                                441
Vehicle loan or lease                                  374
Money transfer, virtual currency, or money service     358
Payday loan, title loan, or personal loan              312
Consumer Loan                                          296
Money transfers                                         53
Other financial service                                  7
Name: Product, dtype: int64

In [185]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.pipeline import make_pipeline

vec = CountVectorizer()
clf = LogisticRegressionCV()
pipe = make_pipeline(vec, clf)
pipe.fit(df2.Consumer_complaint,df2.Product)

Pipeline(steps=[('countvectorizer', CountVectorizer()),
                ('logisticregressioncv', LogisticRegressionCV())])

In [186]:
from sklearn import metrics

def print_report(pipe):
    y_test = df2.Product
    y_pred = pipe.predict(df2.Consumer_complaint)
    report = metrics.classification_report(y_test, y_pred)
    print(report)
    print("accuracy: {:0.3f}".format(metrics.accuracy_score(y_test, y_pred)))

print_report(pipe)


                                                    precision    recall  f1-score   support

                           Bank account or service       0.97      0.74      0.84       441
                       Checking or savings account       0.91      0.87      0.89       849
                                     Consumer Loan       0.98      0.59      0.74       296
                       Credit card or prepaid card       0.91      0.85      0.88      2000
                Credit reporting, repair, or other       0.87      0.95      0.91      8231
                                   Debt collection       0.87      0.85      0.86      4075
Money transfer, virtual currency, or money service       0.92      0.80      0.86       358
                                   Money transfers       1.00      0.70      0.82        53
                                          Mortgage       0.96      0.95      0.95      2216
                           Other financial service       1.00      0.57      0.

In [187]:
import eli5
eli5.show_weights(clf, top=10) 
#this result is not meaningful, as weight and feature names are not there


Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5,Unnamed: 12_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6,Unnamed: 10_level_6,Unnamed: 11_level_6,Unnamed: 12_level_6
Weight?,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7,Unnamed: 11_level_7,Unnamed: 12_level_7
Weight?,Feature,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8,Unnamed: 9_level_8,Unnamed: 10_level_8,Unnamed: 11_level_8,Unnamed: 12_level_8
Weight?,Feature,Unnamed: 2_level_9,Unnamed: 3_level_9,Unnamed: 4_level_9,Unnamed: 5_level_9,Unnamed: 6_level_9,Unnamed: 7_level_9,Unnamed: 8_level_9,Unnamed: 9_level_9,Unnamed: 10_level_9,Unnamed: 11_level_9,Unnamed: 12_level_9
Weight?,Feature,Unnamed: 2_level_10,Unnamed: 3_level_10,Unnamed: 4_level_10,Unnamed: 5_level_10,Unnamed: 6_level_10,Unnamed: 7_level_10,Unnamed: 8_level_10,Unnamed: 9_level_10,Unnamed: 10_level_10,Unnamed: 11_level_10,Unnamed: 12_level_10
Weight?,Feature,Unnamed: 2_level_11,Unnamed: 3_level_11,Unnamed: 4_level_11,Unnamed: 5_level_11,Unnamed: 6_level_11,Unnamed: 7_level_11,Unnamed: 8_level_11,Unnamed: 9_level_11,Unnamed: 10_level_11,Unnamed: 11_level_11,Unnamed: 12_level_11
Weight?,Feature,Unnamed: 2_level_12,Unnamed: 3_level_12,Unnamed: 4_level_12,Unnamed: 5_level_12,Unnamed: 6_level_12,Unnamed: 7_level_12,Unnamed: 8_level_12,Unnamed: 9_level_12,Unnamed: 10_level_12,Unnamed: 11_level_12,Unnamed: 12_level_12
+0.613,x20505,,,,,,,,,,,
+0.373,x459,,,,,,,,,,,
+0.372,x16514,,,,,,,,,,,
+0.369,x11035,,,,,,,,,,,
+0.368,x6856,,,,,,,,,,,
+0.362,x3427,,,,,,,,,,,
+0.327,x19601,,,,,,,,,,,
… 3669 more positive …,… 3669 more positive …,,,,,,,,,,,
… 22528 more negative …,… 22528 more negative …,,,,,,,,,,,
-0.341,x16879,,,,,,,,,,,

Weight?,Feature
+0.613,x20505
+0.373,x459
+0.372,x16514
+0.369,x11035
+0.368,x6856
+0.362,x3427
+0.327,x19601
… 3669 more positive …,… 3669 more positive …
… 22528 more negative …,… 22528 more negative …
-0.341,x16879

Weight?,Feature
+0.423,x20486
+0.387,x5119
+0.365,x3427
+0.341,<BIAS>
… 4772 more positive …,… 4772 more positive …
… 21425 more negative …,… 21425 more negative …
-0.365,x459
-0.371,x16885
-0.397,x447
-0.417,x6495

Weight?,Feature
+0.604,x24811
+0.556,x4596
+0.400,x14062
+0.352,x459
+0.351,x9959
+0.300,x9953
+0.285,x6830
… 2772 more positive …,… 2772 more positive …
… 23425 more negative …,… 23425 more negative …
-0.283,x7505

Weight?,Feature
+0.899,<BIAS>
+0.763,x4600
+0.445,x4564
+0.427,x18244
+0.420,x13949
+0.417,x14841
+0.388,x19968
… 6429 more positive …,… 6429 more positive …
… 19768 more negative …,… 19768 more negative …
-0.416,x15196

Weight?,Feature
+2.290,<BIAS>
+1.208,x8938
+1.078,x9380
+0.999,x23651
+0.607,x12664
+0.565,x10483
+0.540,x15196
+0.483,x12216
+0.464,x19452
+0.447,x19315

Weight?,Feature
+1.828,<BIAS>
+0.733,x6864
+0.592,x5386
+0.568,x14625
+0.534,x5379
+0.492,x5391
+0.458,x4475
+0.456,x16597
+0.436,x11729
+0.404,x2540

Weight?,Feature
+0.807,x5346
+0.584,x2570
+0.479,x16903
+0.475,x23593
+0.458,x13970
+0.451,x15115
+0.429,x17397
+0.370,x10610
+0.345,x23238
… 3155 more positive …,… 3155 more positive …

Weight?,Feature
+0.557,x15115
+0.459,x16903
+0.437,x23593
+0.342,x25508
+0.303,x3427
+0.288,x25350
+0.265,x20716
+0.261,x10610
+0.240,x24914
… 1229 more positive …,… 1229 more positive …

Weight?,Feature
+1.119,x15196
+0.777,<BIAS>
+0.769,x9037
+0.621,x15082
+0.533,x20813
+0.506,x18974
+0.486,x7980
+0.463,x5262
… 7456 more positive …,… 7456 more positive …
… 18741 more negative …,… 18741 more negative …

Weight?,Feature
+0.229,x14201
+0.201,x6495
+0.191,x11457
+0.167,x2853
+0.167,x15115
+0.153,x13149
+0.150,x22755
… 424 more positive …,… 424 more positive …
… 25773 more negative …,… 25773 more negative …
-0.157,x1528

Weight?,Feature
+0.578,x14062
+0.535,x16848
+0.362,x10873
+0.358,x4665
+0.311,x13958
+0.296,x1987
… 2746 more positive …,… 2746 more positive …
… 23451 more negative …,… 23451 more negative …
-0.295,x15196
-0.315,x22119

Weight?,Feature
+1.081,x15458
+0.776,x22119
+0.735,x14077
+0.473,x19395
+0.436,x1888
+0.427,x8458
+0.419,x14062
+0.410,x20478
+0.380,x10211
… 4286 more positive …,… 4286 more positive …

Weight?,Feature
+0.607,x4596
+0.597,x24811
+0.524,x3171
+0.395,x20363
+0.352,x1467
+0.318,x13715
+0.310,x15678
+0.301,x23501
… 3355 more positive …,… 3355 more positive …
… 22842 more negative …,… 22842 more negative …


In [190]:
eli5.show_weights(clf,feature_names=vec.get_feature_names(),target_names=set(df2.Product)) 
#make sense


Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5,Unnamed: 12_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6,Unnamed: 10_level_6,Unnamed: 11_level_6,Unnamed: 12_level_6
Weight?,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7,Unnamed: 11_level_7,Unnamed: 12_level_7
Weight?,Feature,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8,Unnamed: 9_level_8,Unnamed: 10_level_8,Unnamed: 11_level_8,Unnamed: 12_level_8
Weight?,Feature,Unnamed: 2_level_9,Unnamed: 3_level_9,Unnamed: 4_level_9,Unnamed: 5_level_9,Unnamed: 6_level_9,Unnamed: 7_level_9,Unnamed: 8_level_9,Unnamed: 9_level_9,Unnamed: 10_level_9,Unnamed: 11_level_9,Unnamed: 12_level_9
Weight?,Feature,Unnamed: 2_level_10,Unnamed: 3_level_10,Unnamed: 4_level_10,Unnamed: 5_level_10,Unnamed: 6_level_10,Unnamed: 7_level_10,Unnamed: 8_level_10,Unnamed: 9_level_10,Unnamed: 10_level_10,Unnamed: 11_level_10,Unnamed: 12_level_10
Weight?,Feature,Unnamed: 2_level_11,Unnamed: 3_level_11,Unnamed: 4_level_11,Unnamed: 5_level_11,Unnamed: 6_level_11,Unnamed: 7_level_11,Unnamed: 8_level_11,Unnamed: 9_level_11,Unnamed: 10_level_11,Unnamed: 11_level_11,Unnamed: 12_level_11
Weight?,Feature,Unnamed: 2_level_12,Unnamed: 3_level_12,Unnamed: 4_level_12,Unnamed: 5_level_12,Unnamed: 6_level_12,Unnamed: 7_level_12,Unnamed: 8_level_12,Unnamed: 9_level_12,Unnamed: 10_level_12,Unnamed: 11_level_12,Unnamed: 12_level_12
+0.613,scottrade,,,,,,,,,,,
+0.373,2016,,,,,,,,,,,
+0.372,overdraft,,,,,,,,,,,
+0.369,greentree,,,,,,,,,,,
+0.368,debit,,,,,,,,,,,
+0.362,bank,,,,,,,,,,,
+0.327,requirements,,,,,,,,,,,
+0.323,2015,,,,,,,,,,,
+0.310,branch,,,,,,,,,,,
+0.306,deposited,,,,,,,,,,,

Weight?,Feature
+0.613,scottrade
+0.373,2016
+0.372,overdraft
+0.369,greentree
+0.368,debit
+0.362,bank
+0.327,requirements
+0.323,2015
+0.310,branch
+0.306,deposited

Weight?,Feature
+0.423,schwab
+0.387,citibank
+0.365,bank
+0.341,<BIAS>
+0.340,deposit
+0.314,overdraft
+0.311,bonus
+0.307,debit
+0.289,charged
+0.288,branch

Weight?,Feature
+0.604,vehicle
+0.556,car
+0.400,loan
+0.352,2016
+0.351,financial
+0.300,finance
+0.285,dealer
+0.265,contract
+0.253,corporation
+0.252,paying

Weight?,Feature
+0.899,<BIAS>
+0.763,card
+0.445,capital
+0.427,purchase
+0.420,limit
+0.417,minimum
+0.388,rewards
+0.386,visa
+0.381,unemployment
+0.378,issued

Weight?,Feature
+2.290,<BIAS>
+1.208,equifax
+1.078,experian
+0.999,transunion
+0.607,inquiries
+0.565,freeze
+0.540,mortgage
+0.483,inaccurate
+0.464,report
+0.447,remove

Weight?,Feature
+1.828,<BIAS>
+0.733,debt
+0.592,collection
+0.568,medical
+0.534,collect
+0.492,collections
+0.458,calling
+0.456,owe
+0.436,hospital
+0.404,apartment

Weight?,Feature
+0.807,coinbase
+0.584,app
+0.479,paypal
+0.475,transfer
+0.458,link
+0.451,money
+0.429,pnc
+0.370,funds
+0.345,tickets
+0.335,transaction

Weight?,Feature
+0.557,money
+0.459,paypal
+0.437,transfer
+0.342,wire
+0.303,bank
+0.288,western
+0.265,send
+0.261,funds
+0.240,via
+0.231,transaction

Weight?,Feature
+1.119,mortgage
+0.777,<BIAS>
+0.769,escrow
+0.621,modification
+0.533,servicer
+0.506,refinance
+0.486,ditech
+0.463,closing
+0.454,lender
+0.442,appraisal

Weight?,Feature
+0.229,lower
+0.201,credit
+0.191,help
+0.167,as
+0.167,money
+0.153,irs
+0.150,td
+0.141,hard
+0.129,score
+0.124,karma

Weight?,Feature
+0.578,loan
+0.535,payday
+0.362,go
+0.358,cash
+0.311,line
+0.296,against
+0.286,company
+0.283,like
+0.278,calls
+0.270,amex

Weight?,Feature
+1.081,navient
+0.776,student
+0.735,loans
+0.473,repayment
+0.436,aes
+0.427,education
+0.419,loan
+0.410,school
+0.380,forbearance
+0.339,interest

Weight?,Feature
+0.607,car
+0.597,vehicle
+0.524,auto
+0.395,santander
+0.352,acceptance
+0.318,lease
+0.310,nissan
+0.301,toyota
+0.288,financial
+0.286,gm


In [197]:
eli5.show_prediction(clf, np.array(df2.Consumer_complaint)[0], vec=vec,
                     target_names=set(df2.Product)) # explain local prediction


Contribution?,Feature
0.67,Highlighted in text (sum)
-0.287,<BIAS>

Contribution?,Feature
2.345,Highlighted in text (sum)
0.341,<BIAS>

Contribution?,Feature
-0.585,<BIAS>
-0.939,Highlighted in text (sum)

Contribution?,Feature
0.899,<BIAS>
0.392,Highlighted in text (sum)

Contribution?,Feature
2.29,<BIAS>
-1.285,Highlighted in text (sum)

Contribution?,Feature
1.828,<BIAS>
-1.769,Highlighted in text (sum)

Contribution?,Feature
1.018,Highlighted in text (sum)
0.091,<BIAS>

Contribution?,Feature
-0.404,Highlighted in text (sum)
-1.898,<BIAS>

Contribution?,Feature
4.229,Highlighted in text (sum)
0.777,<BIAS>

Contribution?,Feature
-0.405,Highlighted in text (sum)
-2.364,<BIAS>

Contribution?,Feature
0.145,Highlighted in text (sum)
-0.537,<BIAS>

Contribution?,Feature
0.053,<BIAS>
-2.257,Highlighted in text (sum)

Contribution?,Feature
-0.61,<BIAS>
-1.739,Highlighted in text (sum)


In [196]:
np.array(df2.Consumer_complaint)[0]

'Dear CFPB, I currently have a Bank of America mortgage and am unable to contact " customer service \'\' for any assistance on my loan or impound account. On XXXX XXXX, XXXX, I was " on hold \'\' XXXX hours trying to reach a representative to ask a simple question about my impound account and finally got disconnected from their phone bank. \nTHIS IS UNACCEPTABLE. I am calling about my home. I am a customer. The use of a so called " customer service \'\' number by Bank of America is hypocrisy because what is going on is instead customer ABUSE. \nWhy should I have to wait XXXX hours during my work week to try to speak with a representative about my home mortgage? They should be required to have a call back service or something that does not hold customers XXXX for hours to their negligence, disregard and abuse of customers. \nPlease advise what you plan to do about this. \nSincerely, Customer'

In [198]:
eli5.show_prediction(clf, np.array(df2.Consumer_complaint)[123], vec=vec,
                     target_names=set(df2.Product)) # explain local prediction


Contribution?,Feature
0.271,Highlighted in text (sum)
-0.287,<BIAS>

Contribution?,Feature
0.341,<BIAS>
-0.203,Highlighted in text (sum)

Contribution?,Feature
0.881,Highlighted in text (sum)
-0.585,<BIAS>

Contribution?,Feature
0.899,<BIAS>
-0.346,Highlighted in text (sum)

Contribution?,Feature
2.29,<BIAS>
-0.553,Highlighted in text (sum)

Contribution?,Feature
2.009,Highlighted in text (sum)
1.828,<BIAS>

Contribution?,Feature
0.612,Highlighted in text (sum)
0.091,<BIAS>

Contribution?,Feature
-0.69,Highlighted in text (sum)
-1.898,<BIAS>

Contribution?,Feature
0.777,<BIAS>
-0.926,Highlighted in text (sum)

Contribution?,Feature
-0.81,Highlighted in text (sum)
-2.364,<BIAS>

Contribution?,Feature
-0.024,Highlighted in text (sum)
-0.537,<BIAS>

Contribution?,Feature
0.339,Highlighted in text (sum)
0.053,<BIAS>

Contribution?,Feature
-0.561,Highlighted in text (sum)
-0.61,<BIAS>


In [199]:
eli5.show_prediction(clf, np.array(df2.Consumer_complaint)[100], vec=vec,
                     target_names=set(df2.Product)) # explain local prediction


Contribution?,Feature
1.13,Highlighted in text (sum)
-0.287,<BIAS>

Contribution?,Feature
1.07,Highlighted in text (sum)
0.341,<BIAS>

Contribution?,Feature
-0.509,Highlighted in text (sum)
-0.585,<BIAS>

Contribution?,Feature
0.899,<BIAS>
0.56,Highlighted in text (sum)

Contribution?,Feature
2.29,<BIAS>
0.861,Highlighted in text (sum)

Contribution?,Feature
1.828,<BIAS>
0.116,Highlighted in text (sum)

Contribution?,Feature
0.638,Highlighted in text (sum)
0.091,<BIAS>

Contribution?,Feature
0.137,Highlighted in text (sum)
-1.898,<BIAS>

Contribution?,Feature
0.777,<BIAS>
-1.275,Highlighted in text (sum)

Contribution?,Feature
-0.741,Highlighted in text (sum)
-2.364,<BIAS>

Contribution?,Feature
-0.524,Highlighted in text (sum)
-0.537,<BIAS>

Contribution?,Feature
0.053,<BIAS>
-0.862,Highlighted in text (sum)

Contribution?,Feature
-0.602,Highlighted in text (sum)
-0.61,<BIAS>


In [200]:
vec = CountVectorizer(stop_words='english')
clf = LogisticRegressionCV()
pipe = make_pipeline(vec, clf)
pipe.fit(df2.Consumer_complaint, df2.Product)

print_report(pipe)

                                                    precision    recall  f1-score   support

                           Bank account or service       0.95      0.71      0.81       441
                       Checking or savings account       0.90      0.85      0.87       849
                                     Consumer Loan       0.96      0.52      0.68       296
                       Credit card or prepaid card       0.89      0.84      0.86      2000
                Credit reporting, repair, or other       0.86      0.95      0.90      8231
                                   Debt collection       0.86      0.84      0.85      4075
Money transfer, virtual currency, or money service       0.92      0.80      0.85       358
                                   Money transfers       0.97      0.62      0.76        53
                                          Mortgage       0.95      0.94      0.95      2216
                           Other financial service       1.00      0.43      0.

In [201]:
eli5.show_prediction(clf, np.array(df2.Consumer_complaint)[0], vec=vec,
                     target_names=set(df2.Product))


Contribution?,Feature
1.196,Highlighted in text (sum)
-0.195,<BIAS>

Contribution?,Feature
1.772,Highlighted in text (sum)
0.371,<BIAS>

Contribution?,Feature
-0.223,Highlighted in text (sum)
-0.507,<BIAS>

Contribution?,Feature
0.989,<BIAS>
0.295,Highlighted in text (sum)

Contribution?,Feature
2.315,<BIAS>
-1.588,Highlighted in text (sum)

Contribution?,Feature
1.856,<BIAS>
-1.568,Highlighted in text (sum)

Contribution?,Feature
1.024,Highlighted in text (sum)
0.094,<BIAS>

Contribution?,Feature
-0.206,Highlighted in text (sum)
-2.091,<BIAS>

Contribution?,Feature
3.469,Highlighted in text (sum)
0.831,<BIAS>

Contribution?,Feature
0.026,Highlighted in text (sum)
-2.931,<BIAS>

Contribution?,Feature
-0.128,Highlighted in text (sum)
-0.468,<BIAS>

Contribution?,Feature
0.212,<BIAS>
-3.197,Highlighted in text (sum)

Contribution?,Feature
-0.477,<BIAS>
-0.873,Highlighted in text (sum)


In [202]:
from sklearn.feature_extraction.text import TfidfVectorizer

vec = TfidfVectorizer()
clf = LogisticRegressionCV()
pipe = make_pipeline(vec, clf)
pipe.fit(df2.Consumer_complaint, df2.Product)
print_report(pipe)


                                                    precision    recall  f1-score   support

                           Bank account or service       0.95      0.73      0.82       441
                       Checking or savings account       0.86      0.90      0.88       849
                                     Consumer Loan       0.92      0.53      0.68       296
                       Credit card or prepaid card       0.88      0.89      0.89      2000
                Credit reporting, repair, or other       0.91      0.94      0.92      8231
                                   Debt collection       0.88      0.88      0.88      4075
Money transfer, virtual currency, or money service       0.88      0.83      0.86       358
                                   Money transfers       0.96      0.49      0.65        53
                                          Mortgage       0.93      0.97      0.95      2216
                           Other financial service       0.00      0.00      0.

In [203]:
eli5.show_prediction(clf, np.array(df2.Consumer_complaint)[0], vec=vec,
                     target_names=set(df2.Product))


Contribution?,Feature
0.588,Highlighted in text (sum)
-0.171,<BIAS>

Contribution?,Feature
1.685,Highlighted in text (sum)
0.28,<BIAS>

Contribution?,Feature
-0.24,<BIAS>
-1.145,Highlighted in text (sum)

Contribution?,Feature
0.949,<BIAS>
0.462,Highlighted in text (sum)

Contribution?,Feature
2.313,<BIAS>
-1.56,Highlighted in text (sum)

Contribution?,Feature
2.03,<BIAS>
-2.114,Highlighted in text (sum)

Contribution?,Feature
0.438,Highlighted in text (sum)
0.105,<BIAS>

Contribution?,Feature
-0.279,Highlighted in text (sum)
-1.516,<BIAS>

Contribution?,Feature
3.167,Highlighted in text (sum)
0.765,<BIAS>

Contribution?,Feature
0.305,Highlighted in text (sum)
-3.494,<BIAS>

Contribution?,Feature
0.12,Highlighted in text (sum)
-0.366,<BIAS>

Contribution?,Feature
-0.048,<BIAS>
-1.231,Highlighted in text (sum)

Contribution?,Feature
-0.437,Highlighted in text (sum)
-0.608,<BIAS>


In [204]:
vec = TfidfVectorizer(stop_words='english')
clf = LogisticRegressionCV()
pipe = make_pipeline(vec, clf)
pipe.fit(df2.Consumer_complaint, df2.Product)
print_report(pipe)


                                                    precision    recall  f1-score   support

                           Bank account or service       0.93      0.76      0.83       441
                       Checking or savings account       0.87      0.90      0.88       849
                                     Consumer Loan       0.94      0.60      0.74       296
                       Credit card or prepaid card       0.88      0.90      0.89      2000
                Credit reporting, repair, or other       0.91      0.94      0.93      8231
                                   Debt collection       0.89      0.88      0.89      4075
Money transfer, virtual currency, or money service       0.89      0.85      0.87       358
                                   Money transfers       0.96      0.49      0.65        53
                                          Mortgage       0.94      0.97      0.96      2216
                           Other financial service       0.00      0.00      0.

In [205]:
eli5.show_prediction(clf, np.array(df2.Consumer_complaint)[0], vec=vec,
                     target_names=set(df2.Product))


Contribution?,Feature
0.969,Highlighted in text (sum)
-0.152,<BIAS>

Contribution?,Feature
1.708,Highlighted in text (sum)
0.257,<BIAS>

Contribution?,Feature
-0.267,<BIAS>
-0.824,Highlighted in text (sum)

Contribution?,Feature
0.966,<BIAS>
0.492,Highlighted in text (sum)

Contribution?,Feature
2.219,<BIAS>
-1.758,Highlighted in text (sum)

Contribution?,Feature
1.919,<BIAS>
-2.069,Highlighted in text (sum)

Contribution?,Feature
0.734,Highlighted in text (sum)
-0.147,<BIAS>

Contribution?,Feature
-0.074,Highlighted in text (sum)
-1.69,<BIAS>

Contribution?,Feature
3.153,Highlighted in text (sum)
0.847,<BIAS>

Contribution?,Feature
-0.021,Highlighted in text (sum)
-3.142,<BIAS>

Contribution?,Feature
0.091,Highlighted in text (sum)
-0.496,<BIAS>

Contribution?,Feature
0.072,<BIAS>
-1.823,Highlighted in text (sum)

Contribution?,Feature
-0.387,<BIAS>
-0.578,Highlighted in text (sum)


In [206]:
vec = TfidfVectorizer(stop_words='english', analyzer='char',
                      ngram_range=(2,5))
clf = LogisticRegressionCV()
pipe = make_pipeline(vec, clf)
pipe.fit(df2.Consumer_complaint, df2.Product)

print_report(pipe)


                                                    precision    recall  f1-score   support

                           Bank account or service       0.95      0.75      0.84       441
                       Checking or savings account       0.88      0.91      0.90       849
                                     Consumer Loan       0.98      0.56      0.71       296
                       Credit card or prepaid card       0.89      0.91      0.90      2000
                Credit reporting, repair, or other       0.92      0.95      0.94      8231
                                   Debt collection       0.89      0.90      0.90      4075
Money transfer, virtual currency, or money service       0.90      0.85      0.87       358
                                   Money transfers       1.00      0.38      0.55        53
                                          Mortgage       0.94      0.97      0.95      2216
                           Other financial service       0.00      0.00      0.

In [207]:
eli5.show_prediction(clf, np.array(df2.Consumer_complaint)[0], vec=vec,
                     target_names=set(df2.Product))


Contribution?,Feature
1.012,Highlighted in text (sum)
-0.219,<BIAS>

Contribution?,Feature
1.211,Highlighted in text (sum)
0.334,<BIAS>

Contribution?,Feature
-0.282,<BIAS>
-0.613,Highlighted in text (sum)

Contribution?,Feature
0.884,<BIAS>
0.855,Highlighted in text (sum)

Contribution?,Feature
2.535,<BIAS>
-2.162,Highlighted in text (sum)

Contribution?,Feature
2.116,<BIAS>
-2.513,Highlighted in text (sum)

Contribution?,Feature
0.52,Highlighted in text (sum)
0.276,<BIAS>

Contribution?,Feature
-0.191,Highlighted in text (sum)
-1.611,<BIAS>

Contribution?,Feature
3.37,Highlighted in text (sum)
0.619,<BIAS>

Contribution?,Feature
0.409,Highlighted in text (sum)
-3.709,<BIAS>

Contribution?,Feature
-0.235,<BIAS>
-0.397,Highlighted in text (sum)

Contribution?,Feature
-0.145,<BIAS>
-1.049,Highlighted in text (sum)

Contribution?,Feature
-0.453,Highlighted in text (sum)
-0.563,<BIAS>


In [208]:
vec = TfidfVectorizer(stop_words='english', analyzer='char_wb',
                      ngram_range=(2,5))
clf = LogisticRegressionCV()
pipe = make_pipeline(vec, clf)
pipe.fit(df2.Consumer_complaint, df2.Product)

print_report(pipe)


                                                    precision    recall  f1-score   support

                           Bank account or service       0.92      0.63      0.75       441
                       Checking or savings account       0.82      0.87      0.84       849
                                     Consumer Loan       0.94      0.49      0.64       296
                       Credit card or prepaid card       0.85      0.88      0.87      2000
                Credit reporting, repair, or other       0.90      0.93      0.92      8231
                                   Debt collection       0.86      0.87      0.86      4075
Money transfer, virtual currency, or money service       0.86      0.80      0.83       358
                                   Money transfers       1.00      0.32      0.49        53
                                          Mortgage       0.92      0.96      0.94      2216
                           Other financial service       0.00      0.00      0.

In [209]:
eli5.show_prediction(clf, np.array(df2.Consumer_complaint)[0], vec=vec,
                     target_names=set(df2.Product))

Contribution?,Feature
0.991,Highlighted in text (sum)
-0.26,<BIAS>

Contribution?,Feature
1.694,Highlighted in text (sum)
0.329,<BIAS>

Contribution?,Feature
-0.241,<BIAS>
-0.738,Highlighted in text (sum)

Contribution?,Feature
0.824,<BIAS>
0.778,Highlighted in text (sum)

Contribution?,Feature
2.694,<BIAS>
-2.193,Highlighted in text (sum)

Contribution?,Feature
2.205,<BIAS>
-2.815,Highlighted in text (sum)

Contribution?,Feature
0.669,Highlighted in text (sum)
0.291,<BIAS>

Contribution?,Feature
-0.399,Highlighted in text (sum)
-1.492,<BIAS>

Contribution?,Feature
3.422,Highlighted in text (sum)
0.575,<BIAS>

Contribution?,Feature
0.386,Highlighted in text (sum)
-3.694,<BIAS>

Contribution?,Feature
-0.222,Highlighted in text (sum)
-0.302,<BIAS>

Contribution?,Feature
-0.251,<BIAS>
-1.257,Highlighted in text (sum)

Contribution?,Feature
-0.317,Highlighted in text (sum)
-0.677,<BIAS>


In [211]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

vec = HashingVectorizer(stop_words='english', ngram_range=(1,2))
clf = SGDClassifier(random_state=42)
pipe = make_pipeline(vec, clf)
pipe.fit(df2.Consumer_complaint, df2.Product)
print_report(pipe)


                                                    precision    recall  f1-score   support

                           Bank account or service       0.97      0.47      0.63       441
                       Checking or savings account       0.77      0.79      0.78       849
                                     Consumer Loan       0.98      0.49      0.66       296
                       Credit card or prepaid card       0.82      0.83      0.83      2000
                Credit reporting, repair, or other       0.86      0.93      0.89      8231
                                   Debt collection       0.86      0.82      0.84      4075
Money transfer, virtual currency, or money service       0.91      0.73      0.81       358
                                   Money transfers       1.00      0.68      0.81        53
                                          Mortgage       0.86      0.96      0.91      2216
                           Other financial service       0.50      0.14      0.

In [212]:
eli5.show_prediction(clf, np.array(df2.Consumer_complaint)[0], vec=vec,
                     target_names=set(df2.Product))

Contribution?,Feature
0.098,Highlighted in text (sum)
-1.254,<BIAS>

Contribution?,Feature
0.346,Highlighted in text (sum)
-1.506,<BIAS>

Contribution?,Feature
-0.058,Highlighted in text (sum)
-1.134,<BIAS>

Contribution?,Feature
0.313,Highlighted in text (sum)
-1.253,<BIAS>

Contribution?,Feature
-0.335,<BIAS>
-2.003,Highlighted in text (sum)

Contribution?,Feature
-0.771,<BIAS>
-1.086,Highlighted in text (sum)

Contribution?,Feature
0.081,Highlighted in text (sum)
-1.228,<BIAS>

Contribution?,Feature
-0.0,Highlighted in text (sum)
-1.182,<BIAS>

Contribution?,Feature
2.05,Highlighted in text (sum)
-1.27,<BIAS>

Contribution?,Feature
0.005,Highlighted in text (sum)
-1.098,<BIAS>

Contribution?,Feature
-0.09,Highlighted in text (sum)
-1.184,<BIAS>

Contribution?,Feature
-0.565,Highlighted in text (sum)
-1.221,<BIAS>

Contribution?,Feature
0.031,Highlighted in text (sum)
-1.203,<BIAS>


In [213]:
from eli5.sklearn import InvertableHashingVectorizer
import numpy as np

ivec = InvertableHashingVectorizer(vec)
sample_size = len(df2.Consumer_complaint) // 10
X_sample = np.random.choice(df2.Consumer_complaint, size=sample_size)
ivec.fit(X_sample)

eli5.show_weights(clf, vec=ivec, top=20,
                  target_names=set(df2.Product))

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5,Unnamed: 12_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6,Unnamed: 10_level_6,Unnamed: 11_level_6,Unnamed: 12_level_6
Weight?,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7,Unnamed: 11_level_7,Unnamed: 12_level_7
Weight?,Feature,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8,Unnamed: 9_level_8,Unnamed: 10_level_8,Unnamed: 11_level_8,Unnamed: 12_level_8
Weight?,Feature,Unnamed: 2_level_9,Unnamed: 3_level_9,Unnamed: 4_level_9,Unnamed: 5_level_9,Unnamed: 6_level_9,Unnamed: 7_level_9,Unnamed: 8_level_9,Unnamed: 9_level_9,Unnamed: 10_level_9,Unnamed: 11_level_9,Unnamed: 12_level_9
Weight?,Feature,Unnamed: 2_level_10,Unnamed: 3_level_10,Unnamed: 4_level_10,Unnamed: 5_level_10,Unnamed: 6_level_10,Unnamed: 7_level_10,Unnamed: 8_level_10,Unnamed: 9_level_10,Unnamed: 10_level_10,Unnamed: 11_level_10,Unnamed: 12_level_10
Weight?,Feature,Unnamed: 2_level_11,Unnamed: 3_level_11,Unnamed: 4_level_11,Unnamed: 5_level_11,Unnamed: 6_level_11,Unnamed: 7_level_11,Unnamed: 8_level_11,Unnamed: 9_level_11,Unnamed: 10_level_11,Unnamed: 11_level_11,Unnamed: 12_level_11
Weight?,Feature,Unnamed: 2_level_12,Unnamed: 3_level_12,Unnamed: 4_level_12,Unnamed: 5_level_12,Unnamed: 6_level_12,Unnamed: 7_level_12,Unnamed: 8_level_12,Unnamed: 9_level_12,Unnamed: 10_level_12,Unnamed: 11_level_12,Unnamed: 12_level_12
+1.096,requirements,,,,,,,,,,,
+0.725,promotion,,,,,,,,,,,
+0.661,citigold,,,,,,,,,,,
+0.658,overdraft,,,,,,,,,,,
+0.649,miles,,,,,,,,,,,
+0.633,met,,,,,,,,,,,
+0.629,fee,,,,,,,,,,,
+0.608,citi,,,,,,,,,,,
+0.573,FEATURE[428730],,,,,,,,,,,
+0.570,2016,,,,,,,,,,,

Weight?,Feature
+1.096,requirements
+0.725,promotion
+0.661,citigold
+0.658,overdraft
+0.649,miles
+0.633,met
+0.629,fee
+0.608,citi
+0.573,FEATURE[428730]
+0.570,2016

Weight?,Feature
+2.169,branch
+2.145,deposit
+1.696,bonus
+1.641,bank
+1.591,deposited
+1.447,funds
+1.433,checking …
+1.414,atm
+1.351,savings account
+1.341,savings

Weight?,Feature
+0.432,vehicle
+0.409,finance …
+0.404,car
+0.403,auto loan
+0.394,dealer
+0.380,vehicle xxxx …
+0.375,FEATURE[503180]
+0.350,bmw
+0.346,corporation
+0.336,xxxx vehicle

Weight?,Feature
+4.756,card
+2.565,purchase
+2.422,capital
+2.404,minimum
+2.228,discover
+2.187,synchrony
+2.154,express
+2.144,cards
+2.015,citi
+1.987,amex

Weight?,Feature
+6.313,experian …
+6.148,equifax
+5.390,transunion
+3.496,inquiries
+2.969,report
+2.916,reporting
+2.775,inquiry
+2.500,00 xxxx …
+2.330,xxxx account
+2.128,removed

Weight?,Feature
+5.378,debt …
+3.592,collection
+3.478,collect
+3.426,owe
+2.883,calling …
+2.685,collections
+2.553,hospital
+2.477,owed …
+2.324,recovery
+2.192,medical

Weight?,Feature
+4.269,coinbase
+1.652,app
+1.593,paypal
+1.314,transfer …
+1.228,cash app
+1.150,tickets
+0.898,transaction …
+0.881,wallet …
+0.846,buyer
+0.823,moneygram

Weight?,Feature
+0.411,wire
+0.398,western
+0.394,western union
+0.369,sent money
+0.306,money …
+0.302,union
+0.267,service
+0.265,paypal
+0.264,ebay
+0.257,refused …

Weight?,Feature
+7.850,mortgage
+5.020,escrow
+4.525,modification
+3.341,home
+2.990,foreclosure
+2.959,closing
+2.955,appraisal
+2.848,refinance …
+2.627,ocwen
+2.621,ditech

Weight?,Feature
+0.180,lower
+0.120,help
+0.114,money …
+0.104,irs
+0.087,karma
+0.087,credit karma
+0.086,credit score
+0.081,return
+0.081,process
+0.079,forced

Weight?,Feature
+0.836,00 loan
+0.810,payday
+0.510,cash
+0.484,loan
+0.465,lending
+0.459,borrowed
+0.459,taking
+0.429,apr
+0.415,payday loan
+0.377,store

Weight?,Feature
+6.329,navient
+3.108,loans
+3.027,student
+2.066,repayment
+1.920,aes
+1.877,school
+1.758,student loan
+1.575,private
+1.436,forbearance
+1.397,forgiveness

Weight?,Feature
+0.971,car
+0.688,santander
+0.611,toyota
+0.603,dmv
+0.557,auto
+0.548,vehicle
+0.526,nissan
+0.492,ally
+0.485,motor
+0.471,lease …
