## Consumer complaints classification model

#### This module builds machine learning model that can classify consumer complaints to different catogories
#### The model is saved in a serializable object using pickle
#### Input: Consumer_Complaints.csv
#### Output: complaint_classification_model.pkl

In [1]:
# import data processing libraries
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
# load the dataset
cc = pd.read_csv('consumer_complaints.csv')

In [4]:
# size of the data
cc.shape

(555957, 18)

In [5]:
# snapshot of data
cc.head()

Unnamed: 0,date_received,product,sub_product,issue,sub_issue,consumer_complaint_narrative,company_public_response,company,state,zipcode,tags,consumer_consent_provided,submitted_via,date_sent_to_company,company_response_to_consumer,timely_response,consumer_disputed?,complaint_id
0,08/30/2013,Mortgage,Other mortgage,"Loan modification,collection,foreclosure",,,,U.S. Bancorp,CA,95993,,,Referral,09/03/2013,Closed with explanation,Yes,Yes,511074
1,08/30/2013,Mortgage,Other mortgage,"Loan servicing, payments, escrow account",,,,Wells Fargo & Company,CA,91104,,,Referral,09/03/2013,Closed with explanation,Yes,Yes,511080
2,08/30/2013,Credit reporting,,Incorrect information on credit report,Account status,,,Wells Fargo & Company,NY,11764,,,Postal mail,09/18/2013,Closed with explanation,Yes,No,510473
3,08/30/2013,Student loan,Non-federal student loan,Repaying your loan,Repaying your loan,,,"Navient Solutions, Inc.",MD,21402,,,Email,08/30/2013,Closed with explanation,Yes,Yes,510326
4,08/30/2013,Debt collection,Credit card,False statements or representation,Attempted to collect wrong amount,,,Resurgent Capital Services L.P.,GA,30106,,,Web,08/30/2013,Closed with explanation,Yes,Yes,511067


In [6]:
# check for null values
cc.isnull().sum()

date_received                        0
product                              0
sub_product                     158322
issue                                0
sub_issue                       343335
consumer_complaint_narrative    489151
company_public_response         470833
company                              0
state                             4887
zipcode                           4505
tags                            477998
consumer_consent_provided       432499
submitted_via                        0
date_sent_to_company                 0
company_response_to_consumer         0
timely_response                      0
consumer_disputed?                   0
complaint_id                         0
dtype: int64

In [7]:
# combine 3 columns and apply string function to make one single columns
cc['new_complaint_narrative'] = cc[['issue','sub_issue','consumer_complaint_narrative']].apply(lambda x: ' '.join(x.astype(str)),axis=1)

In [8]:
cc['new_complaint_narrative'].head()

0     Loan modification,collection,foreclosure nan nan
1     Loan servicing, payments, escrow account nan nan
2    Incorrect information on credit report Account...
3            Repaying your loan Repaying your loan nan
4    False statements or representation Attempted t...
Name: new_complaint_narrative, dtype: object

In [9]:
# take only two columns
df = cc[['product','new_complaint_narrative']]

In [10]:
df.shape

(555957, 2)

In [11]:
df.isnull().sum()

product                    0
new_complaint_narrative    0
dtype: int64

In [12]:
# what are the different catogories?
df['product'].value_counts()

Mortgage                   186475
Debt collection            101052
Credit reporting            91854
Credit card                 66468
Bank account or service     62563
Consumer Loan               20990
Student loan                15839
Payday loan                  3877
Money transfers              3812
Prepaid card                 2470
Other financial service       557
Name: product, dtype: int64

In [13]:
df.columns

Index(['product', 'new_complaint_narrative'], dtype='object')

In [14]:
# combining product categories
df['product'][df['product'] == 'Money transfer, virtual currency, or money service'] = 'Money transfers'
df['product'][df['product'] == 'Prepaid card'] = 'Credit card or prepaid card'
df['product'][df['product']== 'Virtual currency'] = 'Other financial service'

In [15]:
df['product'].value_counts()

Mortgage                       186475
Debt collection                101052
Credit reporting                91854
Credit card                     66468
Bank account or service         62563
Consumer Loan                   20990
Student loan                    15839
Payday loan                      3877
Money transfers                  3812
Credit card or prepaid card      2470
Other financial service           557
Name: product, dtype: int64

In [16]:
# build ML model pipeline
comp_class = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

In [17]:
# test and train split
X_train, X_test, y_train, y_test = train_test_split(df['new_complaint_narrative'], df['product'], random_state = 0)
# train the model
comp_class.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB())])

In [18]:
# accuracy on train data
comp_class.score(X_train,y_train)

0.9510320960651563

In [19]:
# accuracy on test data
comp_class.score(X_test,y_test)

0.9506655155047126

In [20]:
# Need to further tune the parameters or try with another ML algorithm to get better accuracy

In [21]:
new_complaint = ["This company refuses to provide me verification and validation of debt per my right under the FDCPA. I do not believe this debt is mine."]
print(comp_class.predict(new_complaint))

['Debt collection']


In [22]:
# save the model
import pickle
pickle.dump(comp_class, open("complaint_classification_model.pkl", 'wb'))