In [1]:
#Import the libraries

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import sklearn.feature_extraction.text as text
from sklearn.metrics import accuracy_score, classification_report
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from textblob import TextBlob
from nltk.stem import PorterStemmer,SnowballStemmer
from textblob import Word
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tokenize.toktok import ToktokTokenizer

from io import StringIO
import os
import string
import gensim
from gensim.models import Word2Vec
import itertools
import scipy
from scipy import spatial
import seaborn as sns
import matplotlib.pyplot as plt
import re
import nltk
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
from nltk import word_tokenize, FreqDist
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Import the dataset
df = pd.read_csv(r"D:\LEARNING\WELLS FARGO\NLP\CFPB\Data\complaints-2023-08-29_03_42.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 921585 entries, 0 to 921584
Data columns (total 18 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   Date received                 921585 non-null  object 
 1   Product                       921585 non-null  object 
 2   Sub-product                   921585 non-null  object 
 3   Issue                         921585 non-null  object 
 4   Sub-issue                     921585 non-null  object 
 5   Consumer complaint narrative  387120 non-null  object 
 6   Company public response       921585 non-null  object 
 7   Company                       921585 non-null  object 
 8   State                         921585 non-null  object 
 9   ZIP code                      921585 non-null  object 
 10  Tags                          921585 non-null  object 
 11  Consumer consent provided?    880274 non-null  object 
 12  Submitted via                 921585 non-nul

In [4]:
df.head()

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,02/06/23,Mortgage,Conventional home mortgage,Struggling to pay mortgage,,,Company has responded to the consumer and the ...,WELLS FARGO & COMPANY,MA,01570,Older American,Consent not provided,Web,02/06/23,Closed with explanation,Yes,,6521596
1,03/26/23,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Personal information incorrect,,,"EQUIFAX, INC.",TX,75002,,Consent not provided,Web,03/26/23,Closed with explanation,Yes,,6751978
2,09/13/22,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Information belongs to someone else,Hi I am submitting this XXXX XXXX this isn't a...,Company has responded to the consumer and the ...,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",IL,60586,,Consent provided,Web,09/13/22,Closed with non-monetary relief,Yes,,5978090
3,03/29/23,Mortgage,Other type of mortgage,Applying for a mortgage or refinancing an exis...,,"I applied for, Again for a Pre-approval on a P...",Company has responded to the consumer and the ...,"BANK OF AMERICA, NATIONAL ASSOCIATION",PA,XXXXX,,Consent provided,Web,03/29/23,Closed with explanation,Yes,,6762421
4,03/28/23,"Credit reporting, credit repair services, or o...",Credit reporting,Problem with a credit reporting company's inve...,Problem with personal statement of dispute,there investigation found that the information...,Company has responded to the consumer and the ...,Experian Information Solutions Inc.,VA,22193,,Consent provided,Web,03/28/23,Closed with non-monetary relief,Yes,,6765663


In [5]:
# Filtering the relevant columns
df = df[['Product', 'Sub-product', 'Issue', 'Sub-issue', 'Consumer complaint narrative']]

In [6]:
#Renaming the filtered columns
df = df.rename(columns={"Product": "product", "Sub-product": "subproduct", "Issue": "issue", "Sub-issue": "subissue", "Consumer complaint narrative": "narrative"})

##### Data Inspection

In [7]:
df.shape

(921585, 5)

In [8]:
#Check for na values in narrative columns
df['narrative'].isna().sum()

534465

In [9]:
#Removing the rows having nan values in narrative column
df = df[df['narrative'].notna()]
df =df.reset_index(drop=True)
df.shape

(387120, 5)

In [10]:
df['product'].value_counts()

Credit reporting, credit repair services, or other personal consumer reports    274742
Debt collection                                                                  31685
Credit card or prepaid card                                                      23842
Checking or savings account                                                      22374
Mortgage                                                                         12746
Money transfer, virtual currency, or money service                                8291
Vehicle loan or lease                                                             5778
Student loan                                                                      4171
Payday loan, title loan, or personal loan                                         3491
Name: product, dtype: int64

##### Text pre-processing

In [11]:
# Convert text data to lower case
df['narrative'] =df['narrative'].apply(lambda x: ' '.join([i.lower() for i in x.split()]))

In [12]:
# Removing the punctuations
df['narrative'] =df['narrative'].str.replace(r'[^\w\s]',"")

In [13]:
#Removing the patterns xxxx
df['narrative'] = df['narrative'].str.replace(r"xx+\s","")

In [14]:
#Removing the stop words
from nltk.corpus import stopwords
stop = stopwords.words('english')
df['narrative'] =df['narrative'].apply(lambda x: ' '.join([i for i in x.split() if i not in stop]))
#Remove the numbers 
df['narrative'] = df['narrative'].str.replace('\d+', '')

In [15]:
#Lemmatizing
from textblob import Word

df['narrative'] =df['narrative'].apply(lambda x:' '.join([Word(i).lemmatize() for i in x.split()]))

In [16]:
import nltk

w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]
df['narrative_tokens'] = df.narrative.apply(lemmatize_text)

##### Combine categories
- Rename "credit_reporting"
- Rename "debt_collection"
- Rename "credit_card"
- Rename "mortgage"
- Combine "checking" and "money transfer" into "retail_banking"
- Combine the loans into "loans"

In [17]:
df['product'].replace({'Credit reporting, credit repair services, or other personal consumer reports': 'credit_reporting',
                       'Debt collection': 'debt_collection',
                       'Credit card or prepaid card': 'credit_card',
                       'Mortgage': 'mortgage',
                       'Checking or savings account': 'retail_banking',
                       'Money transfer, virtual currency, or money service': 'retail_banking',
                       'Vehicle loan or lease': 'loans',
                       'Payday loan, title loan, or personal loan': 'loans',
                       'Student loan': 'loans'}, inplace=True)

In [18]:
df['product'].value_counts()

credit_reporting    274742
debt_collection      31685
retail_banking       30665
credit_card          23842
loans                13440
mortgage             12746
Name: product, dtype: int64

In [19]:
#Mortgage and loans are the smallest. Since they're both types of loans, combine them.
df['product'].replace({'mortgage': 'mortgages_and_loans',
                       'loans': 'mortgages_and_loans'}, inplace=True)
df['product'].value_counts()

credit_reporting       274742
debt_collection         31685
retail_banking          30665
mortgages_and_loans     26186
credit_card             23842
Name: product, dtype: int64

In [20]:
df.head()

Unnamed: 0,product,subproduct,issue,subissue,narrative,narrative_tokens
0,credit_reporting,Credit reporting,Incorrect information on your report,Information belongs to someone else,hi submitting isnt influence third party trans...,"[hi, submitting, isnt, influence, third, party..."
1,mortgages_and_loans,Other type of mortgage,Applying for a mortgage or refinancing an exis...,,applied preapproval property order able view s...,"[applied, preapproval, property, order, able, ..."
2,credit_reporting,Credit reporting,Problem with a credit reporting company's inve...,Problem with personal statement of dispute,investigation found information disputed inacc...,"[investigation, found, information, disputed, ..."
3,credit_reporting,Credit reporting,Improper use of your report,Reporting company used your report improperly,accordance fair credit reporting act violated ...,"[accordance, fair, credit, reporting, act, vio..."
4,debt_collection,Other debt,Attempts to collect debt not owed,Debt is not yours,address credit reporting company tx dear cfpb ...,"[address, credit, reporting, company, tx, dear..."


##### Model Building

In [21]:
product_dict ={'credit_reporting': 0, 'debt_collection': 1, 'mortgages_and_loans': 2, 
               'credit_card': 3, 'retail_banking': 4}
df['product'].replace(product_dict, inplace=True)

In [22]:
df['product'].unique()

array([0, 2, 1, 4, 3], dtype=int64)

In [23]:
#Train-Test Split
X = df.drop(['product'], axis=1)
y = df['product']

X_train, X_test, y_train, y_test = train_test_split(X['narrative'], y, test_size=0.20, random_state=200)

In [24]:
tfidf = TfidfVectorizer(max_features=16000)
pipeline = Pipeline([('vectorizer', tfidf), 
                                ('model',GradientBoostingClassifier(random_state=123, max_depth=5))])

In [25]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('vectorizer', TfidfVectorizer(max_features=16000)),
                ('model',
                 GradientBoostingClassifier(max_depth=5, random_state=123))])

In [26]:
# Testing the Pipeline
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))
print('Accuracy: {} %'.format(100 * accuracy_score(y_test, y_pred)))

              precision    recall  f1-score   support

           0       0.93      0.97      0.95     54832
           1       0.78      0.65      0.71      6307
           2       0.85      0.78      0.81      5261
           3       0.79      0.72      0.75      4858
           4       0.88      0.86      0.87      6166

    accuracy                           0.91     77424
   macro avg       0.85      0.79      0.82     77424
weighted avg       0.90      0.91      0.90     77424

Accuracy: 90.5507336226493 %


In [29]:
# Saving the Pipeline

from joblib import dump
dump(pipeline, 'complaints_classifier.joblib')

['complaints_classifier.joblib']