You will be provided with an Excel dataset containing multiple columns. Your focus will be on the "Issue" and "Sub-Issue" columns, which contain textual information describing customer issues and more specific sub-issues. Your goal is to develop an NLP model that can predict the appropriate product category to which each issue belongs.

In [1]:
import pandas as pd
import numpy as np

In [2]:
data=pd.read_csv("consumer_complaints_copy.csv")

In [3]:
data

Unnamed: 0,date_received,product,sub_product,issue,sub_issue,consumer_complaint_narrative,company_public_response,company,state,zipcode,tags,consumer_consent_provided,submitted_via,date_sent_to_company,company_response_to_consumer,timely_response,consumer_disputed?,complaint_id
0,12-05-2014,Debt collection,Mortgage,Disclosure verification of debt,Not given enough info to verify debt,,,"(Former)Shapiro, Swertfeger & Hasty, LLP",GA,30005,,,Referral,12-12-2014,Untimely response,No,No,1144671
1,11-10-2014,Mortgage,Other mortgage,"Loan servicing, payments, escrow account",,,,"(Former)Shapiro, Swertfeger & Hasty, LLP",DE,19803,,,Referral,11/19/2014,Untimely response,No,No,1109287
2,08/26/2015,Mortgage,Other mortgage,"Loan modification,collection,foreclosure",,,,"(Former)Shapiro, Swertfeger & Hasty, LLP",GA,30014,,,Referral,09-08-2015,Untimely response,No,No,1536776
3,01/16/2014,Debt collection,Mortgage,Disclosure verification of debt,Not given enough info to verify debt,,,"(Former)Shapiro, Swertfeger & Hasty, LLP",GA,30087,,,Referral,02-11-2014,Untimely response,No,No,671539
4,06/25/2015,Mortgage,Conventional fixed mortgage,"Application, originator, mortgage broker",,My mortgage company has misrepresented themsel...,,"1st 2nd Mortgage Company Of NJ, Inc.",NJ,074XX,,Consent provided,Web,07/22/2015,Closed,Yes,No,1437506
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
555952,01/26/2014,Debt collection,Non-federal student loan,Improper contact or sharing of info,Contacted employer after asked not to,,,Zwicker & Associates,MN,55428,,,Web,01/27/2014,Closed with non-monetary relief,Yes,No,685904
555953,01/26/2016,Debt collection,Non-federal student loan,Cont'd attempts collect debt not owed,Debt was discharged in bankruptcy,,,Zwicker & Associates,NJ,070XX,Older American,Consent provided,Web,02-10-2016,Closed with non-monetary relief,Yes,No,1759548
555954,03/31/2016,Debt collection,"Other (i.e. phone, health club, etc.)",Disclosure verification of debt,Not given enough info to verify debt,,,Zwicker & Associates,FL,33837,,,Referral,04-04-2016,Closed with explanation,Yes,No,1859430
555955,10/13/2015,Debt collection,Credit card,Disclosure verification of debt,Not given enough info to verify debt,,,Zwicker & Associates,FL,33308,,,Phone,10/13/2015,Closed with non-monetary relief,Yes,No,1603745


In [4]:
df=data[["issue","sub_issue","product"]]

In [5]:
df

Unnamed: 0,issue,sub_issue,product
0,Disclosure verification of debt,Not given enough info to verify debt,Debt collection
1,"Loan servicing, payments, escrow account",,Mortgage
2,"Loan modification,collection,foreclosure",,Mortgage
3,Disclosure verification of debt,Not given enough info to verify debt,Debt collection
4,"Application, originator, mortgage broker",,Mortgage
...,...,...,...
555952,Improper contact or sharing of info,Contacted employer after asked not to,Debt collection
555953,Cont'd attempts collect debt not owed,Debt was discharged in bankruptcy,Debt collection
555954,Disclosure verification of debt,Not given enough info to verify debt,Debt collection
555955,Disclosure verification of debt,Not given enough info to verify debt,Debt collection


In [6]:
df.shape

(555957, 3)

In [7]:
df.head(50)

Unnamed: 0,issue,sub_issue,product
0,Disclosure verification of debt,Not given enough info to verify debt,Debt collection
1,"Loan servicing, payments, escrow account",,Mortgage
2,"Loan modification,collection,foreclosure",,Mortgage
3,Disclosure verification of debt,Not given enough info to verify debt,Debt collection
4,"Application, originator, mortgage broker",,Mortgage
5,Cont'd attempts collect debt not owed,Debt is not mine,Debt collection
6,"Application, originator, mortgage broker",,Mortgage
7,"Loan servicing, payments, escrow account",,Mortgage
8,"Loan servicing, payments, escrow account",,Mortgage
9,"Loan modification,collection,foreclosure",,Mortgage


In [8]:
df.sub_issue.isnull().sum()

343335

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 555957 entries, 0 to 555956
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   issue      555957 non-null  object
 1   sub_issue  212622 non-null  object
 2   product    555957 non-null  object
dtypes: object(3)
memory usage: 12.7+ MB


In [10]:
df.isnull().sum()

issue             0
sub_issue    343335
product           0
dtype: int64

In [11]:
df["product"].value_counts()

Mortgage                   186475
Debt collection            101052
Credit reporting            91854
Credit card                 66468
Bank account or service     62563
Consumer Loan               20990
Student loan                15839
Payday loan                  3877
Money transfers              3812
Prepaid card                 2470
Other financial service       557
Name: product, dtype: int64

In [12]:
len(df["product"].value_counts())

11

In [13]:
df.sub_issue.unique()

array(['Not given enough info to verify debt', nan, 'Debt is not mine',
       'Talked to a third party about my debt',
       'Contacted employer after asked not to',
       'Threatened to sue on too old debt',
       'Seized/Attempted to seize property', 'Frequent or repeated calls',
       'Contacted me after I asked not to',
       'Attempted to collect wrong amount',
       'Threatened arrest/jail if do not pay',
       'Contacted me instead of my attorney', 'Information is not mine',
       'Applied for loan/did not receive money', 'Account status',
       'Threatened to take legal action',
       'Debt was discharged in bankruptcy', "Can't contact lender",
       "Can't stop charges to bank account",
       "Received a loan I didn't apply for",
       "Charged fees or interest I didn't expect",
       'Called after sent written cease of comm',
       'Used obscene/profane/abusive language',
       'Impersonated an attorney or official',
       'Indicated committed crime not payi

In [14]:
df.sub_issue.value_counts()

Account status                              26798
Debt is not mine                            26285
Information is not mine                     19900
Not given enough info to verify debt        12496
Debt was paid                               11328
                                            ...  
Receiving unwanted marketing/advertising      166
Report shared with employer w/o consent       127
Received marketing offer after opted out      125
Qualify for a better loan than offered        107
Insurance terms                                 4
Name: sub_issue, Length: 68, dtype: int64

In [15]:
len(df.sub_issue.value_counts())

68

In [16]:
df.issue.value_counts()

Loan modification,collection,foreclosure    97191
Incorrect information on credit report      66718
Loan servicing, payments, escrow account    60375
Cont'd attempts collect debt not owed       42285
Account opening, closing, or management     26661
                                            ...  
Lost or stolen money order                     25
Incorrect exchange rate                        16
Lender sold the property                        5
Lender damaged or destroyed vehicle             5
Lender damaged or destroyed property            1
Name: issue, Length: 95, dtype: int64

In [17]:
len(df.issue.value_counts())

95

In [18]:
df["product"].unique()

array(['Debt collection', 'Mortgage', 'Consumer Loan',
       'Bank account or service', 'Credit reporting', 'Payday loan',
       'Other financial service', 'Student loan', 'Money transfers',
       'Prepaid card', 'Credit card'], dtype=object)

In [19]:
df["sub_issue"]=df["sub_issue"].replace(np.nan," ")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [20]:
df

Unnamed: 0,issue,sub_issue,product
0,Disclosure verification of debt,Not given enough info to verify debt,Debt collection
1,"Loan servicing, payments, escrow account",,Mortgage
2,"Loan modification,collection,foreclosure",,Mortgage
3,Disclosure verification of debt,Not given enough info to verify debt,Debt collection
4,"Application, originator, mortgage broker",,Mortgage
...,...,...,...
555952,Improper contact or sharing of info,Contacted employer after asked not to,Debt collection
555953,Cont'd attempts collect debt not owed,Debt was discharged in bankruptcy,Debt collection
555954,Disclosure verification of debt,Not given enough info to verify debt,Debt collection
555955,Disclosure verification of debt,Not given enough info to verify debt,Debt collection


In [21]:
df["Main_issue"] = df['issue'].astype(str) +" "+ df["sub_issue"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [22]:
df

Unnamed: 0,issue,sub_issue,product,Main_issue
0,Disclosure verification of debt,Not given enough info to verify debt,Debt collection,Disclosure verification of debt Not given enou...
1,"Loan servicing, payments, escrow account",,Mortgage,"Loan servicing, payments, escrow account"
2,"Loan modification,collection,foreclosure",,Mortgage,"Loan modification,collection,foreclosure"
3,Disclosure verification of debt,Not given enough info to verify debt,Debt collection,Disclosure verification of debt Not given enou...
4,"Application, originator, mortgage broker",,Mortgage,"Application, originator, mortgage broker"
...,...,...,...,...
555952,Improper contact or sharing of info,Contacted employer after asked not to,Debt collection,Improper contact or sharing of info Contacted ...
555953,Cont'd attempts collect debt not owed,Debt was discharged in bankruptcy,Debt collection,Cont'd attempts collect debt not owed Debt was...
555954,Disclosure verification of debt,Not given enough info to verify debt,Debt collection,Disclosure verification of debt Not given enou...
555955,Disclosure verification of debt,Not given enough info to verify debt,Debt collection,Disclosure verification of debt Not given enou...


In [23]:
df1=df[["Main_issue","product"]]

In [24]:
df1

Unnamed: 0,Main_issue,product
0,Disclosure verification of debt Not given enou...,Debt collection
1,"Loan servicing, payments, escrow account",Mortgage
2,"Loan modification,collection,foreclosure",Mortgage
3,Disclosure verification of debt Not given enou...,Debt collection
4,"Application, originator, mortgage broker",Mortgage
...,...,...
555952,Improper contact or sharing of info Contacted ...,Debt collection
555953,Cont'd attempts collect debt not owed Debt was...,Debt collection
555954,Disclosure verification of debt Not given enou...,Debt collection
555955,Disclosure verification of debt Not given enou...,Debt collection


In [25]:
# for importing label encoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [26]:
# for applying label encoder
df1['product'] = le.fit_transform(df['product'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [27]:
df1

Unnamed: 0,Main_issue,product
0,Disclosure verification of debt Not given enou...,4
1,"Loan servicing, payments, escrow account",6
2,"Loan modification,collection,foreclosure",6
3,Disclosure verification of debt Not given enou...,4
4,"Application, originator, mortgage broker",6
...,...,...
555952,Improper contact or sharing of info Contacted ...,4
555953,Cont'd attempts collect debt not owed Debt was...,4
555954,Disclosure verification of debt Not given enou...,4
555955,Disclosure verification of debt Not given enou...,4


In [32]:
# X for input y for output/ prediction
X = df1['Main_issue']
y = df1['product']

In [33]:
X

0         Disclosure verification of debt Not given enou...
1                Loan servicing, payments, escrow account  
2                Loan modification,collection,foreclosure  
3         Disclosure verification of debt Not given enou...
4                Application, originator, mortgage broker  
                                ...                        
555952    Improper contact or sharing of info Contacted ...
555953    Cont'd attempts collect debt not owed Debt was...
555954    Disclosure verification of debt Not given enou...
555955    Disclosure verification of debt Not given enou...
555956    False statements or representation Impersonate...
Name: Main_issue, Length: 555957, dtype: object

In [34]:
y

0         4
1         6
2         6
3         4
4         6
         ..
555952    4
555953    4
555954    4
555955    4
555956    4
Name: product, Length: 555957, dtype: int32

In [35]:
# for stlyling only
from tqdm import tqdm
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re

In [36]:
ps = PorterStemmer()
corpus = []

for i in tqdm(range(len(X))):
#     print(i, end=', ')
    review = re.sub("[^a-zA-Z]"," ",X[i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if word not in set(stopwords.words("english"))]
    review = " ".join(review)
    corpus.append(review)

  1%|▋                                                                          | 5149/555957 [00:13<21:42, 422.93it/s]

KeyboardInterrupt: 