In [1]:
%%capture
%pip install pandas

In [2]:
import pandas as pd

In [10]:
# Import the dataset with consumer complaints
dataset_CC = pd.read_csv("C:\\Users\\admin\\Downloads\\complaint_data.csv")

# Show columns as rows for better comprehension
print(dataset_CC.head(1).T)

                                                                             0
Date received                                                       08/09/2015
Product                                                       Credit reporting
Sub-product                                                                NaN
Issue                                   Incorrect information on credit report
Sub-issue                                              Information is not mine
Consumer complaint narrative                                               NaN
Company public response       Company chooses not to provide a public response
Company                                    Experian Information Solutions Inc.
State                                                                       NJ
ZIP code                                                                 08872
Tags                                                                       NaN
Consumer consent provided?                          

In [11]:
# Verify lines with empty columns
# It is important to mention that there are 4 relevant category columns, namely 'Product', 'Sub-product', 'Issue' and 'Sub-issue'
percent_missing = round(dataset_CC.isnull().sum() * 100 / len(dataset_CC), 2)
print(percent_missing.sort_values(ascending=False).head(20))

Tags                            86.37
Consumer complaint narrative    67.31
Company public response         62.76
Consumer disputed?              47.79
Consumer consent provided?      41.39
Sub-issue                       37.71
Sub-product                     15.98
ZIP code                         9.32
State                            1.66
Date sent to company             0.00
Timely response?                 0.00
Company response to consumer     0.00
Date received                    0.00
Submitted via                    0.00
Product                          0.00
Company                          0.00
Issue                            0.00
Complaint ID                     0.00
dtype: float64


In [12]:
# Delete all lines without consumer complaint narrative which is the text associated with the ticket
# This step is crucial because we are gonna use this text to infer the previously mentioned categories.
print(f"Raw Length: {len(dataset_CC)}\n")

dataset_CC = dataset_CC.dropna(subset=['Consumer complaint narrative'])
print(f"New Length: {len(dataset_CC)}\n")
print(dataset_CC.head(1))

Raw Length: 1471766

New Length: 481087

  Date received      Product Sub-product             Issue Sub-issue  \
4    03/04/2016  Credit card         NaN  Billing disputes       NaN   

                        Consumer complaint narrative Company public response  \
4  I am dissatisfied with the current outcome of ...                     NaN   

         Company State ZIP code Tags Consumer consent provided? Submitted via  \
4  DISCOVER BANK    NV    891XX  NaN           Consent provided           Web   

  Date sent to company Company response to consumer Timely response?  \
4           03/04/2016      Closed with explanation              Yes   

  Consumer disputed?  Complaint ID  
4                Yes       1816726  


In [13]:
# Analyse the unique company public responses
print(dataset_CC['Company public response'].unique())

[nan
 'Company has responded to the consumer and the CFPB and chooses not to provide a public response'
 'Company believes complaint caused principally by actions of third party outside the control or direction of the company'
 'Company chooses not to provide a public response'
 'Company disputes the facts presented in the complaint'
 'Company believes the complaint is the result of a misunderstanding'
 'Company believes it acted appropriately as authorized by contract or law'
 "Company can't verify or dispute the facts in the complaint"
 'Company believes complaint is the result of an isolated error'
 'Company believes complaint represents an opportunity for improvement to better serve consumers'
 'Company believes complaint relates to a discontinued policy or procedure']


In [27]:
# Analyse the unique company responses to consumers
unique_company_responses_to_consumers = dataset_CC['Company response to consumer'].unique()
print(f"Unique Company Responses to Consumers: {len(unique_company_responses_to_consumers)}\n")
print(unique_company_responses_to_consumers[:10])

Unique Company Responses to Consumers: 6

['Closed with explanation' 'Closed with monetary relief'
 'Closed with non-monetary relief' 'Untimely response' 'Closed' nan]


In [28]:
# Analyse the unique values for category 'Product'
unique_products = dataset_CC['Product'].unique()
print(f"Unique Products: {len(unique_products)}\n")
print(unique_products[:10])

Unique Products: 18

['Credit card'
 'Credit reporting, credit repair services, or other personal consumer reports'
 'Debt collection' 'Credit card or prepaid card' 'Mortgage'
 'Bank account or service' 'Credit reporting' 'Student loan'
 'Checking or savings account' 'Vehicle loan or lease']


In [29]:
# Analyse the unique values for category 'Sub-product'
unique_sub_products = dataset_CC['Sub-product'].unique()
print(f"Unique Sub-products: {len(unique_sub_products)}\n")
print(unique_sub_products[:10])

Unique Sub-products: 76

[nan 'Credit reporting' 'I do not know'
 'General-purpose credit card or charge card' 'FHA mortgage'
 'Credit card debt' 'Conventional home mortgage' 'Checking account'
 'Store credit card' 'Other (i.e. phone, health club, etc.)']


In [30]:
# Analyse the unique values for category 'Issue'
unique_issues = dataset_CC['Issue'].unique()
print(f"Unique Issues: {len(unique_issues)}\n")
print(unique_issues[:10])

Unique Issues: 161

['Billing disputes' 'Incorrect information on your report'
 'Attempts to collect debt not owed' 'Improper use of your report'
 'Problem with a purchase shown on your statement'
 'Loan modification,collection,foreclosure'
 'False statements or representation' 'Trouble during payment process'
 'Deposits and withdrawals' 'Problem when making payments']


In [31]:
# Analyse the unique values for category 'Sub-issue'
unique_sub_issues = dataset_CC['Sub-issue'].unique()
print(f"Unique Sub-issues: {len(unique_sub_issues)}\n")
print(unique_sub_issues[:10])

Unique Sub-issues: 217

[nan 'Information belongs to someone else'
 'Debt was result of identity theft'
 "Credit inquiries on your report that you don't recognize"
 "Credit card company isn't resolving a dispute about a purchase on your statement"
 'Public record information inaccurate'
 'Attempted to collect wrong amount'
 'You never received your bill or did not know a payment was due'
 'Debt was paid' 'Information is not mine']


In [34]:
# Get unique combinations of classification columns
unique_classification_combinations = dataset_CC[['Product', 'Sub-product', 'Issue', 'Sub-issue']].value_counts()

print(unique_classification_combinations.head(10))

Product                                                                       Sub-product                                 Issue                                                                             Sub-issue                                                                       
Credit reporting, credit repair services, or other personal consumer reports  Credit reporting                            Incorrect information on your report                                              Information belongs to someone else                                                 30960
                                                                                                                          Problem with a credit reporting company's investigation into an existing problem  Their investigation did not fix an error on your report                             26321
                                                                                                                          Incor