In [1]:
%%capture
%pip install pandas

In [2]:
import pandas as pd

In [33]:
# Import the dataset with consumer complaints
dataset_CC = pd.read_csv("C:\\Users\\admin\\Downloads\\complaint_data.csv")

# Show columns as rows for better comprehension
print(dataset_CC.head(1).T)

                                                                             0  \
Date received                                                       08/09/2015   
Product                                                       Credit reporting   
Sub-product                                                                NaN   
Issue                                   Incorrect information on credit report   
Sub-issue                                              Information is not mine   
Consumer complaint narrative                                               NaN   
Company public response       Company chooses not to provide a public response   
Company                                    Experian Information Solutions Inc.   
State                                                                       NJ   
ZIP code                                                                 08872   
Tags                                                                       NaN   
Consumer consent

In [36]:
# Verify lines with empty columns
# It is important to mention that there are 4 relevant category columns, namely 'Product', 'Sub-product', 'Issue' and 'Sub-issue'
percent_missing = round(dataset_CC.isnull().sum() * 100 / len(dataset_CC), 2)
print(percent_missing.sort_values(ascending=False).head(20))

Tags                            86.37
Consumer complaint narrative    67.31
Company public response         62.76
Consumer disputed?              47.79
Consumer consent provided?      41.39
Sub-issue                       37.71
Sub-product                     15.98
ZIP code                         9.32
State                            1.66
Date sent to company             0.00
Timely response?                 0.00
Company response to consumer     0.00
Date received                    0.00
Submitted via                    0.00
Product                          0.00
Company                          0.00
Issue                            0.00
Complaint ID                     0.00
dtype: float64


In [38]:
# Delete all lines without consumer complaint narrative which is the text associated with the ticket
# This step is crucial because we are gonna use this text to infer the previously mentioned categories.
print(f"Length: {len(dataset_CC)}\n")

dataset_CC = dataset_CC.dropna(subset=['Consumer complaint narrative'])
print(f"New Length: {len(dataset_CC)}\n")
print(dataset_CC.head(1))

Length: 481087

New Length: 481087

  Date received      Product Sub-product             Issue Sub-issue  \
4    03/04/2016  Credit card         NaN  Billing disputes       NaN   

                        Consumer complaint narrative Company public response  \
4  I am dissatisfied with the current outcome of ...                     NaN   

         Company State ZIP code Tags Consumer consent provided? Submitted via  \
4  DISCOVER BANK    NV    891XX  NaN           Consent provided           Web   

  Date sent to company Company response to consumer Timely response?  \
4           03/04/2016      Closed with explanation              Yes   

  Consumer disputed?  Complaint ID  
4                Yes       1816726  


In [9]:
# Analyse the unique company public responses
print(dataset_CC['Company public response'].unique())

[nan
 'Company has responded to the consumer and the CFPB and chooses not to provide a public response'
 'Company believes complaint caused principally by actions of third party outside the control or direction of the company'
 'Company chooses not to provide a public response'
 'Company disputes the facts presented in the complaint'
 'Company believes the complaint is the result of a misunderstanding'
 'Company believes it acted appropriately as authorized by contract or law'
 "Company can't verify or dispute the facts in the complaint"
 'Company believes complaint is the result of an isolated error'
 'Company believes complaint represents an opportunity for improvement to better serve consumers'
 'Company believes complaint relates to a discontinued policy or procedure']


In [8]:
# Analyse the unique company responses to consumers
print(dataset_CC['Company response to consumer'].unique())

['Closed with explanation' 'Closed with monetary relief'
 'Closed with non-monetary relief' 'Untimely response' 'Closed' nan]


In [39]:
# Get unique combinations of category columns
support_guide = dataset_CC[['Product', 'Sub-product', 'Issue', 'Sub-issue']].value_counts()

print(support_guide.head(25))

Product                                                                       Sub-product                                 Issue                                                                             Sub-issue                                                                            
Credit reporting, credit repair services, or other personal consumer reports  Credit reporting                            Incorrect information on your report                                              Information belongs to someone else                                                      30960
                                                                                                                          Problem with a credit reporting company's investigation into an existing problem  Their investigation did not fix an error on your report                                  26321
                                                                                                                