In [63]:
import os
from pathlib import Path

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

In [64]:
# Constants
DATA_FOLDER = Path('../DATA_STORE')
ARTIFACTS_STORE = Path('../artifacts')
RAW_DATA_FOLDER = DATA_FOLDER / 'raw_data'
FINAL_DATA_FOLDER = DATA_FOLDER / 'final_data'

In [65]:
csv_file = RAW_DATA_FOLDER/'complaints.csv'
# Specify the correct dtypes for the problematic columns
dtypes = {
    'Company public response': 'object',
    'Consumer complaint narrative': 'object',
    'Consumer consent provided?': 'object',
    'Consumer disputed?': 'object',
    'Tags': 'object',
    'ZIP code': 'object'
}

# Read the CSV file with Dask, specifying the dtypes
df = pd.read_csv(csv_file, dtype=dtypes)

In [66]:
print(df.shape)
df.head()

(6140348, 18)


Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,2024-09-15,Credit reporting or other personal consumer re...,Credit reporting,Incorrect information on your report,Information belongs to someone else,,,"EQUIFAX, INC.",OR,97218,,,Web,2024-09-15,In progress,Yes,,10127439
1,2024-09-15,Credit reporting or other personal consumer re...,Credit reporting,Incorrect information on your report,Information belongs to someone else,,,"EQUIFAX, INC.",OR,97218,,,Web,2024-09-15,In progress,Yes,,10127441
2,2024-09-15,Credit reporting or other personal consumer re...,Credit reporting,Incorrect information on your report,Information belongs to someone else,,,"EQUIFAX, INC.",CA,95822,,,Web,2024-09-15,In progress,Yes,,10127444
3,2024-09-15,Credit reporting or other personal consumer re...,Credit reporting,Improper use of your report,Credit inquiries on your report that you don't...,,,"EQUIFAX, INC.",UT,84765,,,Web,2024-09-15,In progress,Yes,,10127452
4,2024-09-15,Credit reporting or other personal consumer re...,Credit reporting,Incorrect information on your report,Information belongs to someone else,,,"EQUIFAX, INC.",CA,91344,,,Web,2024-09-15,In progress,Yes,,10127458


In [67]:
# Basic Cleaning
df['Date received'] = pd.to_datetime(df['Date received'])
df['year'] = df['Date received'].dt.year
df = df[df['year']==2024]
df = df[(df['Consumer complaint narrative'].isna()==False) & (df['Sub-issue'].isna()==False)]

df.shape

(405000, 19)

In [68]:
product_map = {
    'Credit reporting or other personal consumer reports': 'Credit Reporting',
    'Debt collection': 'Debt collection', 
    'Credit card': 'Credit card',
    'Prepaid card':'Prepaid card',
    'Checking or savings account': 'Checking/savings account',
    'Mortgage': 'Loans / Mortgage',
    'Money transfer, virtual currency, or money service': 'Money transfer',
    'Student loan': 'Loans / Mortgage',
    'Vehicle loan or lease': 'Loans / Mortgage',
    'Payday loan, title loan, personal loan, or advance loan': 'Loans / Mortgage',
    'Debt or credit management': 'Debt/credit management'
}
df['Product'] = df['Product'].map(product_map)

In [69]:
df['Product'] = df['Product'].str.strip()
df['Sub-product'] = df['Sub-product'].str.strip()
df['Issue'] = df['Issue'].str.strip()
df['Sub-issue'] = df['Sub-issue'].str.strip()
df = df[
    (df['Product'] != '') |
    (df['Sub-product'] != '') |
    (df['Issue'] != '') |
    (df['Sub-issue'] != '')
]
df.shape

(405000, 19)

In [70]:
df.loc[:, 'narrative_word_count'] = df['Consumer complaint narrative'].apply(lambda x: len(str(x).split()))
df = df[df['narrative_word_count'] >= 30]
df.shape

(364520, 20)

In [9]:
df = df.drop_duplicates(subset=['Consumer complaint narrative', 'Product', 'Sub-product', 'Issue', 'Sub-issue'], keep='first')
df.shape

(227463, 20)

In [10]:
filtered_narratives = df['Consumer complaint narrative'].value_counts()
narratives_to_remove = filtered_narratives[filtered_narratives >= 2].index
df = df[~df['Consumer complaint narrative'].isin(narratives_to_remove)]
df.shape

(223155, 20)

In [21]:
df['Product_SubProduct'] = df['Product'] + '_' + df['Sub-product']
combined_counts = df['Product_SubProduct'].value_counts()
df = df[df['Product_SubProduct'].isin(combined_counts[combined_counts >= 500].index)]
df = df.drop(columns=['Product_SubProduct'])

df['Issue_SubIssue'] = df['Issue'] + '_' + df['Sub-issue']
combined_counts = df['Issue_SubIssue'].value_counts()
df = df[df['Issue_SubIssue'].isin(combined_counts[combined_counts >= 500].index)]
df = df.drop(columns=['Issue_SubIssue'])

df.shape

(185049, 20)

In [28]:
product_counts = df['Product'].value_counts()
sub_product_counts = df['Sub-product'].value_counts()
issue_counts = df['Issue'].value_counts()
sub_issue_counts = df['Sub-issue'].value_counts()

df = df[(df['Product'].isin(product_counts[product_counts >= 500].index)) |
        (df['Sub-product'].isin(sub_product_counts[sub_product_counts >= 500].index)) |
        (df['Issue'].isin(issue_counts[issue_counts >= 500].index)) |
        (df['Sub-issue'].isin(sub_issue_counts[sub_issue_counts >= 500].index))]
df = df[df['Sub-product'].isin(['I do not know'])==False]
df.shape

(180577, 20)

In [61]:
req_df = df[['Product', 'Sub-product', 'Issue', 'Sub-issue', 'Consumer complaint narrative']]

In [62]:
req_df.to_csv(FINAL_DATA_FOLDER / 'filtered_complaints.csv', index = False)

# Extras

In [30]:
df[['Product', 'Sub-product']].nunique()

Product         5
Sub-product    15
dtype: int64

In [31]:
df[['Product', 'Sub-product']].value_counts()

Product                   Sub-product                               
Credit Reporting          Credit reporting                              144967
Credit card               General-purpose credit card or charge card      9183
Checking/savings account  Checking account                                6495
Debt collection           Credit card debt                                4150
                          Other debt                                      3098
                          Telecommunications debt                         2127
                          Rental debt                                     1755
Credit card               Store credit card                               1682
Debt collection           Medical debt                                    1541
Loans / Mortgage          Federal student loan servicing                  1410
                          Conventional home mortgage                      1073
Credit Reporting          Other personal consumer report      

In [34]:
df[['Issue', 'Sub-issue']].nunique()

Issue        16
Sub-issue    32
dtype: int64

In [35]:
df['Issue'].value_counts()

Issue
Incorrect information on your report                               64785
Improper use of your report                                        49636
Problem with a company's investigation into an existing problem    33685
Attempts to collect debt not owed                                   7305
Managing an account                                                 6036
Problem with a purchase shown on your statement                     4469
Written notification about debt                                     3938
Problem with a lender or other company charging your account        1671
False statements or representation                                  1436
Fees or interest                                                    1336
Dealing with your lender or servicer                                1248
Other features, terms, or problems                                  1113
Problem when making payments                                        1075
Getting a credit card                        

In [36]:
df['Sub-issue'].value_counts()

Sub-issue
Reporting company used your report improperly                                            39365
Information belongs to someone else                                                      36626
Their investigation did not fix an error on your report                                  23357
Account information incorrect                                                            11112
Account status incorrect                                                                 10724
Credit inquiries on your report that you don't recognize                                 10271
Was not notified of investigation status or results                                       3714
Debt is not yours                                                                         3560
Credit card company isn't resolving a dispute about a purchase on your statement          3305
Deposits and withdrawals                                                                  3204
Personal information incorrect          

In [55]:
pd.DataFrame(df[['Issue', 'Sub-issue']].value_counts()).reset_index().sort_values('Issue')

Unnamed: 0,Issue,Sub-issue,count
7,Attempts to collect debt not owed,Debt is not yours,3560
19,Attempts to collect debt not owed,Debt was paid,1511
14,Attempts to collect debt not owed,Debt was result of identity theft,2234
22,Dealing with your lender or servicer,Trouble with how payments are being handled,1248
20,False statements or representation,Attempted to collect wrong amount,1436
21,Fees or interest,Problem with fees,1336
28,Getting a credit card,Card opened without my consent or knowledge,1061
0,Improper use of your report,Reporting company used your report improperly,39365
5,Improper use of your report,Credit inquiries on your report that you don't...,10271
23,Incorrect information on your report,Public record information inaccurate,1229


In [43]:
product_labels = df['Product'].unique().tolist()
product_label_mapping = {label: i for i, label in enumerate(product_labels)}
product_label_mapping

{'Credit Reporting': 0,
 'Credit card': 1,
 'Debt collection': 2,
 'Checking/savings account': 3,
 'Loans / Mortgage': 4}

In [48]:
product_label2id = product_label_mapping
product_id2label = {j : i for i, j in product_label_mapping.items()}
product_id2label

{0: 'Credit Reporting',
 1: 'Credit card',
 2: 'Debt collection',
 3: 'Checking/savings account',
 4: 'Loans / Mortgage'}

In [49]:
sub_product_mapping = df['Sub-product'].unique().tolist()
sub_product_label_mapping = {label: i for i, label in enumerate(sub_product_mapping)}
sub_product_label_mapping

{'Credit reporting': 0,
 'Store credit card': 1,
 'General-purpose credit card or charge card': 2,
 'Credit card debt': 3,
 'Checking account': 4,
 'Other personal consumer report': 5,
 'Auto debt': 6,
 'Federal student loan servicing': 7,
 'Other debt': 8,
 'Telecommunications debt': 9,
 'Medical debt': 10,
 'Conventional home mortgage': 11,
 'Other banking product or service': 12,
 'Rental debt': 13,
 'Savings account': 14}

In [50]:
sub_product_label2id = sub_product_label_mapping
sub_product_id2label = {j : i for i, j in sub_product_label_mapping.items()}
sub_product_id2label

{0: 'Credit reporting',
 1: 'Store credit card',
 2: 'General-purpose credit card or charge card',
 3: 'Credit card debt',
 4: 'Checking account',
 5: 'Other personal consumer report',
 6: 'Auto debt',
 7: 'Federal student loan servicing',
 8: 'Other debt',
 9: 'Telecommunications debt',
 10: 'Medical debt',
 11: 'Conventional home mortgage',
 12: 'Other banking product or service',
 13: 'Rental debt',
 14: 'Savings account'}

In [51]:
issue_labels = df['Issue'].unique().tolist()
issue_label_mapping = {label: i for i, label in enumerate(issue_labels)}
issue_label_mapping

{'Incorrect information on your report': 0,
 'Problem when making payments': 1,
 'Getting a credit card': 2,
 "Problem with a company's investigation into an existing problem": 3,
 'Attempts to collect debt not owed': 4,
 'Managing an account': 5,
 'Improper use of your report': 6,
 'Problem with a purchase shown on your statement': 7,
 'False statements or representation': 8,
 'Dealing with your lender or servicer': 9,
 'Written notification about debt': 10,
 'Trouble during payment process': 11,
 'Fees or interest': 12,
 'Problem with a lender or other company charging your account': 13,
 'Took or threatened to take negative or legal action': 14,
 'Other features, terms, or problems': 15}

In [52]:
issue_label2id = issue_label_mapping
issue_id2label = {j : i for i, j in issue_label_mapping.items()}
issue_id2label

{0: 'Incorrect information on your report',
 1: 'Problem when making payments',
 2: 'Getting a credit card',
 3: "Problem with a company's investigation into an existing problem",
 4: 'Attempts to collect debt not owed',
 5: 'Managing an account',
 6: 'Improper use of your report',
 7: 'Problem with a purchase shown on your statement',
 8: 'False statements or representation',
 9: 'Dealing with your lender or servicer',
 10: 'Written notification about debt',
 11: 'Trouble during payment process',
 12: 'Fees or interest',
 13: 'Problem with a lender or other company charging your account',
 14: 'Took or threatened to take negative or legal action',
 15: 'Other features, terms, or problems'}

In [53]:
sub_issue_mapping = df['Sub-issue'].unique().tolist()
sub_issue_label_mapping = {label: i for i, label in enumerate(sub_issue_mapping)}
sub_issue_label_mapping

{'Information belongs to someone else': 0,
 'Problem during payment process': 1,
 'Card opened without my consent or knowledge': 2,
 'Their investigation did not fix an error on your report': 3,
 'Debt is not yours': 4,
 'Deposits and withdrawals': 5,
 'Reporting company used your report improperly': 6,
 'Account status incorrect': 7,
 'Account information incorrect': 8,
 'Was not notified of investigation status or results': 9,
 "Credit inquiries on your report that you don't recognize": 10,
 'Personal information incorrect': 11,
 'Card was charged for something you did not purchase with the card': 12,
 'Investigation took more than 30 days': 13,
 'Problem with personal statement of dispute': 14,
 'Old information reappears or never goes away': 15,
 'Attempted to collect wrong amount': 16,
 'Trouble with how payments are being handled': 17,
 'Public record information inaccurate': 18,
 'Debt was result of identity theft': 19,
 'Difficulty submitting a dispute or getting information ab

In [54]:
sub_issue_label2id = sub_issue_label_mapping
sub_issue_id2label = {j : i for i, j in sub_issue_label_mapping.items()}
sub_issue_id2label

{0: 'Information belongs to someone else',
 1: 'Problem during payment process',
 2: 'Card opened without my consent or knowledge',
 3: 'Their investigation did not fix an error on your report',
 4: 'Debt is not yours',
 5: 'Deposits and withdrawals',
 6: 'Reporting company used your report improperly',
 7: 'Account status incorrect',
 8: 'Account information incorrect',
 9: 'Was not notified of investigation status or results',
 10: "Credit inquiries on your report that you don't recognize",
 11: 'Personal information incorrect',
 12: 'Card was charged for something you did not purchase with the card',
 13: 'Investigation took more than 30 days',
 14: 'Problem with personal statement of dispute',
 15: 'Old information reappears or never goes away',
 16: 'Attempted to collect wrong amount',
 17: 'Trouble with how payments are being handled',
 18: 'Public record information inaccurate',
 19: 'Debt was result of identity theft',
 20: 'Difficulty submitting a dispute or getting informatio

In [57]:
import os
import json

# Define the folder path
output_folder = ARTIFACTS_STORE

# Create the folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Save each dictionary to its corresponding JSON file
with open(os.path.join(output_folder, 'product_label2id.json'), 'w') as f:
    json.dump(product_label2id, f, indent=4)

with open(os.path.join(output_folder, 'product_id2label.json'), 'w') as f:
    json.dump(product_id2label, f, indent=4)

with open(os.path.join(output_folder, 'sub_product_label2id.json'), 'w') as f:
    json.dump(sub_product_label2id, f, indent=4)

with open(os.path.join(output_folder, 'sub_product_id2label.json'), 'w') as f:
    json.dump(sub_product_id2label, f, indent=4)

with open(os.path.join(output_folder, 'issue_label2id.json'), 'w') as f:
    json.dump(issue_label2id, f, indent=4)

with open(os.path.join(output_folder, 'issue_id2label.json'), 'w') as f:
    json.dump(issue_id2label, f, indent=4)

with open(os.path.join(output_folder, 'sub_issue_label2id.json'), 'w') as f:
    json.dump(sub_issue_label2id, f, indent=4)

with open(os.path.join(output_folder, 'sub_issue_id2label.json'), 'w') as f:
    json.dump(sub_issue_id2label, f, indent=4)

print(f"All JSON files have been saved successfully in the '{output_folder}' folder!")


All JSON files have been saved successfully in the '../artifacts' folder!


In [75]:
product_label2id
product_id2label

{0: 'Credit Reporting',
 1: 'Credit card',
 2: 'Debt collection',
 3: 'Checking/savings account',
 4: 'Loans / Mortgage'}

In [76]:
sub_product_label2id
sub_product_id2label

{0: 'Credit reporting',
 1: 'Store credit card',
 2: 'General-purpose credit card or charge card',
 3: 'Credit card debt',
 4: 'Checking account',
 5: 'Other personal consumer report',
 6: 'Auto debt',
 7: 'Federal student loan servicing',
 8: 'Other debt',
 9: 'Telecommunications debt',
 10: 'Medical debt',
 11: 'Conventional home mortgage',
 12: 'Other banking product or service',
 13: 'Rental debt',
 14: 'Savings account'}

In [77]:
issue_label2id
issue_id2label

{0: 'Incorrect information on your report',
 1: 'Problem when making payments',
 2: 'Getting a credit card',
 3: "Problem with a company's investigation into an existing problem",
 4: 'Attempts to collect debt not owed',
 5: 'Managing an account',
 6: 'Improper use of your report',
 7: 'Problem with a purchase shown on your statement',
 8: 'False statements or representation',
 9: 'Dealing with your lender or servicer',
 10: 'Written notification about debt',
 11: 'Trouble during payment process',
 12: 'Fees or interest',
 13: 'Problem with a lender or other company charging your account',
 14: 'Took or threatened to take negative or legal action',
 15: 'Other features, terms, or problems'}

In [78]:
sub_issue_label2id
sub_issue_id2label

{0: 'Information belongs to someone else',
 1: 'Problem during payment process',
 2: 'Card opened without my consent or knowledge',
 3: 'Their investigation did not fix an error on your report',
 4: 'Debt is not yours',
 5: 'Deposits and withdrawals',
 6: 'Reporting company used your report improperly',
 7: 'Account status incorrect',
 8: 'Account information incorrect',
 9: 'Was not notified of investigation status or results',
 10: "Credit inquiries on your report that you don't recognize",
 11: 'Personal information incorrect',
 12: 'Card was charged for something you did not purchase with the card',
 13: 'Investigation took more than 30 days',
 14: 'Problem with personal statement of dispute',
 15: 'Old information reappears or never goes away',
 16: 'Attempted to collect wrong amount',
 17: 'Trouble with how payments are being handled',
 18: 'Public record information inaccurate',
 19: 'Debt was result of identity theft',
 20: 'Difficulty submitting a dispute or getting informatio