In [2]:
%pip install transformers

# Cell 1: Imports & Setup
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from imblearn.over_sampling import RandomOverSampler
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

Note: you may need to restart the kernel to use updated packages.


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Cell 2: Load Preprocessed Data (from Step 2)
# Re-run preprocessing if needed (copy from eda_preprocessing.ipynb)
df = pd.read_csv('data/complaints_10k.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,0,2017-06-08,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Account information incorrect,,,"EQUIFAX, INC.",MD,21108,,,Postal mail,2017-06-09,Closed with explanation,Yes,,2546268
1,1,2015-05-13,Bank account or service,Checking account,"Account opening, closing, or management",,,,TD BANK US HOLDING COMPANY,NJ,08759,Older American,,Phone,2015-05-14,Closed with monetary relief,Yes,No,1373196
2,2,2016-03-01,Credit card,,Billing disputes,,,,CAPITAL ONE FINANCIAL CORPORATION,LA,710XX,"Older American, Servicemember",,Phone,2016-03-01,Closed with explanation,Yes,No,1810934
3,3,2021-03-01,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Information belongs to someone else,"The following accounts are not mines, and is e...",,"EQUIFAX, INC.",MO,,,Consent provided,Web,2021-03-01,Closed with explanation,Yes,,4173045
4,4,2019-08-01,"Credit reporting, credit repair services, or o...",Credit reporting,Problem with a credit reporting company's inve...,Their investigation did not fix an error on yo...,I have submitted several disputes requesting V...,Company has responded to the consumer and the ...,Experian Information Solutions Inc.,TX,,,Consent provided,Web,2019-08-01,Closed with explanation,Yes,,3325520


In [4]:
df_subset = df[['Consumer complaint narrative', 'Issue']].dropna()
df_subset.head()

Unnamed: 0,Consumer complaint narrative,Issue
3,"The following accounts are not mines, and is e...",Incorrect information on your report
4,I have submitted several disputes requesting V...,Problem with a credit reporting company's inve...
5,Referecnce CFPB Complaint : XXXX and XXXX This...,Attempts to collect debt not owed
10,Loancare continues to take my money but not pa...,"Loan servicing, payments, escrow account"
11,"SageStream , LLC, Consumer Office,XX/XX/XXXX X...",Incorrect information on your report


In [6]:
import nltk
nltk.download('stopwords')
stop_words = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\amolc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
lemmatizer = nltk.stem.WordNetLemmatizer()

In [8]:
import re
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', str(text).lower())
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words and len(token) > 2]
    return ' '.join(tokens)

In [9]:
df_subset['processed_narrative'] = df_subset['Consumer complaint narrative'].apply(preprocess_text)

In [15]:
print(df_subset['processed_narrative'][10])

loancare continues take money pay mortgage xxxx xxxx keep calling wanting money month behind mortgage never missed payment find loancare sold xxxx xxxx release fund


In [16]:
X = df_subset['processed_narrative']

In [17]:
Y = df_subset['Issue']

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")

Train size: 2788, Test size: 697


In [21]:
# Handle Class Imbalance
ros = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train.values.reshape(-1, 1), Y_train)
X_train_resampled = X_train_resampled.flatten()
print(f"Resampled train classes: {Counter(y_train_resampled)}")

Resampled train classes: Counter({'Disclosure verification of debt': 632, 'Attempts to collect debt not owed': 632, "Charged fees or interest you didn't expect": 632, 'Incorrect information on your report': 632, 'Loan servicing, payments, escrow account': 632, 'Managing an account': 632, 'Took or threatened to take negative or legal action': 632, 'Incorrect information on credit report': 632, 'Closing your account': 632, "Problem with a credit reporting company's investigation into an existing problem": 632, 'Dealing with your lender or servicer': 632, 'Struggling to pay mortgage': 632, 'Problem with a purchase shown on your statement': 632, 'Closing on a mortgage': 632, 'Dealing with my lender or servicer': 632, 'Fraud or scam': 632, "Cont'd attempts collect debt not owed": 632, 'Problem with a lender or other company charging your account': 632, 'Taking/threatening an illegal action': 632, 'Delinquent account': 632, 'Improper use of your report': 632, 'Problem with fraud alerts or se