In [2]:
import pandas as pd
import numpy as np

Data Source - https://catalog.data.gov/dataset/consumer-complaint-database

In [3]:
# data = pd.read_csv('classification_complaints.csv')
# master_data = data[['Consumer complaint narrative','Product']]
# master_data.to_csv('master_data.csv', index=False)

In [4]:
master_data = pd.read_csv('master_data.csv')
master_data.drop_duplicates(inplace=True)
master_data.sample(5)

Unnamed: 0,Consumer complaint narrative,Product
2405941,XXXX @ XXXX I called Experian to inquire about...,Credit reporting or other personal consumer re...
4511043,Hi! \n\nI'm writing to request that you help m...,Debt collection
3330813,XXXX is reporting unknown credit accounts. Req...,Credit reporting or other personal consumer re...
2654597,"On XX/XX/2019, XXXX XXXX XXXX XXXX placed a co...",Debt collection
453349,According to FCRA Section 605B ( a ) the CREDI...,Credit reporting or other personal consumer re...


In [5]:
master_data['label'] = master_data['Product'].factorize()[0]

In [6]:
master_data['Product'].value_counts()

Product
Credit reporting, credit repair services, or other personal consumer reports    586209
Credit reporting or other personal consumer reports                             275606
Debt collection                                                                 253605
Mortgage                                                                        121720
Checking or savings account                                                     106709
Credit card or prepaid card                                                     104132
Credit card                                                                      50851
Student loan                                                                     45495
Money transfer, virtual currency, or money service                               43321
Vehicle loan or lease                                                            33214
Credit reporting                                                                 29827
Payday loan, title loan, or persona

In [7]:
master_data['Consumer complaint narrative'].sample(10)

3671792    This company keeps calling and calling and cal...
5311196    OCWEN OWES US $ XXXX AS 3RD PARTY CLAIMS LISTE...
1528499    Hi My Name is XXXX XXXX, I have premier accoun...
6160554    I had a Toyota Corolla that was repossesed due...
1477095    attached is the CERTIFIED MAIL FROM USPS XXXX....
6537447    I had an account sent to collections that I do...
1467461    In accordance with the Fair Credit Reporting a...
3485317    The following information currently listed and...
826650     On XX/XX/XXXX, I submitted a request to block ...
815288     The debt is time-barred ie more than four and ...
Name: Consumer complaint narrative, dtype: object

In [8]:
master_data['Consumer complaint narrative'].isna().sum()

np.int64(21)

In [9]:
master_data[master_data['Consumer complaint narrative'].isna()]

Unnamed: 0,Consumer complaint narrative,Product,label
0,,Credit reporting or other personal consumer re...,0
24,,Credit card,1
25,,Debt collection,2
1280,,Vehicle loan or lease,4
1803,,Debt or credit management,5
2081,,"Payday loan, title loan, personal loan, or adv...",6
2962,,Mortgage,7
3616,,Checking or savings account,3
3671,,"Money transfer, virtual currency, or money ser...",8
3695,,Student loan,9


In [10]:
master_data.dropna(inplace=True)

In [11]:
master_data.label.nunique()

21

In [12]:
master_data.head(5)

Unnamed: 0,Consumer complaint narrative,Product,label
27,I need your assistance with this incorrect acc...,Credit reporting or other personal consumer re...,0
63,"On XX/XX/XXXX, I filed a formal complaint with...",Credit reporting or other personal consumer re...,0
1038,"Dear Sir/Ma'am, Be advised that the descriptio...",Credit reporting or other personal consumer re...,0
1040,I'm really not sure what happened. I have mail...,Credit reporting or other personal consumer re...,0
1041,I am requesting again in regard to the inaccur...,Credit reporting or other personal consumer re...,0


In [13]:
master_data.shape

(1707171, 3)

In [17]:
master_data['length'] = master_data['Consumer complaint narrative'].apply(lambda x: len(x.split(" "))).tolist()

In [19]:
master_data['length'].describe().astype('int')

count    1707171
mean         206
std          252
min            1
25%           76
50%          139
75%          248
max         6320
Name: length, dtype: int64

In [20]:
import torch.utils
from transformers import AutoTokenizer
from torch.utils.data import Dataset, DataLoader
import torch
import pandas as pd

class ClassificationDataset(Dataset):
    def __init__(self,ds, tokenizer, padding:bool, truncation:bool, max_length:int):
        self.ds = ds      
        self.tokenizer = tokenizer
        self.padding = padding
        self.truncation = truncation
        self.max_length = max_length
        
    def __len__(self):
        return len(self.ds)
    
    def __getitem__(self, idx):
        complaint = self.ds.iloc[idx]['Consumer complaint narrative']
        label = self.ds.iloc[idx]['label']
        
        tokenized_complaint = self.tokenizer.encode_plus(complaint,
                                                                      padding = self.padding,
                                                                      truncation = self.truncation,
                                                                      max_length = self.max_length,
                                                                      return_attention_mask = True,
                                                                      return_tensors = 'pt')
        
        complaint_ids =  tokenized_complaint['input_ids'].flatten()
        attention_mask = tokenized_complaint['attention_mask'].flatten()
        label = torch.tensor(label,dtype=torch.int64)        
        
        return {   
            'complaint_ids': complaint_ids,
            'attention_mask': attention_mask,
            'label': label
        }

In [21]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

In [22]:
dataset = ClassificationDataset(master_data, tokenizer, padding=True, truncation=True, max_length=250)

In [23]:
train_size = int(len(dataset)*0.8)
val_size = len(dataset) - train_size

train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

In [25]:
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=1)

In [26]:
next(iter(train_loader))

{'complaint_ids': tensor([[  101, 22038, 20348,  1010, 22038, 20348,  1010,  1004,  1041, 15549,
           7011,  2595,  2024,  5719,  2000,  3189, 16542,  2592,  2006,  2026,
           7325,  4311,  1012,  2104,  2976,  2375,  7325,  6736,  2024,  3223,
           2000,  5441,  2531,  1003, 10640,  1998,  1999,  2023,  2553,  2027,
           2024,  2025,  1012,  2045,  2024,  2195,  2171,  8358,  1998,  4769,
           2008,  2024, 16542,  1998,  2323,  2022,  3202, 17159,  1012,  2026,
           2171,  2003, 22038, 20348, 22038, 20348,  2045,  2323,  2022,  2053,
           2060,  3415,  3205,  2006,  2026,  7325,  3189,  2030, 14593,  2229,
           1012, 22038, 20348, 22038, 20348, 22038, 20348, 22038, 20348, 22038,
          20348, 22038, 20348, 22038, 20348, 22038, 20348, 22038, 20348, 22038,
          20348, 22038, 20348, 22038, 20348, 22038, 20348, 22038, 20348, 22038,
          20348, 22038, 20348, 22038, 20348, 22038, 20348, 22038, 20348, 22038,
          20348, 22038,

In [52]:
pd.DataFrame(tokenized_datasets['train'][5])

Unnamed: 0,label,input_ids,attention_mask
0,tensor(0),101,1
1,tensor(0),1045,1
2,tensor(0),2052,1
3,tensor(0),2404,1
4,tensor(0),2023,1
...,...,...,...
149,tensor(0),1026,1
150,tensor(0),7987,1
151,tensor(0),1013,1
152,tensor(0),1028,1


In [49]:
pd.DataFrame(tokenized_datasets['train'][5])

Unnamed: 0,text,label,input_ids,attention_mask
0,I would put this at the top of my list of film...,0,101,1
1,I would put this at the top of my list of film...,0,1045,1
2,I would put this at the top of my list of film...,0,2052,1
3,I would put this at the top of my list of film...,0,2404,1
4,I would put this at the top of my list of film...,0,2023,1
...,...,...,...,...
149,I would put this at the top of my list of film...,0,1026,1
150,I would put this at the top of my list of film...,0,7987,1
151,I would put this at the top of my list of film...,0,1013,1
152,I would put this at the top of my list of film...,0,1028,1


In [46]:
len(pd.DataFrame(tokenized_datasets['train'][5])['input_ids'])

154

In [47]:
len(pd.DataFrame(tokenized_datasets['train'][5])['text'][0].split())

123

In [13]:
from transformers import AutoModelForSequenceClassification
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased',
                                                               num_labels=21).to(device)
    

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
tokenizer.encode_plus("this is a car", padding=True, truncation=True, max_length=10, return_attention_mask=True)

{'input_ids': [101, 2023, 2003, 1037, 2482, 102], 'attention_mask': [1, 1, 1, 1, 1, 1]}

In [28]:
tokenizer.encode_plus("this is a car", padding=True, truncation=True, max_length=6, return_attention_mask=True)['input_ids']

[101, 2023, 2003, 1037, 2482, 102]