##Project- Classify the query and send it to related department in Bank



In [None]:
#Importing libraries

import numpy as np
import pandas as pd
import re
from sklearn.metrics import f1_score

In [None]:
#importing data
df = pd.read_csv('/content/complaints_new.csv')

In [None]:
!pip install accelerate -U



In [None]:
#view data
df.head(5)

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,6/13/2019,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Information belongs to someone else,,,CAPITAL ONE FINANCIAL CORPORATION,PA,186XX,,Consent not provided,Web,6/13/2019,Closed with explanation,Yes,,3274605
1,11/1/2019,Vehicle loan or lease,Loan,Struggling to pay your loan,Denied request to lower payments,I contacted Ally on Friday XX/XX/XXXX after fa...,Company has responded to the consumer and the ...,ALLY FINANCIAL INC.,NJ,088XX,,Consent provided,Web,11/1/2019,Closed with explanation,Yes,,3425257
2,4/1/2019,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Account status incorrect,,Company has responded to the consumer and the ...,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",PA,19067,,Consent not provided,Web,4/1/2019,Closed with explanation,Yes,,3198225
3,11/1/2021,"Credit reporting, credit repair services, or o...",Credit reporting,Problem with a credit reporting company's inve...,Was not notified of investigation status or re...,,,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",GA,31707,,,Web,11/1/2021,In progress,Yes,,4863965
4,11/2/2021,Debt collection,Medical debt,Took or threatened to take negative or legal a...,Threatened or suggested your credit would be d...,,,"Medical Data Systems, Inc.",VA,22033,,,Web,11/2/2021,In progress,Yes,,4866449


In [None]:
#selecting the data we want in our model
df = df[["Product", "Consumer complaint narrative"]]

In [None]:
#viweing data
df.head(5)

Unnamed: 0,Product,Consumer complaint narrative
0,"Credit reporting, credit repair services, or o...",
1,Vehicle loan or lease,I contacted Ally on Friday XX/XX/XXXX after fa...
2,"Credit reporting, credit repair services, or o...",
3,"Credit reporting, credit repair services, or o...",
4,Debt collection,


In [None]:
#seeing shape of data
df.shape

(1048573, 2)

In [None]:
#percentage of null values
df.isna().sum()/len(df)

Unnamed: 0,0
Product,0.0
Consumer complaint narrative,0.598065


In [None]:
#dropping cuz we have enough values after discussing with client
df = df.dropna()

In [None]:
df.isna().sum()

Unnamed: 0,0
Product,0
Consumer complaint narrative,0


In [None]:
df.shape

(421458, 2)

In [None]:
df["Product"].unique()

array(['Vehicle loan or lease',
       'Credit reporting, credit repair services, or other personal consumer reports',
       'Credit card or prepaid card',
       'Money transfer, virtual currency, or money service', 'Mortgage',
       'Payday loan, title loan, or personal loan', 'Debt collection',
       'Checking or savings account', 'Student loan', 'Consumer Loan',
       'Money transfers', 'Credit card', 'Bank account or service',
       'Credit reporting', 'Prepaid card', 'Payday loan',
       'Other financial service', 'Virtual currency'], dtype=object)

In [None]:
df["Product"].nunique()

18

In [None]:
#checking the amount of data in percentage of each column in whole data
df["Product"].value_counts()/len(df["Product"])*100

Unnamed: 0_level_0,count
Product,Unnamed: 1_level_1
"Credit reporting, credit repair services, or other personal consumer reports",50.259101
Debt collection,16.90987
Credit card or prepaid card,9.525267
Mortgage,8.034253
Checking or savings account,5.674587
"Money transfer, virtual currency, or money service",2.754723
Student loan,2.371767
Vehicle loan or lease,2.213981
"Payday loan, title loan, or personal loan",1.474168
Credit reporting,0.252931


In [None]:
##merging classes : after discussion with client
class_dict =  {

'Vehicle loan or lease' : 'loan',
'Credit reporting, credit repair services, or other personal consumer reports' : 'credit_report',
'Credit card or prepaid card' : 'card' ,
'Money transfer, virtual currency, or money service' : 'money_transfer',
'Mortgage' : 'Mortgage',
'Payday loan, title loan, or personal loan' : 'loan' ,
'Debt collection' : 'Debt collection',
'Checking or savings account' : 'account',
'Student loan' : 'loan',
'Consumer Loan' : 'loan',
'Money transfers' : 'money_transfer',
'Credit card' : 'card',
'Bank account or service' : 'account',
'Credit reporting' : 'credit_report',
'Prepaid card' : 'card',
'Payday loan' : 'loan',
'Other financial service' : 'credit_report',
'Virtual currency' : 'money_transfer'
}

In [None]:
df.replace({'Product': class_dict}, inplace=True)

In [None]:
df

Unnamed: 0,Product,Consumer complaint narrative
1,loan,I contacted Ally on Friday XX/XX/XXXX after fa...
7,credit_report,Hello This complaint is against the three cred...
8,credit_report,I am a victim of Identity Theft & currently ha...
10,credit_report,Two accounts are still on my credit history af...
13,credit_report,Receiving daily telephone call ( s ) from XXXX...
...,...,...
1048539,credit_report,XXXX XXXX is reporting an inaccurate late on m...
1048559,Debt collection,Company is re porting on all XXXX credit bur...
1048560,credit_report,I filed for bankruptcy almost eight years ago ...
1048567,loan,On XX/XX/XXXX I returned the Car that I had a ...


In [None]:
df['Product'].unique()

array(['loan', 'credit_report', 'card', 'money_transfer', 'Mortgage',
       'Debt collection', 'account'], dtype=object)

In [None]:
df['Product'].nunique()

7

In [None]:
df["Product"].value_counts()/len(df["Product"])*100

Unnamed: 0_level_0,count
Product,Unnamed: 1_level_1
credit_report,50.520574
Debt collection,16.90987
card,9.726473
Mortgage,8.034253
loan,6.210583
account,5.816712
money_transfer,2.781535


In [None]:
df["Product"].value_counts()

Unnamed: 0_level_0,count
Product,Unnamed: 1_level_1
credit_report,212923
Debt collection,71268
card,40993
Mortgage,33861
loan,26175
account,24515
money_transfer,11723


In [None]:
#data in imbalaced : undersampling
sampled_df = pd.DataFrame()

In [None]:
for col in df['Product'].unique():
    sample = df[df['Product'] == col].sample(500)
    sampled_df = pd.concat([sampled_df, sample], ignore_index=True)
sampled_df

Unnamed: 0,Product,Consumer complaint narrative
0,loan,"I applied for a loan of XXXX. I got the loan, ..."
1,loan,I made my fourth of three monthly payments to ...
2,loan,"I had paid on these loans for over 7 years, so..."
3,loan,Since XX/XX/2016 I have been trying and doing ...
4,loan,Purchased a used XXXX XXXX XXXX on XX/XX/XXXX ...
...,...,...
3495,account,I have made multiple requests to Robinhood and...
3496,account,I opened an estate checking account for my fat...
3497,account,After having a business checking account for 3...
3498,account,I opened a Wells Fargo Everyday Checking accou...


In [None]:
sampled_df["Product"].value_counts()

Unnamed: 0_level_0,count
Product,Unnamed: 1_level_1
loan,500
credit_report,500
card,500
money_transfer,500
Mortgage,500
Debt collection,500
account,500


In [None]:
sampled_df.head(5)

Unnamed: 0,Product,Consumer complaint narrative
0,loan,"I applied for a loan of XXXX. I got the loan, ..."
1,loan,I made my fourth of three monthly payments to ...
2,loan,"I had paid on these loans for over 7 years, so..."
3,loan,Since XX/XX/2016 I have been trying and doing ...
4,loan,Purchased a used XXXX XXXX XXXX on XX/XX/XXXX ...


In [None]:
sampled_df.shape

(3500, 2)

In [None]:
#data cleaning
# lower case
# mutli-occurance of X removing
# removing digits and punctuations

In [None]:
def text_cleaning(text):
  text = text.lower()   #lower case
  text = re.sub(r'[^\w\s]', '', text)  #removing punct
  text = re.sub(r'[0-9]', '', text)   #removing digits
  text = re.sub(r'[x]{2,}', '', text) # removing more than 2 occurance of X
  return text

In [None]:
sampled_df['Consumer complaint narrative'] = sampled_df['Consumer complaint narrative'].apply(text_cleaning)

In [None]:
sampled_df.head()

Unnamed: 0,Product,Consumer complaint narrative
0,loan,i applied for a loan of i got the loan but th...
1,loan,i made my fourth of three monthly payments to ...
2,loan,i had paid on these loans for over years some...
3,loan,since i have been trying and doing everything...
4,loan,purchased a used on from ga \nselling p...


In [None]:
sampled_df['Product'].unique()

array(['loan', 'credit_report', 'card', 'money_transfer', 'Mortgage',
       'Debt collection', 'account'], dtype=object)

In [None]:
classes = {'loan' : 0,
           'credit_report' :1,
           'card': 2,
           'money_transfer' : 3,
           'Mortgage' : 4,
           'Debt collection' : 5,
           'account' : 6
          }

sampled_df.replace({'Product' : classes}, inplace=True)

  sampled_df.replace({'Product' : classes}, inplace=True)


In [None]:
sampled_df

Unnamed: 0,Product,Consumer complaint narrative
0,0,i applied for a loan of i got the loan but th...
1,0,i made my fourth of three monthly payments to ...
2,0,i had paid on these loans for over years some...
3,0,since i have been trying and doing everything...
4,0,purchased a used on from ga \nselling p...
...,...,...
3495,6,i have made multiple requests to robinhood and...
3496,6,i opened an estate checking account for my fat...
3497,6,after having a business checking account for ...
3498,6,i opened a wells fargo everyday checking accou...


In [None]:
sampled_df.columns = ['label', 'text']
sampled_df.to_csv("complaint_dataset.csv", index=False)

In [None]:
#creating model

In [None]:
!pip install datasets



In [None]:
from datasets import load_dataset

In [None]:
data = load_dataset('csv', data_files='complaint_dataset.csv')

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
data

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 3500
    })
})

In [None]:
train_test = data['train'].train_test_split(test_size=0.2, seed=42)

In [None]:
train_test

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 2800
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 700
    })
})

In [None]:
train_test['test']

Dataset({
    features: ['label', 'text'],
    num_rows: 700
})

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [None]:
#selecting bert model from hugging face
model_name = 'bert-base-cased'

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForSequenceClassification.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels = 7)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def tokenize(batch):
  return tokenizer(batch['text'], truncation=True )

tokenized_data = train_test.map(tokenize, batched=True)

Map:   0%|          | 0/2800 [00:00<?, ? examples/s]

Map:   0%|          | 0/700 [00:00<?, ? examples/s]

In [None]:
from transformers import Trainer, TrainingArguments

In [None]:
training_args = TrainingArguments(output_dir = 'output_dir',
                                  evaluation_strategy = 'epoch',
                                  num_train_epochs= 2,
                                  )



In [None]:
# Define evaluation metrics, which we will pass during training

def compute_metrics(logits_and_labels):
  logits, labels = logits_and_labels
  predictions = np.argmax(logits, axis=-1)
  acc = np.mean(predictions == labels)
  f1 = f1_score(labels, predictions, average = 'micro')
  return {'accuracy': acc, 'f1_score': f1}

In [None]:
tokenized_data['train']

Dataset({
    features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 2800
})

In [None]:
trainer = Trainer(model,
                  training_args,
                  train_dataset = tokenized_data['train'],
                  eval_dataset = tokenized_data['test'],
                  tokenizer=tokenizer,
                  compute_metrics = compute_metrics)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1 Score
1,No log,0.818421,0.738571,0.738571
2,1.133600,0.669856,0.788571,0.788571


TrainOutput(global_step=700, training_loss=0.9963536725725446, metrics={'train_runtime': 503.2786, 'train_samples_per_second': 11.127, 'train_steps_per_second': 1.391, 'total_flos': 1268486563707600.0, 'train_loss': 0.9963536725725446, 'epoch': 2.0})

In [None]:
#PREDICTION

In [None]:
model_checkpoint= '/content/output_dir/checkpoint-500'

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)

In [None]:
from transformers import pipeline

In [None]:
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [None]:
train_test['test']['text'][:1]

['i have disputed the errors on my credit report multiple times but it is still showing on it the credit bureaus must provide proof of the negative accounts also the monthly payment terms credit limit past due and date of last payment as it is missing in the report if you are unable to do it please remove the negative accounts from my report also i have inquiries i didnt authorized']

In [None]:
train_test['test']['label'][:1]

[1]

In [None]:
# prediction 100 texts and its labe;

In [None]:
predz = classifier(train_test['test']['text'][:100], padding=True, truncation=True)

In [None]:
predz

[{'label': 'LABEL_1', 'score': 0.9312503337860107},
 {'label': 'LABEL_3', 'score': 0.926616907119751},
 {'label': 'LABEL_6', 'score': 0.8300415873527527},
 {'label': 'LABEL_2', 'score': 0.8932764530181885},
 {'label': 'LABEL_5', 'score': 0.8955186605453491},
 {'label': 'LABEL_3', 'score': 0.9380978345870972},
 {'label': 'LABEL_0', 'score': 0.44093525409698486},
 {'label': 'LABEL_4', 'score': 0.9397725462913513},
 {'label': 'LABEL_0', 'score': 0.9329546093940735},
 {'label': 'LABEL_0', 'score': 0.8455503582954407},
 {'label': 'LABEL_0', 'score': 0.9087695479393005},
 {'label': 'LABEL_0', 'score': 0.9246225953102112},
 {'label': 'LABEL_2', 'score': 0.8774312734603882},
 {'label': 'LABEL_0', 'score': 0.9336698651313782},
 {'label': 'LABEL_2', 'score': 0.6000677347183228},
 {'label': 'LABEL_4', 'score': 0.9596453309059143},
 {'label': 'LABEL_2', 'score': 0.770052433013916},
 {'label': 'LABEL_6', 'score': 0.7380486130714417},
 {'label': 'LABEL_5', 'score': 0.4981657564640045},
 {'label': 'L

In [None]:
y_pred = [int(label['label'][-1]) for label in predz]

In [None]:
y_pred

[1,
 3,
 6,
 2,
 5,
 3,
 0,
 4,
 0,
 0,
 0,
 0,
 2,
 0,
 2,
 4,
 2,
 6,
 5,
 0,
 4,
 6,
 6,
 1,
 6,
 4,
 4,
 1,
 0,
 5,
 2,
 0,
 4,
 4,
 5,
 3,
 4,
 0,
 5,
 6,
 1,
 3,
 3,
 4,
 0,
 5,
 4,
 6,
 6,
 5,
 6,
 0,
 5,
 5,
 3,
 4,
 5,
 5,
 5,
 5,
 3,
 0,
 2,
 1,
 3,
 2,
 4,
 6,
 5,
 3,
 0,
 0,
 1,
 2,
 4,
 6,
 4,
 0,
 0,
 0,
 4,
 2,
 0,
 0,
 4,
 1,
 4,
 3,
 1,
 2,
 2,
 0,
 4,
 5,
 2,
 2,
 4,
 4,
 5,
 4]

In [None]:
y_actual = train_test['test']['label'][:100]

In [None]:
from sklearn.metrics import accuracy_score, f1_score

In [None]:
accuracy_score(y_actual, y_pred)

0.78

In [None]:
f1_score(y_actual, y_pred, average='macro')

0.7716271197474205