In [1]:
#import the libraries
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize, FreqDist
from nltk.stem.wordnet import WordNetLemmatizer
import string
import plotly
import plotly.graph_objects as go
import plotly.express as px
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [2]:
# importing the data and filtering the required columns

df = pd.read_csv(r"D:\LEARNING\WELLS FARGO\NLP\CFPB\Data\complaints-2023-08-29_03_42.csv")
df = df[['Product','Consumer complaint narrative']]
df = df.rename(columns={"Product": "product", "Consumer complaint narrative": "narrative"})
df['product'].replace({'Credit reporting, credit repair services, or other personal consumer reports': 'credit_reporting',
                       'Debt collection': 'debt_collection',
                       'Credit card or prepaid card': 'credit_card',
                       'Mortgage': 'mortgages_and_loans',
                       'Checking or savings account': 'retail_banking',
                       'Money transfer, virtual currency, or money service': 'retail_banking',
                       'Vehicle loan or lease': 'mortgages_and_loans',
                       'Payday loan, title loan, or personal loan': 'mortgages_and_loans',
                       'Student loan': 'mortgages_and_loans'}, inplace=True)

In [3]:
#Removing the rows having nan values in narrative column
df = df[df['narrative'].notna()]
df =df.reset_index(drop=True)
df.shape

(387120, 2)

In [4]:
df.head()

Unnamed: 0,product,narrative
0,credit_reporting,Hi I am submitting this XXXX XXXX this isn't a...
1,mortgages_and_loans,"I applied for, Again for a Pre-approval on a P..."
2,credit_reporting,there investigation found that the information...
3,credit_reporting,In accordance with the fair credit reporting a...
4,debt_collection,Address of credit reporting company XXXX XXXX ...


In [5]:
df.loc[3]['narrative']

'In accordance with the fair credit reporting act XXXX XXXX XXXX has violated my rights 15 U.S.C 1681 section 602 A. States I have the right to privacy. 15 U.S.C 1681 section 604 A section 2 : It also states a consumer reporting agency can not furnish a account without my written instructions 15 U.S.C 166b A Creditor May not treat a payment on a credit card account under and open and consumer credit plan as late for any reason.'

In [6]:
len(df)

387120

In [7]:
#remove stopwords, tokenize and lemmatize
stopwords_list = stopwords.words('english') + list(string.punctuation)
stopwords_list += ["''", '""', '...', '``']
stopwords_list += ['--', 'xxxx']

In [8]:
# function to tokenize data and remove stopwords
def process_narrative(narrative):
    tokens = nltk.word_tokenize(narrative)
    stopwords_removed = [token.lower() for token in tokens if token.lower() not in stopwords_list]
    
    # adding line to remove all tokens with numbers and punctuation
    stopwords_punc_and_numbers_removed = [word for word in stopwords_removed if word.isalpha()]
    
    return stopwords_punc_and_numbers_removed


# function to concat words (used in function below)
def concat_words(list_of_words):
    # remove any NaN's
    # list_of_words = [i for i in list if i is not np.nan]

    concat_words = ''
    for word in list_of_words:
        concat_words += word + ' '
    return concat_words.strip()

# function to lemmatize words and merge each complaint into a single space-separated string

lemm = WordNetLemmatizer()

def make_lemma_and_concat(list_of_words):
    # remove any NaN's
    list_of_words = [i for i in list_of_words if i is not np.nan]
    
    # lemmatize each word
    lemmatized_list = []
    for idx, word in enumerate(list_of_words):
        lemmatized_list.append(lemm.lemmatize(word))
    
    # make the list into a single string with the words separated by ' '
    concatenated_string = concat_words(lemmatized_list)
    return concatenated_string

In [9]:
df['product'] = df['product'].astype(str)
df['narrative'] = df['narrative'].astype(str)

In [10]:
#Prepare dataframe for modeling
for i in range(len(df)):
    processed_narr = process_narrative(df['narrative'].loc[i])
    narr = make_lemma_and_concat(processed_narr)
    df['narrative'].loc[i] = narr
    if i % 3000 == 0:
        print(f'Finished line number {i}')
df.head()

Finished line number 0
Finished line number 3000
Finished line number 6000
Finished line number 9000
Finished line number 12000
Finished line number 15000
Finished line number 18000
Finished line number 21000
Finished line number 24000
Finished line number 27000
Finished line number 30000
Finished line number 33000
Finished line number 36000
Finished line number 39000
Finished line number 42000
Finished line number 45000
Finished line number 48000
Finished line number 51000
Finished line number 54000
Finished line number 57000
Finished line number 60000
Finished line number 63000
Finished line number 66000
Finished line number 69000
Finished line number 72000
Finished line number 75000
Finished line number 78000
Finished line number 81000
Finished line number 84000
Finished line number 87000
Finished line number 90000
Finished line number 93000
Finished line number 96000
Finished line number 99000
Finished line number 102000
Finished line number 105000
Finished line number 108000
Finis

Unnamed: 0,product,narrative
0,credit_reporting,hi submitting influence third party transunion...
1,mortgages_and_loans,applied property order able view several prope...
2,credit_reporting,investigation found information disputed inacc...
3,credit_reporting,accordance fair credit reporting act violated ...
4,debt_collection,address credit reporting company tx dear cfpb ...


In [16]:
int(df['narrative'].astype(bytes).str.len().max())

21306

In [17]:
#Save dataframe as csv
df.to_csv(r'D:\LEARNING\WELLS FARGO\NLP\CFPB\Data\complaints_processed.csv')