In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import re

import spacy

try:
    # Try loading the model to check if it's installed
    nlp = spacy.load("en_core_web_lg")
    print("Model is already installed.")
except OSError:
    # If the model isn't installed, download it
    !python -m spacy download en_core_web_lg
    nlp = spacy.load("en_core_web_lg")
    print("Model installed and loaded.")

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

from wordcloud import WordCloud

from typing import Dict
from sklearn.base import BaseEstimator
from sklearn.model_selection import cross_val_score
from time import time
from sklearn.metrics import accuracy_score, confusion_matrix

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
# import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')

Model is already installed.


In [3]:
# Reading the CSV file into a DataFrame
# The file contains complaints data

data = pd.read_csv(r'../data/complaints_processed.csv')
data.shape

(162421, 3)

In [4]:
data.head(5)

Unnamed: 0.1,Unnamed: 0,product,narrative
0,0,credit_card,purchase order day shipping amount receive pro...
1,1,credit_card,forwarded message date tue subject please inve...
2,2,retail_banking,forwarded message cc sent friday pdt subject f...
3,3,credit_reporting,payment history missing credit report speciali...
4,4,credit_reporting,payment history missing credit report made mis...


In [5]:
# Selecting only the 'product' and 'narrative' columns
data = data[['product', 'narrative']]

# Displaying the first 5 rows of the modified dataset
data.head(5)

Unnamed: 0,product,narrative
0,credit_card,purchase order day shipping amount receive pro...
1,credit_card,forwarded message date tue subject please inve...
2,retail_banking,forwarded message cc sent friday pdt subject f...
3,credit_reporting,payment history missing credit report speciali...
4,credit_reporting,payment history missing credit report made mis...


In [6]:
# Creating a copy of the data
df = data.copy()

# Displaying the summary information of the DataFrame
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162421 entries, 0 to 162420
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   product    162421 non-null  object
 1   narrative  162411 non-null  object
dtypes: object(2)
memory usage: 2.5+ MB


In [7]:
# Checking for missing values in the DataFrame
df.isna().sum()

product       0
narrative    10
dtype: int64

In [8]:
# Displaying rows where 'narrative' column has missing values
df[df['narrative'].isna()]

Unnamed: 0,product,narrative
1089,credit_reporting,
3954,credit_reporting,
3955,credit_reporting,
29690,credit_reporting,
139436,debt_collection,
151052,debt_collection,
154494,credit_reporting,
156902,retail_banking,
158538,credit_reporting,
159503,credit_reporting,


In [9]:
# Dropping rows with missing values and displaying updated information
df.dropna(axis=0, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 162411 entries, 0 to 162420
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   product    162411 non-null  object
 1   narrative  162411 non-null  object
dtypes: object(2)
memory usage: 3.7+ MB


In [10]:
# Counting the number of duplicated rows in the DataFrame
df.duplicated().sum()

37735

In [11]:
# Displaying the first 10 duplicated rows in the DataFrame
df[df.duplicated()].head(10)

Unnamed: 0,product,narrative
32,credit_reporting,name last four s account number writing disput...
44,credit_reporting,open account acct opened balance account acct ...
69,credit_reporting,hello name trying reach several time get probl...
70,credit_reporting,hello name trying reach several time get probl...
112,credit_reporting,name trying make change credit report since sa...
121,credit_reporting,name trying fix incorrect information credit r...
170,credit_reporting,block except otherwise provided section consum...
172,credit_reporting,block except otherwise provided section consum...
173,credit_reporting,block except otherwise provided section consum...
174,credit_reporting,block except otherwise provided section consum...


In [12]:
# Drop duplicate rows, keeping the first occurrence and ignoring the index
df.drop_duplicates(keep='first', inplace=True, ignore_index=True)

# Displaying DataFrame info after removing duplicates
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 124676 entries, 0 to 124675
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   product    124676 non-null  object
 1   narrative  124676 non-null  object
dtypes: object(2)
memory usage: 1.9+ MB


In [13]:
df.head()

Unnamed: 0,product,narrative
0,credit_card,purchase order day shipping amount receive pro...
1,credit_card,forwarded message date tue subject please inve...
2,retail_banking,forwarded message cc sent friday pdt subject f...
3,credit_reporting,payment history missing credit report speciali...
4,credit_reporting,payment history missing credit report made mis...


In [14]:
# Count the number of empty strings in each column
empty_string_counts = df.apply(lambda x: (x == '').sum())
print(empty_string_counts)

product      0
narrative    0
dtype: int64


In [15]:
# Calculate the length of each text in the 'narrative' column and add it as a new column 'length'
df['length'] = df['narrative'].apply(lambda x: len(x))

# Display a random sample of 10 rows from the DataFrame
df.sample(10)

Unnamed: 0,product,narrative,length
99893,credit_reporting,information sending fraud found credit report ...,88
107245,debt_collection,law firm debt collector filing lawsuit collect...,238
87295,mortgages_and_loans,discover student loan put student loan repayme...,684
16356,credit_reporting,filed complaint experian credit bureau receive...,770
62697,credit_reporting,went file tax told consultant identity stolen ...,263
84254,credit_reporting,went onto transunion website dispute incorrect...,387
112662,debt_collection,trying refinance home told account collection ...,1972
45986,credit_reporting,two collection report mine owe account reporte...,458
119283,credit_reporting,reissued student loan listen completely fabric...,475
69453,credit_card,logged bank america edd account numerous time ...,113


In [16]:
# Display rows where the length of 'narrative' text is less than 10 characters
short_texts_df = df[df['length'] < 10]
print(short_texts_df)

                    product  narrative  length
874        credit_reporting    account       7
1357        debt_collection  debt idea       9
7559         retail_banking    connect       7
7571         retail_banking   link pnc       8
7594         retail_banking  ca access       9
12314       debt_collection  debt paid       9
12320       debt_collection  debt owed       9
12321      credit_reporting  debt owed       9
12595      credit_reporting    inquire       7
12740        retail_banking  dismissed       9
12913      credit_reporting     belong       6
16201      credit_reporting        fdc       3
17469        retail_banking     friday       6
31414       debt_collection   owe debt       8
42765      credit_reporting       idea       4
77137      credit_reporting       need       4
77379      credit_reporting   response       8
77462      credit_reporting   inquires       8
77463      credit_reporting       item       4
77632      credit_reporting    financa       7
77729        

In [17]:
# Remove rows from the DataFrame where the length of the 'narrative' text is less than 10 characters
df = df[~(df['length'] < 10)]

# Display information about the DataFrame after removing short texts
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 124633 entries, 0 to 124672
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   product    124633 non-null  object
 1   narrative  124633 non-null  object
 2   length     124633 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 3.8+ MB


In [18]:
# Get the count of each unique value in the 'product' column
df['product'].value_counts()

product
credit_reporting       56283
debt_collection        21106
mortgages_and_loans    18758
credit_card            15023
retail_banking         13463
Name: count, dtype: int64

In [28]:
# Define the desired sample size for each class
sample_size_per_class = 1000  # Adjust this number based on your requirements

# Perform stratified sampling
df_sampled = df.groupby('product', group_keys=False).apply(lambda x: x.sample(min(len(x), sample_size_per_class)))

# Reset index if necessary
df_sampled = df_sampled.reset_index(drop=True)

# Display the distribution of classes in the sampled dataset
print(df_sampled['product'].value_counts())

product
credit_card            1000
credit_reporting       1000
debt_collection        1000
mortgages_and_loans    1000
retail_banking         1000
Name: count, dtype: int64


In [29]:
df_sampled.shape

(5000, 3)

In [30]:
# Precompile the regular expression for whitespace replacement
whitespace_re = re.compile(r'\s+')

def text_pre_process(text):
    """
    Preprocesses the input text by lowercasing, trimming whitespace,
    replacing multiple spaces with a single space, lemmatizing,
    and removing stop words, punctuation, and digits.
    """
    text = text.lower()  # Convert text to lowercase
    text = text.strip()  # Trim leading and trailing whitespace
    text = whitespace_re.sub(' ', text)  # Replace multiple spaces with a single space

    # Process text with SpaCy
    doc = nlp(text)

    # Extract lemmatized tokens, excluding stop words, punctuation, and digits
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct and not token.is_digit]

    # Join tokens into a single string
    return ' '.join(tokens)

In [31]:
# Apply text preprocessing to the DataFrame
df_sampled['text_pre_processed'] = df_sampled['narrative'].apply(text_pre_process)

# Display the DataFrame
df_sampled.head()

Unnamed: 0,product,narrative,length,text_pre_processed
0,credit_card,business close since available made single pay...,412,business close available single payment busine...
1,credit_card,citi card stopped bank autopay became past due...,458,citi card stop bank autopay past day past acce...
2,credit_card,mad bank america got purcahse wanted dispute g...,203,mad bank america get purcahse want dispute get...
3,credit_card,got sick since countless left unable work left...,749,get sick countless leave unable work leave bre...
4,credit_card,reported fraudulent account opened name td fin...,1217,report fraudulent account open td find way com...


In [32]:
# Calculate the length of preprocessed text
df_sampled['length_text_pre_processed'] = df_sampled['text_pre_processed'].apply(lambda x: len(x))

# Display the DataFrame
df_sampled.head()

Unnamed: 0,product,narrative,length,text_pre_processed,length_text_pre_processed
0,credit_card,business close since available made single pay...,412,business close available single payment busine...,370
1,credit_card,citi card stopped bank autopay became past due...,458,citi card stop bank autopay past day past acce...,374
2,credit_card,mad bank america got purcahse wanted dispute g...,203,mad bank america get purcahse want dispute get...,176
3,credit_card,got sick since countless left unable work left...,749,get sick countless leave unable work leave bre...,637
4,credit_card,reported fraudulent account opened name td fin...,1217,report fraudulent account open td find way com...,1080


In [33]:

df_sampled['length_text_pre_processed'].describe(percentiles=[0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95]).T

count     5000.000000
mean       563.282200
std        704.293883
min         12.000000
5%          68.000000
10%        105.000000
25%        201.000000
50%        380.000000
75%        671.000000
90%       1153.100000
95%       1614.000000
max      11566.000000
Name: length_text_pre_processed, dtype: float64

In [34]:
# Filter the DataFrame to include only rows where the length of the preprocessed text is less than 10 characters
df_sampled[df_sampled['length_text_pre_processed'] < 10]

Unnamed: 0,product,narrative,length,text_pre_processed,length_text_pre_processed


### Wordcloud

In [26]:
def plot_wordcloud(df, label_name):
    """
    Plots a word cloud for the given label name from the DataFrame.

    Parameters:
    df (pandas.DataFrame): The input DataFrame containing 'narrative' and 'product' columns.
    label_name (str): The label name for which the word cloud is to be generated.
    """
    # Filter the DataFrame to include only rows where the 'product' column matches the label_name
    label_texts = df[df['product'] == label_name]['narrative']

    # Combine all the text for the specified label into a single string
    combined_text = " ".join(label_texts)

    # Generate the word cloud with a black border
    wordcloud = WordCloud(
        width=1600,
        height=800,
        background_color='white',
    ).generate(combined_text)

    # Plot the word cloud
    plt.figure(figsize=(10, 5))  # Set the figure size
    plt.imshow(wordcloud, interpolation='bilinear')  # Display the word cloud image
    plt.axis('off')  # Hide the axis
    plt.title(f'Word Cloud for {label_name}')  # Add a title to the plot
    plt.show()  # Show the plot

In [None]:
# Plot a word cloud for the first unique product label in the DataFrame
plot_wordcloud(df=df, label_name=df['product'].unique()[0])

In [None]:
# Plot a word cloud for the second unique product label in the DataFrame
plot_wordcloud(df=df, label_name=df['product'].unique()[1])

In [None]:
# Plot a word cloud for the third unique product label in the DataFrame
plot_wordcloud(df=df, label_name=df['product'].unique()[2])

In [None]:
# Plot a word cloud for the fourth unique product label in the DataFrame
plot_wordcloud(df=df, label_name=df['product'].unique()[3])

In [None]:
# Plot a word cloud for the fifth unique product label in the DataFrame
plot_wordcloud(df=df, label_name=df['product'].unique()[4])

In [None]:
# Compute the number of words in the 'text_pre_processed' column
df['num_words'] = df['text_pre_processed'].apply(lambda x: len(x.split()))
df.head()

In [None]:
# Plot box plot
plt.figure(figsize=(12, 6))
sns.boxplot(x='product', y='num_words', data=df)
plt.xlabel('Product Category')
plt.ylabel('Number of Words')
plt.title('Box Plot of Number of Words by Product Category')
plt.xticks(rotation=45)
plt.show()

In [None]:
from collections import Counter

# Define a function to plot the top N most common words for a specific product category
def plot_top_words_for_category(df, category):
    # Filter the DataFrame for the given category
    category_texts = df[df['product'] == category]['text_pre_processed']

    # Combine all text for the category into a single string
    combined_text = " ".join(category_texts)

    # Tokenize the combined text
    words = combined_text.split()

    # Compute word frequencies
    word_freq = Counter(words)

    # Convert the word frequencies to a DataFrame
    word_freq_df = pd.DataFrame(word_freq.items(), columns=['Word', 'Frequency'])
    word_freq_df = word_freq_df.sort_values(by='Frequency', ascending=False)

    # Select the top N words
    N = 20  # You can change this value to display more or fewer words
    top_words = word_freq_df.head(N)

    # Plot the bar plot
    plt.figure(figsize=(12, 8))
    plt.barh(top_words['Word'], top_words['Frequency'], color='skyblue')
    plt.xlabel('Frequency')
    plt.ylabel('Word')
    plt.title(f'Top {N} Most Common Words for {category}')
    plt.gca().invert_yaxis()  # To display the highest frequency at the top
    plt.show()

# Get unique product categories
unique_categories = df['product'].unique()

# Plot top words for each product category
for category in unique_categories:
    plot_top_words_for_category(df, category)

### Model Building

In [35]:
df_sampled.shape

(5000, 5)

In [36]:
df_sampled.head()

Unnamed: 0,product,narrative,length,text_pre_processed,length_text_pre_processed
0,credit_card,business close since available made single pay...,412,business close available single payment busine...,370
1,credit_card,citi card stopped bank autopay became past due...,458,citi card stop bank autopay past day past acce...,374
2,credit_card,mad bank america got purcahse wanted dispute g...,203,mad bank america get purcahse want dispute get...,176
3,credit_card,got sick since countless left unable work left...,749,get sick countless leave unable work leave bre...,637
4,credit_card,reported fraudulent account opened name td fin...,1217,report fraudulent account open td find way com...,1080


In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform the cleaned narratives
X = vectorizer.fit_transform(df_sampled['text_pre_processed'])

# Display the shape of the resulting feature matrix
print("Feature matrix shape:", X.shape)

Feature matrix shape: (5000, 5000)


In [38]:
from sklearn.preprocessing import LabelEncoder

# Initialize label encoder
label_encoder = LabelEncoder()

# Encode the 'product' column
y = label_encoder.fit_transform(df_sampled['product'])

# Display the encoded labels
print("Encoded labels:", y)

Encoded labels: [0 0 0 ... 4 4 4]


In [39]:
from sklearn.model_selection import train_test_split

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the sizes of the splits
print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)

Training set size: (4000, 5000)
Testing set size: (1000, 5000)


In [41]:
from sklearn.naive_bayes import MultinomialNB

# Initialize the classifier
model = MultinomialNB()

# Train the model
model.fit(X_train, y_train)

In [42]:
from sklearn.metrics import classification_report, accuracy_score

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Display detailed classification report
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Accuracy: 0.804
Classification Report:
                      precision    recall  f1-score   support

        credit_card       0.82      0.78      0.80       216
   credit_reporting       0.76      0.76      0.76       190
    debt_collection       0.79      0.77      0.78       192
mortgages_and_loans       0.79      0.84      0.82       210
     retail_banking       0.85      0.87      0.86       192

           accuracy                           0.80      1000
          macro avg       0.80      0.80      0.80      1000
       weighted avg       0.80      0.80      0.80      1000



In [43]:
import joblib

# Save the trained model
joblib.dump(model, '../model/text_classification_model.pkl')

# Save the TF-IDF vectorizer
joblib.dump(vectorizer, '../model/tfidf_vectorizer.pkl')

# Save the label encoder
joblib.dump(label_encoder, '../model/label_encoder.pkl')


['../model/label_encoder.pkl']

In [44]:
# Load the saved model, vectorizer, and label encoder
model = joblib.load('../model/text_classification_model.pkl')
vectorizer = joblib.load('../model/tfidf_vectorizer.pkl')
label_encoder = joblib.load('../model/label_encoder.pkl')

# Preprocess the new text
new_text = "I am having trouble paying my bills due to financial hardship caused by the COVID-19 pandemic."
cleaned_text = text_pre_process(new_text)

# Transform the text using the TF-IDF vectorizer
X_new = vectorizer.transform([cleaned_text])

# Predict the class
predicted_class = model.predict(X_new)

# Decode the class label
predicted_label = label_encoder.inverse_transform(predicted_class)
print("Predicted Product:", predicted_label[0])


Predicted Product: mortgages_and_loans
