In [None]:
# Data Science
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Natural Language Processing
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re

In [None]:
# Load the data
df = pd.read_csv('wells_fargo.csv')
df.head()

In [None]:
# tokenize, remove stopwords, lower case, remove non-alphanumeric characters, lemmatize
def preprocess(text):
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token not in stopwords.words('english')]
    tokens = [token.lower() for token in tokens]
    tokens = [re.sub(r'[^a-zA-Z0-9\s]', '', token) for token in tokens]
    tokens = [WordNetLemmatizer().lemmatize(token) for token in tokens]
    return tokens

In [None]:
# apply preprocessing and join tokens back into strings
df['Preprocessed text'] = df['Consumer complaint narrative'].apply(preprocess).apply(lambda x: ' '.join(x))
df.head()

In [None]:
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df['Preprocessed text'], df['Product'], test_size=0.2, random_state=123)

In [None]:
# embed text into vectors using TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [None]:
# embed text into vectors using BERT
model = SentenceTransformer('all-MiniLM-L6-v2')
X_train_bert = model.encode(X_train.values)
X_test_bert = model.encode(X_test.values)

In [None]:
print(X_train_tfidf.shape)
print(X_train_bert.shape)

In [None]:
# Categories colors (total of 6 categories)
cat_colors = {
    "Checking or savings account": "#1f77b4",
    "Mortgage": "#ff7f0e",
    "Credit reporting, credit repair services, or other personal consumer reports": "#2ca02c",
    "Credit card or prepaid card": "#d62728",
    "Money transfer, virtual currency, or money service": "#9467bd",
    "Vehicle loan or lease": "#8c564b"
}

In [None]:
# Plot latent space with UMAP

## Machine Learning Models

In [None]:
def print_metrics(y_test, y_pred):
    print(accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))

def print_confusion_matrix(model, y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(10, 10))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False, xticklabels=model.classes_, yticklabels=model.classes_)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

### Logistic Regression

In [None]:
# Logistic Regression with TF-IDF
lr_tfidf = LogisticRegression(C=1, max_iter=100, solver='saga')
lr_tfidf.fit(X_train_tfidf, y_train)
y_pred = lr_tfidf.predict(X_test_tfidf)

In [None]:
# Metrics
print_metrics(y_test, y_pred)

In [None]:
# Confusion Matrix
print_confusion_matrix(lr_tfidf, y_test, y_pred)

In [None]:
# Logistic Regression with BERT
lr_bert = LogisticRegression(max_iter=1000)
lr_bert.fit(X_train_bert, y_train)
y_pred = lr_bert.predict(X_test_bert)

In [None]:
# Metrics
print_metrics(y_test, y_pred)

In [None]:
# Confusion Matrix
print_confusion_matrix(lr_bert, y_test, y_pred)

### Random Forest

In [None]:
# Random Forest with TF-IDF
clf = RandomForestClassifier(max_depth=15, n_estimators=600, random_state=123)
clf.fit(X_train_tfidf, y_train)
y_pred = clf.predict(X_test_tfidf)

In [None]:
# Metrics
print_metrics(y_test, y_pred)

In [None]:
# Confusion matrix
print_confusion_matrix(clf, y_test, y_pred)

In [None]:
# Random Forest with BERT
clf = RandomForestClassifier()
clf.fit(X_train_bert, y_train)
y_pred = clf.predict(X_test_bert)

In [None]:
# Metrics
print_metrics(y_test, y_pred)

In [None]:
# Confusion matrix
print_confusion_matrix(clf, y_test, y_pred)

### SVM

In [None]:
# SVM with TF-IDF
clf = SVC(kernel='linear')
clf.fit(X_train_tfidf, y_train)
y_pred = clf.predict(X_test_tfidf)

In [None]:
# Metrics
print_metrics(y_test, y_pred)

In [None]:
# Confusion matrix
print_confusion_matrix(clf, y_test, y_pred)

In [None]:
# SVM with BERT
clf = SVC(kernel='linear')
clf.fit(X_train_bert, y_train)
y_pred = clf.predict(X_test_bert)

In [None]:
# Metrics
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 10))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False, xticklabels=clf.classes_, yticklabels=clf.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()