In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [None]:
df = pd.read_csv('wells_fargo.csv')
df.head()

In [None]:
print("The average length of the complaints is:", int(df['Consumer complaint narrative'].str.len().mean()), "characters.")

In [None]:
# lower case
df['Consumer complaint narrative'] = df['Consumer complaint narrative'].str.lower()
df.head()

In [None]:
# remove non-alphanumeric characters
import re
df['Consumer complaint narrative'] = df['Consumer complaint narrative'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', x))
df.head()

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet') # for lemmatization

In [None]:
# remove stopwords
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
df['Consumer complaint narrative'] = df['Consumer complaint narrative'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
df.head()

In [None]:
# lemmatization
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
df['Consumer complaint narrative'] = df['Consumer complaint narrative'].apply(lambda x: lemmatizer.lemmatize(x))
df.head()

In [None]:
print("The average length of the complaints is:", int(df['Consumer complaint narrative'].str.len().mean()), "characters.")

In [None]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
# compute embeddings (tarda 3 min)
df['embedding'] = df['Consumer complaint narrative'].apply(lambda x: model.encode(x))
df.head()

In [None]:
df["embedding"][0].shape

In [None]:
from sklearn.decomposition import PCA
def reduce_dimensions(embeddings, n_components):
    pca = PCA(n_components=n_components)
    pca.fit(embeddings)
    return pca.transform(embeddings).tolist()

In [None]:
all_embeddings = np.array(df['embedding'].to_list())
pca_components = [32, 64, 128, 256]
for i in pca_components:
    df['embedding_pca_' + str(i)] = reduce_dimensions(all_embeddings, i)
    df['embedding_pca_' + str(i)] = df['embedding_pca_' + str(i)].apply(lambda x: np.array(x))
df.head()

In [None]:
from sklearn.model_selection import train_test_split
df = df.sample(frac=1, random_state=123) # shuffle
train_df, test_df = train_test_split(df, test_size=0.2, random_state=123)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
# grid search
from sklearn.model_selection import GridSearchCV
k_range = list(range(1, 51))
param_grid = dict(n_neighbors=k_range)
for i in pca_components:
    grid = GridSearchCV(KNeighborsClassifier(metric='cosine'), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid.fit(list(train_df[f'embedding_pca_{i}']), train_df['Product'])
    print(f"Best accuracy for {i} components: {grid.best_score_}")
    plt.plot(k_range, grid.cv_results_['mean_test_score'])
grid.fit(list(train_df['embedding']), train_df['Product'])
print(f"Best accuracy for full embedding: {grid.best_score_}")
plt.plot(k_range, grid.cv_results_['mean_test_score'], color='black')
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validated Accuracy')
plt.legend(pca_components + ['full embedding'])
# make plot bigger and save it with 300 dpi
plt.show()

In [None]:
# use the best model
knn = KNeighborsClassifier(n_neighbors=15, metric='cosine')
knn.fit(list(train_df['embedding_pca_128']), train_df['Product'])
test_df["knn_pred"] = knn.predict(list(test_df['embedding_pca_128']))

In [None]:
# get metrics
from sklearn.metrics import classification_report
print(classification_report(test_df['Product'], test_df['knn_pred']))

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
cm = confusion_matrix(test_df['Product'], test_df['knn_pred'])
plt.figure(figsize=(8, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False, xticklabels=df['Product'].unique(), yticklabels=df['Product'].unique())
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
from sklearn.svm import SVC

In [None]:
# grid search
param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001]}
grid = GridSearchCV(SVC(), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
for i in pca_components:
    grid.fit(list(train_df[f'embedding_pca_{i}']), train_df['Product'])
    print(f"Best accuracy for {i} components: {grid.best_score_}")
grid.fit(list(train_df['embedding']), train_df['Product'])
print(f"Best accuracy for full embedding: {grid.best_score_}")

In [None]:
# train the best model
svm = SVC(C=grid.best_params_['C'], gamma=grid.best_params_['gamma'])
svm.fit(list(train_df['embedding']), train_df['Product'])

In [None]:
# predict
test_df["svm_pred"] = svm.predict(list(test_df['embedding']))

In [None]:
# get metrics
print(classification_report(test_df['Product'], test_df['svm_pred']))

In [None]:
# confusion matrix
cm = confusion_matrix(test_df['Product'], test_df['svm_pred'])
plt.figure(figsize=(10, 10))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False, xticklabels=df['Product'].unique(), yticklabels=df['Product'].unique())
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
# logistic regression
from sklearn.linear_model import LogisticRegression

In [None]:
# grid search
param_grid = {'C': [0.1, 1, 10, 100]}
grid = GridSearchCV(LogisticRegression(), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
for i in pca_components:
    grid.fit(list(train_df[f'embedding_pca_{i}']), train_df['Product'])
    print(f"Best accuracy for {i} components: {grid.best_score_}")
grid.fit(list(train_df['embedding']), train_df['Product'])
print(f"Best accuracy for full embedding: {grid.best_score_}")

In [None]:
# train the best model
lr = LogisticRegression(C=1)
lr.fit(list(train_df['embedding_pca_256']), train_df['Product'])

In [None]:
# predict
test_df["lr_pred"] = lr.predict(list(test_df['embedding_pca_256']))

In [None]:
# get metrics
print(classification_report(test_df['Product'], test_df['lr_pred']))

In [None]:
# confusion matrix
cm = confusion_matrix(test_df['Product'], test_df['lr_pred'])
plt.figure(figsize=(10, 10))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False, xticklabels=df['Product'].unique(), yticklabels=df['Product'].unique())
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()