In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [None]:
# Load the data
df = pd.read_csv('wells_fargo.csv')
df.head()

In [None]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

# tokenize, remove stopwords, lower case, remove non-alphanumeric characters, lemmatize
def preprocess(text):
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token not in stopwords.words('english')]
    tokens = [token.lower() for token in tokens]
    tokens = [re.sub(r'[^a-zA-Z0-9\s]', '', token) for token in tokens]
    tokens = [WordNetLemmatizer().lemmatize(token) for token in tokens]
    return tokens

# apply preprocessing and join tokens back into strings
df['Preprocessed text'] = df['Consumer complaint narrative'].apply(preprocess).apply(lambda x: ' '.join(x))
df.head()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['Preprocessed text'], df['Product'], test_size=0.2, random_state=123)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

## SVM

In [None]:
# SVM
from sklearn.svm import SVC

clf = SVC(kernel='linear')
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score, classification_report

print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
# heatmap confusion matrix
from sklearn.metrics import confusion_matrix
import seaborn as sns

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 10))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False, xticklabels=clf.classes_, yticklabels=clf.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()