In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report

In [8]:
df = pd.read_csv("complaints.csv")

In [9]:
# Drop the rows with missing data
df = df.dropna()

In [10]:
# Convert the product code to integers
df['Product Code'] = pd.Categorical(df['Product']).codes

In [11]:
# Define a function to preprocess the complaint text
def preprocess(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = nltk.word_tokenize(text)
    tokens = [token for token in tokens if token not in stop_words]
    # Join tokens back into a string
    text = ' '.join(tokens)
    return text

In [12]:
# Apply the preprocessing function to the complaint text
df['Complaint Text'] = df['Consumer complaint narrative'].apply(preprocess)

In [13]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['Complaint Text'], df['Product Code'], test_size=0.2, random_state=42)


In [14]:
# Create a TfidfVectorizer object
tfidf = TfidfVectorizer()

In [15]:
# Vectorize the training data
X = tfidf.fit_transform(X_train).toarray()

In [16]:
# Train a Random Forest Classifier on the vectorized data
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X, y_train)

# Train a Support Vector Classifier on the vectorized data
svm = LinearSVC()
svm.fit(X, y_train)

LinearSVC()

In [17]:
# Evaluate the performance of each model using cross-validation
rf_scores = cross_val_score(rf, X, y_train, cv=5)
svm_scores = cross_val_score(svm, X, y_train, cv=5)

print("Random Forest Classifier cross-validation scores:", rf_scores)
print("Mean score:", rf_scores.mean())
print("Standard deviation:", rf_scores.std())

print("\nSupport Vector Classifier cross-validation scores:", svm_scores)
print("Mean score:", svm_scores.mean())
print("Standard deviation:", svm_scores.std())

Random Forest Classifier cross-validation scores: [0.964      0.95791583 0.96192385 0.95991984 0.95791583]
Mean score: 0.9603350701402805
Standard deviation: 0.0023593969970613733

Support Vector Classifier cross-validation scores: [0.974      0.96392786 0.97795591 0.9759519  0.96993988]
Mean score: 0.9723551102204409
Standard deviation: 0.004977218537160005


In [18]:
# Evaluate the performance of the Support Vector Classifier on the test set
y_pred = svm.predict(tfidf.transform(X_test).toarray())

print("Support Vector Classifier:\n")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Support Vector Classifier:

Accuracy: 0.9807692307692307
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       596
           1       0.90      0.64      0.75        28

    accuracy                           0.98       624
   macro avg       0.94      0.82      0.87       624
weighted avg       0.98      0.98      0.98       624



In [28]:
# Example consumer complaint
complaint = "I have an error on my credit report that I would like to dispute."

# Preprocess the complaint text
complaint_processed = preprocess(complaint)

# Vectorize the complaint text using the TF-IDF algorithm
complaint_vector = tfidf.transform([complaint_processed]).toarray()

In [29]:
# Make a prediction
product_code = svm.predict(complaint_vector)[0]

print("Predicted product code:", product_code)

Predicted product code: 0
