In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.read_csv("complaints.csv")
print(df.head())
print(df.isnull().sum())
print(df['Product'].value_counts())
selected_products = [
    'Credit reporting, repair, or other',
    'Debt collection',
    'Consumer Loan',
    'Mortgage'
]
df = df[df['Product'].isin(selected_products)]

df = df.dropna(subset=['Consumer complaint narrative'])

sns.countplot(y='Product', data=df)
plt.title("Complaint Distribution by Category")
plt.show()


In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)  
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

df['clean_text'] = df['Consumer complaint narrative'].apply(clean_text)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
df['label'] = encoder.fit_transform(df['Product'])

X_train, X_test, y_train, y_test = train_test_split(
    df['clean_text'], df['label'], test_size=0.2, random_state=42, stratify=df['label']
)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Naive Bayes": MultinomialNB(),
    "Random Forest": RandomForestClassifier(),
    "Linear SVM": LinearSVC()
}

for name, model in models.items():
    model.fit(X_train_tfidf, y_train)
    preds = model.predict(X_test_tfidf)
    print(f"\n{name} Accuracy: {accuracy_score(y_test, preds):.4f}")
    
    print(classification_report(y_test, preds, target_names=encoder.classes_))


In [None]:
results = {}
for name, model in models.items():
    preds = model.predict(X_test_tfidf)
    results[name] = accuracy_score(y_test, preds)

plt.bar(results.keys(), results.values())
plt.ylabel("Accuracy")
plt.title("Model Comparison")
plt.xticks(rotation=45)
plt.show()


In [None]:
sample = ["I am being harassed by debt collectors even after paying my dues."]
sample_tfidf = vectorizer.transform(sample)
pred_label = models["Linear SVM"].predict(sample_tfidf)
print("Predicted Category:", encoder.inverse_transform(pred_label)[0])
