In [None]:
import pandas as pd
import numpy as np
import re
import string
import nltk
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load dataset
df = pd.read_csv("complaints.csv")  # Update with correct path

# Selecting required columns
df = df[['Product', 'Consumer complaint narrative']].dropna()

# Mapping target labels
category_mapping = {
    "Credit reporting, repair, or other": 0,
    "Debt collection": 1,
    "Consumer Loan": 2,
    "Mortgage": 3
}
df = df[df['Product'].isin(category_mapping.keys())]
df['Product'] = df['Product'].map(category_mapping)

# Text Preprocessing
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    words = nltk.word_tokenize(text)
    words = [word for word in words if word not in stopwords.words('english')]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

df['Processed_Text'] = df['Consumer complaint narrative'].apply(clean_text)

# Splitting dataset
X_train, X_test, y_train, y_test = train_test_split(
    df['Processed_Text'], df['Product'], test_size=0.2, random_state=42, stratify=df['Product']
)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train XGBoost Model
xgb_model = XGBClassifier(n_estimators=200, max_depth=5, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train_tfidf, y_train)

# Train LightGBM Model
lgbm_model = LGBMClassifier(n_estimators=200, num_leaves=31, max_depth=-1, learning_rate=0.1, random_state=42)
lgbm_model.fit(X_train_tfidf, y_train)

# Predictions & Evaluation
def evaluate_model(model, X_test, y_test, model_name):
    y_pred = model.predict(X_test)
    print(f"\n{model_name} Model Performance:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=category_mapping.keys(), yticklabels=category_mapping.keys())
    plt.title(f"{model_name} Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()

evaluate_model(xgb_model, X_test_tfidf, y_test, "XGBoost")
evaluate_model(lgbm_model, X_test_tfidf, y_test, "LightGBM")

# Sample Prediction
def make_prediction(model, text_sample):
    processed_text = clean_text(text_sample)
    text_tfidf = vectorizer.transform([processed_text])
    prediction = model.predict(text_tfidf)[0]
    label = [key for key, value in category_mapping.items() if value == prediction][0]
    return label

sample_text = "I have been wrongly charged for a credit report issue. The company is not responding."
print("Predicted Category:", make_prediction(xgb_model, sample_text))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Rohit\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Rohit\AppData\Roaming\nltk_data...
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Rohit\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


FileNotFoundError: [Errno 2] No such file or directory: 'consumer_complaints.csv'