In [None]:
# Credit Card Defaulter Detection - Full Assignment Notebook

# Step 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report, confusion_matrix, accuracy_score, roc_auc_score, roc_curve
)

# Step 2: Load Dataset (header=1 to skip metadata row)
df = pd.read_csv("default of credit card clients.csv", header=1)

# Step 3: Rename target column and drop ID
df.rename(columns={'default payment next month': 'target'}, inplace=True)
df.drop(columns='ID', inplace=True)

# Step 4: Encode categorical variables if needed (SEX, EDUCATION, MARRIAGE)
categorical_cols = ['SEX', 'EDUCATION', 'MARRIAGE']
for col in categorical_cols:
    df[col] = LabelEncoder().fit_transform(df[col])

# Step 5: EDA - Visualizations
sns.countplot(x='target', data=df)
plt.title("Target Class Distribution (0: No Default, 1: Default)")
plt.show()

plt.figure(figsize=(12, 10))
sns.heatmap(df.corr(), cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()

# Step 6: Outlier Detection
plt.figure(figsize=(10, 4))
sns.boxplot(data=df[['LIMIT_BAL', 'AGE']])
plt.title("Boxplot for Outlier Detection")
plt.show()

# Step 7: Feature and Target Split
X = df.drop('target', axis=1)
y = df['target']

# Step 8: Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Step 9: Train Decision Tree Classifier
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
print("\nDecision Tree Classification Report:")
print(classification_report(y_test, y_pred_dt))

# Step 10: Train Logistic Regression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
print("\nLogistic Regression Classification Report:")
print(classification_report(y_test, y_pred_lr))

# Step 11: Cross-Validation
print("\nCross-Validation (5-Fold):")
print("Decision Tree CV Accuracy:", cross_val_score(dt, X_scaled, y, cv=5).mean())
print("Logistic Regression CV Accuracy:", cross_val_score(lr, X_scaled, y, cv=5).mean())

# Step 12: ROC Curve Plot
y_proba_dt = dt.predict_proba(X_test)[:, 1]
y_proba_lr = lr.predict_proba(X_test)[:, 1]

fpr_dt, tpr_dt, _ = roc_curve(y_test, y_proba_dt)
fpr_lr, tpr_lr, _ = roc_curve(y_test, y_proba_lr)

plt.plot(fpr_dt, tpr_dt, label=f'Decision Tree AUC = {roc_auc_score(y_test, y_proba_dt):.2f}')
plt.plot(fpr_lr, tpr_lr, label=f'Logistic Regression AUC = {roc_auc_score(y_test, y_proba_lr):.2f}')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()

# Step 13: Confusion Matrices
cm_dt = confusion_matrix(y_test, y_pred_dt)
cm_lr = confusion_matrix(y_test, y_pred_lr)

sns.heatmap(cm_dt, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix - Decision Tree")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

sns.heatmap(cm_lr, annot=True, fmt='d', cmap='Greens')
plt.title("Confusion Matrix - Logistic Regression")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# Step 14: Feature Importance - Decision Tree
feature_importance = pd.Series(dt.feature_importances_, index=X.columns)
feature_importance.nlargest(10).plot(kind='barh')
plt.title("Top 10 Important Features - Decision Tree")
plt.show()

# Step 15: Feature Coefficients - Logistic Regression
coeffs = pd.Series(lr.coef_[0], index=X.columns)
coeffs.nlargest(10).plot(kind='barh', color='orange')
plt.title("Top 10 Predictive Features - Logistic Regression")
plt.show()

# Step 16: Summary of Insights
print("\nTop Predictive Features from Decision Tree:\n", feature_importance.nlargest(5))
print("\nTop Predictive Coefficients from Logistic Regression:\n", coeffs.nlargest(5))
print("\nBusiness Insight: Features like PAY_0, LIMIT_BAL, and PAY_AMT1 are strong predictors of credit default.\n")
