# üìä Credit Card Default Prediction
## Internship-Ready Project
This project predicts whether a credit card customer will default next month based on their past payment history and demographic details.

In [None]:
# 1. Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc

sns.set(style='whitegrid')

In [None]:
# 2. Load Dataset
# Replace with your dataset file path
df = pd.read_csv("credit_card_default.csv")
print("Dataset Shape:", df.shape)
df.head()

In [None]:
# 3. Data Understanding & Cleaning
print(df.info())
print(df.describe())
print("Missing Values:\n", df.isnull().sum())
print("Duplicate Rows:", df.duplicated().sum())

In [None]:
# 4. Exploratory Data Analysis (EDA)
plt.figure(figsize=(6,4))
sns.countplot(x='default.payment.next.month', data=df)
plt.title('Default vs Non-Default Customers')
plt.show()

plt.figure(figsize=(12,8))
sns.heatmap(df.corr(), cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# 5. Feature Engineering
X = df.drop('default.payment.next.month', axis=1)
y = df['default.payment.next.month']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# 6. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)
print("Training size:", X_train.shape, "Testing size:", X_test.shape)

In [None]:
# 7. Model Training
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

In [None]:
# 8. Model Evaluation
y_pred_log = log_reg.predict(X_test)
y_pred_rf = rf.predict(X_test)

print("Logistic Regression Report:\n", classification_report(y_test, y_pred_log))
print("Random Forest Report:\n", classification_report(y_test, y_pred_rf))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_rf)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix (Random Forest)')
plt.show()

# ROC Curve
y_prob = rf.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
plt.plot(fpr, tpr, label='Random Forest')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()

# Feature Importance
importances = pd.Series(rf.feature_importances_, index=X.columns)
importances.sort_values().plot(kind='barh', figsize=(8,6))
plt.title('Feature Importance')
plt.show()

## üèÅ Conclusion
- We trained Logistic Regression and Random Forest models.
- Random Forest performed better with higher precision and recall.
- Feature importance analysis shows which variables affect default risk most.
- **Business Insight:** Banks can use this model to identify risky customers and take preventive actions (like reducing credit limit or asking for extra documentation).