In [None]:
# Cell 1: Imports and data generation (FIXED NOTEBOOK)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import warnings
warnings.filterwarnings("ignore")

# Synthetic loan dataset
np.random.seed(42)
n_samples = 1000

income = np.random.lognormal(mean=10.5, sigma=0.5, size=n_samples)
credit_score = np.random.normal(loc=680, scale=60, size=n_samples).clip(300, 850)
loan_amount = np.random.lognormal(mean=10.0, sigma=0.6, size=n_samples)
term_months = np.random.choice([36, 60], size=n_samples, p=[0.6, 0.4])
dti_ratio = np.random.uniform(0.05, 0.5, size=n_samples)

education_level = np.random.choice(
    ["High School", "Bachelors", "Masters", "PhD"], size=n_samples, p=[0.3, 0.4, 0.25, 0.05]
)
loan_purpose = np.random.choice(
    ["Debt Consolidation", "Home Improvement", "Car", "Medical", "Vacation", "Other"],
    size=n_samples
)

# Default probability synthesis
base = (
    0.15
    + 0.25 * (credit_score < 620).astype(int)
    + 0.20 * (dti_ratio > 0.35).astype(int)
    + 0.15 * (loan_amount > np.median(loan_amount)).astype(int)
    + 0.10 * (term_months == 60).astype(int)
)
noise = 0.05 * np.random.randn(n_samples)
default_prob = np.clip(base + noise, 0, 0.95)
default = (np.random.rand(n_samples) < default_prob).astype(int)

df = pd.DataFrame({
    "income": income,
    "credit_score": credit_score,
    "loan_amount": loan_amount,
    "term_months": term_months,
    "dti_ratio": dti_ratio,
    "education_level": education_level,
    "loan_purpose": loan_purpose,
    "default": default
})

print("Dataset shape:", df.shape)
print("Default rate:", df["default"].mean())
df.head()


In [None]:
# Cell 2: One-hot encoding for categorical features (FIXED)
feature_cols = [
    "income", "credit_score", "loan_amount", "term_months", "dti_ratio",
    "education_level", "loan_purpose"
]
X_raw = df[feature_cols]
y = df["default"]

# One-hot encode categoricals
X = pd.get_dummies(X_raw, columns=["education_level", "loan_purpose"], drop_first=True)

print("Encoded features:", X.shape[1])
print(sorted([c for c in X.columns if "education_level" in c or "loan_purpose" in c])[:10])


In [None]:
# Cell 3: Correct train-test split (80/20), train and evaluate (FIXED)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

model = RandomForestClassifier(n_estimators=200, max_depth=8, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average="binary")
recall = recall_score(y_test, y_pred, average="binary")
f1 = f1_score(y_test, y_pred, average="binary")

model_metrics = {
    "accuracy": float(accuracy),
    "precision": float(precision),
    "recall": float(recall),
    "f1_score": float(f1),
}

print("Metrics:", model_metrics)


In [None]:
# Cell 4: Feature importance dictionary (FIXED)
feature_names = X.columns.tolist()
importances = model.feature_importances_
feature_importance_dict = {name: float(score) for name, score in zip(feature_names, importances)}

# Show top 10
top10 = sorted(feature_importance_dict.items(), key=lambda kv: kv[1], reverse=True)[:10]
print("Top 10 features:")
for i, (name, score) in enumerate(top10, 1):
    print(f"{i}. {name}: {score:.4f}")

print("Sum of importances:", sum(feature_importance_dict.values()))


In [None]:
# Cell 5: Prediction probabilities for positive class (default=1) (FIXED)
proba = model.predict_proba(X_test)
default_probabilities = proba[:, 1]  # numpy array
print("default_probabilities shape:", default_probabilities.shape)
print("Range:", float(default_probabilities.min()), float(default_probabilities.max()))
