In [1]:
import pandas as pd

data = pd.read_csv('heloc.csv')

In [2]:
print("Shape of data:", data.shape)
print("Column names:", data.columns.tolist())

Shape of data: (10459, 24)
Column names: ['RiskPerformance', 'ExternalRiskEstimate', 'MSinceOldestTradeOpen', 'MSinceMostRecentTradeOpen', 'AverageMInFile', 'NumSatisfactoryTrades', 'NumTrades60Ever2DerogPubRec', 'NumTrades90Ever2DerogPubRec', 'PercentTradesNeverDelq', 'MSinceMostRecentDelq', 'MaxDelq2PublicRecLast12M', 'MaxDelqEver', 'NumTotalTrades', 'NumTradesOpeninLast12M', 'PercentInstallTrades', 'MSinceMostRecentInqexcl7days', 'NumInqLast6M', 'NumInqLast6Mexcl7days', 'NetFractionRevolvingBurden', 'NetFractionInstallBurden', 'NumRevolvingTradesWBalance', 'NumInstallTradesWBalance', 'NumBank2NatlTradesWHighUtilization', 'PercentTradesWBalance']


### Column Details and Logical Grouping

#### 1. Target Variable
- **RiskPerformance**: Indicates credit risk outcome (`Good` or `Bad`).

#### 2. Credit Risk Scores
- **ExternalRiskEstimate**: External credit risk score.

#### 3. Trade History
- **MSinceOldestTradeOpen**: Months since oldest trade opened.
- **MSinceMostRecentTradeOpen**: Months since most recent trade opened.
- **AverageMInFile**: Average months in file.
- **NumTotalTrades**: Total number of trades.
- **NumSatisfactoryTrades**: Number of satisfactory trades.
- **NumTradesOpeninLast12M**: Number of trades opened in last 12 months.

#### 4. Delinquency & Derogatory Records
- **NumTrades60Ever2DerogPubRec**: Number of trades 60+ days overdue with derogatory public record.
- **NumTrades90Ever2DerogPubRec**: Number of trades 90+ days overdue with derogatory public record.
- **PercentTradesNeverDelq**: Percentage of trades never delinquent.
- **MSinceMostRecentDelq**: Months since most recent delinquency.
- **MaxDelq2PublicRecLast12M**: Maximum delinquency in public record in last 12 months.
- **MaxDelqEver**: Maximum delinquency ever.

#### 5. Inquiry Information
- **MSinceMostRecentInqexcl7days**: Months since most recent inquiry (excluding last 7 days).
- **NumInqLast6M**: Number of inquiries in last 6 months.
- **NumInqLast6Mexcl7days**: Number of inquiries in last 6 months (excluding last 7 days).

#### 6. Trade Types & Utilization
- **PercentInstallTrades**: Percentage of installment trades.
- **NetFractionRevolvingBurden**: Net fraction of revolving burden.
- **NetFractionInstallBurden**: Net fraction of installment burden.
- **NumRevolvingTradesWBalance**: Number of revolving trades with balance.
- **NumInstallTradesWBalance**: Number of installment trades with balance.
- **NumBank2NatlTradesWHighUtilization**: Number of bank/national trades with high utilization.
- **PercentTradesWBalance**: Percentage of trades with balance.

---

These logical groups help in understanding the structure of the dataset and facilitate feature engineering for modeling tasks.

In [3]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
# Exclude the target variable 'RiskPerformance' from normalization
features = data.drop(columns=['RiskPerformance'])
normalized_features = pd.DataFrame(scaler.fit_transform(features), columns=features.columns)

# Combine normalized features with the target variable
normalized_data = pd.concat([normalized_features, data['RiskPerformance']], axis=1)
normalized_data.head()

Unnamed: 0,ExternalRiskEstimate,MSinceOldestTradeOpen,MSinceMostRecentTradeOpen,AverageMInFile,NumSatisfactoryTrades,NumTrades60Ever2DerogPubRec,NumTrades90Ever2DerogPubRec,PercentTradesNeverDelq,MSinceMostRecentDelq,MaxDelq2PublicRecLast12M,...,MSinceMostRecentInqexcl7days,NumInqLast6M,NumInqLast6Mexcl7days,NetFractionRevolvingBurden,NetFractionInstallBurden,NumRevolvingTradesWBalance,NumInstallTradesWBalance,NumBank2NatlTradesWHighUtilization,PercentTradesWBalance,RiskPerformance
0,0.621359,0.188424,0.033163,0.237245,0.329545,0.428571,0.321429,0.844037,0.119565,0.666667,...,0.272727,0.12,0.12,0.174274,0.002083,0.414634,0.3125,0.37037,0.715596,Bad
1,0.679612,0.082512,0.061224,0.127551,0.125,0.464286,0.464286,1.0,0.021739,0.5,...,0.272727,0.12,0.12,0.037344,0.002083,0.219512,0.03125,0.037037,0.082569,Bad
2,0.737864,0.092365,0.035714,0.084184,0.204545,0.321429,0.321429,1.0,0.021739,0.888889,...,0.272727,0.173333,0.173333,0.257261,0.15625,0.317073,0.34375,0.37037,0.87156,Bad
3,0.728155,0.219212,0.02551,0.209184,0.420455,0.357143,0.357143,0.93578,0.923913,0.833333,...,0.272727,0.186667,0.173333,0.3361,0.191667,0.365854,0.40625,0.444444,0.917431,Bad
4,0.873786,0.421182,0.091837,0.359694,0.238636,0.321429,0.321429,1.0,0.021739,0.888889,...,0.272727,0.133333,0.133333,0.248963,0.204167,0.292683,0.3125,0.333333,0.816514,Bad


In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, roc_curve, roc_auc_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import statsmodels.api as sm

# Prepare features and target
X = normalized_features
le = LabelEncoder()
y = le.fit_transform(normalized_data['RiskPerformance'])  # Good=1, Bad=0

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic Regression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
y_prob = lr.predict_proba(X_test)[:, 1]

# Accuracy
accuracy = accuracy_score(y_test, y_pred)

# Precision
precision = precision_score(y_test, y_pred)

# F1 Score
f1 = f1_score(y_test, y_pred)

# ROC Curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
auc = roc_auc_score(y_test, y_prob)

# R-square (Pseudo R2 using statsmodels)
X_sm = sm.add_constant(X_train)
logit_model = sm.Logit(y_train, X_sm)
result = logit_model.fit(disp=0)
pseudo_r2 = result.prsquared

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC: {auc:.4f}")
print(f"Pseudo R-square: {pseudo_r2:.4f}")

# F-statistic is not defined for logistic regression, but you can use the likelihood ratio test from statsmodels
print(f"Likelihood Ratio (F-statistic analog): {result.llr:.4f}")

Accuracy: 0.6922
Precision: 0.6871
F1 Score: 0.6724
ROC AUC: 0.7534
Pseudo R-square: 0.1977
Likelihood Ratio (F-statistic analog): 2289.7599


In [5]:
from sklearn.ensemble import RandomForestClassifier

# Train Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
rf_prob = rf.predict_proba(X_test)[:, 1]

# Metrics
rf_accuracy = accuracy_score(y_test, rf_pred)
rf_precision = precision_score(y_test, rf_pred)
rf_f1 = f1_score(y_test, rf_pred)
rf_auc = roc_auc_score(y_test, rf_prob)

print(f"Random Forest Accuracy: {rf_accuracy:.4f}")
print(f"Random Forest Precision: {rf_precision:.4f}")
print(f"Random Forest F1 Score: {rf_f1:.4f}")
print(f"Random Forest ROC AUC: {rf_auc:.4f}")

Random Forest Accuracy: 0.6979
Random Forest Precision: 0.7053
Random Forest F1 Score: 0.6691
Random Forest ROC AUC: 0.7662


In [6]:
# Unconditional probability of default (i.e., percentage of 'Bad' cases in all data)
unconditional_default_prob = (y == 0).mean() * 100
print(f"Unconditional Probability of Default: {unconditional_default_prob:.2f}%")

Unconditional Probability of Default: 52.19%


In [7]:
# R-square for logistic regression (Pseudo R2 already computed)
print(f"Logistic Regression Pseudo R-square: {pseudo_r2:.4f}")

# R-square for Random Forest (using the coefficient of determination on test set)
rf_r2 = rf.score(X_test, y_test)
print(f"Random Forest R-square: {rf_r2:.4f}")

Logistic Regression Pseudo R-square: 0.1977
Random Forest R-square: 0.6979
