In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer  
from sklearn.impute import IterativeImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

import pylab
import scipy.stats as stats
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import binarize
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

%matplotlib inline

In [5]:
import pandas as pd
data = pd.read_csv("heloc_dataset.csv")

In [6]:
df = data.copy()

In [7]:
# Define the mapping of current values to integers
mapping = {'Bad': 0, 'Good': 1}

# Apply the mapping to the 'RiskPerformance' column
df['RiskPerformance'] = df['RiskPerformance'].replace(mapping)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10459 entries, 0 to 10458
Data columns (total 24 columns):
 #   Column                              Non-Null Count  Dtype
---  ------                              --------------  -----
 0   RiskPerformance                     10459 non-null  int64
 1   ExternalRiskEstimate                10459 non-null  int64
 2   MSinceOldestTradeOpen               10459 non-null  int64
 3   MSinceMostRecentTradeOpen           10459 non-null  int64
 4   AverageMInFile                      10459 non-null  int64
 5   NumSatisfactoryTrades               10459 non-null  int64
 6   NumTrades60Ever2DerogPubRec         10459 non-null  int64
 7   NumTrades90Ever2DerogPubRec         10459 non-null  int64
 8   PercentTradesNeverDelq              10459 non-null  int64
 9   MSinceMostRecentDelq                10459 non-null  int64
 10  MaxDelq2PublicRecLast12M            10459 non-null  int64
 11  MaxDelqEver                         10459 non-null  int64
 12  NumT

In [9]:
# Separate feature columns and the target column
X = df.drop('RiskPerformance', axis=1)  # This drops the target column from the DataFrame, leaving only feature columns
y = df['RiskPerformance']  # This selects only the target column

In [10]:
# Replace -9 with NaN across the DataFrame
X.replace(-9, np.nan, inplace=True)

In [11]:
# Get dummies + MICE imputation
X = pd.get_dummies(X, drop_first=True, dummy_na=True)

imputer = IterativeImputer(random_state=99)
imputed = imputer.fit_transform(X)
X_imputed = pd.DataFrame(imputed, columns=X.columns)

In [12]:
X = X_imputed.copy()

In [13]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10459 entries, 0 to 10458
Data columns (total 23 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   ExternalRiskEstimate                10459 non-null  float64
 1   MSinceOldestTradeOpen               10459 non-null  float64
 2   MSinceMostRecentTradeOpen           10459 non-null  float64
 3   AverageMInFile                      10459 non-null  float64
 4   NumSatisfactoryTrades               10459 non-null  float64
 5   NumTrades60Ever2DerogPubRec         10459 non-null  float64
 6   NumTrades90Ever2DerogPubRec         10459 non-null  float64
 7   PercentTradesNeverDelq              10459 non-null  float64
 8   MSinceMostRecentDelq                10459 non-null  float64
 9   MaxDelq2PublicRecLast12M            10459 non-null  float64
 10  MaxDelqEver                         10459 non-null  float64
 11  NumTotalTrades                      10459

In [14]:
X

Unnamed: 0,ExternalRiskEstimate,MSinceOldestTradeOpen,MSinceMostRecentTradeOpen,AverageMInFile,NumSatisfactoryTrades,NumTrades60Ever2DerogPubRec,NumTrades90Ever2DerogPubRec,PercentTradesNeverDelq,MSinceMostRecentDelq,MaxDelq2PublicRecLast12M,...,PercentInstallTrades,MSinceMostRecentInqexcl7days,NumInqLast6M,NumInqLast6Mexcl7days,NetFractionRevolvingBurden,NetFractionInstallBurden,NumRevolvingTradesWBalance,NumInstallTradesWBalance,NumBank2NatlTradesWHighUtilization,PercentTradesWBalance
0,55.0,144.0,4.0,84.0,20.0,3.0,0.0,83.0,2.0,3.0,...,43.0,0.0,0.0,0.0,33.0,-8.0,8.0,1.0,1.0,69.0
1,61.0,58.0,15.0,41.0,2.0,4.0,4.0,100.0,-7.0,0.0,...,67.0,0.0,0.0,0.0,0.0,-8.0,0.0,-8.0,-8.0,0.0
2,67.0,66.0,5.0,24.0,9.0,0.0,0.0,100.0,-7.0,7.0,...,44.0,0.0,4.0,4.0,53.0,66.0,4.0,2.0,1.0,86.0
3,66.0,169.0,1.0,73.0,28.0,1.0,1.0,93.0,76.0,6.0,...,57.0,0.0,5.0,4.0,72.0,83.0,6.0,4.0,3.0,91.0
4,81.0,333.0,27.0,132.0,12.0,0.0,0.0,100.0,-7.0,7.0,...,25.0,0.0,1.0,1.0,51.0,89.0,3.0,1.0,0.0,80.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10454,73.0,131.0,5.0,57.0,21.0,0.0,0.0,95.0,80.0,6.0,...,19.0,7.0,0.0,0.0,26.0,-8.0,5.0,2.0,0.0,100.0
10455,65.0,147.0,39.0,68.0,11.0,0.0,0.0,92.0,28.0,6.0,...,42.0,1.0,1.0,1.0,86.0,53.0,2.0,2.0,1.0,80.0
10456,74.0,129.0,6.0,64.0,18.0,1.0,1.0,100.0,-7.0,6.0,...,33.0,3.0,4.0,4.0,6.0,-8.0,5.0,-8.0,0.0,56.0
10457,72.0,234.0,12.0,113.0,42.0,2.0,2.0,96.0,35.0,6.0,...,20.0,6.0,0.0,0.0,19.0,-8.0,4.0,1.0,0.0,38.0


In [15]:
y

0        0
1        0
2        0
3        0
4        0
        ..
10454    1
10455    0
10456    0
10457    0
10458    0
Name: RiskPerformance, Length: 10459, dtype: int64

In [16]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Assuming 'data_resampled' is a DataFrame that includes the target variable 'Quality'
X = X.copy()
y = y.astype(float)  # Ensure y is a numpy array of float for TensorFlow compatibility

# Reset indices to ensure alignment
X.reset_index(drop=True, inplace=True)
y.reset_index(drop=True, inplace=True)

# Splitting and scaling data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=99)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Neural network setup for regression
model = tf.keras.Sequential([
    tf.keras.layers.Dense(25, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    tf.keras.layers.Dense(1, activation='linear')  # Linear output for regression
])

model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_squared_error'])

# Early stopping callback to prevent overfitting
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    min_delta=1e-4,
    patience=10,
    restore_best_weights=True
)

# Train the model with validation to utilize early stopping
history = model.fit(
    X_train_scaled, y_train,
    validation_split=0.2,
    epochs=150,
    batch_size=32,
    callbacks=[early_stopping],
    verbose=1  # Set verbose to 1 for progress output
)

# Function to calculate squared gradients for regression
def calculate_test_statistics(X, model):
    X_tf = tf.convert_to_tensor(X, dtype=tf.float32)
    with tf.GradientTape() as tape:
        tape.watch(X_tf)
        predictions = model(X_tf)
    grads = tape.gradient(predictions, X_tf)
    squared_grads = tf.square(grads)
    mean_squared_grads = tf.reduce_mean(squared_grads, axis=0).numpy()
    return mean_squared_grads

# Calculate the test statistics for the training data
test_statistics = calculate_test_statistics(X_train_scaled, model)

# Function to calculate p-values using bootstrapping
def calculate_p_values(X, y, model, num_permutations=500):
    p_values = np.zeros(X.shape[1])
    original_stats = calculate_test_statistics(X, model)
    for _ in range(num_permutations):
        y_permuted = np.random.permutation(y)
        # Reset and recompile the model to avoid leakage
        new_model = tf.keras.Sequential([
            tf.keras.layers.Dense(25, activation='relu', input_shape=(X.shape[1],)),
            tf.keras.layers.Dense(1, activation='linear')
        ])
        new_model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_squared_error'])
        new_model.fit(X, y_permuted, epochs=10, batch_size=32, verbose=0)  # Reduced epochs for speed
        permuted_stats = calculate_test_statistics(X, new_model)
        p_values += (permuted_stats >= original_stats)
    p_values /= num_permutations
    return p_values

# Calculate p-values for each feature
p_values = calculate_p_values(X_train_scaled, y_train, model)

# Output the test statistics and p-values
results_df = pd.DataFrame({
    'Feature': X.columns,  # Use DataFrame column names directly
    'Test Statistic': test_statistics,
    'P-Value': p_values
})
print("Test Statistics and P-values for each feature:")
print(results_df)

Epoch 1/150


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.6000 - loss: 0.6817 - val_accuracy: 0.7103 - val_loss: 0.5795
Epoch 2/150
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7116 - loss: 0.5617 - val_accuracy: 0.7145 - val_loss: 0.5700
Epoch 3/150
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7207 - loss: 0.5565 - val_accuracy: 0.7115 - val_loss: 0.5699
Epoch 4/150
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7089 - loss: 0.5612 - val_accuracy: 0.7109 - val_loss: 0.5676
Epoch 5/150
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7149 - loss: 0.5592 - val_accuracy: 0.7121 - val_loss: 0.5666
Epoch 6/150
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7213 - loss: 0.5497 - val_accuracy: 0.7103 - val_loss: 0.5662
Epoch 7/150
[1m210/210[0m [32m━