In [14]:
import pandas as pd
import numpy as np
from statsmodels.tools.tools import add_constant
from statsmodels.stats.outliers_influence import variance_inflation_factor

# -----------------------
# Step 0: Load dataset
# -----------------------
file_path = r"C:\Users\rajat\OneDrive\Desktop\Inputs scorecard\EDA_final_dataset.xlsx"
final_df = pd.read_excel(file_path, sheet_name="Sheet1")

# Set target column
target = "Default_y"

In [15]:
# -----------------------
# Step 1: Regulatory Exclusions (if any)
# -----------------------
# Example: drop ID columns or leakage variables
exclude_vars = ["Customer_ID"] if "Customer_ID" in final_df.columns else []
final_df = final_df.drop(columns=exclude_vars, errors="ignore")

print("Step 1 Done ✅ → Variables after regulatory exclusion:", final_df.shape[1])

Step 1 Done ✅ → Variables after regulatory exclusion: 29


In [16]:
import pandas as pd
import numpy as np

# ------------------------------
# Step 2: IV Calculation with Classification
# ------------------------------

def calculate_iv(df, feature, target, bins=10):
    """
    Calculate Information Value (IV) for a single feature.
    """
    # Create bins
    df["bin"] = pd.qcut(df[feature].rank(method="first"), bins, duplicates="drop")

    # Group by bin (observed=False fixes warning)
    grouped = df.groupby("bin", observed=False)[target].agg(["count", "sum"])
    grouped["non_event"] = grouped["count"] - grouped["sum"]

    # Distributions
    dist_event = grouped["sum"] / grouped["sum"].sum()
    dist_non_event = grouped["non_event"] / grouped["non_event"].sum()

    # IV calculation
    iv = ((dist_event - dist_non_event) * 
          np.log((dist_event + 1e-10) / (dist_non_event + 1e-10))).sum()

    return iv

def classify_iv(iv):
    """
    Classify variable predictive power based on IV value.
    """
    if iv < 0.02:
        return "Not Useful"
    elif iv < 0.1:
        return "Weak"
    elif iv < 0.3:
        return "Medium"
    elif iv < 0.5:
        return "Strong"
    else:
        return "Suspicious / Overfit"

# ------------------------------
# Apply to final_df
# ------------------------------

target = "Default_y"
features = [col for col in final_df.columns if col != target]

iv_results = {}

for feature in features:
    try:
        iv_results[feature] = calculate_iv(final_df, feature, target, bins=10)
    except Exception as e:
        print(f"⚠️ Skipping {feature} due to error: {e}")

# Convert results to DataFrame
iv_df = pd.DataFrame(list(iv_results.items()), columns=["Variable", "IV"])
iv_df["Strength"] = iv_df["IV"].apply(classify_iv)

print("\nStep 2 Done ✅ → IV Calculation")
print(iv_df.sort_values(by="IV", ascending=False))



Step 2 Done ✅ → IV Calculation
                                Variable        IV              Strength
2                             Income_INR  3.850526  Suspicious / Overfit
20                     Outstanding_Loans  3.501148  Suspicious / Overfit
10                    No_of_Inquiries_6M  3.491895  Suspicious / Overfit
1                Behavior_Spending_Score  3.453279  Suspicious / Overfit
4                     Loan_Tenure_Months  3.407736  Suspicious / Overfit
15              Checking_Account_Balance  3.369120  Suspicious / Overfit
19                           Loan_Amount  3.283799  Suspicious / Overfit
6               Credit_Utilization_Ratio  3.206567  Suspicious / Overfit
13                 No_of_Closed_Accounts  0.589163  Suspicious / Overfit
0                        Education_Level  0.589163  Suspicious / Overfit
9               Behavior_Repayment_Score  0.558637  Suspicious / Overfit
25                                DPD_30  0.511931  Suspicious / Overfit
11                 

In [17]:
import pandas as pd
import numpy as np

# ------------------------------
# Step 3: WOE Monotonic Trend Check 
# ------------------------------

def check_almost_monotonic_woe(df, feature, target, bins=5, tolerance=2):
    """
    Check if WOE values for a feature are *almost monotonic*.
    Allows up to `tolerance` direction changes.
    Uses fewer bins to smooth noise.
    """
    df["bin"] = pd.qcut(df[feature].rank(method="first"), bins, duplicates="drop")

    grouped = df.groupby("bin", observed=False)[target].agg(["count", "sum"])
    grouped["non_event"] = grouped["count"] - grouped["sum"]

    dist_event = grouped["sum"] / grouped["sum"].sum()
    dist_non_event = grouped["non_event"] / grouped["non_event"].sum()

    woe_values = np.log((dist_event + 1e-10) / (dist_non_event + 1e-10)).values

    # Find direction changes
    diffs = np.sign(np.diff(woe_values))
    changes = np.sum(diffs[1:] != diffs[:-1])

    return changes <= tolerance   # Allow up to N changes

# ------------------------------
# Apply to final_df
# ------------------------------

target = "Default_y"
features = [col for col in final_df.columns if col != target]

monotonic_vars = []

for feature in features:
    try:
        if check_almost_monotonic_woe(final_df, feature, target, bins=5, tolerance=2):
            monotonic_vars.append(feature)
    except Exception as e:
        print(f"⚠️ Skipping {feature} due to error: {e}")

# Keep only selected variables + target
final_df_step3 = final_df[monotonic_vars + [target]]

print("\nStep 3 Done ✅ → Variables with relaxed monotonic WOE:")
print(monotonic_vars)
print(f"Dataset shape after Step 3: {final_df_step3.shape}")



Step 3 Done ✅ → Variables with relaxed monotonic WOE:
['Loan_Tenure_Months', 'Months_Since_Most_Recent_Delinquency', 'Credit_Utilization_Ratio', 'Credit_Card_Utilization', 'No_of_Inquiries_6M', 'Newest_Trade_Open_Months', 'Checking_Account_Balance', 'Delinquency_12M', 'Total_Current_Balance', 'No_of_Open_Accounts', 'Savings_Account_Balance']
Dataset shape after Step 3: (100, 12)


In [18]:
# -----------------------
# Step 4: Multicollinearity (VIF)
# -----------------------
X = final_df.drop(columns=[target])
X = X.select_dtypes(include=[np.number])

if not X.empty:
    X_const = add_constant(X)
    vif_df = pd.DataFrame()
    vif_df["Variable"] = X_const.columns
    vif_df["VIF"] = [variance_inflation_factor(X_const.values, i) for i in range(X_const.shape[1])]
    vif_df = vif_df[vif_df["Variable"] != "const"]

    # Keep variables with VIF < 5
    final_vars = vif_df.loc[vif_df["VIF"] < 5, "Variable"].tolist()
    final_df = final_df[final_vars + [target]]

    print("\nStep 4 Done ✅ → VIF values")
    print(vif_df)
    print("Variables kept after VIF filtering:", final_vars)
else:
    print("\nNo numeric variables left for VIF check.")

# -----------------------
# Final Output
# -----------------------
print("\n✅ Final dataset shape:", final_df.shape)
print("✅ Final variables used:", final_df.columns.tolist())


Step 4 Done ✅ → VIF values
                                Variable       VIF
1                Behavior_Spending_Score  1.597960
2                             Income_INR  1.584497
3                     Total_Credit_Limit  1.275800
4                     Loan_Tenure_Months  1.538533
5   Months_Since_Most_Recent_Delinquency  1.551407
6               Credit_Utilization_Ratio  1.537819
7                Credit_Card_Utilization  1.490912
8                       Employment_Years  1.522870
9               Behavior_Repayment_Score  1.446245
10                    No_of_Inquiries_6M  1.701075
11                 Credit_History_Length  1.394881
12                  Worst_Current_Status  1.459092
13                 No_of_Closed_Accounts  1.446409
14              Newest_Trade_Open_Months  1.560629
15              Checking_Account_Balance  1.506160
16                   No_of_Inquiries_12M  1.404393
17                       Delinquency_12M  1.538489
18                           Loan_Amount  1.471337
19 

In [19]:
# -----------------------
# Save Final Dataset
# -----------------------

output_path = r"C:\Users\rajat\OneDrive\Desktop\Inputs scorecard\Final_Model_Dataset.xlsx"

# Save final_df into Excel
final_df.to_excel(output_path, sheet_name="Model_Data", index=False)

print(f"✅ Final dataset saved to: {output_path}")
print("Shape:", final_df.shape)
print("Columns:", final_df.columns.tolist())


✅ Final dataset saved to: C:\Users\rajat\OneDrive\Desktop\Inputs scorecard\Final_Model_Dataset.xlsx
Shape: (100, 27)
Columns: ['Behavior_Spending_Score', 'Income_INR', 'Total_Credit_Limit', 'Loan_Tenure_Months', 'Months_Since_Most_Recent_Delinquency', 'Credit_Utilization_Ratio', 'Credit_Card_Utilization', 'Employment_Years', 'Behavior_Repayment_Score', 'No_of_Inquiries_6M', 'Credit_History_Length', 'Worst_Current_Status', 'No_of_Closed_Accounts', 'Newest_Trade_Open_Months', 'Checking_Account_Balance', 'No_of_Inquiries_12M', 'Delinquency_12M', 'Loan_Amount', 'Outstanding_Loans', 'Oldest_Trade_Open_Months', 'Total_Current_Balance', 'Max_Credit_Exposure', 'No_of_Open_Accounts', 'DPD_30', 'Savings_Account_Balance', 'Cluster', 'Default_y']
