In [None]:
import os
from utils.core.config import initialize_daanish, load_project_config
from utils.data_io import load_data
from utils.core.feature_manager import FeatureManager
from utils.eda.descriptive import DescriptiveAnalysis
from utils.viz.display import DisplayUtils
from utils.core.save_manager import SaveUtils
from utils.core.format_utils import FormatUtils
from utils.eda.statistical import StatisticalAnalysis
from utils.eda.visualisation.general_viz import Visualisation
from utils.eda.correlation import CorrelationAnalyzer
import pandas as pd
import numpy as np
from utils.preprocessing.missing_values import MissingValueHandler
from utils.features.selector import FeatureSelector
from utils.eda.outlier_detection import OutlierDetector
from utils.preprocessing.outlier_treatment import OutlierHandler
from utils.dimensionality.multicollinearity import MulticollinearityDetector
from utils.dimensionality.pca_analyzer import PCAAnalyzer
from utils.dimensionality.mca_analyzer import MCAAnalyzer
from utils.dimensionality.famd_analyzer import FAMDAnalyzer
from utils.eda.visualisation.dimensionality_viz import DimensionalityViz
from utils.modelling.clustering.kmeans_cluster import KMeansClustering
from utils.eda.visualisation.cluster_viz import ClusterVisualisation
from utils.modelling.clustering.hierarchical_cluster import HierarchicalClustering
from utils.modelling.clustering.dbscan_cluster import DBSCANClustering  
from utils.modelling.clustering.kmodes_cluster import KModesClustering
from utils.modelling.clustering.kprototypes_cluster import KPrototypesClustering
from utils.modelling.clustering.hierarchical_gower_cluster import HierarchicalGowerClustering
from utils.features.binning import Binner  
from utils.importance.woe_encoder import WOEEncoder
from utils.eda.visualisation.feature_importance_viz import FeatureImportanceVisualiser
from utils.importance.iv_calculator import IVCalculator
from utils.preprocessing.data_encoder import DataEncoder
from utils.modelling.classification.logistic_regression import LogisticModel
from utils.modelling.classification.random_forest import RandomForestModel
from utils.modelling.classification.xgboost import XGBoostModel


#### Step 1: Project Initialization and Data Loading 
In this step, we:
- Initialize the Daanish core setup
- Access global and project-specific configuration values
- Construct input and output paths based on project settings
- Load the main dataset for modeling
- Load the list of model features along with their attributes

- Initialize the Daanish core setup

In [None]:
global_config = initialize_daanish()


- Access global and project-specific configuration values

In [None]:

project_root = os.getcwd()
project_config = load_project_config(project_root)

input_data_folder = project_config.get('paths', 'input_data_folder')
output_data_folder = project_config.get('paths', 'output_data_folder')
main_dataset = project_config.get('input_files', 'main_dataset')
model_features = project_config.get('input_files', 'features_attributes')
source_type = project_config.get('datasource_type', 'source_type')
main_dataset_query = project_config.get('db_queries', 'main_dataset_query')
model_features_query = project_config.get('db_queries', 'model_features_query')

- Construct input and output paths based on project settings

In [None]:

input_path = os.path.join(project_root, input_data_folder)
output_path = os.path.join(project_root, output_data_folder)

- Load the main dataset for modeling

In [None]:
main_df = load_data(
    source_type=source_type,
    input_path=os.path.join(input_path, main_dataset),
    query=main_dataset_query,
    global_config=global_config
)

- Load the list of model features along with their attributes

In [None]:
feature_manager = FeatureManager(
    source_type=source_type,
    input_path=os.path.join(input_path, model_features),
    global_config=global_config,
    query=model_features_query
)

# Feature types
nominal_features = feature_manager.get_nominal_features()
ordinal_features = feature_manager.get_ordinal_features()
numerical_features = feature_manager.get_numerical_features()
target_variable = feature_manager.get_target_variable()
all_features = feature_manager.get_all_features()

# Missing value handling
missing_value_strategies = feature_manager.get_missing_value_strategies()
missing_fill_values = feature_manager.get_missing_fill_values()

# Display names
display_names = feature_manager.get_display_names()

# Outlier handling configs
outlier_strategies = feature_manager.get_outlier_detection_strategies()
outlier_params = feature_manager.get_outlier_detection_params()
outlier_imputation_methods = feature_manager.get_outlier_imputation_methods()
outlier_imputation_values = feature_manager.get_outlier_imputation_values()
outlier_config_bundle = feature_manager.get_outlier_config_bundle()

# Features' binning config
binning_config = feature_manager.get_binning_config_bundle()

# print("Nominal Features:", nominal_features)
# print("Ordinal Features:", ordinal_features)
# print("Numerical Features:", numerical_features)
# print("Target Variable:", target_variable)
# print("All Features:", all_features)
# print("Missing Value Strategies:", missing_value_strategies)
# print("Missing Fill Values:", missing_fill_values)
# print("Display Names:", display_names)
# print("Binning Config:", binning_config)

#### Step 2: Preliminary Exploratory Data Analysis (EDA)

In this step, we explore the raw dataset to understand its structure, identify potential issues (e.g., missing values, outliers, inconsistent types), and gain initial insights into data distributions. This provides the foundation for informed preprocessing and feature engineering decisions later.

In [None]:
# initializing the `DescriptiveAnalysis` class with our main dataset
eda_desc = DescriptiveAnalysis(main_df)

- 2.1 data sample

In [None]:
sample_data = eda_desc.get_data_samples(5)
DisplayUtils.show_dataframe_notebook(sample_data)

- 2.2 dataset summary

In [None]:
dataset_summary = eda_desc.get_dataset_summary()
DisplayUtils.show_summary_console(dataset_summary)

- 2.3 Summary of Feature(s) 

In [None]:
# single_feature_summary = eda_desc.get_feature_summary("loan_amnt")
All_features_summary = eda_desc.get_all_feature_summaries()

- Display Summary

In [None]:
# DisplayUtils.print_feature_summary("loan_amnt", single_feature_summary)
DisplayUtils.print_high_level_summary(All_features_summary)

- Save High Level Descriptive Analysis Summary to CSV and Excel

This cell formats the high-level feature summaries into a structured DataFrame and saves it as a CSV or Excel file. The formatting is handled by `FormatUtils`, which extracts selected statistics for each feature, and the output is saved using `SaveUtils`.

In [None]:
# initializing the `SaveUtils` class
save_utils = SaveUtils()

In [None]:
# Format the summary
df_summary = FormatUtils.high_level_summary_to_dataframe(All_features_summary)

# Save as a CSV file
# save_utils.save_dataframe_to_csv(df_summary, os.path.join(output_path, "descriptive_summary.csv"), overwrite=True)

# Save as an Excel file
# save_utils.save_dataframe_to_excel(df_summary, os.path.join(output_path, "descriptive_summary.xlsx"), sheet_name='Descriptive Summary')

- Save Detailed Descriptive Analysis Summary to JSON
This cell saves the detailed descriptive analysis summary as a JSON file.  
It is intended for use by applications or services that need to consume and display the analysis results.

In [None]:
# Save as JSON file
# save_utils.save_json(All_features_summary, os.path.join(output_path, "descriptive_summary.json"), overwrite=True)

### 🔍 2.4 Find the Best-Fit Probability Distribution for Selected Features
This cell identifies the best-fit probability distribution for each feature in the given list.
- **Method**: Defines the criterion for selecting the best fit. Options are:
  - `'sumsquare_error'` *(default)*
  - `'aic'`
  - `'bic'`

- **common_distributions (bool)**:  
  - If `True`, only fits a curated list of commonly used distributions:  
    `'norm'`, `'expon'`, `'lognorm'`, `'gamma'`, `'beta'`, `'weibull_min'`, `'chi2'`, `'pareto'`, `'uniform'`, `'t'`, `'gumbel_r'`, `'burr'`, `'invgauss'`, `'triang'`, `'laplace'`, `'logistic'`, `'genextreme'`, `'skewnorm'`, `'genpareto'`, `'burr12'`, `'fatiguelife'`, `'geninvgauss'`, `'halfnorm'`, `'exponpow'`

  - If `False`, fits from an extended list of over 100 SciPy continuous distributions.

In [None]:
# initializing the `StatisticalAnalysis` class
eda_stat = StatisticalAnalysis(main_df)

# Finding the best-fit probability distribution
# distribution_results = eda_stat.fit_best_distribution(numerical_features, method='sumsquare_error', common_distributions=True, timeout=60)
distribution_results = eda_stat.fit_best_distribution(['person_income'], method='sumsquare_error', common_distributions=True, timeout=60)

- Plotting the best-fit distributions for a given feature(s) set

In [None]:
# initializing the `Visualisation` class
viz = Visualisation(main_df, display_names)

# Plotting best-fit distributions
# viz.plot_distributions(fitted_distributions = distribution_results,variables=numerical_features)
# viz.plot_distributions(fitted_distributions = distribution_results,variables=['person_age', 'loan_amnt'])

- 📊Plot Histograms for Selected Features
This cell visualizes the distribution of selected features using histograms.

In [None]:
    # viz.plot_histogram(variables=numerical_features, orientation="vertical")
# viz.plot_histogram(variables=['loan_amnt'], orientation="vertical")

    # viz.plot_histogram(variables=nominal_features, orientation="horizontal")
    # viz.plot_histogram(variables=['loan_intent'], orientation="horizontal")


- 🔵 Scatter Plots for Relationship Analysis
This cell provides visualizations to explore relationships between two numerical variables, with optional grouping and trendlines.

In [None]:
# Scatter plot with color based on `loan_grade`
# viz.plot_scatter(x_var="person_income", y_var="person_age", hue_var="loan_status", trendline=True)
# viz.plot_scatter(x_var="person_income", y_var="loan_amnt", hue_var="loan_status", trendline=True)
# viz.plot_scatter(x_var="loan_int_rate", y_var="loan_amnt", hue_var="loan_status", trendline=True)
# viz.plot_scatter(x_var="loan_int_rate", y_var="person_income", hue_var="loan_status", trendline=True)


# Scatter plot with trendline
# viz.plot_scatter(x_var="person_age", y_var="person_income", trendline=True)

- 🟩 Box Plots for Distribution Comparison

This cell uses box plots to compare the distribution of a numerical variable across categories of another feature.

In [None]:
# viz.plot_boxplot(column='person_income', by='loan_intent')
# viz.plot_boxplot(column='loan_amnt', by='person_home_ownership')
# viz.plot_boxplot(column='loan_int_rate', by='cb_person_default_on_file')
# viz.plot_boxplot(column='person_age', by='loan_intent')


- 📊 Crosstab Analysis
This section generates cross-tabulation (contingency) tables to explore the relationship between categorical variables.

In [None]:
# For two variables
crosstab_result_1 = eda_stat.crosstab("loan_status", "person_home_ownership", normalize="index")
crosstab_result_2 = eda_stat.crosstab("loan_status", "loan_intent", normalize="index")
crosstab_result_3 = eda_stat.crosstab("loan_status", "cb_person_default_on_file", normalize="index")

# For three variables
crosstab_result_4 = eda_stat.crosstab_three_way("loan_status", "cb_person_default_on_file", "person_home_ownership")


In [None]:
# Display
# crosstab_result_1
# crosstab_result_2
# crosstab_result_3
crosstab_result_4

## Step 3: Data Preprocessing

- 3.1 Missing Value Handling  
- 3.2 Outlier Handling


#### 🧼 3.1 Missing Value Handling Pipeline

This pipeline addresses missing data in a modular and strategy-driven manner. It includes detection, filtering, and imputation using configurable rules per feature.

---

##### 🔍 Identify and Remove Features with Excessive Missingness

A threshold (e.g., 30%) is used to detect features with too many missing values. These features are removed from the dataset to prevent model instability, unreliable imputations, or learning bias.

---

##### 🛠️ Impute Remaining Missing Values with Defined Strategies

For the remaining features, missing values are imputed according to user-specified strategies. Each feature can have a tailored strategy from the options below:

- `"drop"`: Remove rows with missing values in this feature.
- `"fill_mean"`: Fill with the mean (numeric features only).
- `"fill_median"`: Fill with the median (numeric features only).
- `"fill_mode"`: Fill with the most frequent value.
- `"fill_value"`: Fill using a custom value (requires `fill_values`).
- `"ffill"`: Forward fill (carry last valid value forward).
- `"bfill"`: Backward fill (use next valid value).
- `"fill_interpolate"`: Linearly interpolate between valid values.
- `"none"` or `"keep"`: Retain missing values (no action taken).

The missing value handler processes these strategies on a per-feature basis and returns:

- **`imputed_records`**: A subset of rows where imputation occurred, with an `affected_features` column.
- **`imputed_dataset`**: The complete dataset after imputation and/or row removal.


In [None]:
# Step 1: Initialize handler and identify features with high missing rates
dp = MissingValueHandler(main_df)
features_with_high_missing = dp.features_with_many_missing(threshold=0.3)
print("Features with high missing values: ", features_with_high_missing)

# Step 2: Drop high-missing features and update the working dataset
fs = FeatureSelector(main_df)
updated_df = fs.drop_features(features_with_high_missing)

# Step 3: Reinitialize missing value handler with cleaned dataset
dp = MissingValueHandler(updated_df)

# Step 4: Filter feature lists and strategies to match updated dataset
all_features = [f for f in all_features if f not in features_with_high_missing]
strategies = {k: v for k, v in missing_value_strategies.items() if k in all_features}
fill_values = {k: v for k, v in missing_fill_values.items() if k in all_features}

# Step 5: Apply missing value imputation strategies
imputed_records, imputed_dataset = dp.handle(
    all_features, strategies=strategies, fill_values=fill_values)

#### 📉 3.2 Outlier Detection and Handling
This stage identifies and flags outliers in numerical features using a variety of statistical and machine learning methods.
Available techniques include:
- IQR-based detection
- Z-score filtering
- Isolation Forest
- Local Outlier Factor (LOF)
- Distribution fitting (e.g., lognorm, gamma)
- Custom user-defined bounds

Once detection strategies are tested and validated, a unified pipeline is executed using detect_outliers_featurewise(), which applies the appropriate detection method per feature based on predefined configuration.
Then, outliers will be treated based on defined imputation methods, such as replacing with mean, median, mode, or using a custom value.

##### 3.2.1 Testing Strategies

In [None]:
outliers = OutlierDetector(imputed_dataset)
# outlier_df = outliers.detect_outliers_distribution(distribution_results,confidence_interval=0.999)
outlier_df = outliers.detect_outliers_iqr(['person_income'])
# outlier_df = outliers.detect_outliers_isolation_forest(['person_income'])
# outlier_df = outliers.detect_outliers_lof(['person_income'])
# outlier_df = outliers.detect_outliers_zscore(['person_income'])
# outlier_df = outliers.detect_custom_outliers(['person_age'],upper_bounds={'person_age': 100})
# print(outlier_df) 


##### 3.2.2 Executing a unified pipeline to identify outliers 

In [None]:
outliers = OutlierDetector(imputed_dataset)
outlier_df = outliers.detect_outliers_featurewise(
    method_config=outlier_config_bundle,
    distribution_results=distribution_results
)
# print(outlier_df)

##### 3.2.3 Executing imputation methods

Removes rows where the proportion of outlier features exceeds the threshold.

In [None]:
handler = OutlierHandler(imputed_dataset)
row_wise_filtered_df = handler.filter_outlier_heavy_rows(outliers_df=outlier_df,threshold=0.5)

Apply imputation method per feature's outliers

In [None]:
handler = OutlierHandler(row_wise_filtered_df)
handled_records, cleaned_df = handler.handle_from_config(
    outlier_config_bundle=outlier_config_bundle,
    outliers_df=outlier_df
)
# print(handled_records)


## Step 4: Full Exploratory Data Analysis (EDA)
- Analyse feature correlations, multicollinearity, and interactions.
- Use dimensionality-reduction techniques (e.g., PCA) or clustering to explore structure.
- Assess relationships between features and the target variable (Default).


#### 🔗 4.1 Examine Variable Correlations

This section calculates and displays correlations between different types of variables in the cleaned (`cleaned_df`) dataset.

- **`num_method` (str)**: Defines the method for calculating correlation between numerical variables. Allowed values are:
    - `'pearson'` *(default)*: Standard Pearson linear correlation coefficient.
    - `'spearman'`: Spearman's rank correlation coefficient (for monotonic relationships).
    - `'kendall'`: Kendall's tau correlation coefficient (for ordinal or non-normally distributed data).

- **`cat_method` (str)**: Defines the method for calculating association between categorical variables. Allowed values are:
    - `'cramers_v'` *(default)*: Cramer's V (measures association between nominal categorical variables).
    - `'mutual_info'`: Mutual Information (measures the statistical dependence between two random variables).

- **`cat_num_method` (str)**: Defines the method for calculating association between 
    categorical and numerical variables. Allowed values are:
    - `'correlation_ratio'` *(default)*: Correlation Ratio (Eta squared, measures variance explained).
    - `'f_test'`: F-statistic from ANOVA (assesses the difference in means across categories).
    - `'mutual_info'`: Mutual Information (measures the statistical dependence).
    - `'kruskal'`: Non-parametric alternative to ANOVA. Compares distributions of a continuous variable across categories. Good when your numerical variables are not normally distributed
    - `'target_spearman'`: Replaces each category with the mean of the target variable (e.g. default rate). Then computes correlation with numerical features. Captures ordinal structure or monotonic trends across groups

In [None]:
corr_analyzer = CorrelationAnalyzer(cleaned_df)
corr_df,corr_matrix = corr_analyzer.correlation_matrix(num_method="spearman", cat_method="cramers_v",
                                      cat_num_method="mutual_info",return_matrix=True)

Visualisation.plot_heatmap_matrix(corr_matrix, title="Correlation Matrix")

save_utils.save_dataframe_to_csv(corr_matrix, os.path.join(output_path, "correlation_matrix.csv"), overwrite=True)


#### 4.2 Multicollinearity Detection

This section is identifying and resolving multicollinearity among numerical features in a dataset using:
- **Variance Inflation Factor (VIF)**.
- **Pairwise correlation analysis**.

In [None]:
detector = MulticollinearityDetector(cleaned_df, correlation_matrix=corr_matrix)

# VIF values
print(detector.compute_vif())

# Correlated pairs
print('High Correlation Pairs: ',detector.high_correlation_pairs())

# Suggestions for dropping features with high correlation and multicollinearity
print('Suggestion for Features to drop: ', detector.suggest_features_to_drop())

#### 4.3 PCA and MCA Analysis
In this section, we reduce the dimensionality of our dataset using Principal Component Analysis (PCA) for numeric features and Multiple Correspondence Analysis (MCA) for categorical features. These methods help uncover latent patterns, simplify complexity, and enhance visualization by projecting high-dimensional data into lower-dimensional spaces while preserving as much variability as possible.

##### 4.3.1 PCA Analysis on Numerical Features

In this step, we perform Principal Component Analysis (PCA) to explore the structure of the numerical feature space and understand the variance explained by the principal components.

Steps:
1. **Drop non-numerical features and the target variable** to ensure PCA only processes numerical data.
2. **Fit PCA** on the cleaned numerical dataset.
3. **Plot explained variance** to decide how many components capture most of the data’s variance.
4. **Plot loadings heatmap** to understand how original features contribute to each principal component.
5. **Visualize PCA scores by target classes** to see how the target variable (e.g., loan status) is distributed in PCA space.


In [None]:
cat_col_with_target = nominal_features + ordinal_features + target_variable
print(cat_col_with_target)

# Convert to string (supporting both list and string cases)
target = target_variable[0] if isinstance(target_variable, list) else target_variable

# Numerical and Categorical data frames without target variable
cat_features_without_target = [col for col in cat_col_with_target if col != target]
print(cat_features_without_target)

num_features_df = cleaned_df.drop(columns=cat_col_with_target)
categorical_df = cleaned_df[nominal_features + ordinal_features].drop(columns=target_variable)
all_except_target_df = cleaned_df.drop(columns=target_variable)

In [None]:
pca_analyzer = PCAAnalyzer()
pca_analyzer.fit(num_features_df)

viz = DimensionalityViz(data=num_features_df,display_names=display_names)

# To find how many PCs are needed to capture e.g. 90–95% of the variance.
viz.plot_explained_variance(pca_analyzer.explained_variance)

# Loadings show which original features contribute to each component.
viz.plot_pca_loadings(pca_analyzer.loadings)

# Visualize target classes in PCA space
# PCA scores by target
scores_df = pca_analyzer.get_scores_df()
scores_df[target_variable[0]] = cleaned_df[target_variable[0]]  # Add target
viz.plot_pca_scores(scores_df, target_column=target_variable[0])

##### 4.3.2 MCA Analysis on Categorical Features

In this step, we perform Multiple Correspondence Analysis (MCA) to explore the structure of the categorical feature space to understand the relationships between different categories and how much "inertia" (similar to variance in PCA) is explained by the resulting dimensions.

Steps:
1. **Drop numerical features and the target variable** to ensure MCA only processes categorical data.
2. **Fit MCA** on the cleaned categorical dataset.
3. **Plot Explained Inertia** to visualize the cumulative explained inertia for the MCA dimensions.It helps in deciding how many dimensions are significant and should be retained for further analysis or interpretation
4. **Plot MCA Column Coordinates (Category Contributions)** to visualize the coordinates of the original variable categories in the reduced MCA space. It reveals associations between categories 
5. **Visualize MCA Row Coordinates (Individual Scores) by Target Class** Plot the positions (coordinates) of each individual observation (row) in the reduced dimensional MCA space. This helps in understanding if the patterns captured by MCA among categorical features are related to the outcome variable.
6. **Plot contribution heatmap** to understand how original features contribute to each principal 

In [None]:
# MCA on categorical data

mca_analyzer = MCAAnalyzer(n_components=None)
mca_analyzer.fit(categorical_df)

print (mca_analyzer.get_explained_inertia())

viz = DimensionalityViz(data=categorical_df, display_names=display_names)

# Plot explained inertia (variance) to determine how many dimensions to retain
viz.plot_mca_explained_inertia(mca_analyzer.get_explained_inertia())

# MCA column coordinates: contributions of each category to the components
viz.plot_mca_column_coordinates(mca_analyzer.get_column_coordinates_df())

# MCA results by plotting the position of each row in the reduced dimensional space, colored by the target variable.
row_coords = mca_analyzer.get_row_coordinates_df()
row_coords[target_variable[0]] = cleaned_df[target_variable[0]]
viz.plot_mca_row_coordinates(row_coords, target_column=target_variable[0])

# Plot contributions of original variables to each principal 
contributions_df = mca_analyzer.get_column_contributions_df()
viz.plot_mca_column_contributions(contributions_df)
print(contributions_df)

##### 4.3.3 FAMD Analysis on Mixed Features

In this step, we perform **Factor Analysis of Mixed Data (FAMD)** to uncover latent patterns in datasets containing both **categorical and numerical** features. FAMD combines the strengths of PCA (for numerical data) and MCA (for categorical data) to jointly analyze mixed-type variables while ensuring balanced influence from each.

---

**Steps:**

1. **Prepare a mixed dataset**: Combine both **numerical** and **categorical** features, but exclude the **target variable** to avoid leakage during decomposition.

2. **Fit FAMD** on the cleaned dataset using `FAMDAnalyzer`.

3. **Plot Explained Inertia** to visualize the cumulative variance (inertia) explained by each FAMD dimension. This helps in selecting how many components are meaningful for interpretation.

4. **Plot FAMD Column Coordinates (Feature Category Contributions)** to visualize how different feature categories or continuous variables are projected into the FAMD-reduced space. This reveals clustering and relationships across both variable types.

5. **Visualize FAMD Row Coordinates (Individual Scores) by Target Class**: Plot the reduced coordinates for each data point and color by the target variable. This allows assessment of how well the FAMD components separate or cluster observations based on the outcome.

6. **Plot Contribution Heatmap** to understand how much each original variable (categorical or numerical) contributes to each principal dimension. This is essential for interpretation of the latent components and identifying feature importance across the reduced space.

---

**Note**: FAMD automatically scales numerical features and encodes categorical variables internally. No manual preprocessing like standardization or one-hot encoding is required.


In [None]:
# Fit FAMD on full (mixed) dataset
famd_analyzer = FAMDAnalyzer(n_components=None,
                             scale_numerical=True,        # Explicitly enabling
                            scaling_method='zscore',      # Use scaling method
                            handle_skew=True,             # Turn on skew correction
                            skew_method='log',            # Use log transform
                            skew_threshold=1.5            # Set a new threshold for skewness
                            )
famd_analyzer.fit(all_except_target_df)

print(famd_analyzer.get_explained_inertia())

# Set up the visualisation object
viz = DimensionalityViz(data=all_except_target_df, display_names=display_names)

# Plot explained inertia (variance) to determine how many components to retain
viz.plot_famd_explained_inertia(famd_analyzer.get_explained_inertia())

# Plot column coordinates (showing variable positions in FAMD space)
viz.plot_famd_column_coordinates(famd_analyzer.get_column_coordinates_df())

# Plot row coordinates (samples) colored by the target variable
row_coords = famd_analyzer.get_row_coordinates_df()
row_coords[target_variable[0]] = cleaned_df[target_variable[0]]
viz.plot_famd_row_coordinates(row_coords, target_column=target_variable[0])

# Plot contributions of original variables to each FAMD dimension
contributions_df = famd_analyzer.get_contributions()
viz.plot_famd_column_contributions(contributions_df)
print(contributions_df)


#### 4.4 Clustering Analysis
This section investigates natural groupings or underlying patterns in the data by applying clustering techniques. The objective is to uncover homogeneous subgroups within the dataset that may not be visible through standard univariate or bivariate analysis.
We explore clustering using:
- Numerical features: K-Means, Hierarchical, and DBSCAN Clustering analysis 
- Categorical features
- Combined numerical and categorical features

##### 4.4.1 Clustering Analysis for Numerical Features

##### K-Means Clustering Analysis
We applied K-Means clustering to segment the dataset based on numerical features. The process included:
- Scaling and log transformation to handle skewness.
- Grid search for optimal number of clusters using Silhouette scores (Optional).
- Dimensionality reduction using PCA for 2D visualization.
- Cluster profiling to interpret group characteristics.

In [None]:
# # Initialize the KMeansClustering class
# kmeans_analyser = KMeansClustering(df = cleaned_df,features=numerical_features,n_clusters=2,
#                                    scale=True, handle_skew=True, skew_method='log', 
#                                    skew_threshold=1, random_state=42, 
#                                    tune_mode=None, # Optional: None or 'grid' for grid search
#                                    cluster_range=list(range(2, 11)))

# print(kmeans_analyser.grid_search_results)

# # Fit and predict cluster labels
# labels = kmeans_analyser.fit_predict()

# # Get 2D PCA projection of the clustered data
# pca_components = kmeans_analyser.transform_pca()

# # Project centroids into PCA space
# centroids_scaled = kmeans_analyser.get_centroids()
# centroids_pca = kmeans_analyser.pca_.transform(centroids_scaled)


# # Visualize clusters in 2D
# ClusterVisualisation.plot_clusters_2d(
#     components=pca_components,
#     labels=labels,
#     centroids=centroids_pca,
#     title='K-Means Clustering (PCA Projection)',
#     xlabel="PCA Component 1", ylabel="PCA Component 2"
# )

# # Profile clusters
# pivoted_profile = kmeans_analyser.profile_clusters(pivot=True)

# save_utils.save_dataframe_to_csv(pivoted_profile, os.path.join(output_path, "kmeans_analysis.csv"), overwrite=True, index=True)
# # print(pivoted_profile)

##### Hierarchical Clustering Analysis (Numerical Features - PCA)
We applied Hierarchical clustering to segment the dataset based on numerical features. The process included:
- Scaling and log transformation to handle skewness.
- Grid search for optimal number of clusters using Silhouette scores (Optional).
- Dimensionality reduction using PCA for 2D visualization.
- Cluster profiling to interpret group characteristics.

In [None]:
# # Initialize the HierarchicalClustering class
# hierarchical_analyser = HierarchicalClustering(
#     df=cleaned_df,
#     features=numerical_features,
#     scale=True,
#     handle_skew=True,
#     skew_method='log',
#     skew_threshold=1.0,
#     linkage='ward',       # For ward, metric must be 'euclidean'
#     metric='euclidean',
#     tune_mode=None,     # Optional: None or 'grid' for grid search  
#     cluster_range=list(range(2, 11)),
#     n_clusters=3,    # default in case grid not used
#     mca_mode=False
# )

# # Fit and predict cluster labels
# labels = hierarchical_analyser.fit_predict()

# # Show grid results
# print(hierarchical_analyser.grid_search_results)

# # Get 2D PCA projection of the clustered data
# pca_components = hierarchical_analyser.project_2d()

# # Hierarchical clustering doesn't have centroids, but we can use mean positions per cluster
# centroids_pca = hierarchical_analyser.get_cluster_means_in_2d_space(pca_components)

# # Visualize clusters in 2D
# ClusterVisualisation.plot_clusters_2d(
#     components=pca_components,
#     labels=labels,
#     centroids=centroids_pca,
#     title='Hierarchical Clustering (PCA Projection)',
#     xlabel="PCA Component 1", ylabel="PCA Component 2"
# )

# # Profile clusters
# pivoted_profile = hierarchical_analyser.profile_clusters(pivot=True)


# # Save cluster profiling results
# save_utils.save_dataframe_to_csv(
#     pivoted_profile,
#     os.path.join(output_path, "hierarchical_analysis_pca.csv"),
#     overwrite=True,
#     index=True
# )

# # Optionally print
# # print(pivoted_profile)

##### DBSCAN Clustering analysis
We applied DBSCAN clustering to segment the dataset based on numerical features. The process included:
- Scaling and log transformation to handle skewness.
- Grid search for optimal number of clusters using Silhouette scores (Optional).
- Dimensionality reduction using PCA for 2D visualization.
- Cluster profiling to interpret group characteristics.

In [None]:
# # Initialize the DBSCANClustering class
# dbscan_analyser = DBSCANClustering(
#     df=cleaned_df,
#     features=numerical_features,
#     eps='auto',                  # Distance threshold for clustering, "auto" for tuning or fixed input such as 0.3, 0.5, 0.7, 1.0
#     min_samples=5,               # Minimum samples per core point
#     metric='euclidean',
#     scale=True,                  # Whether to scale features
#     handle_skew=True,            # Whether to reduce skewness
#     skew_method='log',           # Method of skewness reduction
#     skew_threshold=1.0,          # Skewness threshold to apply transformation
#     scaling_method='zscore',     # Scaling method: 'zscore' or 'minmax'
#     suggest_eps_percentile = 90,  # For tuning eps
#     tune_mode=None,  # Options: None, 'eps', 'tune_min_samples', 'grid'
#     min_samples_range=None,
#     eps_percentiles=None
# )

# # print(dbscan_analyser.grid_search_results)

# # Fit and predict cluster labels
# labels = dbscan_analyser.fit_predict()

# # Get 2D PCA projection of the clustered data
# pca_components = dbscan_analyser.project_pca()

# # DBSCAN doesn’t compute centroids, but we can calculate average positions per cluster (ignoring noise)
# centroids_pca = dbscan_analyser.get_cluster_means_in_pca_space(pca_components)

# # Visualize clusters in 2D
# ClusterVisualisation.plot_clusters_2d(
#     components=pca_components,
#     labels=labels,
#     centroids=centroids_pca,
#     title='DBSCAN Clustering (PCA Projection)',
#     xlabel="PCA Component 1", ylabel="PCA Component 2"
# )

# # Profile clusters (excluding noise if your logic skips -1)
# pivoted_profile = dbscan_analyser.profile_clusters(pivot=True)

# # Save cluster profiling results
# save_utils.save_dataframe_to_csv(
#     pivoted_profile,
#     os.path.join(output_path, "dbscan_analysis.csv"),
#     overwrite=True,
#     index=True
# )

# # Optionally print
# # print(pivoted_profile)

##### 4.4.2 Clustering Analysis for Categorical Features

##### K-Modes Clustering Analysis (Categorical Features - MCA)
We applied K-Modes clustering to segment the dataset based purely on categorical features, leveraging MCA for visualization. The process included:

Clustering using the K-Modes algorithm, which uses modes instead of means and a simple matching dissimilarity for categorical attributes.

Optional grid search to determine the best number of clusters and initialization method (Huang or Cao) using internal validation scores.

Dimensionality reduction with Multiple Correspondence Analysis (MCA) to project the clusters into 2D space for visualization.

Cluster visualization in MCA space alongside projected centroids for interpretability.

Detailed cluster profiling based on the distribution of original categorical features.



In [None]:
# # Initialize the KModesClustering class
# kmodes_analyser = KModesClustering(
#     df=cleaned_df,
#     features= cat_features_without_target,   # List of categorical columns excluding target variable
#     n_clusters=3,                            # or leave it out and set tune_mode='grid'
#     init='Huang',                            # or 'Cao'
#     n_init=10,
#     verbose=0,
#     random_state=42,
#     tune_mode=None,                         # Optional: None or 'grid' for grid search  
#     cluster_range=list(range(2, 6)),        # Optional: range of clusters to test
#     init_methods=['Huang', 'Cao']           # Optional: init methods to test
# )

# # Fit and predict cluster labels
# labels = kmodes_analyser.fit_predict()

# # Project data and centroids to MCA space
# mca_components = np.asarray(kmodes_analyser.project_mca(), dtype=float)
# centroids_mca = np.asarray(kmodes_analyser.project_centroids_to_mca(), dtype=float)


# # Visualize clusters in 2D
# ClusterVisualisation.plot_clusters_2d(
#     components=mca_components,
#     labels=kmodes_analyser.labels_,
#     centroids=centroids_mca,
#     title='KModes Clustering (MCA Projection)',
#     xlabel="MCA Component 1", ylabel="MCA Component 2"
# )

# # Profile clusters
# cluster_profile = kmodes_analyser.profile_clusters(include_counts=True)

# # Save cluster profiling to CSV
# save_utils.save_dataframe_to_csv(
#     cluster_profile,
#     os.path.join(output_path, "kmodes_analysis.csv"),
#     overwrite=True,
#     index=True
# )


##### Hierarchical Clustering Analysis (Categorical Features - MCA)
We applied Hierarchical clustering to segment the dataset based on **categorical features** using MCA (Multiple Correspondence Analysis). The process included:
- Dimensionality reduction using MCA to project categorical data into a numeric space.
- Optional grid search to identify the optimal number of clusters using Silhouette scores.
- Clustering using Agglomerative Hierarchical Clustering.
- Cluster profiling based on the original categorical feature distribution to interpret segment characteristics.

In [None]:
# # Initialize the HierarchicalClustering class
# hierarchical_analyser = HierarchicalClustering(
#     df=cleaned_df,
#     features=cat_features_without_target,
#     scale=False,
#     handle_skew=False,
#     skew_method='log',
#     skew_threshold=1.0,
#     linkage='ward',       # For ward, metric must be 'euclidean'
#     metric='euclidean',
#     tune_mode=None,     # Optional: None or 'grid' for grid search  
#     cluster_range=list(range(2, 11)),
#     n_clusters=3,    # default in case grid not used
#     mca_mode=True
# )

# # Fit and predict cluster labels
# labels = hierarchical_analyser.fit_predict()

# # Show grid results
# print(hierarchical_analyser.grid_search_results)

# # Get 2D PCA projection of the clustered data
# mca_components = hierarchical_analyser.project_2d()

# # Approximate cluster centers in MCA space
# centroids_mca = hierarchical_analyser.get_cluster_means_in_2d_space(mca_components)

# # Visualize clusters in 2D
# ClusterVisualisation.plot_clusters_2d(
#     components=mca_components,
#     labels=labels,
#     centroids=centroids_mca,
#     title='Hierarchical Clustering (MCA Projection)',
#     xlabel="MCA Component 1", ylabel="MCA Component 2"
# )

# # Profile clusters
# pivoted_profile = hierarchical_analyser.profile_clusters(pivot=True)


# # Save cluster profiling results
# save_utils.save_dataframe_to_csv(
#     pivoted_profile,
#     os.path.join(output_path, "hierarchical_analysis_mca.csv"),
#     overwrite=True,
#     index=True
# )

# # Optionally print
# # print(pivoted_profile)

##### 4.4.3 Clustering Analysis for Mixed (Numerical + Categorical) Data

##### K-Prototypes Clustering Analysis (Mixed Features - FAMD)
We applied **K-Prototypes clustering** to segment the dataset based on a mix of **categorical and numerical features**, using **FAMD** for dimensionality reduction. The process included:

- **Dimensionality reduction** using **FAMD (Factor Analysis of Mixed Data)** to project both categorical and numerical features into a shared numerical space for visualization.
- **Optional grid search** to identify the optimal number of clusters based on **Silhouette scores**.
- **Clustering** using the **K-Prototypes algorithm**, which handles mixed data types efficiently.
- **Cluster profiling** to interpret segment characteristics by analyzing the distribution of original features across clusters.

In [None]:
# # Initialize the KPrototypesClustering class
# kproto_analyser = KPrototypesClustering(
#                 cleaned_df, cat_features_without_target, numerical_features,
#                  n_clusters=3, init='Huang', n_init=10, verbose=0,
#                  random_state=42, 
#                  tune_mode=None, # Optional: None or 'grid'
#                  cluster_range=None,
#                  init_methods=['Huang', 'Cao'],
#                  scale=True, scaling_method='zscore',
#                  handle_skew=True, skew_method='log', skew_threshold=1.0)


# # Fit and predict cluster labels
# labels = kproto_analyser.fit_predict()

# # Show grid search results (if any)
# print(kproto_analyser.grid_search_results)

# # Get 2D FAMD projection
# famd_components = kproto_analyser.project_clusters_famd()

# # Get cluster centers in FAMD space
# centroids_famd = kproto_analyser.get_cluster_means_in_2d_space(famd_components)

# # Visualize clusters in 2D using the utility class
# ClusterVisualisation.plot_clusters_2d(
#     components=famd_components,
#     labels=labels,
#     centroids=centroids_famd,
#     title='K-Prototypes Clustering (FAMD Projection)',
#     xlabel="FAMD Component 1", ylabel="FAMD Component 2"
# )

# # Profile clusters and pivot
# pivoted_profile = kproto_analyser.profile_clusters()

# # Save the profiling results
# save_utils.save_dataframe_to_csv(
#     pivoted_profile,
#     os.path.join(output_path, "kprototypes_analysis.csv"),
#     overwrite=True,
#     index=True
# )

# # Optionally print
# print(pivoted_profile)


##### Hierarchical Clustering Analysis with Gower Distance (Mixed Features - FAMD Visualization)
To segment the dataset based on its combination of **categorical and numerical features**, we employed **Hierarchical Agglomerative Clustering** coupled with the **Gower distance** metric.
Visualization of these clusters was facilitated using **FAMD**. The analytical process encompassed:

- **Similarity Measurement**: Computation of a Gower distance matrix to effectively quantify similarities between data points across mixed data types, appropriately handling numerical and categorical attributes.
- **Clustering Algorithm**: Application of Hierarchical Agglomerative Clustering on the Gower distance matrix. This method builds a hierarchy of clusters (dendrogram) without requiring a pre-specified number of clusters initially.
- **Optimal Cluster Determination**: Optional tuning to identify the most suitable number of clusters (by selecting a level to cut the dendrogram) and the optimal linkage method (e.g., 'average', 'complete') by evaluating Silhouette scores.
- **Dimensionality Reduction for Visualization**: Utilization of FAMD (Factor Analysis of Mixed Data) to project the original dataset (both categorical and numerical features) into a lower-dimensional space, enabling 2D visualization of the derived cluster structures.
- **Cluster Profiling**: Interpretation of the resulting segments by analyzing the distribution and central tendencies (e.g., means for numerical, modes for categorical) of the original features within each identified cluster.


In [None]:
# # Initialize the HierarchicalGowerClustering class
# hierarchical_analyser = HierarchicalGowerClustering(
#     df=cleaned_df,
#     categorical_features=cat_features_without_target,
#     numerical_features=numerical_features,
#     n_clusters=3,  # Example: specify desired k, or use tune_mode
#     linkage_method='average', # Common linkage method
#     verbose=1, # Set to 1 or higher for more output
#     random_state=42,
#     tune_mode=None, # Optional: 'silhouette' or None
#     cluster_range=list(range(2, 7)), # Range of k to test if tune_mode='silhouette'
#     linkage_methods_to_tune=['average', 'complete'], # Linkage methods to test if tune_mode='silhouette'
#     scale=True,
#     scaling_method='zscore',
#     handle_skew=True,
#     skew_method='log',
#     skew_threshold=1.0
# )

# labels_hierarchical = hierarchical_analyser.labels_

# # Show tuning results (if any)
# if hierarchical_analyser.tuning_results_ is not None:
#     print("\n--- Hierarchical Clustering Tuning Results ---")
#     print(hierarchical_analyser.tuning_results_)
#     print(f"Best n_clusters found: {hierarchical_analyser.n_clusters}")
#     print(f"Best linkage method found: {hierarchical_analyser.linkage_method}")

# # Get 2D FAMD projection
# famd_components_hierarchical = hierarchical_analyser.project_clusters_famd()

# # Get cluster centers in FAMD space
# centroids_famd_hierarchical = hierarchical_analyser.get_cluster_means_in_2d_space(famd_components_hierarchical)

# # Visualize clusters in 2D using the utility class
# ClusterVisualisation.plot_clusters_2d(
#     components=famd_components_hierarchical, # DataFrame with FAMD components and 'Cluster' column
#     labels=labels_hierarchical, # Pass the labels explicitly or ensure components DF has 'Cluster'
#     centroids=centroids_famd_hierarchical, # DataFrame of 2D centroids, indexed by cluster label
#     title='Hierarchical Clustering with Gower Distance (FAMD Projection)',
#     xlabel="FAMD Component 1",
#     ylabel="FAMD Component 2"
# )

# # Profile clusters
# profile_hierarchical = hierarchical_analyser.profile_clusters()

# # Save the profiling results
# save_utils.save_dataframe_to_csv(
#     profile_hierarchical,
#     os.path.join(output_path, "hierarchical_gower_analysis.csv"),
#     overwrite=True,
#     index=True # Cluster profiles are typically indexed by cluster label
# )

# # Optionally print
# print("\n--- Hierarchical Gower Clustering Profile ---")
# print(profile_hierarchical)

#### Step 5: Feature Engineering

##### 5.1 Feature Importance
This section implements methods to evaluate feature importance for predicting loan default (PD).
It includes both traditional statistical and machine learning-based approaches:
- Logistic Regression using Weight of Evidence (WoE) and Information Value (IV)
- Tree-based methods such as Random Forest for model-based feature importance

These methods guide variable selection by identifying the most predictive features.


##### 5.1.1 Logistic Regression
To evaluate the predictive power of features collectively, we use Logistic Regression — a statistical model well-suited for binary classification tasks such as default prediction.

##### 5.1.1.1 Binning

Applies binning to numerical and categorical variables as a preprocessing step for WoE transformation.
Supports manual and algorithmic methods such as equal-width, quantile, decile, decision trees, K-means, and optimal binning.

Binning transforms continuous variables into discrete intervals, improving interpretability and performance in scorecard models.

In [None]:
# Step 1: Define binning configuration
testing_binning_config = {
    "person_age": {
        "type": "numerical",
        "method": "optimal",  #  can change this to 'quantile', 'decile', etc.
        "target": "loan_status"  # only needed for supervised methods like 'optimal'
    },
    "cb_person_cred_hist_length": {
        "type": "numerical",
        "method": "optimal", 
        "target": "loan_status"  
    },
    "person_income": {
        "type": "numerical",
        "method": "optimal", 
        "target": "loan_status"
    },
    "person_emp_length": {
        "type": "numerical",
        "method": "optimal",
        "target": "loan_status"
    },
    "loan_amnt": {
        "type": "numerical",
        "method": "optimal",
        "target": "loan_status"
    },
    "loan_int_rate": {
        "type": "numerical",
        "method": "optimal", 
        "target": "loan_status"
    },
    "loan_percent_income": {
        "type": "numerical",
        "method": "optimal", 
        "target": "loan_status"
}
}

# Step 2: Initialize the Binner class 
binner = Binner(df=cleaned_df, config=testing_binning_config)

# Step 3: Suggest bins only (if apply=False does NOT apply the bins to the DataFrame)
binner.suggest_and_apply_bins(features_to_bin=["person_age", "cb_person_cred_hist_length",
                                     "person_income","person_emp_length","loan_amnt",
                                     "loan_int_rate","loan_percent_income"], apply=False)

# Step 4: Access the suggested bin edges for review
print("\n--- Suggested Bin Edges ---")
for feature in binning_config.keys():
    bin_edges = binner.get_bin_edges(feature)
    print(f"{feature}: {bin_edges}")

# To see all binning info
print(binner.get_binning_info())

# Step 5: Apply the bins and create new columns
df_binned = binner.apply_bins()
print(df_binned.head(10))
print(df_binned.columns.tolist())

##### Applying Manual Binning from a Data Source (CSV, Excel, SQL, etc.)
This section explains how to use manually defined binning rules (e.g., bin edges or category mappings) stored in an external data source like a CSV, Excel, or SQL table to transform features accordingly.

In [None]:
binner = Binner(df=cleaned_df, config=binning_config)
df_binned = binner.suggest_and_apply_bins(apply=True)
print(df_binned.head(10))
print(df_binned.columns.tolist())

Saving the new data frame and also visulalising the bins

In [None]:
# Save as a CSV file
save_utils = SaveUtils()
save_utils.save_dataframe_to_csv(df_binned, os.path.join(output_path, "loan_data_binned.csv"), overwrite=True)


In [None]:
viz = Visualisation(df_binned, display_names)
viz.plot_binned_distribution("cb_person_cred_hist_length")

##### 5.1.1.2 Calulating Weight of Evidence (WoE)
Weight of Evidence (WoE) transforms binned features into interpretable numerical values that reflect the strength and direction of their relationship with the target variable. It is especially useful in credit risk modeling as it helps improve model interpretability and supports monotonic relationships with default likelihood.

In [None]:

# Step 1: Define your binned features and target
binned_features = ['person_age_binned', 'person_income_binned', 'person_emp_length_binned', 'loan_amnt_binned', 'loan_int_rate_binned', 'loan_percent_income_binned', 'cb_person_cred_hist_length_binned']
target = target_variable

# Step 2: Initialize the encoder
woe_encoder = WOEEncoder(df_binned, target)

# Step 3: Fit on binned features
woe_encoder.fit(binned_features)

# Optional: See the WOE mapping
woe_map = woe_encoder.get_woe_mapping('cb_person_cred_hist_length_binned')
print(woe_map)

# Step 4: Apply WOE transformation
woe_encoder.transform()  # or .transform(['loan_amnt_binned']) for a subset

# Step 5: Get the final DataFrame
woe_df = woe_encoder.get_transformed_data()

print(woe_df.head(10))
print(woe_df.columns.tolist())

# Optional: Save dataframe
save_utils.save_dataframe_to_csv(woe_df, os.path.join(output_path, "loan_data_woe.csv"), overwrite=True)

# Now we can use `woe_df` for:
# - correlation matrix
# - training a logistic regression
# - plotting WOE vs default rate


Visualising the Weight of Evidence

In [None]:
woe_map = woe_encoder.get_woe_mapping()

viz = FeatureImportanceVisualiser(woe_map=woe_map)

# Plot for a single feature
viz.plot_woe_trend("loan_amnt_binned")

# Plot for all features
# viz.plot_all_woe_trends()

##### 5.1.1.3 Calulating Information Value (IV)
Information Value (IV) is used to measure the predictive power of each binned feature in relation to the binary target variable. Higher IV indicates stronger separation between good and bad outcomes.

- **Steps**:
  1. Initialize the `IVCalculator` with the DataFrame and target column.
  2. Compute IV for all binned features using either precomputed WoE values or calculating WoE internally.
  3. Convert results to a DataFrame and visualize.

- **Interpretation Guide**:
  - IV < 0.02: Not useful
  - 0.02 ≤ IV < 0.1: Weak predictor
  - 0.1 ≤ IV < 0.3: Medium predictor
  - 0.3 ≤ IV < 0.5: Strong predictor
  - IV ≥ 0.5: Suspiciously strong (may indicate overfitting or data leakage)

The resulting IV scores help in selecting the most informative features for credit risk modeling.

In [None]:
# Step 1: Initialize
iv_calc = IVCalculator(woe_df, target=target)

# Step 2: Run IV calculation
iv_calc.calculate_iv(
    binned_features=[
        'loan_amnt_binned', 'person_age_binned', 'person_income_binned',
        'person_emp_length_binned','loan_int_rate_binned','loan_percent_income_binned',
        'cb_person_cred_hist_length_binned'
    ],

    woe_column_map={
        'loan_amnt_binned': 'loan_amnt_binned_woe',
        'person_age_binned': 'person_age_binned_woe',
        'person_income_binned': 'person_income_binned_woe',
        'person_emp_length_binned' : 'person_emp_length_binned_woe',
        'loan_int_rate_binned' : 'loan_int_rate_binned_woe',
        'loan_percent_income_binned' : 'loan_percent_income_binned_woe',
        'cb_person_cred_hist_length_binned' : 'cb_person_cred_hist_length_binned_woe'
    },
    use_precomputed_woe=True
)

# Step 3: Get IV scores
iv_scores_df = iv_calc.as_dataframe()
print(iv_scores_df)

viz.plot_iv_scores(iv_scores_df)

##### 5.1.1.4 Encoding Nominal and Ordinal Variables
To prepare categorical variables for logistic regression and other ML models, we applied appropriate encoding techniques:

Nominal variables (e.g., loan_intent, person_home_ownership) were encoded using One-Hot

In [None]:
encoder = DataEncoder(
    nominal_features=['person_home_ownership', 'loan_intent', 'loan_grade'],
    ordinal_features=None,drop_original=False
)

encoded_df = encoder.fit_transform(woe_df)

# Turn "cb_person_default_on_file" to a Boolean variable 
encoded_df['cb_person_default_on_file'] = encoded_df['cb_person_default_on_file'].map({'Y': 1, 'N': 0})

print(encoded_df.head(10))
print(encoded_df.columns.tolist())

##### 5.1.1.5 Calculate Feature Importance with Logistic Regression (Logit)

Unlike Information Value (IV), which evaluates individual variable relationships with the target, logistic regression captures joint effects, including multicollinearity and interactions between features.

In this step:

- We train a logistic regression model on the preprocessed dataset (with WoE-transformed numerical features and one-hot encoded categorical variables).

- We analyze the resulting coefficients:

  - Magnitude and sign of each coefficient indicate the direction and strength of influence on the target.

  - Larger absolute values imply stronger predictive contribution.

- We evaluate models beyond coefficients by calculating permutation importance

In [None]:

features = [
    'loan_amnt_binned_woe','person_age_binned_woe','person_income_binned_woe','person_emp_length_binned_woe',
    'loan_int_rate_binned_woe','loan_percent_income_binned_woe','cb_person_cred_hist_length_binned_woe',
    'person_home_ownership_MORTGAGE', 'person_home_ownership_OTHER', 'person_home_ownership_OWN', 
    'person_home_ownership_RENT', 'loan_intent_DEBTCONSOLIDATION', 'loan_intent_EDUCATION', 
    'loan_intent_HOMEIMPROVEMENT', 'loan_intent_MEDICAL', 'loan_intent_PERSONAL', 
    'loan_intent_VENTURE', 'loan_grade_A', 'loan_grade_B', 'loan_grade_C', 'loan_grade_D',
    'loan_grade_E', 'loan_grade_F', 'loan_grade_G', 'cb_person_default_on_file'
]

# Step 1: Initialize and train the model
model = LogisticModel(
    df=encoded_df,
    features=features,
    target=target,
    id_column=None,                # or 'id' if we have one
    test_size=0.2,
    eval_size=0.0,                 # or set to 0.1 if we want an eval set
    random_state=42,
    balance_method='none',        # or 'smote', 'undersample', 'oversample'
    tune_hyperparameters=False,   # Set True if we want GridSearch
    scoring='roc_auc',
    solver='liblinear',           # Or 'saga', 'lbfgs', etc.
    scale=False,
    scaling_method='zscore',
    handle_skew=True,
    skew_method='log',
    skew_threshold=1.0
)

# Step 2: Train the model
model.fit_model()

# Step 3: Get sorted feature coefficients
coeff_df = model.get_coefficients()
print("🔹 Logit Coefficient:\n", coeff_df.head(30))  

# Step 4: Get permutation importance
importance_df = model.get_permutation_importance(dataset='test', scoring='roc_auc')
print("🔹 Permutation Importance:\n",importance_df)

#### 5.1.2 Calculate Feature Importance with Random Forest
Random Forest is a powerful ensemble learning method that captures complex patterns, interactions, and non-linear relationships across features without requiring explicit transformations.

In this step:

- We train a Random Forest classifier on the preprocessed dataset (with WoE-transformed numerical features and one-hot encoded categorical variables).

- We evaluate feature importance using three complementary methods:

    - Gini Importance (Mean Decrease in Impurity):
    Measures how much each feature decreases node impurity across all trees in the forest. Features with higher scores are considered more important. However, this method can be biased toward features with many unique values.

    - Permutation Importance:
    Assesses the drop in model performance (e.g., ROC AUC) when the values of a feature are randomly shuffled. This helps reveal the true predictive contribution of each feature in the trained model and avoids structural bias.

    - SHAP Values (SHapley Additive exPlanations):
    Based on cooperative game theory, SHAP provides local and global explanations by quantifying the average contribution of each feature to predictions. It captures both magnitude and direction, offering the most detailed and model-consistent interpretation.

Using all three methods allows us to evaluate importance from different angles: impurity reduction, model dependence, and individual feature attribution — leading to a more comprehensive understanding of feature behavior.

In [None]:
# WoE features are not used for random forest. Here we use original vaues instead of them.

original_features = [ 'person_age', 'person_income', 'person_emp_length', 'loan_int_rate',
                    'loan_amnt', 'loan_percent_income', 'cb_person_default_on_file',  
                    'cb_person_cred_hist_length','person_home_ownership_MORTGAGE', 
                    'person_home_ownership_OTHER', 'person_home_ownership_OWN', 
                    'person_home_ownership_RENT', 'loan_intent_DEBTCONSOLIDATION',
                    'loan_intent_EDUCATION', 'loan_intent_HOMEIMPROVEMENT', 'loan_intent_MEDICAL',
                    'loan_intent_PERSONAL', 'loan_intent_VENTURE', 'loan_grade_A', 'loan_grade_B',
                    'loan_grade_C', 'loan_grade_D','loan_grade_E', 'loan_grade_F', 'loan_grade_G'
                    ]


# Step 1: Initialize and train the model
rf_model = RandomForestModel(
    df=encoded_df,
    features=original_features,
    target=target,
    test_size=0.2,
    eval_size=0.0, 
    random_state=42,
    scoring='roc_auc',
    balance_method='none',  
    tune_hyperparameters=False,
    n_estimators=200,
    max_depth=10,
    criterion='entropy'
)

# Step 2: Train the model
rf_model.fit_model()

# Step 3: Get Gini Importance
gini_df = rf_model.get_feature_importance_gini()
print("🔹 Gini Importance:\n", gini_df)


# # Step 4: Get Permutation Importance
# permutation_df = rf_model.get_permutation_importance(dataset='test', scoring='roc_auc')
# print("🔹 Permutation Importance:\n", permutation_df)

# # Step 5: Get SHAP Importance
# shap_df = rf_model.get_feature_importance_shap(max_display=30)
# print("🔹 SHAP Importance:\n", shap_df)

#### 5.1.3 Calculate Feature Importance with XGBoost
XGBoost (Extreme Gradient Boosting) is a high-performance gradient boosting framework known for its accuracy, speed, and regularization capabilities, making it ideal for structured/tabular data.

In this step:

- We train an XGBoost classifier on the preprocessed dataset (with original numerical features and one-hot encoded categorical variables, without WoE transformation).

- We evaluate feature importance using three complementary methods:

    - Gain Importance:
    Gain measures the average improvement in the loss function brought by a feature when it is used in a split across all boosting rounds. Features with higher gain are considered more important. Unlike Gini importance, gain is based on how much each feature contributes to reducing the prediction error.

    - Permutation Importance

    - SHAP Values (SHapley Additive exPlanations)

In [None]:
# Step 1: Initialize and train the model
xgb_model = XGBoostModel(
    df=encoded_df,
    features=original_features,
    target=target,
    test_size=0.2,
    eval_size=0.0, 
    random_state=42,
    scoring='roc_auc',
    balance_method='none',
    tune_hyperparameters=False,
    n_estimators=200,
    max_depth=10,
    learning_rate=0.1
)

# Step 2: Train the model
xgb_model.fit_model()

# Step 3: Get Gain Importance
gain_df = xgb_model.get_feature_importance_gain()
print("🔹 Gain Importance:\n", gain_df)

# Step 4: Get Permutation Importance
xgb_permutation_df = xgb_model.get_permutation_importance(dataset='test', scoring='roc_auc')
print("🔹 Permutation Importance:\n", xgb_permutation_df)

# Step 5: Get SHAP Importance
xgb_shap_df = xgb_model.get_feature_importance_shap(max_display=30)
print("🔹 SHAP Importance:\n", xgb_shap_df)