In [33]:
import os
from utils.core.config import initialize_daanish, load_project_config
from utils.data_io import load_data
from utils.core.feature_manager import FeatureManager
from utils.eda.descriptive import DescriptiveAnalysis
from utils.viz.display import DisplayUtils
from utils.core.save_manager import SaveUtils
from utils.core.format_utils import FormatUtils

#### Step 1: Project Initialization and Data Loading 
In this step, we:
- Initialize the Daanish core setup
- Access global and project-specific configuration values
- Construct input and output paths based on project settings
- Load the main dataset for modeling
- Load the list of model features along with their attributes

- Initialize the Daanish core setup

In [34]:
global_config = initialize_daanish()


Daanish core setup complete.


- Access global and project-specific configuration values

In [35]:

project_root = os.getcwd()
project_config = load_project_config(project_root)

input_data_folder = project_config.get('paths', 'input_data_folder')
output_data_folder = project_config.get('paths', 'output_data_folder')
main_dataset = project_config.get('input_files', 'main_dataset')
model_features = project_config.get('input_files', 'features_attributes')
source_type = project_config.get('datasource_type', 'source_type')
excel_sheet_name = project_config.get('datasource_type', 'excel_sheet_name')
main_dataset_query = project_config.get('db_queries', 'main_dataset_query')
model_features_query = project_config.get('db_queries', 'model_features_query')

INFO:utils.core.config:Loading project configuration from c:\Data Science Projects\Daanish\projects\probability_of_default\project_config.ini
INFO:utils.core.config:Project configuration loaded successfully


- Construct input and output paths based on project settings

In [36]:

input_path = os.path.join(project_root, input_data_folder)
output_path = os.path.join(project_root, output_data_folder)

- Load the main dataset for modeling

In [37]:
main_df = load_data(
    source_type=source_type,
    input_path=os.path.join(input_path, main_dataset),
    query=main_dataset_query,
    global_config=global_config
)

Connected to SQL Server.
Connection closed.


  return pd.read_sql(query, connection)


- Load the list of model features along with their attributes

In [38]:
feature_manager = FeatureManager(
    source_type=source_type,
    input_path=os.path.join(input_path, model_features),
    global_config=global_config,
    query=model_features_query
)

nominal_features = feature_manager.get_nominal_features()
ordinal_features = feature_manager.get_ordinal_features()
numerical_features = feature_manager.get_numerical_features()
target_variable = feature_manager.get_target_variable()
all_features = feature_manager.get_all_features()
missing_value_strategies = feature_manager.get_missing_value_strategies()
missing_fill_values = feature_manager.get_missing_fill_values()
display_names = feature_manager.get_display_names()

# print("Nominal Features:", nominal_features)
# print("Ordinal Features:", ordinal_features)
# print("Numerical Features:", numerical_features)
# print("Target Variable:", target_variable)
# print("All Features:", all_features)
# print("Missing Value Strategies:", missing_value_strategies)
# print("Missing Fill Values:", missing_fill_values)
# print("Display Names:", display_names)

Connected to SQL Server.
Connection closed.


#### Step 2: Preliminary Exploratory Data Analysis (EDA)

In this step, we explore the raw dataset to understand its structure, identify potential issues (e.g., missing values, outliers, inconsistent types), and gain initial insights into data distributions. This provides the foundation for informed preprocessing and feature engineering decisions later.

- initializing the `DescriptiveAnalysis` class with our main dataset

In [39]:
eda_desc = DescriptiveAnalysis(main_df)

- Print summaries of data samples

In [40]:
sample_data = eda_desc.get_data_samples(5)
DisplayUtils.show_dataframe_notebook(sample_data)

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
23195,29,16800,RENT,4.0,HOMEIMPROVEMENT,A,5000,6.92,True,0.3,False,10
6876,21,44000,MORTGAGE,5.0,MEDICAL,D,6000,12.61,True,0.14,False,3
11801,26,76000,MORTGAGE,10.0,HOMEIMPROVEMENT,A,5425,8.49,False,0.07,False,3
4859,23,36100,RENT,2.0,EDUCATION,A,5000,7.49,False,0.14,False,4
22696,28,57000,MORTGAGE,12.0,HOMEIMPROVEMENT,A,13000,9.63,False,0.23,False,5


- Print dataset summary

In [41]:
dataset_summary = eda_desc.get_dataset_summary()
DisplayUtils.show_summary_console(dataset_summary)


=== Dataset Summary ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32581 entries, 0 to 32580
Columns: 12 entries, person_age to cb_person_cred_hist_length
dtypes: bool(2), float64(3), int64(4), object(3)

Duplicate Count: 165

Missing Values:
Feature                       Count
--------------------------  -------
person_age                        0
person_income                     0
person_home_ownership             0
person_emp_length               895
loan_intent                       0
loan_grade                        0
loan_amnt                         0
loan_int_rate                  3116
loan_status                       0
loan_percent_income               9
cb_person_default_on_file         0
cb_person_cred_hist_length        0

Missing Percentages:
Feature                     Percentage
--------------------------  ------------
person_age                  0.00%
person_income               0.00%
person_home_ownership       0.00%
person_emp_length           2.75%
loan_in

- Display/Save summaries of feature(s)

In [42]:
# single_feature_summary = eda_desc.get_feature_summary("loan_amnt")
# DisplayUtils.print_feature_summary("loan_amnt", single_feature_summary)

All_features_summary = eda_desc.get_all_feature_summaries()
DisplayUtils.print_high_level_summary(All_features_summary)


=== High-Level Feature Summary ===
+---------------------+------------+---------------+-----------------------+-------------------+-------------+------------+--------------+---------------+-------------+---------------------+---------------------------+----------------------------+
|      Statistic      | person_age | person_income | person_home_ownership | person_emp_length | loan_intent | loan_grade |  loan_amnt   | loan_int_rate | loan_status | loan_percent_income | cb_person_default_on_file | cb_person_cred_hist_length |
+---------------------+------------+---------------+-----------------------+-------------------+-------------+------------+--------------+---------------+-------------+---------------------+---------------------------+----------------------------+
|        count        |  32581.00  |   32581.00    |          N/A          |     31686.00      |     N/A     |    N/A     |   32581.00   |   29465.00    |     N/A     |      32572.00       |            N/A            |  

- Save Descriptive Analysis Summary to CSV

This cell formats the high-level feature summaries into a structured DataFrame and saves it as a CSV or Excel file. The formatting is handled by `FormatUtils`, which extracts selected statistics for each feature, and the output is saved using `SaveUtils`.

In [43]:
# Format the summary
df_summary = FormatUtils.high_level_summary_to_dataframe(All_features_summary)

save_utils = SaveUtils()

# Save as a CSV file
save_utils.save_dataframe_to_csv(df_summary, os.path.join(output_path, "descriptive_summary.csv"), overwrite=True)

# Save as an Excel file
save_utils.save_dataframe_to_excel(df_summary, os.path.join(output_path, "descriptive_summary.xlsx"), sheet_name='Descriptive Summary')

Data saved to c:\Data Science Projects\Daanish\projects\probability_of_default\data/output/descriptive_summary.csv successfully.


AttributeError: 'SaveUtils' object has no attribute 'save_dataframe_to_excel'