---
# **Property Assessment CCAO:** FIN 550 Final Project

This R script contains the full workflow to:
   - Load and process (ETL) Cook County property datasets
   - Perform Exploratory Data Analysis (EDA)
   - Develop and apply models for predicting residential property values
 
The script produces the final 'assessed_value.csv' file for project submission.


---
### **Extract, Transform, Load**

##### Install & Load Required Packages

In [89]:
# Install and load required packages

# Function to install packages if not already installed
install_if_missing <- function(package) {
  if (!require(package, character.only = TRUE, quietly = TRUE)) {
    install.packages(package, dependencies = TRUE)
  }
}

# Data manipulation and ETL
install_if_missing("readr")
install_if_missing("dplyr")
install_if_missing("tidyr")
install_if_missing("data.table")

# EDA and visualization
install_if_missing("ggplot2")
install_if_missing("corrplot")
install_if_missing("GGally")
install_if_missing("skimr")

# Modeling - Linear Models
install_if_missing("caret")
install_if_missing("glmnet")

# Modeling - Tree-based methods
install_if_missing("rpart")
install_if_missing("rpart.plot")
install_if_missing("randomForest")
install_if_missing("xgboost")

# Model evaluation and tuning
install_if_missing("Metrics")
install_if_missing("MLmetrics")

# Missing data imputation
install_if_missing("mice")

# Load libraries
library(readr)       # Reading CSV files
library(dplyr)       # Data manipulation
library(tidyr)       # Data tidying
library(data.table)  # Fast data manipulation

library(ggplot2)     # Visualization
library(corrplot)    # Correlation plots
library(GGally)      # Pairwise plots
library(skimr)       # Summary statistics

library(caret)       # Model training and evaluation
library(glmnet)      # Regularized regression (Ridge, Lasso, Elastic Net)

library(rpart)       # Decision trees
library(rpart.plot)  # Decision tree visualization
library(randomForest) # Random Forest
library(xgboost)     # XGBoost

library(Metrics)     # Model evaluation metrics
library(MLmetrics)   # Additional ML metrics

library(mice)        # Multiple imputation by chained equations

# Set random seed for reproducibility
set.seed(550)

cat("All required packages loaded successfully.\n")


All required packages loaded successfully.


##### Load the Datasets

In [90]:
# Load the datasets
historic_data <- read_csv('data/historic_property_data.csv')
predict_set <- read_csv('data/predict_property_data.csv')

# Display basic information about the datasets
cat("Historic Data Shape:", dim(historic_data), "\n")
cat("Predict Set Shape:", dim(predict_set), "\n")


[1mRows: [22m[34m50000[39m [1mColumns: [22m[34m63[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (8): meta_cdu, meta_deed_type, geo_property_city, geo_property_zip, geo...
[32mdbl[39m (52): sale_price, meta_class, meta_town_code, meta_nbhd, meta_certified_...
[33mlgl[39m  (3): ind_large_home, ind_garage, ind_arms_length

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m10000[39m [1mColumns: [22m[34m63[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (8): meta_cdu, meta_deed_type, geo_property_city, geo_property_zip, geo...
[32mdbl[39m (52): pid, meta_class, meta_town_code, meta_nbhd, meta_certified_est_bld...
[33mlgl

Historic Data Shape: 50000 63 
Predict Set Shape: 10000 63 


##### Observe Missing Values

In [91]:
# Check for missing values in historic_data
cat("\n=== Missing Values in Historic Data ===\n")
missing_historic <- colSums(is.na(historic_data))
missing_historic_pct <- (missing_historic / nrow(historic_data)) * 100

# Display columns with missing values
missing_historic_df <- data.frame(
  Variable = names(missing_historic),
  Missing_Count = missing_historic,
  Missing_Percent = round(missing_historic_pct, 2)
) %>%
  filter(Missing_Count > 0) %>%
  arrange(desc(Missing_Count))

if(nrow(missing_historic_df) > 0) {
  print(missing_historic_df, row.names = FALSE)
  cat("\nTotal columns with missing values:", nrow(missing_historic_df), "\n")
} else {
  cat("No missing values found in historic data.\n")
}

# Check for missing values in predict_set
cat("\n=== Missing Values in Predict Set ===\n")
missing_predict <- colSums(is.na(predict_set))
missing_predict_pct <- (missing_predict / nrow(predict_set)) * 100

# Display columns with missing values
missing_predict_df <- data.frame(
  Variable = names(missing_predict),
  Missing_Count = missing_predict,
  Missing_Percent = round(missing_predict_pct, 2)
) %>%
  filter(Missing_Count > 0) %>%
  arrange(desc(Missing_Count))

if(nrow(missing_predict_df) > 0) {
  print(missing_predict_df, row.names = FALSE)
  cat("\nTotal columns with missing values:", nrow(missing_predict_df), "\n")
} else {
  cat("No missing values found in predict set.\n")
}

# Summary of missing values
cat("\n=== Summary ===\n")
cat("Historic Data - Total missing values:", sum(missing_historic), "\n")
cat("Predict Set - Total missing values:", sum(missing_predict), "\n")



=== Missing Values in Historic Data ===
                    Variable Missing_Count Missing_Percent
             char_renovation         49741           99.48
                    meta_cdu         47172           94.34
                   char_apts         43093           86.19
                  char_porch         40725           81.45
             char_attic_fnsh         33453           66.91
                char_tp_dsgn         27173           54.35
                char_tp_plan         14394           28.79
              char_gar1_area          7090           14.18
              char_gar1_cnst          7088           14.18
               char_gar1_att          7088           14.18
                    geo_fips          1103            2.21
            geo_municipality          1103            2.21
                  char_oheat           172            0.34
               geo_tract_pop           162            0.32
              geo_white_perc           162            0.32
              g

##### Missing Values by Data Type *(Nominal, Ordinal, Integer, Continuous, Booleen)*

In [92]:
# Analyze missing values by codebook-based data type
cat("\n=== Missing Values by Data Type (Codebook-Informed) ===\n")

# Load codebook if not already loaded
if(!exists("codebook_df")) {
  codebook_df <- read.csv("data/codebook.csv", stringsAsFactors = FALSE)
}

# Create a lookup table: name -> type (using codebook var_type and var_data_type for more granularity)
get_codebook_types <- function() {
  # Try to get a clean set of variable names mapping to their type and data type
  out <- codebook_df
  # All possible codebook variable names (columns in datasets) to possible types
  # Favor var_name_standard (it's primary for standardization)
  out <- out[!is.na(out$var_name_standard) & nzchar(out$var_name_standard), ]
  
  # Check if var_description column exists
  if("var_description" %in% names(out)) {
    var_description_map <- setNames(out$var_description, out$var_name_standard)
  } else {
    # If var_description doesn't exist, create empty map
    var_description_map <- setNames(rep(NA_character_, nrow(out)), out$var_name_standard)
  }
  
  var_type_map <- setNames(out$var_type, out$var_name_standard)
  var_data_type_map <- setNames(out$var_data_type, out$var_name_standard)
  
  return(list(type=var_type_map, datatype=var_data_type_map, description=var_description_map))
}

cb_types <- get_codebook_types()

# Function to group codebook data types into our analysis buckets
codebook_col_type <- function(var, cb_types) {
  # var: column name

  # Check if variable exists in codebook
  if(!var %in% names(cb_types$type)) return("Unknown")

  type1 <- cb_types$type[[var]]
  type2 <- cb_types$datatype[[var]]
  desc <- cb_types$description[[var]]
  
  if (is.null(type1) && is.null(type2)) return("Unknown")
  
  # Map codebook types to user buckets:
  # If it's explicit logical in codebook, call it 'Boolean'
  if(!is.null(type2) && tolower(type2) == "logical") return("Boolean")
  if(!is.null(type1) && tolower(type1) == "ind") return("Boolean")
  
  # Check for ordinal indicators in description or type
  if(!is.null(desc) && !is.na(desc)) {
    desc_lower <- tolower(desc)
    if(grepl("ordinal|order|rank|level|grade|quality", desc_lower)) {
      return("Ordinal")
    }
  }
  
  # Categorical can be nominal or ordinal: if type2 is "categorical" or type1 is "char"
  if(!is.null(type2) && tolower(type2) == "categorical") {
    # Try to determine if ordinal or nominal
    if(!is.null(desc) && !is.na(desc)) {
      desc_lower <- tolower(desc)
      if(grepl("ordinal|order|rank|level|grade|quality", desc_lower)) {
        return("Ordinal")
      }
    }
    return("Nominal")
  }
  
  if(!is.null(type1) && type1 %in% c("char", "meta", "geo", "econ")) {
    # meta/geo/econ might include character or codes
    if(!is.null(type2) && tolower(type2) == "categorical") {
      # Check for ordinal
      if(!is.null(desc) && !is.na(desc)) {
        desc_lower <- tolower(desc)
        if(grepl("ordinal|order|rank|level|grade|quality", desc_lower)) {
          return("Ordinal")
        }
      }
      return("Nominal")
    }
    # If it's character type without categorical designation
    if(!is.null(type2) && tolower(type2) == "character") return("Character")
    # Default for char/meta/geo/econ without more info
    return("Nominal")
  }
  
  # Numeric types
  if(!is.null(type2) && tolower(type2) %in% c("numeric", "integer", "double")) return("Numeric")
  
  # Default fallback
  return("Unknown")
}

# Calculate missing by codebook-based type
analyze_missing_by_codebook_type <- function(data, dataset_name, cb_types) {
  cat("\n", dataset_name, ":\n", sep = "")
  cols_with_missing <- names(data)[colSums(is.na(data)) > 0]
  if(length(cols_with_missing) == 0) {
    cat("  No missing values found.\n")
    return(NULL)
  }
  # Assign each col to codebook type
  cb_type_per_col <- sapply(cols_with_missing, codebook_col_type, cb_types=cb_types)
  types_for_summary <- c("Numeric", "Nominal", "Ordinal", "Character", "Boolean", "Unknown")
  total_missing <- sum(is.na(data))
  missing_vals_by_type <- sapply(types_for_summary, function(btype) {
    these_cols <- cols_with_missing[cb_type_per_col == btype]
    if(length(these_cols)>0) sum(is.na(data[these_cols])) else 0
  })
  # Calculate percentages
  missing_pct_by_type <- if(total_missing>0) missing_vals_by_type/total_missing*100 else rep(0, length(missing_vals_by_type))
  names(missing_vals_by_type) <- types_for_summary
  names(missing_pct_by_type) <- types_for_summary
  
  # Print results
  for(type_str in types_for_summary) {
    cat(sprintf("  %-11s: %.2f%% (%d missing values)\n", 
                type_str,
                round(missing_pct_by_type[type_str], 2),
                missing_vals_by_type[type_str]))
  }
  cat(sprintf("  Total:       100.00%% (%d missing values)\n", total_missing))
}

# Apply to both datasets
analyze_missing_by_codebook_type(historic_data, "Historic Data", cb_types)
analyze_missing_by_codebook_type(predict_set, "Predict Set", cb_types)



=== Missing Values by Data Type (Codebook-Informed) ===

Historic Data:
  Numeric    : 0.00% (0 missing values)
  Nominal    : 82.08% (231857 missing values)
  Ordinal    : 0.00% (0 missing values)
  Character  : 17.69% (49964 missing values)
  Boolean    : 0.24% (672 missing values)
  Unknown    : 0.00% (0 missing values)
  Total:       100.00% (282493 missing values)

Predict Set:
  Numeric    : 0.00% (0 missing values)
  Nominal    : 82.23% (45708 missing values)
  Ordinal    : 0.00% (0 missing values)
  Character  : 17.76% (9874 missing values)
  Boolean    : 0.01% (3 missing values)
  Unknown    : 0.00% (0 missing values)
  Total:       100.00% (55585 missing values)


In [93]:
# Print col names and their missing % for Character dtypes in historic_data
cat("\nCharacter columns and their percent missing (historic_data):\n")
char_cols <- names(historic_data)[sapply(historic_data, is.character)]
if(length(char_cols) == 0) {
  cat("  (No character columns found)\n")
} else {
  pct_missing <- sapply(char_cols, function(col) {
    mean(is.na(historic_data[[col]])) * 100
  })
  result_df <- data.frame(Column = char_cols, Percent_Missing = round(pct_missing, 2), row.names = NULL)
  print(result_df)
}



Character columns and their percent missing (historic_data):
                    Column Percent_Missing
1                 meta_cdu           94.34
2           meta_deed_type            0.00
3        geo_property_city            0.26
4         geo_property_zip            0.26
5                 geo_fips            2.21
6         geo_municipality            2.21
7 geo_school_elem_district            0.32
8   geo_school_hs_district            0.32


##### Missing Value Handling *(Drop High-Missing Columns)*

- Drop any column with >50% missing values from both datasets

In [94]:
# Identify columns with >50% missing in historic data
high_missing_historic <- names(missing_historic_pct[missing_historic_pct > 50])
cat("\nColumns with >50% missing in historic data:", length(high_missing_historic), "\n")
if(length(high_missing_historic) > 0) {
  print(high_missing_historic)
}

# Identify columns with >50% missing in predict set
high_missing_predict <- names(missing_predict_pct[missing_predict_pct > 50])
cat("\nColumns with >50% missing in predict set:", length(high_missing_predict), "\n")
if(length(high_missing_predict) > 0) {
  print(high_missing_predict)
}

# Get union of columns to drop (present in either dataset)
cols_to_drop <- unique(c(high_missing_historic, high_missing_predict))
cat("\nTotal unique columns to drop:", length(cols_to_drop), "\n")

# Drop high-missing columns from both datasets
if(length(cols_to_drop) > 0) {
  cols_to_drop_historic <- intersect(cols_to_drop, names(historic_data))
  cols_to_drop_predict <- intersect(cols_to_drop, names(predict_set))
  
  if(length(cols_to_drop_historic) > 0) {
    historic_data <- historic_data %>% select(-all_of(cols_to_drop_historic))
    cat("Dropped", length(cols_to_drop_historic), "columns from historic data\n")
  }
  
  if(length(cols_to_drop_predict) > 0) {
    predict_set <- predict_set %>% select(-all_of(cols_to_drop_predict))
    cat("Dropped", length(cols_to_drop_predict), "columns from predict set\n")
  }
}

# Check remaining missing values
remaining_missing_historic <- sum(is.na(historic_data))
remaining_missing_predict <- sum(is.na(predict_set))

cat("\nRemaining missing values in historic data:", remaining_missing_historic, "\n")
cat("Remaining missing values in predict set:", remaining_missing_predict, "\n")


Columns with >50% missing in historic data: 6 
[1] "meta_cdu"        "char_apts"       "char_tp_dsgn"    "char_attic_fnsh"
[5] "char_renovation" "char_porch"     

Columns with >50% missing in predict set: 6 
[1] "meta_cdu"        "char_apts"       "char_tp_dsgn"    "char_attic_fnsh"
[5] "char_renovation" "char_porch"     

Total unique columns to drop: 6 
Dropped 6 columns from historic data
Dropped 6 columns from predict set

Remaining missing values in historic data: 41136 
Remaining missing values in predict set: 7319 


In [95]:

# Re-run the missing analysis at this stage, but only use code that isn’t already defined above
cat("\n=== Missing Values by Data Type (Codebook-Informed; after drop) ===\n")
analyze_missing_by_codebook_type(historic_data, "Historic Data (post-drop)", cb_types)
analyze_missing_by_codebook_type(predict_set, "Predict Set (post-drop)", cb_types)


=== Missing Values by Data Type (Codebook-Informed; after drop) ===

Historic Data (post-drop):
  Numeric    : 0.00% (0 missing values)
  Nominal    : 91.58% (37672 missing values)
  Ordinal    : 0.00% (0 missing values)
  Character  : 6.79% (2792 missing values)
  Boolean    : 1.63% (672 missing values)
  Unknown    : 0.00% (0 missing values)
  Total:       100.00% (41136 missing values)

Predict Set (post-drop):
  Numeric    : 0.00% (0 missing values)
  Nominal    : 93.73% (6860 missing values)
  Ordinal    : 0.00% (0 missing values)
  Character  : 6.23% (456 missing values)
  Boolean    : 0.04% (3 missing values)
  Unknown    : 0.00% (0 missing values)
  Total:       100.00% (7319 missing values)


##### Missing Value Handling *(Analysis & Decision Support)*

- Analyze missing data patterns to decide on imputation strategy

In [96]:
# Function to analyze missing values by detailed codebook type (updated to match get_codebook_types output: "Numeric", "Nominal", "Ordinal", "Character", "Boolean", "Unknown")
analyze_missing_by_codebook_type_detailed <- function(data, dataset_name, cb_types) {
  cat("Analysis for", dataset_name, "\n")
  
  # Find columns with missing values
  cols_with_missing <- names(data)[colSums(is.na(data)) > 0]
  
  if(length(cols_with_missing) == 0) {
    cat("No missing values found.\n")
    return(NULL)
  }
  
  # Assign codebook-based type to each column with missing values
  cb_types_col <- sapply(
    cols_with_missing, 
    function(var) codebook_col_type(var, cb_types), 
    USE.NAMES = FALSE
  )
  
  type_levels <- c("Numeric", "Nominal", "Ordinal", "Character", "Boolean", "Unknown")
  
  # Create detailed analysis dataframe
  missing_analysis <- data.frame(
    Variable = cols_with_missing,
    Codebook_Type = cb_types_col,
    Missing_Count = colSums(is.na(data[cols_with_missing])),
    Total_Rows = nrow(data),
    stringsAsFactors = FALSE
  ) %>%
    mutate(
      Missing_Percent = round((Missing_Count / Total_Rows) * 100, 2),
      Data_Category = factor(Codebook_Type, levels = type_levels)
    ) %>%
    arrange(desc(Missing_Percent))
  
  # Print percent missing by data type/category (summary)
  summary_by_type <- missing_analysis %>%
    group_by(Data_Category) %>%
    summarise(
      Vars = n(),
      Total_Missing = sum(Missing_Count),
      Percent_Missing = round(100 * sum(Missing_Count) / (Total_Rows[1] * Vars), 2)
    )
  for (i in seq_len(nrow(summary_by_type))) {
    cat(
      sprintf(
        "  %-10s: %5.2f%% (%d missing values)\n",
        as.character(summary_by_type$Data_Category[i]),
        summary_by_type$Percent_Missing[i],
        summary_by_type$Total_Missing[i]
      )
    )
  }
  
  # Analyze detailed distributions for each codebook-based type
  for(type_lbl in type_levels) {
    n_vars <- sum(missing_analysis$Data_Category == type_lbl)
    cat(sprintf("\n--- %s Variable Distributions ---\n", type_lbl))
    these_vars <- missing_analysis %>% filter(Data_Category == type_lbl) %>% pull(Variable)
    if(length(these_vars) > 0) {
      for(var in these_vars) {
        cat(var, " (", missing_analysis$Missing_Percent[missing_analysis$Variable == var], "% missing)\n", sep = "")
      }
    } else {
      cat(sprintf("No %s variables with missing values.\n", tolower(type_lbl)))
    }
  }
  
  return(missing_analysis)
}

# Analyze both datasets using codebook-derived types (with updated levels)
missing_analysis_historic <- analyze_missing_by_codebook_type_detailed(historic_data, "Historic Data", cb_types)
cat("\n")
missing_analysis_predict <- analyze_missing_by_codebook_type_detailed(predict_set, "Predict Set", cb_types)

# Row-wise missing analysis (allow categorization by updated type; default to "Nominal")
cat("\n=== Row-wise Missing Value Analysis ===\n")

analyze_row_completeness_cb <- function(data, dataset_name, missing_analysis_ref = NULL, type_target = "Nominal") {
  cat("\n--- Row completeness for", dataset_name, "---\n")
  
  missing_per_row <- rowSums(is.na(data))
  
  cat("Rows with 0 missing values:", sum(missing_per_row == 0), 
      sprintf("(%.2f%%)", sum(missing_per_row == 0) / nrow(data) * 100), "\n")
  cat("Rows with 1-2 missing values:", sum(missing_per_row >= 1 & missing_per_row <= 2), 
      sprintf("(%.2f%%)", sum(missing_per_row >= 1 & missing_per_row <= 2) / nrow(data) * 100), "\n")
  cat("Rows with 3-5 missing values:", sum(missing_per_row >= 3 & missing_per_row <= 5), 
      sprintf("(%.2f%%)", sum(missing_per_row >= 3 & missing_per_row <= 5) / nrow(data) * 100), "\n")
  cat("Rows with >5 missing values:", sum(missing_per_row > 5), 
      sprintf("(%.2f%%)", sum(missing_per_row > 5) / nrow(data) * 100), "\n")
  
  # If we have type-targeted analysis (e.g. "Nominal", "Boolean"), show effect of dropping those rows
  if(!is.null(missing_analysis_ref)) {
    type_cols <- missing_analysis_ref %>% 
      filter(Data_Category == type_target) %>% 
      pull(Variable)
    type_cols <- intersect(type_cols, names(data))
    
    if(length(type_cols) > 0) {
      rows_with_type_missing <- rowSums(is.na(data[type_cols])) > 0
      cat(sprintf("\nRows with missing %s values: %d (%.2f%% of dataset)\n",
                  tolower(type_target), sum(rows_with_type_missing), 
                  sum(rows_with_type_missing) / nrow(data) * 100))
      cat(sprintf("Data remaining if these rows dropped: %d (%.2f%% retention)\n",
                  nrow(data) - sum(rows_with_type_missing),
                  (nrow(data) - sum(rows_with_type_missing)) / nrow(data) * 100))
    }
  }
  
  return(missing_per_row)
}

row_missing_historic <- analyze_row_completeness_cb(historic_data, "Historic Data", missing_analysis_historic, "Nominal")
row_missing_predict <- analyze_row_completeness_cb(predict_set, "Predict Set", missing_analysis_predict, "Nominal")

Analysis for Historic Data 
  Nominal   :  2.69% (37672 missing values)
  Character :  0.93% (2792 missing values)
  Boolean   :  0.27% (672 missing values)

--- Numeric Variable Distributions ---
No numeric variables with missing values.

--- Nominal Variable Distributions ---
char_tp_plan (28.79% missing)
char_gar1_cnst (14.18% missing)
char_gar1_att (14.18% missing)
char_gar1_area (14.18% missing)
char_oheat (0.34% missing)
geo_tract_pop (0.32% missing)
geo_white_perc (0.32% missing)
geo_black_perc (0.32% missing)
geo_asian_perc (0.32% missing)
geo_his_perc (0.32% missing)
geo_other_perc (0.32% missing)
geo_fs_flood_factor (0.32% missing)
geo_fs_flood_risk_direction (0.32% missing)
econ_midincome (0.32% missing)
char_cnst_qlty (0.11% missing)
char_ext_wall (0.05% missing)
char_roof_cnst (0.05% missing)
char_bsmt (0.05% missing)
char_bsmt_fin (0.05% missing)
char_heat (0.05% missing)
char_air (0.05% missing)
char_frpl (0.05% missing)
char_attic_type (0.05% missing)
char_site (0.05% m

##### Missing Value Handling *(MICE Protocol)*

- **MICE Imputation** — Apply multivariate imputation by chained equations to remaining missing values:

   - ***PMM*** *(Predictive Mean Matching)* *{Numeric variables}*: Predicts missing values by matching to observed values with similar predicted means, preserving the original distribution
   - ***CART*** *(Classification and Regression Trees)* *{Categorical Variables}*: Uses decision tree models to predict missing categories based on other variables
   - ***LogReg*** *(Logistic Regression)* *{Logical/Binary Variables}*: Models the probability of TRUE/FALSE outcomes using logistic regression

 - MICE preserves relationships between variables and maintains distributional properties

In [97]:
# =====================================================
# PRE-IMPUTATION: CONVERT CATEGORICALS TO FACTORS (updated get_codebook_types)
# =====================================================

cat("\n=== Converting Categorical Variables to Factors ===\n")
flush.console()

# Get codebook types (now a data.frame or tibble with columns: Variable, Type, etc.)
cb_types <- get_codebook_types()

# Create a lookup named vector for efficiency (type by variable name)
cb_type_lookup <- setNames(cb_types$Type, cb_types$Variable)

# Convert categorical columns in historic_data using updated lookup
for (col in names(historic_data)) {
  if (!col %in% names(cb_type_lookup)) next
  col_type <- cb_type_lookup[[col]]

  # For codebook conventions: treat "Nominal" and "Ordinal" as categorical
  if ((col_type %in% c("Nominal", "Ordinal")) && !is.factor(historic_data[[col]])) {
    historic_data[[col]] <- as.factor(historic_data[[col]])
    cat("  Converted historic_data$", col, " to factor (",
        length(levels(historic_data[[col]])), " levels)\n", sep = "")
  } else if (col_type == "Boolean" && !is.logical(historic_data[[col]])) {
    historic_data[[col]] <- as.logical(historic_data[[col]])
    cat("  Converted historic_data$", col, " to logical\n", sep = "")
  }
}

# Convert categorical columns in predict_set using updated lookup
for (col in names(predict_set)) {
  if (!col %in% names(cb_type_lookup)) next
  col_type <- cb_type_lookup[[col]]

  if ((col_type %in% c("Nominal", "Ordinal")) && !is.factor(predict_set[[col]])) {
    predict_set[[col]] <- as.factor(predict_set[[col]])
    cat("  Converted predict_set$", col, " to factor (",
        length(levels(predict_set[[col]])), " levels)\n", sep = "")
  } else if (col_type == "Boolean" && !is.logical(predict_set[[col]])) {
    predict_set[[col]] <- as.logical(predict_set[[col]])
    cat("  Converted predict_set$", col, " to logical\n", sep = "")
  }
}

cat("Categorical conversion complete.\n")
cat(sprintf("Timestamp: %s\n", Sys.time()))
flush.console()


=== Converting Categorical Variables to Factors ===
Categorical conversion complete.
Timestamp: 2025-11-25 19:38:45.417135


In [98]:
# =====================================================
# BASELINE SNAPSHOTS (Pre-MICE)
# =====================================================
cat("\n[BASELINE] Creating pre-MICE snapshots...\n")
flush.console()

# Create baseline snapshots before any MICE imputation
historic_snapshot_pre_mice <- historic_data
predict_snapshot_pre_mice  <- predict_set

cat("  Baseline snapshots created:\n")
cat(sprintf("    historic_snapshot_pre_mice: %d rows, %d cols\n", 
            nrow(historic_snapshot_pre_mice), ncol(historic_snapshot_pre_mice)))
cat(sprintf("    predict_snapshot_pre_mice:  %d rows, %d cols\n", 
            nrow(predict_snapshot_pre_mice), ncol(predict_snapshot_pre_mice)))
cat(sprintf("Timestamp: %s\n", Sys.time()))
flush.console()



[BASELINE] Creating pre-MICE snapshots...
  Baseline snapshots created:
    historic_snapshot_pre_mice: 50000 rows, 57 cols
    predict_snapshot_pre_mice:  10000 rows, 57 cols
Timestamp: 2025-11-25 19:38:45.428224


In [99]:
# =====================================================
# BLOCK 1: NUMERIC IMPUTATION (PMM ONLY)
# =====================================================
cat("\n[NUMERIC BLOCK] Starting numeric-only MICE run...\n")
flush.console()
set.seed(550)

# Get codebook types
cb_types <- get_codebook_types()

# Get numeric columns by checking each column's type via codebook_col_type(), with tryCatch
numeric_cols_hist <- names(historic_data)[
  sapply(names(historic_data), function(col) {
    tryCatch(
      codebook_col_type(col, cb_types) == "Numeric",
      error = function(e) {
        cat(sprintf("  [Warning] codebook_col_type error for column '%s': %s\n", col, as.character(e)))
        FALSE
      }
    )
  })
]

numeric_cols_pred <- names(predict_set)[
  sapply(names(predict_set), function(col) {
    tryCatch(
      codebook_col_type(col, cb_types) == "Numeric",
      error = function(e) {
        cat(sprintf("  [Warning] codebook_col_type error for column '%s': %s\n", col, as.character(e)))
        FALSE
      }
    )
  })
]

# Find columns with missing values
numeric_missing_hist <- numeric_cols_hist[colSums(is.na(historic_data[numeric_cols_hist])) > 0]
numeric_missing_pred <- numeric_cols_pred[colSums(is.na(predict_set[numeric_cols_pred])) > 0]

cat(sprintf("  Found %d numeric variables in historic_data\n", length(numeric_cols_hist)))
cat(sprintf("  Found %d numeric variables in predict_set\n", length(numeric_cols_pred)))
cat(sprintf("  %d numeric vars with NAs in historic_data\n", length(numeric_missing_hist)))
cat(sprintf("  %d numeric vars with NAs in predict_set\n", length(numeric_missing_pred)))
flush.console()

impute_numeric <- function(df, dataset_name, numeric_cols) {
  target_cols <- numeric_cols[colSums(is.na(df[numeric_cols])) > 0]
  if (!length(target_cols)) {
    cat(sprintf("  [%s] No numeric NAs detected, skipping.\n", dataset_name))
    return(df)
  }
  cat(sprintf("  [%s] Imputing %d numeric vars via PMM...\n", dataset_name, length(target_cols)))
  flush.console()
  
  start_time <- Sys.time()
  methods <- rep("", ncol(df)); names(methods) <- names(df)
  methods[target_cols] <- "pmm"
  pred_matrix <- quickpred(df, mincor = 0.4, minpuc = 0.5)
  
  mice_obj <- mice(df,
                   m = 3,
                   method = methods,
                   predictorMatrix = pred_matrix,
                   ridge = 1e-5,
                   maxit = 2,
                   seed = 550,
                   printFlag = TRUE)
  elapsed <- difftime(Sys.time(), start_time, units = "mins")
  cat(sprintf("  [%s] Numeric MICE finished in %.2f minutes\n", dataset_name, elapsed))
  flush.console()
  complete(mice_obj, 1)
}

historic_data <- impute_numeric(historic_data, "Historic", numeric_cols_hist)
predict_set   <- impute_numeric(predict_set, "Predict", numeric_cols_pred)

cat("[NUMERIC BLOCK] Completed. Remaining numeric NAs:",
    sum(is.na(historic_data[numeric_cols_hist])), "(historic),",
    sum(is.na(predict_set[numeric_cols_pred])), "(predict)\n")


[NUMERIC BLOCK] Starting numeric-only MICE run...
  Found 0 numeric variables in historic_data
  Found 0 numeric variables in predict_set
  0 numeric vars with NAs in historic_data
  0 numeric vars with NAs in predict_set
  [Historic] No numeric NAs detected, skipping.
  [Predict] No numeric NAs detected, skipping.
[NUMERIC BLOCK] Completed. Remaining numeric NAs: 0 (historic), 0 (predict)


In [100]:
# =====================================================
# BLOCK 2: BOOLEAN IMPUTATION (LOGREG)
# =====================================================
cat("\n[BOOLEAN BLOCK] Starting boolean-only MICE run...\n")
flush.console()
set.seed(551)

# Get boolean columns by checking each column's type via codebook_col_type()
boolean_cols_hist <- names(historic_data)[sapply(names(historic_data), function(col) {
  tryCatch({
    codebook_col_type(col, cb_types) == "Boolean"
  }, error = function(e) {
    FALSE
  })
})]

boolean_cols_pred <- names(predict_set)[sapply(names(predict_set), function(col) {
  tryCatch({
    codebook_col_type(col, cb_types) == "Boolean"
  }, error = function(e) {
    FALSE
  })
})]

cat(sprintf("  Found %d boolean variables in historic_data\n", length(boolean_cols_hist)))
cat(sprintf("  Found %d boolean variables in predict_set\n", length(boolean_cols_pred)))
flush.console()

boolean_missing_hist <- boolean_cols_hist[colSums(is.na(historic_data[boolean_cols_hist])) > 0]
boolean_missing_pred <- boolean_cols_pred[colSums(is.na(predict_set[boolean_cols_pred])) > 0]

cat(sprintf("  %d boolean vars with NAs in historic_data\n", length(boolean_missing_hist)))
cat(sprintf("  %d boolean vars with NAs in predict_set\n", length(boolean_missing_pred)))
flush.console()

# Enhanced conversion function that handles both logical and already-factor columns
convert_bool_to_factor <- function(df, cols) {
  for (col in cols) {
    if (is.logical(df[[col]])) {
      df[[col]] <- factor(df[[col]], levels = c(FALSE, TRUE))
      cat(sprintf("    Converted %s from logical to factor\n", col))
    } else if (is.factor(df[[col]])) {
      # Already a factor, ensure it has correct levels
      if (!all(levels(df[[col]]) %in% c("FALSE", "TRUE", "0", "1"))) {
        # Try to coerce to logical first, then to factor
        df[[col]] <- factor(as.logical(df[[col]]), levels = c(FALSE, TRUE))
        cat(sprintf("    Re-converted %s to boolean factor\n", col))
      }
    } else if (is.numeric(df[[col]]) || is.character(df[[col]])) {
      # Convert numeric (0/1) or character ("TRUE"/"FALSE") to logical then factor
      df[[col]] <- factor(as.logical(df[[col]]), levels = c(FALSE, TRUE))
      cat(sprintf("    Converted %s from %s to boolean factor\n", col, class(df[[col]])[1]))
    }
  }
  df
}

historic_data <- convert_bool_to_factor(historic_data, boolean_cols_hist)
predict_set   <- convert_bool_to_factor(predict_set, boolean_cols_pred)

impute_boolean <- function(df, dataset_name, boolean_cols) {
  # Find boolean factor columns with missing values
  target_cols <- boolean_cols[colSums(is.na(df[boolean_cols])) > 0]
  if (!length(target_cols)) {
    cat(sprintf("  [%s] No boolean NAs detected, skipping.\n", dataset_name))
    return(df)
  }
  cat(sprintf("  [%s] Imputing %d boolean vars via logistic regression...\n", dataset_name, length(target_cols)))
  flush.console()
  
  start_time <- Sys.time()
  methods <- rep("", ncol(df)); names(methods) <- names(df)
  methods[target_cols] <- "logreg"
  pred_matrix <- quickpred(df, mincor = 0.3, minpuc = 0.4)
  
  mice_obj <- mice(df,
                   m = 3,
                   method = methods,
                   predictorMatrix = pred_matrix,
                   ridge = 1e-5,
                   maxit = 2,
                   seed = 551,
                   printFlag = TRUE)
  elapsed <- difftime(Sys.time(), start_time, units = "mins")
  cat(sprintf("  [%s] Boolean MICE finished in %.2f minutes\n", dataset_name, elapsed))
  flush.console()
  complete(mice_obj, 1)
}

historic_data <- impute_boolean(historic_data, "Historic", boolean_cols_hist)
predict_set   <- impute_boolean(predict_set, "Predict", boolean_cols_pred)

cat("[BOOLEAN BLOCK] Completed. Remaining boolean NAs:",
    sum(is.na(historic_data[boolean_cols_hist])), "(historic),",
    sum(is.na(predict_set[boolean_cols_pred])), "(predict)\n")


[BOOLEAN BLOCK] Starting boolean-only MICE run...
  Found 6 boolean variables in historic_data
  Found 6 boolean variables in predict_set
  5 boolean vars with NAs in historic_data
  1 boolean vars with NAs in predict_set
    Converted geo_ohare_noise from factor to boolean factor
    Converted geo_floodplain from factor to boolean factor
    Converted geo_withinmr100 from factor to boolean factor
    Converted geo_withinmr101300 from factor to boolean factor
    Converted ind_large_home from logical to factor
    Converted ind_garage from logical to factor
    Converted geo_ohare_noise from factor to boolean factor
    Converted geo_floodplain from factor to boolean factor
    Converted geo_withinmr100 from factor to boolean factor
    Converted geo_withinmr101300 from factor to boolean factor
    Converted ind_large_home from logical to factor
    Converted ind_garage from logical to factor
  [Historic] Imputing 5 boolean vars via logistic regression...

 iter imp variable
  1   1  

“Number of logged events: 6”


  [Historic] Boolean MICE finished in 0.03 minutes
  [Predict] Imputing 1 boolean vars via logistic regression...

 iter imp variable
  1   1  ind_garage
  1   2  ind_garage
  1   3  ind_garage
  2   1  ind_garage
  2   2  ind_garage
  2   3  ind_garage


“Number of logged events: 6”


  [Predict] Boolean MICE finished in 0.00 minutes
[BOOLEAN BLOCK] Completed. Remaining boolean NAs: 0 (historic), 0 (predict)


In [101]:
# =====================================================
# BLOCK 3.1: NOMINAL IMPUTATION - ONE AT A TIME
# =====================================================
cat("\n[NOMINAL BLOCK] Starting sequential nominal imputation...\n")
cat("Strategy: MICE 'sample' for ≤10 categories, Direct sampling for >10 categories\n")
flush.console()
set.seed(552)

# Helper function for direct sampling
impute_sample_direct <- function(x) {
  if(sum(is.na(x)) == 0) return(x)
  obs_vals <- x[!is.na(x)]
  if(length(obs_vals) == 0) return(x)
  x[is.na(x)] <- sample(obs_vals, sum(is.na(x)), replace=TRUE)
  return(x)
}

# Get nominal columns by checking each column's type via codebook_col_type()
nominal_cols_hist <- names(historic_data)[sapply(names(historic_data), function(col) {
  codebook_col_type(col, cb_types) == "Nominal"
})]

nominal_cols_pred <- names(predict_set)[sapply(names(predict_set), function(col) {
  codebook_col_type(col, cb_types) == "Nominal"
})]

# Find columns with missing values and sort by missingness (ASCENDING)
nominal_missing_hist <- nominal_cols_hist[colSums(is.na(historic_data[nominal_cols_hist])) > 0]
nominal_missing_pred <- nominal_cols_pred[colSums(is.na(predict_set[nominal_cols_pred])) > 0]

# Order by missingness (least to greatest)
if(length(nominal_missing_hist) > 0) {
  miss_counts_hist <- colSums(is.na(historic_data[nominal_missing_hist]))
  nominal_missing_hist <- names(sort(miss_counts_hist))
}

if(length(nominal_missing_pred) > 0) {
  miss_counts_pred <- colSums(is.na(predict_set[nominal_missing_pred]))
  nominal_missing_pred <- names(sort(miss_counts_pred))
}

cat(sprintf("Nominal columns with missing (historic): %d\n", length(nominal_missing_hist)))
cat(sprintf("Nominal columns with missing (predict): %d\n\n", length(nominal_missing_pred)))
flush.console()

# ============ HISTORIC DATA ============
if(length(nominal_missing_hist) > 0) {
  cat("[NOMINAL] Processing historic data...\n")
  flush.console()
  
  for(i in seq_along(nominal_missing_hist)) {
    col <- nominal_missing_hist[i]
    n_missing_before <- sum(is.na(historic_data[[col]]))
    n_levels <- length(levels(historic_data[[col]]))
    
    cat(sprintf("[%d/%d] %s: %d missing, %d levels -> ", 
                i, length(nominal_missing_hist), col, n_missing_before, n_levels))
    flush.console()
    
    if(n_levels <= 10) {
      # Use MICE sample method
      cat("using MICE 'sample'... ")
      flush.console()
      
      tryCatch({
        methods <- rep("", ncol(historic_data))
        names(methods) <- names(historic_data)
        methods[col] <- "sample"
        
        pred_matrix <- quickpred(historic_data, mincor = 0.3, minpuc = 0.5)
        
        mice_obj <- mice(historic_data,
                        m = 1,
                        method = methods,
                        predictorMatrix = pred_matrix,
                        maxit = 1,
                        seed = 552 + i,
                        printFlag = FALSE)
        
        historic_data <- complete(mice_obj, 1)
        n_missing_after <- sum(is.na(historic_data[[col]]))
        
        # If MICE didn't fully impute, use direct sampling as fallback
        if(n_missing_after > 0) {
          cat("MICE incomplete, applying direct sampling... ")
          flush.console()
          historic_data[[col]] <- impute_sample_direct(historic_data[[col]])
          n_missing_after <- sum(is.na(historic_data[[col]]))
        }
        
        cat(sprintf("COMPLETE (imputed %d values, %d remaining)\n", n_missing_before - n_missing_after, n_missing_after))
        flush.console()
      }, error = function(e) {
        cat("MICE failed, using direct sampling... ")
        flush.console()
        historic_data[[col]] <<- impute_sample_direct(historic_data[[col]])
        n_missing_after <- sum(is.na(historic_data[[col]]))
        cat(sprintf("COMPLETE (imputed %d values, %d remaining)\n", n_missing_before - n_missing_after, n_missing_after))
        flush.console()
      })
    } else {
      # Direct sampling for high-cardinality
      cat("using direct sampling... ")
      flush.console()
      historic_data[[col]] <- impute_sample_direct(historic_data[[col]])
      n_missing_after <- sum(is.na(historic_data[[col]]))
      cat(sprintf("COMPLETE (imputed %d values, %d remaining)\n", n_missing_before - n_missing_after, n_missing_after))
      flush.console()
    }
  }
  
  # Final pass: ensure all nominal columns are fully imputed
  remaining_nominal_na <- nominal_cols_hist[colSums(is.na(historic_data[nominal_cols_hist])) > 0]
  if(length(remaining_nominal_na) > 0) {
    cat("\n[NOMINAL] Final pass: imputing remaining NAs via direct sampling...\n")
    for(col in remaining_nominal_na) {
      n_before <- sum(is.na(historic_data[[col]]))
      historic_data[[col]] <- impute_sample_direct(historic_data[[col]])
      n_after <- sum(is.na(historic_data[[col]]))
      cat(sprintf("  %s: imputed %d values\n", col, n_before - n_after))
    }
  }
  
  cat("\n[NOMINAL] Historic data imputation COMPLETE\n")
  cat(sprintf("  Remaining nominal NAs: %d\n", sum(is.na(historic_data[nominal_cols_hist]))))
  cat(sprintf("  Total remaining NAs: %d\n\n", sum(is.na(historic_data))))
  flush.console()
}

# ============ PREDICT SET ============
if(length(nominal_missing_pred) > 0) {
  cat("[NOMINAL] Processing predict set...\n")
  flush.console()
  
  for(i in seq_along(nominal_missing_pred)) {
    col <- nominal_missing_pred[i]
    n_missing_before <- sum(is.na(predict_set[[col]]))
    n_levels <- length(levels(predict_set[[col]]))
    
    cat(sprintf("[%d/%d] %s: %d missing, %d levels -> ", 
                i, length(nominal_missing_pred), col, n_missing_before, n_levels))
    flush.console()
    
    if(n_levels <= 10) {
      # Use MICE sample method
      cat("using MICE 'sample'... ")
      flush.console()
      
      tryCatch({
        methods <- rep("", ncol(predict_set))
        names(methods) <- names(predict_set)
        methods[col] <- "sample"
        
        pred_matrix <- quickpred(predict_set, mincor = 0.3, minpuc = 0.5)
        
        mice_obj <- mice(predict_set,
                        m = 1,
                        method = methods,
                        predictorMatrix = pred_matrix,
                        maxit = 1,
                        seed = 552 + i,
                        printFlag = FALSE)
        
        predict_set <- complete(mice_obj, 1)
        n_missing_after <- sum(is.na(predict_set[[col]]))
        
        # If MICE didn't fully impute, use direct sampling as fallback
        if(n_missing_after > 0) {
          cat("MICE incomplete, applying direct sampling... ")
          flush.console()
          predict_set[[col]] <- impute_sample_direct(predict_set[[col]])
          n_missing_after <- sum(is.na(predict_set[[col]]))
        }
        
        cat(sprintf("COMPLETE (imputed %d values, %d remaining)\n", n_missing_before - n_missing_after, n_missing_after))
        flush.console()
      }, error = function(e) {
        cat("MICE failed, using direct sampling... ")
        flush.console()
        predict_set[[col]] <<- impute_sample_direct(predict_set[[col]])
        n_missing_after <- sum(is.na(predict_set[[col]]))
        cat(sprintf("COMPLETE (imputed %d values, %d remaining)\n", n_missing_before - n_missing_after, n_missing_after))
        flush.console()
      })
    } else {
      # Direct sampling for high-cardinality
      cat("using direct sampling... ")
      flush.console()
      predict_set[[col]] <- impute_sample_direct(predict_set[[col]])
      n_missing_after <- sum(is.na(predict_set[[col]]))
      cat(sprintf("COMPLETE (imputed %d values, %d remaining)\n", n_missing_before - n_missing_after, n_missing_after))
      flush.console()
    }
  }
  
  # Final pass: ensure all nominal columns are fully imputed
  remaining_nominal_na <- nominal_cols_pred[colSums(is.na(predict_set[nominal_cols_pred])) > 0]
  if(length(remaining_nominal_na) > 0) {
    cat("\n[NOMINAL] Final pass: imputing remaining NAs via direct sampling...\n")
    for(col in remaining_nominal_na) {
      n_before <- sum(is.na(predict_set[[col]]))
      predict_set[[col]] <- impute_sample_direct(predict_set[[col]])
      n_after <- sum(is.na(predict_set[[col]]))
      cat(sprintf("  %s: imputed %d values\n", col, n_before - n_after))
    }
  }
  
  cat("\n[NOMINAL] Predict set imputation COMPLETE\n")
  cat(sprintf("  Remaining nominal NAs: %d\n", sum(is.na(predict_set[nominal_cols_pred]))))
  cat(sprintf("  Total remaining NAs: %d\n\n", sum(is.na(predict_set))))
  flush.console()
}

cat("[BLOCK 3.1 NOMINAL] COMPLETED\n")
cat("Final missing values:\n")
cat("  Historic:", sum(is.na(historic_data)), "\n")
cat("  Predict:", sum(is.na(predict_set)), "\n\n")
flush.console()


[NOMINAL BLOCK] Starting sequential nominal imputation...
Strategy: MICE 'sample' for ≤10 categories, Direct sampling for >10 categories
Nominal columns with missing (historic): 28
Nominal columns with missing (predict): 19

[NOMINAL] Processing historic data...
[1/28] char_air: 23 missing, 0 levels -> using MICE 'sample'... 

“Number of logged events: 6”


MICE incomplete, applying direct sampling... COMPLETE (imputed 23 values, 0 remaining)
[2/28] char_type_resd: 23 missing, 0 levels -> using MICE 'sample'... 

“Number of logged events: 6”


COMPLETE (imputed 23 values, 0 remaining)
[3/28] char_gar1_size: 24 missing, 0 levels -> using MICE 'sample'... 

“Number of logged events: 6”


COMPLETE (imputed 24 values, 0 remaining)
[4/28] char_use: 24 missing, 0 levels -> using MICE 'sample'... 

“Number of logged events: 6”


COMPLETE (imputed 24 values, 0 remaining)
[5/28] char_bsmt: 25 missing, 0 levels -> using MICE 'sample'... 

“Number of logged events: 6”


COMPLETE (imputed 25 values, 0 remaining)
[6/28] char_bsmt_fin: 25 missing, 0 levels -> using MICE 'sample'... 

“Number of logged events: 6”


COMPLETE (imputed 25 values, 0 remaining)
[7/28] char_frpl: 25 missing, 0 levels -> using MICE 'sample'... 

“Number of logged events: 6”


MICE incomplete, applying direct sampling... COMPLETE (imputed 25 values, 0 remaining)
[8/28] char_ext_wall: 26 missing, 0 levels -> using MICE 'sample'... 

“Number of logged events: 6”


COMPLETE (imputed 26 values, 0 remaining)
[9/28] char_roof_cnst: 26 missing, 0 levels -> using MICE 'sample'... 

“Number of logged events: 6”


COMPLETE (imputed 26 values, 0 remaining)
[10/28] char_heat: 26 missing, 0 levels -> using MICE 'sample'... 

“Number of logged events: 6”


COMPLETE (imputed 26 values, 0 remaining)
[11/28] char_attic_type: 26 missing, 0 levels -> using MICE 'sample'... 

“Number of logged events: 6”


COMPLETE (imputed 26 values, 0 remaining)
[12/28] char_site: 26 missing, 0 levels -> using MICE 'sample'... 

“Number of logged events: 6”


COMPLETE (imputed 26 values, 0 remaining)
[13/28] char_repair_cnd: 26 missing, 0 levels -> using MICE 'sample'... 

“Number of logged events: 6”


COMPLETE (imputed 26 values, 0 remaining)
[14/28] char_cnst_qlty: 57 missing, 0 levels -> using MICE 'sample'... 

“Number of logged events: 6”


COMPLETE (imputed 57 values, 0 remaining)
[15/28] geo_tract_pop: 162 missing, 0 levels -> using MICE 'sample'... 

“Number of logged events: 6”


COMPLETE (imputed 162 values, 0 remaining)
[16/28] geo_white_perc: 162 missing, 0 levels -> using MICE 'sample'... 

“Number of logged events: 6”


MICE incomplete, applying direct sampling... COMPLETE (imputed 162 values, 0 remaining)
[17/28] geo_black_perc: 162 missing, 0 levels -> using MICE 'sample'... 

“Number of logged events: 6”


COMPLETE (imputed 162 values, 0 remaining)
[18/28] geo_asian_perc: 162 missing, 0 levels -> using MICE 'sample'... 

“Number of logged events: 6”


COMPLETE (imputed 162 values, 0 remaining)
[19/28] geo_his_perc: 162 missing, 0 levels -> using MICE 'sample'... 

“Number of logged events: 6”


COMPLETE (imputed 162 values, 0 remaining)
[20/28] geo_other_perc: 162 missing, 0 levels -> using MICE 'sample'... 

“Number of logged events: 6”


COMPLETE (imputed 162 values, 0 remaining)
[21/28] geo_fs_flood_factor: 162 missing, 0 levels -> using MICE 'sample'... 

“Number of logged events: 6”


COMPLETE (imputed 162 values, 0 remaining)
[22/28] geo_fs_flood_risk_direction: 162 missing, 0 levels -> using MICE 'sample'... 

“Number of logged events: 6”


COMPLETE (imputed 162 values, 0 remaining)
[23/28] econ_midincome: 162 missing, 0 levels -> using MICE 'sample'... 

“Number of logged events: 6”


MICE incomplete, applying direct sampling... COMPLETE (imputed 162 values, 0 remaining)
[24/28] char_oheat: 172 missing, 0 levels -> using MICE 'sample'... 

“Number of logged events: 6”


COMPLETE (imputed 172 values, 0 remaining)
[25/28] char_gar1_cnst: 7088 missing, 0 levels -> using MICE 'sample'... 

“Number of logged events: 7”


COMPLETE (imputed 7088 values, 0 remaining)
[26/28] char_gar1_att: 7088 missing, 0 levels -> using MICE 'sample'... 

“Number of logged events: 7”


COMPLETE (imputed 7088 values, 0 remaining)
[27/28] char_gar1_area: 7090 missing, 0 levels -> using MICE 'sample'... 

“Number of logged events: 5”


COMPLETE (imputed 7090 values, 0 remaining)
[28/28] char_tp_plan: 14394 missing, 0 levels -> using MICE 'sample'... 

“Number of logged events: 4”


COMPLETE (imputed 14394 values, 0 remaining)

[NOMINAL] Historic data imputation COMPLETE
  Remaining nominal NAs: 0
  Total remaining NAs: 2792

[NOMINAL] Processing predict set...
[1/19] char_ext_wall: 3 missing, 0 levels -> using MICE 'sample'... 

“Number of logged events: 6”


COMPLETE (imputed 3 values, 0 remaining)
[2/19] char_roof_cnst: 3 missing, 0 levels -> using MICE 'sample'... 

“Number of logged events: 6”


COMPLETE (imputed 3 values, 0 remaining)
[3/19] char_bsmt: 3 missing, 0 levels -> using MICE 'sample'... 

“Number of logged events: 6”


COMPLETE (imputed 3 values, 0 remaining)
[4/19] char_bsmt_fin: 3 missing, 0 levels -> using MICE 'sample'... 

“Number of logged events: 6”


COMPLETE (imputed 3 values, 0 remaining)
[5/19] char_heat: 3 missing, 0 levels -> using MICE 'sample'... 

“Number of logged events: 6”


COMPLETE (imputed 3 values, 0 remaining)
[6/19] char_air: 3 missing, 0 levels -> using MICE 'sample'... 

“Number of logged events: 6”


COMPLETE (imputed 3 values, 0 remaining)
[7/19] char_frpl: 3 missing, 0 levels -> using MICE 'sample'... 

“Number of logged events: 6”


COMPLETE (imputed 3 values, 0 remaining)
[8/19] char_attic_type: 3 missing, 0 levels -> using MICE 'sample'... 

“Number of logged events: 6”


COMPLETE (imputed 3 values, 0 remaining)
[9/19] char_site: 3 missing, 0 levels -> using MICE 'sample'... 

“Number of logged events: 6”


COMPLETE (imputed 3 values, 0 remaining)
[10/19] char_gar1_size: 3 missing, 0 levels -> using MICE 'sample'... 

“Number of logged events: 6”


COMPLETE (imputed 3 values, 0 remaining)
[11/19] char_repair_cnd: 3 missing, 0 levels -> using MICE 'sample'... 

“Number of logged events: 6”


COMPLETE (imputed 3 values, 0 remaining)
[12/19] char_use: 3 missing, 0 levels -> using MICE 'sample'... 

“Number of logged events: 6”


COMPLETE (imputed 3 values, 0 remaining)
[13/19] char_type_resd: 3 missing, 0 levels -> using MICE 'sample'... 

“Number of logged events: 6”


COMPLETE (imputed 3 values, 0 remaining)
[14/19] char_oheat: 5 missing, 0 levels -> using MICE 'sample'... 

“Number of logged events: 6”


COMPLETE (imputed 5 values, 0 remaining)
[15/19] char_cnst_qlty: 10 missing, 0 levels -> using MICE 'sample'... 

“Number of logged events: 6”


COMPLETE (imputed 10 values, 0 remaining)
[16/19] char_gar1_cnst: 1383 missing, 0 levels -> using MICE 'sample'... 

“Number of logged events: 7”


COMPLETE (imputed 1383 values, 0 remaining)
[17/19] char_gar1_att: 1383 missing, 0 levels -> using MICE 'sample'... 

“Number of logged events: 7”


COMPLETE (imputed 1383 values, 0 remaining)
[18/19] char_gar1_area: 1383 missing, 0 levels -> using MICE 'sample'... 

“Number of logged events: 7”


COMPLETE (imputed 1383 values, 0 remaining)
[19/19] char_tp_plan: 2657 missing, 0 levels -> using MICE 'sample'... 

“Number of logged events: 6”


COMPLETE (imputed 2657 values, 0 remaining)

[NOMINAL] Predict set imputation COMPLETE
  Remaining nominal NAs: 0
  Total remaining NAs: 456

[BLOCK 3.1 NOMINAL] COMPLETED
Final missing values:
  Historic: 2792 
  Predict: 456 



In [102]:
# =====================================================
# BLOCK 3.2: CHARACTER IMPUTATION - DIRECT SAMPLING ONLY
# =====================================================
cat("\n[CHARACTER BLOCK] Starting character variable imputation...\n")
cat("Strategy: Direct sampling (character variables are high-cardinality IDs/codes)\n")
flush.console()
set.seed(553)

# Get character columns by checking each column's type via codebook_col_type()
character_cols_hist <- names(historic_data)[sapply(names(historic_data), function(col) {
  codebook_col_type(col, cb_types) == "Character"
})]

character_cols_pred <- names(predict_set)[sapply(names(predict_set), function(col) {
  codebook_col_type(col, cb_types) == "Character"
})]

# Find columns with missing values
character_missing_hist <- character_cols_hist[colSums(is.na(historic_data[character_cols_hist])) > 0]
character_missing_pred <- character_cols_pred[colSums(is.na(predict_set[character_cols_pred])) > 0]

cat(sprintf("Character columns with missing (historic): %d\n", length(character_missing_hist)))
cat(sprintf("Character columns with missing (predict): %d\n\n", length(character_missing_pred)))
flush.console()

# ============ HISTORIC DATA ============
if(length(character_missing_hist) > 0) {
  cat("[CHARACTER] Processing historic data...\n")
  for(i in seq_along(character_missing_hist)) {
    col <- character_missing_hist[i]
    n_missing <- sum(is.na(historic_data[[col]]))
    n_unique <- length(unique(historic_data[[col]][!is.na(historic_data[[col]])]))
    
    cat(sprintf("[%d/%d] %s: %d missing, %d unique values -> ", 
                i, length(character_missing_hist), col, n_missing, n_unique))
    flush.console()
    
    historic_data[[col]] <- impute_sample_direct(historic_data[[col]])
    cat("COMPLETE\n")
    flush.console()
  }
  cat("\n[CHARACTER] Historic data imputation COMPLETE\n")
  cat(sprintf("  Remaining NAs: %d\n\n", sum(is.na(historic_data))))
  flush.console()
}

# ============ PREDICT SET ============
if(length(character_missing_pred) > 0) {
  cat("[CHARACTER] Processing predict set...\n")
  for(i in seq_along(character_missing_pred)) {
    col <- character_missing_pred[i]
    n_missing <- sum(is.na(predict_set[[col]]))
    n_unique <- length(unique(predict_set[[col]][!is.na(predict_set[[col]])]))
    
    cat(sprintf("[%d/%d] %s: %d missing, %d unique values -> ", 
                i, length(character_missing_pred), col, n_missing, n_unique))
    flush.console()
    
    predict_set[[col]] <- impute_sample_direct(predict_set[[col]])
    cat("COMPLETE\n")
    flush.console()
  }
  cat("\n[CHARACTER] Predict set imputation COMPLETE\n")
  cat(sprintf("  Remaining NAs: %d\n\n", sum(is.na(predict_set))))
  flush.console()
}

cat("[BLOCK 3.2 CHARACTER] COMPLETED\n")
cat("Final missing values:\n")
cat("  Historic:", sum(is.na(historic_data)), "\n")
cat("  Predict:", sum(is.na(predict_set)), "\n\n")
flush.console()


[CHARACTER BLOCK] Starting character variable imputation...
Strategy: Direct sampling (character variables are high-cardinality IDs/codes)
Character columns with missing (historic): 6
Character columns with missing (predict): 4

[CHARACTER] Processing historic data...
[1/6] geo_property_city: 131 missing, 132 unique values -> COMPLETE
[2/6] geo_property_zip: 131 missing, 169 unique values -> COMPLETE
[3/6] geo_fips: 1103 missing, 126 unique values -> COMPLETE
[4/6] geo_municipality: 1103 missing, 126 unique values -> COMPLETE
[5/6] geo_school_elem_district: 162 missing, 475 unique values -> COMPLETE
[6/6] geo_school_hs_district: 162 missing, 79 unique values -> COMPLETE

[CHARACTER] Historic data imputation COMPLETE
  Remaining NAs: 0

[CHARACTER] Processing predict set...
[1/4] geo_property_city: 15 missing, 130 unique values -> COMPLETE
[2/4] geo_property_zip: 15 missing, 166 unique values -> COMPLETE
[3/4] geo_fips: 213 missing, 125 unique values -> COMPLETE
[4/4] geo_municipality:

In [103]:
# =====================================================
# BLOCK 3.3: ORDINAL IMPUTATION - ONE AT A TIME
# =====================================================
cat("\n[ORDINAL BLOCK] Starting sequential ordinal imputation...\n")
cat("Strategy: MICE 'sample' for ≤10 categories, Direct sampling for >10 categories\n")
flush.console()
set.seed(554)

# Get ordinal columns by checking each column's type via codebook_col_type()
ordinal_cols_hist <- names(historic_data)[sapply(names(historic_data), function(col) {
  codebook_col_type(col, cb_types) == "Ordinal"
})]

ordinal_cols_pred <- names(predict_set)[sapply(names(predict_set), function(col) {
  codebook_col_type(col, cb_types) == "Ordinal"
})]

# Find columns with missing values and sort by missingness (ASCENDING)
ordinal_missing_hist <- ordinal_cols_hist[colSums(is.na(historic_data[ordinal_cols_hist])) > 0]
ordinal_missing_pred <- ordinal_cols_pred[colSums(is.na(predict_set[ordinal_cols_pred])) > 0]

# Order by missingness (least to greatest)
if(length(ordinal_missing_hist) > 0) {
  miss_counts_hist <- colSums(is.na(historic_data[ordinal_missing_hist]))
  ordinal_missing_hist <- names(sort(miss_counts_hist))
}

if(length(ordinal_missing_pred) > 0) {
  miss_counts_pred <- colSums(is.na(predict_set[ordinal_missing_pred]))
  ordinal_missing_pred <- names(sort(miss_counts_pred))
}

cat(sprintf("Ordinal columns with missing (historic): %d\n", length(ordinal_missing_hist)))
cat(sprintf("Ordinal columns with missing (predict): %d\n\n", length(ordinal_missing_pred)))
flush.console()

# ============ HISTORIC DATA ============
if(length(ordinal_missing_hist) > 0) {
  cat("[ORDINAL] Processing historic data...\n")
  flush.console()
  
  for(i in seq_along(ordinal_missing_hist)) {
    col <- ordinal_missing_hist[i]
    n_missing <- sum(is.na(historic_data[[col]]))
    n_levels <- length(levels(historic_data[[col]]))
    
    cat(sprintf("[%d/%d] %s: %d missing, %d levels -> ", 
                i, length(ordinal_missing_hist), col, n_missing, n_levels))
    flush.console()
    
    if(n_levels <= 10) {
      cat("using MICE 'sample'... ")
      flush.console()
      
      tryCatch({
        methods <- rep("", ncol(historic_data))
        names(methods) <- names(historic_data)
        methods[col] <- "sample"
        
        pred_matrix <- quickpred(historic_data, mincor = 0.3, minpuc = 0.5)
        
        mice_obj <- mice(historic_data,
                        m = 1,
                        method = methods,
                        predictorMatrix = pred_matrix,
                        maxit = 1,
                        seed = 554 + i,
                        printFlag = FALSE)
        
        historic_data <- complete(mice_obj, 1)
        cat("COMPLETE\n")
        flush.console()
      }, error = function(e) {
        cat("MICE failed, using direct sampling... ")
        flush.console()
        historic_data[[col]] <<- impute_sample_direct(historic_data[[col]])
        cat("COMPLETE\n")
        flush.console()
      })
    } else {
      cat("using direct sampling... ")
      flush.console()
      historic_data[[col]] <- impute_sample_direct(historic_data[[col]])
      cat("COMPLETE\n")
      flush.console()
    }
  }
  
  cat("\n[ORDINAL] Historic data imputation COMPLETE\n")
  cat(sprintf("  Remaining NAs: %d\n\n", sum(is.na(historic_data))))
  flush.console()
}

# ============ PREDICT SET ============
if(length(ordinal_missing_pred) > 0) {
  cat("[ORDINAL] Processing predict set...\n")
  flush.console()
  
  for(i in seq_along(ordinal_missing_pred)) {
    col <- ordinal_missing_pred[i]
    n_missing <- sum(is.na(predict_set[[col]]))
    n_levels <- length(levels(predict_set[[col]]))
    
    cat(sprintf("[%d/%d] %s: %d missing, %d levels -> ", 
                i, length(ordinal_missing_pred), col, n_missing, n_levels))
    flush.console()
    
    if(n_levels <= 10) {
      cat("using MICE 'sample'... ")
      flush.console()
      
      tryCatch({
        methods <- rep("", ncol(predict_set))
        names(methods) <- names(predict_set)
        methods[col] <- "sample"
        
        pred_matrix <- quickpred(predict_set, mincor = 0.3, minpuc = 0.5)
        
        mice_obj <- mice(predict_set,
                        m = 1,
                        method = methods,
                        predictorMatrix = pred_matrix,
                        maxit = 1,
                        seed = 554 + i,
                        printFlag = FALSE)
        
        predict_set <- complete(mice_obj, 1)
        cat("COMPLETE\n")
        flush.console()
      }, error = function(e) {
        cat("MICE failed, using direct sampling... ")
        flush.console()
        predict_set[[col]] <<- impute_sample_direct(predict_set[[col]])
        cat("COMPLETE\n")
        flush.console()
      })
    } else {
      cat("using direct sampling... ")
      flush.console()
      predict_set[[col]] <- impute_sample_direct(predict_set[[col]])
      cat("COMPLETE\n")
      flush.console()
    }
  }
  
  cat("\n[ORDINAL] Predict set imputation COMPLETE\n")
  cat(sprintf("  Remaining NAs: %d\n\n", sum(is.na(predict_set))))
  flush.console()
}

cat("[BLOCK 3.3 ORDINAL] COMPLETED\n")
cat("Final missing values:\n")
cat("  Historic:", sum(is.na(historic_data)), "\n")
cat("  Predict:", sum(is.na(predict_set)), "\n\n")
flush.console()


[ORDINAL BLOCK] Starting sequential ordinal imputation...
Strategy: MICE 'sample' for ≤10 categories, Direct sampling for >10 categories
Ordinal columns with missing (historic): 0
Ordinal columns with missing (predict): 0

[BLOCK 3.3 ORDINAL] COMPLETED
Final missing values:
  Historic: 0 
  Predict: 0 



In [109]:
# =====================================================
# BLOCK 4: DATA-INTEGRITY VISUALIZATIONS & CHECKS
# =====================================================
cat("\n[INTEGRITY BLOCK] Verifying statistical fidelity...\n")
flush.console()

if(!exists("historic_snapshot_pre_mice") || !exists("predict_snapshot_pre_mice")) {
  stop(paste(
    "Missing baseline snapshots. Before running Blocks 1-3, create:",
    "  historic_snapshot_pre_mice <- historic_data",
    "  predict_snapshot_pre_mice  <- predict_set",
    sep = "\n"
  ))
}

percent_diff <- function(before, after) {
  diff <- (after - before)
  denom <- ifelse(abs(before) < .Machine$double.eps, NA_real_, abs(before))
  (diff / denom) * 100
}

# Helper: get variables of a type
get_vars_by_type <- function(type, cb_types) {
  cb_types$Variable[which(cb_types$Codebook_Type == type)]
}

# Get codebook types
cb_types <- get_codebook_types()

# Numeric columns: use only Codebook_Type == "Numeric"
numeric_vars_historic <- intersect(
  get_vars_by_type("Numeric", cb_types),
  names(historic_data)
)
numeric_vars_predict <- intersect(
  get_vars_by_type("Numeric", cb_types),
  names(predict_set)
)

# Nominal, Ordinal, Boolean separately for categorical work
nominal_vars_historic <- intersect(get_vars_by_type("Nominal", cb_types), names(historic_data))
nominal_vars_predict <- intersect(get_vars_by_type("Nominal", cb_types), names(predict_set))
ordinal_vars_historic <- intersect(get_vars_by_type("Ordinal", cb_types), names(historic_data))
ordinal_vars_predict <- intersect(get_vars_by_type("Ordinal", cb_types), names(predict_set))
boolean_vars_historic <- intersect(get_vars_by_type("Boolean", cb_types), names(historic_data))
boolean_vars_predict <- intersect(get_vars_by_type("Boolean", cb_types), names(predict_set))

# Combine for categorical integrity check - EXCLUDE numeric/character/unknown
categorical_vars_historic <- unique(c(nominal_vars_historic, ordinal_vars_historic))
categorical_vars_predict <- unique(c(nominal_vars_predict, ordinal_vars_predict))

compare_numeric_stats <- function(before_df, after_df, cols, dataset_name) {
  cols <- cols[cols %in% names(before_df) & cols %in% names(after_df)]
  if(!length(cols)) {
    cat(sprintf("  [%s] No numeric columns found for comparison.\n", dataset_name))
    return(NULL)
  }
  
  # Filter to only columns that are actually numeric in both datasets
  numeric_cols <- cols[sapply(cols, function(col) {
    is.numeric(before_df[[col]]) && is.numeric(after_df[[col]])
  })]
  
  if(!length(numeric_cols)) {
    cat(sprintf("  [%s] No numeric columns are actually numeric type in both datasets.\n", dataset_name))
    return(NULL)
  }
  
  before_stats <- data.frame(
    Variable = numeric_cols,
    Mean_Before   = sapply(before_df[numeric_cols], mean,   na.rm = TRUE),
    Median_Before = sapply(before_df[numeric_cols], median, na.rm = TRUE),
    Var_Before    = sapply(before_df[numeric_cols], var,    na.rm = TRUE),
    row.names = NULL
  )
  after_stats <- data.frame(
    Variable = numeric_cols,
    Mean_After   = sapply(after_df[numeric_cols], mean,   na.rm = TRUE),
    Median_After = sapply(after_df[numeric_cols], median, na.rm = TRUE),
    Var_After    = sapply(after_df[numeric_cols], var,    na.rm = TRUE),
    row.names = NULL
  )
  merged <- before_stats %>%
    inner_join(after_stats, by = "Variable") %>%
    mutate(
      Mean_Pct   = percent_diff(Mean_Before,   Mean_After),
      Median_Pct = percent_diff(Median_Before, Median_After),
      Var_Pct    = percent_diff(Var_Before,    Var_After)
    )
  merged
}

plot_numeric_integrity <- function(df, dataset_name) {
  if(is.null(df) || nrow(df) == 0) {
    cat(sprintf("  [%s] No numeric comparison data available.\n", dataset_name))
    return(NULL)
  }
  long_df <- df %>%
    select(Variable, Mean_Pct, Median_Pct, Var_Pct) %>%
    pivot_longer(-Variable, names_to = "Statistic", values_to = "Percent_Diff") %>%
    mutate(Statistic = gsub("_Pct", "", Statistic))
  
  ggplot(long_df, aes(x = Variable, y = Percent_Diff, fill = Statistic)) +
    geom_bar(stat = "identity", position = "dodge") +
    geom_hline(yintercept = 0, linetype = "dashed") +
    facet_wrap(~Statistic, ncol = 1, scales = "free_y") +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    labs(
      title = sprintf("%s Numeric Integrity (%% Difference)", dataset_name),
      x = "Variable", y = "Percent Difference", fill = "Statistic"
    ) +
    scale_fill_manual(values = c("Mean" = "#E69F00",
                                 "Median" = "#56B4E9",
                                 "Var" = "#009E73"))
}

compare_categorical_dist <- function(before_df, after_df, cols, dataset_name, max_levels_plot = 10) {
  cols <- cols[cols %in% names(before_df) & cols %in% names(after_df)]
  if(!length(cols)) return(NULL)
  
  all_results <- list()
  plot_ready <- data.frame()
  
  for(col in cols) {
    # Convert to character to ensure consistent comparison
    before_vec <- as.character(before_df[[col]])
    after_vec <- as.character(after_df[[col]])
    
    before_tbl <- prop.table(table(before_vec, useNA = "no"))
    after_tbl  <- prop.table(table(after_vec,  useNA = "no"))
    all_lvls <- union(names(before_tbl), names(after_tbl))
    
    aligned <- data.frame(
      Variable = col,
      Category = all_lvls,
      Before   = sapply(all_lvls, function(l) ifelse(l %in% names(before_tbl), before_tbl[[l]], 0)),
      After    = sapply(all_lvls, function(l) ifelse(l %in% names(after_tbl),  after_tbl[[l]], 0))
    ) %>%
      mutate(Abs_Diff = abs(After - Before))
    
    all_results[[col]] <- aligned
    
    if(length(all_lvls) <= max_levels_plot) {
      plot_ready <- bind_rows(
        plot_ready,
        aligned %>%
          pivot_longer(c(Before, After), names_to = "Distribution", values_to = "Proportion") %>%
          mutate(Distribution = ifelse(Distribution == "Before", "Pre-Impute", "Post-Impute"))
      )
    }
  }
  
  summary_df <- bind_rows(all_results)
  list(summary = summary_df, plot_data = plot_ready)
}

plot_cat_integrity <- function(plot_df, dataset_name) {
  if(is.null(plot_df) || nrow(plot_df) == 0) {
    cat(sprintf("  [%s] No low-cardinality categorical vars to plot (all >10 levels).\n", dataset_name))
    return(NULL)
  }
  ggplot(plot_df,
         aes(x = Category, y = Proportion, fill = Distribution)) +
    geom_bar(stat = "identity", position = "dodge") +
    facet_wrap(~Variable, scales = "free_x", ncol = 2) +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8),
          legend.position = "bottom") +
    labs(title = sprintf("%s Categorical Distribution Preservation", dataset_name),
         x = "Category", y = "Proportion", fill = "")
}

# -------- Historic Data Integrity --------
cat("\n[INTEGRITY] Historic dataset\n")

# Numeric integrity: restrict to Codebook_Type == "Numeric"
hist_numeric_cmp <- compare_numeric_stats(
  historic_snapshot_pre_mice, historic_data, numeric_vars_historic, "Historic"
)
if(!is.null(hist_numeric_cmp)) {
  print(hist_numeric_cmp)
  print(plot_numeric_integrity(hist_numeric_cmp, "Historic"))
}

# Categorical integrity (Nominal + Ordinal)
hist_cat_cmp <- compare_categorical_dist(
  historic_snapshot_pre_mice, historic_data, categorical_vars_historic, "Historic"
)

if(!is.null(hist_cat_cmp)) {
  cat(sprintf("  Historic categorical mean abs diff: %.4f\n",
              mean(hist_cat_cmp$summary$Abs_Diff)))
  cat(sprintf("  Historic categorical max abs diff: %.4f\n",
              max(hist_cat_cmp$summary$Abs_Diff)))
  print(plot_cat_integrity(hist_cat_cmp$plot_data, "Historic"))
}

# Boolean integrity
hist_bool_cmp <- compare_categorical_dist(
  historic_snapshot_pre_mice, historic_data, boolean_vars_historic, "Historic", max_levels_plot = 2
)
if(!is.null(hist_bool_cmp)) {
  cat(sprintf("  Historic boolean mean abs diff: %.4f\n",
              mean(hist_bool_cmp$summary$Abs_Diff)))
  print(plot_cat_integrity(hist_bool_cmp$plot_data, "Historic (Boolean)"))
}

# -------- Predict Set Integrity --------
cat("\n[INTEGRITY] Predict dataset\n")

# Numeric
pred_numeric_cmp <- compare_numeric_stats(
  predict_snapshot_pre_mice, predict_set, numeric_vars_predict, "Predict"
)
if(!is.null(pred_numeric_cmp)) {
  print(pred_numeric_cmp)
  print(plot_numeric_integrity(pred_numeric_cmp, "Predict"))
}

# Categorical (Nominal + Ordinal)
pred_cat_cmp <- compare_categorical_dist(
  predict_snapshot_pre_mice, predict_set, categorical_vars_predict, "Predict"
)
if(!is.null(pred_cat_cmp)) {
  cat(sprintf("  Predict categorical mean abs diff: %.4f\n",
              mean(pred_cat_cmp$summary$Abs_Diff)))
  cat(sprintf("  Predict categorical max abs diff: %.4f\n",
              max(pred_cat_cmp$summary$Abs_Diff)))
  print(plot_cat_integrity(pred_cat_cmp$plot_data, "Predict"))
}

# Boolean
pred_bool_cmp <- compare_categorical_dist(
  predict_snapshot_pre_mice, predict_set, boolean_vars_predict, "Predict", max_levels_plot = 2
)
if(!is.null(pred_bool_cmp)) {
  cat(sprintf("  Predict boolean mean abs diff: %.4f\n",
              mean(pred_bool_cmp$summary$Abs_Diff)))
  print(plot_cat_integrity(pred_bool_cmp$plot_data, "Predict (Boolean)"))
}

# Final NA audit
cat("\n[INTEGRITY] Final missing-value audit\n")
cat("Historic remaining NAs:", sum(is.na(historic_data)), "\n")
cat("Predict remaining NAs:", sum(is.na(predict_set)), "\n")
cat("[INTEGRITY BLOCK] Complete.\n")


[INTEGRITY BLOCK] Verifying statistical fidelity...

[INTEGRITY] Historic dataset
  [Historic] No numeric columns found for comparison.

[INTEGRITY] Predict dataset
  [Predict] No numeric columns found for comparison.

[INTEGRITY] Final missing-value audit
Historic remaining NAs: 0 
Predict remaining NAs: 0 
[INTEGRITY BLOCK] Complete.


---
### Exploratory Data Analysis

In [106]:
# =====================================================
# ENHANCED VARIABLE CLASSIFICATION
# =====================================================

cat("\n=== Classifying Variables by Data Type ===\n")

# Get codebook types
cb_types <- get_codebook_types()

# Classification function with granular types
classify_variable <- function(var, data, cb_types) {
  # Get codebook info
  cb_type <- codebook_col_type(var, cb_types)
  
  # Get R data type
  r_type <- class(data[[var]])[1]
  
  # Determine granular classification
  if(cb_type == "Numeric") {
    # Check if it's integer-like (whole numbers, counts) or continuous
    if(r_type %in% c("integer", "factor")) {
      return("Integer")
    } else if(is.numeric(data[[var]])) {
      # Check if values are mostly whole numbers (counts/discrete)
      non_na_vals <- data[[var]][!is.na(data[[var]])]
      if(length(non_na_vals) > 0) {
        whole_number_pct <- mean(non_na_vals == floor(non_na_vals))
        # If variable name suggests count (beds, rooms, baths) or >95% whole numbers
        is_count <- grepl("beds|rooms|bath|apts|frpl", var, ignore.case = TRUE)
        if(is_count || whole_number_pct > 0.95) {
          return("Integer")
        } else {
          return("Continuous")
        }
      }
      return("Continuous")
    }
  } else if(cb_type == "Categorical") {
    # Check if ordinal (quality, condition, size ratings)
    is_ordinal <- grepl("qlty|quality|cnd|condition|size|rating|grade", var, ignore.case = TRUE)
    if(is_ordinal) {
      return("Ordinal")
    } else {
      return("Nominal")
    }
  } else if(cb_type == "Boolean") {
    return("Boolean")
  }
  
  return("Unknown")
}

# Classify all variables
variable_classifications <- data.frame(
  Variable = names(historic_data),
  Classification = sapply(names(historic_data), function(v) {
    classify_variable(v, historic_data, cb_types)
  }),
  stringsAsFactors = FALSE
)

# Group variables by classification
integer_vars <- variable_classifications %>% filter(Classification == "Integer") %>% pull(Variable)
continuous_vars <- variable_classifications %>% filter(Classification == "Continuous") %>% pull(Variable)
ordinal_vars <- variable_classifications %>% filter(Classification == "Ordinal") %>% pull(Variable)
nominal_vars <- variable_classifications %>% filter(Classification == "Nominal") %>% pull(Variable)
boolean_vars <- variable_classifications %>% filter(Classification == "Boolean") %>% pull(Variable)

cat("\nVariable Classification Summary:\n")
cat("  Integer Variables:", length(integer_vars), "\n")
cat("  Continuous Variables:", length(continuous_vars), "\n")
cat("  Ordinal Variables:", length(ordinal_vars), "\n")
cat("  Nominal Variables:", length(nominal_vars), "\n")
cat("  Boolean Variables:", length(boolean_vars), "\n")


=== Classifying Variables by Data Type ===

Variable Classification Summary:
  Integer Variables: 0 
  Continuous Variables: 0 
  Ordinal Variables: 0 
  Nominal Variables: 0 
  Boolean Variables: 6 


In [107]:
# =====================================================
# SUMMARY STATISTICS BY CLASSIFICATION
# =====================================================

cat("\n=== Summary Statistics ===\n\n")

# Integer variables
if(length(integer_vars) > 0) {
  cat("--- Integer Variables ---\n")
  integer_stats <- historic_data %>%
    select(all_of(integer_vars)) %>%
    summarise(across(everything(), 
                     list(mean = ~mean(.x, na.rm = TRUE),
                          median = ~median(.x, na.rm = TRUE),
                          sd = ~sd(.x, na.rm = TRUE),
                          variance = ~var(.x, na.rm = TRUE),
                          min = ~min(.x, na.rm = TRUE),
                          max = ~max(.x, na.rm = TRUE)),
                     .names = "{.col}_{.fn}")) %>%
    pivot_longer(everything(),
                 names_to = c("Variable", "Statistic"),
                 names_pattern = "(.*)_(mean|median|sd|variance|min|max)",
                 values_to = "Value") %>%
    pivot_wider(names_from = Statistic, values_from = Value)
  
  print(integer_stats, n = Inf)
}

# Continuous variables
if(length(continuous_vars) > 0) {
  cat("\n--- Continuous Variables ---\n")
  continuous_stats <- historic_data %>%
    select(all_of(continuous_vars)) %>%
    summarise(across(everything(), 
                     list(mean = ~mean(.x, na.rm = TRUE),
                          median = ~median(.x, na.rm = TRUE),
                          sd = ~sd(.x, na.rm = TRUE),
                          variance = ~var(.x, na.rm = TRUE),
                          min = ~min(.x, na.rm = TRUE),
                          max = ~max(.x, na.rm = TRUE)),
                     .names = "{.col}_{.fn}")) %>%
    pivot_longer(everything(),
                 names_to = c("Variable", "Statistic"),
                 names_pattern = "(.*)_(mean|median|sd|variance|min|max)",
                 values_to = "Value") %>%
    pivot_wider(names_from = Statistic, values_from = Value)
  
  print(continuous_stats, n = Inf)
}

# Ordinal variables - NUMERIC TREATMENT
if(length(ordinal_vars) > 0) {
  cat("\n--- Ordinal Variables (Numeric Statistics) ---\n")
  
  # Convert ordinal to numeric for statistics
  ordinal_numeric <- historic_data %>%
    select(all_of(ordinal_vars)) %>%
    mutate(across(everything(), ~as.numeric(as.factor(.x))))
  
  ordinal_stats <- ordinal_numeric %>%
    summarise(across(everything(), 
                     list(mean = ~mean(.x, na.rm = TRUE),
                          median = ~median(.x, na.rm = TRUE),
                          sd = ~sd(.x, na.rm = TRUE),
                          variance = ~var(.x, na.rm = TRUE),
                          min = ~min(.x, na.rm = TRUE),
                          max = ~max(.x, na.rm = TRUE)),
                     .names = "{.col}_{.fn}")) %>%
    pivot_longer(everything(),
                 names_to = c("Variable", "Statistic"),
                 names_pattern = "(.*)_(mean|median|sd|variance|min|max)",
                 values_to = "Value") %>%
    pivot_wider(names_from = Statistic, values_from = Value)
  
  print(ordinal_stats, n = Inf)
  
  cat("\n--- Ordinal Variables (Category Distributions) ---\n")
  for(var in head(ordinal_vars, 8)) {
    cat("\n", var, ":\n", sep = "")
    freq_table <- table(historic_data[[var]], useNA = "no")
    freq_pct <- prop.table(freq_table) * 100
    
    for(i in seq_along(freq_pct)) {
      cat(sprintf("  %s: %d (%.1f%%)\n", names(freq_pct)[i], freq_table[i], freq_pct[i]))
    }
  }
}

# Nominal summaries
if(length(nominal_vars) > 0) {
  cat("\n--- Nominal Variables (Top Categories) ---\n")
  for(var in head(nominal_vars, 5)) {
    cat("\n", var, ":\n", sep = "")
    freq_table <- table(historic_data[[var]], useNA = "no")
    cat("  Unique values:", length(freq_table), "\n")
    freq_pct <- prop.table(freq_table) * 100
    top_3 <- head(sort(freq_pct, decreasing = TRUE), 3)
    for(i in seq_along(top_3)) {
      cat(sprintf("  %s: %d (%.1f%%)\n", names(top_3)[i], freq_table[names(top_3)[i]], top_3[i]))
    }
  }
}

# Boolean summaries
if(length(boolean_vars) > 0) {
  cat("\n--- Boolean Variables ---\n")
  for(var in boolean_vars) {
    if(var %in% names(historic_data)) {
      true_count <- sum(historic_data[[var]], na.rm = TRUE)
      total_count <- sum(!is.na(historic_data[[var]]))
      true_pct <- (true_count / total_count) * 100
      cat(sprintf("  %s: TRUE=%d (%.1f%%), FALSE=%d (%.1f%%)\n",
                  var, true_count, true_pct, total_count - true_count, 100 - true_pct))
    }
  }
}


=== Summary Statistics ===


--- Boolean Variables ---


ERROR: Error in Summary.factor(structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, : ‘sum’ not meaningful for factors


In [None]:
# =====================================================
# INTEGER VARIABLES: HISTOGRAMS
# =====================================================

cat("\n=== Integer Variable Distributions (Histograms) ===\n")

if(length(integer_vars) > 0) {
  
  for(var in integer_vars[1:min(8, length(integer_vars))]) {
    
    p <- ggplot(historic_data, aes(x = .data[[var]])) +
      geom_histogram(bins = 10, fill = "#2E86AB", color = "#023E73", alpha = 0.85) +
      geom_vline(aes(xintercept = mean(.data[[var]], na.rm = TRUE)),
                 color = "#E63946", linetype = "dashed", size = 1.2) +
      geom_vline(aes(xintercept = median(.data[[var]], na.rm = TRUE)),
                 color = "#06FFA5", linetype = "dashed", size = 1.2) +
      theme_minimal() +
      theme(
        plot.title = element_text(size = 13, face = "bold", hjust = 0.5),
        plot.subtitle = element_text(size = 9, hjust = 0.5, color = "gray30"),
        axis.title = element_text(size = 10, face = "bold"),
        axis.text.x = element_text(size = 9),
        panel.grid.minor = element_blank()
      ) +
      labs(
        title = paste("Distribution:", var),
        subtitle = sprintf("Mean=%.1f | Median=%.1f | SD=%.1f (Integer)",
                          mean(historic_data[[var]], na.rm = TRUE),
                          median(historic_data[[var]], na.rm = TRUE),
                          sd(historic_data[[var]], na.rm = TRUE)),
        x = var,
        y = "Frequency"
      )
    
    print(p)
  }
}

In [None]:
# =====================================================
# ORDINAL VARIABLES: HISTOGRAMS
# =====================================================

cat("\n=== Ordinal Variable Distributions (Histograms) ===\n")

if(length(ordinal_vars) > 0) {
  
  for(var in ordinal_vars[1:min(8, length(ordinal_vars))]) {
    
    freq_data <- as.data.frame(table(historic_data[[var]])) %>%
      arrange(Var1)
    
    colnames(freq_data) <- c("Category", "Count")
    
    # Calculate numeric stats for subtitle
    numeric_vals <- as.numeric(as.factor(historic_data[[var]]))
    mean_val <- mean(numeric_vals, na.rm = TRUE)
    median_val <- median(numeric_vals, na.rm = TRUE)
    sd_val <- sd(numeric_vals, na.rm = TRUE)
    
    p <- ggplot(freq_data, aes(x = Category, y = Count)) +
      geom_bar(stat = "identity", fill = "#6A4C93", color = "#1A1423", alpha = 0.85) +
      geom_text(aes(label = Count), vjust = -0.5, size = 3.5, fontface = "bold") +
      theme_minimal() +
      theme(
        plot.title = element_text(size = 13, face = "bold", hjust = 0.5),
        plot.subtitle = element_text(size = 9, hjust = 0.5, color = "gray30"),
        axis.title = element_text(size = 10, face = "bold"),
        axis.text.x = element_text(angle = 45, hjust = 1, size = 9)
      ) +
      labs(
        title = paste("Distribution:", var),
        subtitle = sprintf("Mean=%.2f | Median=%.2f | SD=%.2f (Ordinal)", 
                          mean_val, median_val, sd_val),
        x = "Category",
        y = "Frequency"
      )
    
    print(p)
  }
}

In [None]:
# =====================================================
# CONTINUOUS VARIABLES: BOX & WHISKER PLOTS
# =====================================================

cat("\n=== Continuous Variable Distributions (Box & Whisker Plots) ===\n")

if(length(continuous_vars) > 0) {
  
  for(var in continuous_vars[1:min(10, length(continuous_vars))]) {
    
    # Calculate outlier boundaries
    Q1 <- quantile(historic_data[[var]], 0.25, na.rm = TRUE)
    Q3 <- quantile(historic_data[[var]], 0.75, na.rm = TRUE)
    IQR_val <- Q3 - Q1
    lower_bound <- Q1 - 1.5 * IQR_val
    upper_bound <- Q3 + 1.5 * IQR_val
    
    # Identify outliers
    outliers <- historic_data %>%
      filter(!is.na(.data[[var]])) %>%
      filter(.data[[var]] < lower_bound | .data[[var]] > upper_bound) %>%
      select(!!sym(var))
    
    n_outliers <- nrow(outliers)
    
    p <- ggplot(historic_data, aes(y = .data[[var]])) +
      geom_boxplot(aes(x = ""), 
                   fill = "#4ECDC4", 
                   color = "#023E73", 
                   alpha = 0.7,
                   outlier.colour = "#E63946",
                   outlier.shape = 16,
                   outlier.size = 2,
                   outlier.alpha = 0.6,
                   width = 0.5) +
      stat_boxplot(aes(x = ""), geom = "errorbar", width = 0.3, color = "#023E73") +
      theme_minimal() +
      theme(
        plot.title = element_text(size = 13, face = "bold", hjust = 0.5),
        plot.subtitle = element_text(size = 9, hjust = 0.5, color = "gray30"),
        axis.title = element_text(size = 10, face = "bold"),
        axis.text = element_text(size = 9),
        axis.title.x = element_blank(),
        axis.text.x = element_blank(),
        panel.grid.major.x = element_blank()
      ) +
      labs(
        title = paste("Distribution:", var),
        subtitle = sprintf("Median=%.2f | IQR=%.2f | Outliers=%d (Continuous)",
                          median(historic_data[[var]], na.rm = TRUE),
                          IQR_val, n_outliers),
        y = var
      ) +
      scale_y_continuous(labels = scales::comma)
    
    print(p)
  }
  
  # Multi-panel box plot overview
  cat("\n--- Multi-Panel Box Plot Overview ---\n")
  
  continuous_sample_vars <- continuous_vars[1:min(9, length(continuous_vars))]
  
  if(length(continuous_sample_vars) >= 3) {
    
    plot_data_long <- historic_data %>%
      select(all_of(continuous_sample_vars)) %>%
      pivot_longer(everything(), names_to = "Variable", values_to = "Value")
    
    p_multi <- ggplot(plot_data_long, aes(x = "", y = Value)) +
      geom_boxplot(fill = "#38B000", 
                   color = "#1A5E00", 
                   alpha = 0.7,
                   outlier.colour = "#E63946",
                   outlier.shape = 16,
                   outlier.size = 1,
                   outlier.alpha = 0.5) +
      facet_wrap(~Variable, scales = "free_y", ncol = 3) +
      theme_minimal() +
      theme(
        plot.title = element_text(size = 14, face = "bold", hjust = 0.5),
        strip.text = element_text(size = 9, face = "bold"),
        axis.text.y = element_text(size = 7),
        axis.text.x = element_blank(),
        axis.title = element_text(size = 9),
        axis.title.x = element_blank(),
        panel.grid.major.x = element_blank()
      ) +
      labs(
        title = "Continuous Variables - Box & Whisker Plot Overview",
        y = "Value"
      ) +
      scale_y_continuous(labels = scales::comma)
    
    print(p_multi)
  }
  
  # Horizontal box plots for comparison
  cat("\n--- Horizontal Comparison Box Plots ---\n")
  
  if(length(continuous_sample_vars) >= 3) {
    
    # Normalize data for better comparison
    plot_data_long_scaled <- historic_data %>%
      select(all_of(continuous_sample_vars)) %>%
      mutate(across(everything(), ~scale(.x)[,1])) %>%
      pivot_longer(everything(), names_to = "Variable", values_to = "Scaled_Value")
    
    p_horizontal <- ggplot(plot_data_long_scaled, 
                           aes(x = reorder(Variable, Scaled_Value, FUN = median), 
                               y = Scaled_Value)) +
      geom_boxplot(fill = "#FF6B6B", 
                   color = "#C92A2A", 
                   alpha = 0.7,
                   outlier.colour = "#E63946",
                   outlier.shape = 16,
                   outlier.size = 1.5,
                   outlier.alpha = 0.6) +
      coord_flip() +
      theme_minimal() +
      theme(
        plot.title = element_text(size = 14, face = "bold", hjust = 0.5),
        plot.subtitle = element_text(size = 9, hjust = 0.5, color = "gray30"),
        axis.title = element_text(size = 10, face = "bold"),
        axis.text = element_text(size = 9)
      ) +
      labs(
        title = "Continuous Variables - Standardized Comparison",
        subtitle = "Scaled to mean=0, SD=1 for comparison",
        x = "Variable",
        y = "Standardized Value"
      ) +
      geom_hline(yintercept = 0, linetype = "dashed", color = "gray40", size = 0.8)
    
    print(p_horizontal)
  }
}

In [None]:
# =====================================================
# NOMINAL VARIABLES: PIE CHARTS
# =====================================================

cat("\n=== Nominal Variable Distributions (Pie Charts) ===\n")

if(length(nominal_vars) > 0) {
  
  for(var in nominal_vars[1:min(8, length(nominal_vars))]) {
    
    # Get frequency data (top 8 categories + "Other")
    freq_table <- table(historic_data[[var]], useNA = "no")
    freq_df <- as.data.frame(freq_table) %>%
      arrange(desc(Freq))
    
    colnames(freq_df) <- c("Category", "Count")
    
    # Keep top 7, group rest as "Other"
    if(nrow(freq_df) > 7) {
      top_cats <- freq_df[1:7, ]
      other_count <- sum(freq_df$Count[8:nrow(freq_df)])
      freq_df <- rbind(top_cats, data.frame(Category = "Other", Count = other_count))
    }
    
    freq_df <- freq_df %>%
      mutate(
        Percentage = round(Count / sum(Count) * 100, 1),
        Label = ifelse(Percentage >= 3, paste0(Percentage, "%"), "")
      )
    
    p_pie <- ggplot(freq_df, aes(x = "", y = Count, fill = Category)) +
      geom_bar(stat = "identity", width = 1, color = "white", size = 1.5) +
      coord_polar("y", start = 0) +
      geom_text(aes(label = Label), 
                position = position_stack(vjust = 0.5),
                size = 3.5, fontface = "bold", color = "black") +
      theme_void() +
      theme(
        plot.title = element_text(size = 13, face = "bold", hjust = 0.5, margin = margin(b = 10)),
        plot.subtitle = element_text(size = 9, hjust = 0.5, color = "gray40", margin = margin(b = 15)),
        legend.position = "right",
        legend.title = element_text(size = 10, face = "bold"),
        legend.text = element_text(size = 8)
      ) +
      labs(
        title = paste("Distribution:", var),
        subtitle = paste("(Nominal -", length(freq_table), "unique values)"),
        fill = "Category"
      ) +
      scale_fill_brewer(palette = "Set3")
    
    print(p_pie)
  }
}

In [None]:
# =====================================================
# BOOLEAN VARIABLES: BAR CHARTS
# =====================================================

cat("\n=== Boolean Variable Distributions (Bar Charts) ===\n")

if(length(boolean_vars) > 0) {
  
  for(var in boolean_vars) {
    
    if(var %in% names(historic_data)) {
      
      freq_data <- data.frame(
        Value = c("TRUE", "FALSE"),
        Count = c(
          sum(historic_data[[var]], na.rm = TRUE),
          sum(!historic_data[[var]], na.rm = TRUE)
        )
      ) %>%
        mutate(
          Percentage = round(Count / sum(Count) * 100, 1),
          Label = paste0(Count, "\n(", Percentage, "%)")
        )
      
      p_bar <- ggplot(freq_data, aes(x = Value, y = Count, fill = Value)) +
        geom_bar(stat = "identity", color = "black", alpha = 0.85, width = 0.6) +
        geom_text(aes(label = Label), vjust = -0.5, size = 4, fontface = "bold") +
        theme_minimal() +
        theme(
          plot.title = element_text(size = 13, face = "bold", hjust = 0.5),
          plot.subtitle = element_text(size = 9, hjust = 0.5, color = "gray30"),
          axis.title = element_text(size = 10, face = "bold"),
          axis.text = element_text(size = 10),
          legend.position = "none",
          panel.grid.major.x = element_blank()
        ) +
        labs(
          title = paste("Distribution:", var),
          subtitle = "(Boolean Variable)",
          x = "Value",
          y = "Count"
        ) +
        scale_fill_manual(values = c("TRUE" = "#4CAF50", "FALSE" = "#F44336")) +
        scale_y_continuous(expand = expansion(mult = c(0, 0.15)))
      
      print(p_bar)
    }
  }
}

In [None]:
# =====================================================
# CORRELATION ANALYSIS
# =====================================================

cat("\n=== Correlation Analysis ===\n")

# Combine integer, continuous, and ordinal (as numeric) for correlation
numeric_for_corr <- c(integer_vars, continuous_vars)
numeric_for_corr <- intersect(numeric_for_corr, names(historic_data))

# Add ordinal as numeric
if(length(ordinal_vars) > 0) {
  ordinal_in_data <- intersect(ordinal_vars, names(historic_data))
  numeric_for_corr <- c(numeric_for_corr, ordinal_in_data)
}

if(length(numeric_for_corr) >= 2) {
  
  # Prepare data - convert ordinal to numeric
  cor_data <- historic_data %>%
    select(all_of(numeric_for_corr))
  
  # Convert ordinal columns to numeric
  for(col in ordinal_in_data) {
    if(col %in% names(cor_data)) {
      cor_data[[col]] <- as.numeric(as.factor(cor_data[[col]]))
    }
  }
  
  # Keep only numeric columns
  cor_data <- cor_data %>%
    select(where(is.numeric))
  
  # Remove zero-variance columns
  cor_data <- cor_data %>%
    select(where(~var(.x, na.rm = TRUE) > 0))
  
  # Calculate correlation matrix
  cor_matrix <- cor(cor_data, use = "pairwise.complete.obs")
  
  # Full correlation heatmap
  cat("\n--- Creating Correlation Heatmap ---\n")
  
  corrplot(cor_matrix, 
           method = "color",
           type = "upper",
           order = "hclust",
           tl.col = "black",
           tl.srt = 45,
           tl.cex = 0.7,
           cl.cex = 0.8,
           col = colorRampPalette(c("#D32F2F", "#FFFFFF", "#1976D2"))(200),
           title = "Correlation Matrix - Numeric Variables (incl. Ordinal)",
           mar = c(0, 0, 2, 0))
  
  # Sale price correlations
  if("sale_price" %in% colnames(cor_matrix)) {
    cat("\n--- Top Correlations with Sale Price ---\n")
    
    price_cors <- cor_matrix["sale_price", ]
    price_cors <- price_cors[order(abs(price_cors), decreasing = TRUE)]
    price_cors <- price_cors[names(price_cors) != "sale_price"]
    
    top_15 <- head(price_cors, 15)
    for(i in seq_along(top_15)) {
      cat(sprintf("  %s: %.3f\n", names(top_15)[i], top_15[i]))
    }
    
    # Bar chart of correlations
    price_cor_df <- data.frame(
      Variable = names(top_15),
      Correlation = as.numeric(top_15)
    )
    
    p_cor <- ggplot(price_cor_df, aes(x = reorder(Variable, abs(Correlation)), 
                                       y = Correlation, fill = Correlation > 0)) +
      geom_bar(stat = "identity", color = "black", alpha = 0.85) +
      geom_text(aes(label = sprintf("%.2f", Correlation)),
                hjust = ifelse(price_cor_df$Correlation > 0, -0.2, 1.2),
                size = 3.5, fontface = "bold") +
      coord_flip() +
      theme_minimal() +
      theme(
        plot.title = element_text(size = 14, face = "bold", hjust = 0.5),
        axis.title = element_text(size = 11, face = "bold"),
        legend.position = "none"
      ) +
      labs(
        title = "Top 15 Correlations with Sale Price",
        x = "Variable",
        y = "Correlation Coefficient"
      ) +
      scale_fill_manual(values = c("TRUE" = "#2E7D32", "FALSE" = "#C62828")) +
      geom_hline(yintercept = 0, linetype = "solid", color = "black", size = 0.8)
    
    print(p_cor)
  }
}

cat("\n=== EDA Complete ===\n")
cat("\nVisualization Summary:")
cat("\n  - Integer: Histograms")
cat("\n  - Continuous: Box & Whisker Plots (red outliers)")
cat("\n  - Ordinal: Numeric stats + Histograms")
cat("\n  - Nominal: Pie Charts")
cat("\n  - Boolean: Bar Charts")
cat("\n  - Correlation: Heatmap (Integer + Continuous + Ordinal)\n\n")