In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

# ===== Load Data =====
folder_path = Path("data")  # Make sure there's a 'data/' folder next to this script

building_years = []
for year in range(2015, 2020):
    df = pd.read_csv(folder_path / f'building_details_{year}.csv')  # FIXED
    df['year'] = year
    building_years.append(df)

building_all = pd.concat(building_years, ignore_index=True)
building_all = building_all.drop_duplicates(subset=['acct', 'year'], keep='first')
pivoted = building_all.pivot(index='acct', columns='year')
pivoted.columns = [f'{col}_{year}' for col, year in pivoted.columns]
pivoted = pivoted.reset_index()

train = pd.read_csv(folder_path / 'assessment_history_train.csv')  # FIXED
test = pd.read_csv(folder_path / 'assessment_history_test.csv')    # FIXED

train_merged = train.merge(pivoted, on='acct', how='left')
test_merged = test.merge(pivoted, on='acct', how='left')


  df = pd.read_csv(folder_path + f'building_details_{year}.csv')
  test = pd.read_csv(folder_path + 'assessment_history_test.csv')


In [None]:
# Step 1: Identify all '2019' columns in train
train_2019_cols = [col for col in train_merged.columns if '2019' in col]

# Step 2: Find which 2019 columns are missing in test
train_only_2019_cols = [col for col in train_2019_cols if col not in test_merged.columns]

# Step 3: Always drop 'protested_2019' due to leakage risk
train_only_2019_cols.append('protested_2019')

# Step 4: Drop the identified columns from train
train_merged = train_merged.drop(columns=train_only_2019_cols, errors='ignore')

# Step 5: Optional logging
print(f" Dropped {len(train_only_2019_cols)} columns from train (including protested_2019 if present):")
print(train_only_2019_cols)


✅ Dropped 4 columns from train (including protested_2019 if present):
['building_value_2019', 'land_value_2019', 'assessed_2019', 'protested_2019']


In [None]:
for df_name, df in [('train_merged', train_merged), ('test_merged', test_merged)]:
    if 'protested_2019' in df.columns:
        df.drop(columns='protested_2019', inplace=True)
        print(f" Dropped 'protested_2019' from {df_name}")
    else:
        print(f" 'protested_2019' not found in {df_name}")


ℹ️ 'protested_2019' not found in train_merged
✅ Dropped 'protested_2019' from test_merged


In [None]:
## Get all '2019' columns in train and test
train_2019_cols = sorted([col for col in train_merged.columns if '2019' in col])
test_2019_cols = sorted([col for col in test_merged.columns if '2019' in col])

# Print train columns
print(" 2019 Columns in train_merged:")
for col in train_2019_cols:
    print(f" - {col}")

print("\n 2019 Columns in test_merged:")
for col in test_2019_cols:
    print(f" - {col}")



📘 2019 Columns in train_merged:
 - bedrooms_2019
 - building_area_2019
 - building_condition_2019
 - deck_area_2019
 - elevator_2019
 - exterior_walls_2019
 - fireplaces_2019
 - floor_area_lower_2019
 - floor_area_primary_2019
 - floor_area_upper_2019
 - floors_2019
 - foundation_type_2019
 - full_bath_2019
 - garage_area_2019
 - grade_2019
 - half_bath_2019
 - has_cooling_2019
 - has_heat_2019
 - land_area_2019
 - mobile_home_area_2019
 - physical_condition_2019
 - porch_area_2019
 - quality_2019
 - quality_description_2019
 - total_rooms_2019
 - year_built_2019
 - year_remodeled_2019

📙 2019 Columns in test_merged:
 - bedrooms_2019
 - building_area_2019
 - building_condition_2019
 - deck_area_2019
 - elevator_2019
 - exterior_walls_2019
 - fireplaces_2019
 - floor_area_lower_2019
 - floor_area_primary_2019
 - floor_area_upper_2019
 - floors_2019
 - foundation_type_2019
 - full_bath_2019
 - garage_area_2019
 - grade_2019
 - half_bath_2019
 - has_cooling_2019
 - has_heat_2019
 - land_a

In [None]:
# Compare sets
train_only_2019 = sorted(list(set(train_2019_cols) - set(test_2019_cols)))
test_only_2019 = sorted(list(set(test_2019_cols) - set(train_2019_cols)))

# Print comparison result
if not train_only_2019 and not test_only_2019:
    print("\n All 2019 columns match between train_merged and test_merged.")
else:
    print("\n Mismatched 2019 columns found:")

    if train_only_2019:
        print(" In train_merged but not in test_merged:")
        for col in train_only_2019:
            print(f"   - {col}")

    if test_only_2019:
        print(" In test_merged but not in train_merged:")
        for col in test_only_2019:
            print(f"   - {col}")


✅ All 2019 columns match between train_merged and test_merged.


In [6]:
# Step 1: Store target separately
y_train = train_merged["TARGET"].values

# Step 4: Confirm sizes
print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")


Train shape: (628287, 37)
Test shape: (418858, 33)


In [7]:
# Calculate % of missing values in each column of TRAIN only
train_na = (train_merged.isnull().sum() / train_merged.shape[0]) * 100

# Drop columns with no missing values, sort the rest
train_na = train_na[train_na > 0].sort_values(ascending=False)

# Optional: Show top 200 missing features
missing_data = pd.DataFrame({'Missing Ratio (%)': train_na})

# Display
print("Top columns with missing data (based on training set only):")
missing_data.head(20)



Top columns with missing data (based on training set only):


Unnamed: 0,Missing Ratio (%)
building_area_2015,5.954126
land_value_2015,5.954126
land_area_2015,5.954126
protested_2015,5.954126
assessed_2015,5.954126
building_value_2015,5.954126
quality_2015,5.74228
quality_description_2015,5.74228
year_built_2015,5.74228
year_remodeled_2015,5.74228


In [None]:
def create_floor_area_totals(df, years):
    for y in years:
        primary_col = f'floor_area_primary_{y}'
        upper_col   = f'floor_area_upper_{y}'
        lower_col   = f'floor_area_lower_{y}'
        total_col   = f'floor_area_total_{y}'

        if all(col in df.columns for col in [primary_col, upper_col, lower_col]):
            df[total_col] = (
                df[primary_col].fillna(0) +
                df[upper_col].fillna(0) +
                df[lower_col].fillna(0)
            )
            print(f" Created: {total_col}")
        else:
            print(f" Skipping {total_col} — one or more components missing.")
    return df

# Apply for years 2015–2019
years = ['2015', '2016', '2017', '2018', '2019']
train_merged = create_floor_area_totals(train_merged, years)
test_merged = create_floor_area_totals(test_merged, years)






✅ Created: floor_area_total_2015
✅ Created: floor_area_total_2016
✅ Created: floor_area_total_2017
✅ Created: floor_area_total_2018
✅ Created: floor_area_total_2019
✅ Created: floor_area_total_2015
✅ Created: floor_area_total_2016
✅ Created: floor_area_total_2017
✅ Created: floor_area_total_2018
✅ Created: floor_area_total_2019


In [None]:
zero_pct = (train_merged['mobile_home_area_2015'] == 0).mean() * 100
print(f" mobile_home_area_2015 is 0 in {zero_pct:.2f}% of training rows")


📊 mobile_home_area_2015 is 0 in 94.42% of training rows


In [None]:
cols_to_drop = [col for col in train_merged.columns if col.startswith("mobile_home_area")]

# Drop from both sets
train_merged.drop(columns=cols_to_drop, inplace=True, errors='ignore')
test_merged.drop(columns=cols_to_drop, inplace=True, errors='ignore')

print(f" Dropped columns from train/test: {cols_to_drop}")



🗑️ Dropped columns from train/test: ['mobile_home_area_2015', 'mobile_home_area_2016', 'mobile_home_area_2017', 'mobile_home_area_2018', 'mobile_home_area_2019']


In [None]:
zero_pct = (train_merged['deck_area_2015'] == 0).mean() * 100
print(f" deck_area_2015 is 0 in {zero_pct:.2f}% of training rows")



📊 deck_area_2015 is 0 in 90.66% of training rows


In [None]:
# Use training columns to identify which deck_area columns exist
cols_to_drop = [col for col in train_merged.columns if col.startswith("deck_area")]

# Drop those columns from both datasets
train_merged.drop(columns=cols_to_drop, inplace=True)
test_merged.drop(columns=cols_to_drop, inplace=True)

print(f" Dropped columns from train/test: {cols_to_drop}")



🗑️ Dropped columns from train/test: ['deck_area_2015', 'deck_area_2016', 'deck_area_2017', 'deck_area_2018', 'deck_area_2019']


In [None]:
def backfill_yearly_features(df, base_features, years):
    for feature in base_features:
        cols = [f"{feature}_{y}" for y in years if f"{feature}_{y}" in df.columns]
        if len(cols) >= 2:
            # Backfill all relevant year columns in-place
            df[cols] = df[cols].bfill(axis=1)
            print(f" Backfilled: {feature} across {cols}")
        else:
            print(f" Skipping {feature} — not enough year columns found.")
    return df

# Years in reverse so that bfill works from most recent (2019) to oldest (2015)
years = ['2019', '2018', '2017', '2016', '2015']
features = ['garage_area', 'porch_area']

# Apply to both train and test
train_merged = backfill_yearly_features(train_merged, features, years)
test_merged = backfill_yearly_features(test_merged, features, years)



✅ Backfilled: garage_area across ['garage_area_2019', 'garage_area_2018', 'garage_area_2017', 'garage_area_2016', 'garage_area_2015']
✅ Backfilled: porch_area across ['porch_area_2019', 'porch_area_2018', 'porch_area_2017', 'porch_area_2016', 'porch_area_2015']
✅ Backfilled: garage_area across ['garage_area_2019', 'garage_area_2018', 'garage_area_2017', 'garage_area_2016', 'garage_area_2015']
✅ Backfilled: porch_area across ['porch_area_2019', 'porch_area_2018', 'porch_area_2017', 'porch_area_2016', 'porch_area_2015']


In [None]:
def backfill_yearly_features(df, features, years):
    for feature in features:
        year_cols = [f"{feature}_{y}" for y in years if f"{feature}_{y}" in df.columns]
        if len(year_cols) >= 2:
            df[year_cols] = df[year_cols].bfill(axis=1)
            print(f" Backfilled: {feature} across {year_cols}")
        else:
            print(f" Skipped: Not enough year columns for '{feature}'")
    return df

# Use years in reverse for proper backfill (newest to oldest)
years = ['2019', '2018', '2017', '2016', '2015']
features = ['floors', 'half_bath', 'full_bath', 'total_rooms', 'bedrooms']

# Apply to both train and test
train_merged = backfill_yearly_features(train_merged, features, years)
test_merged = backfill_yearly_features(test_merged, features, years)




✅ Backfilled: floors across ['floors_2019', 'floors_2018', 'floors_2017', 'floors_2016', 'floors_2015']
✅ Backfilled: half_bath across ['half_bath_2019', 'half_bath_2018', 'half_bath_2017', 'half_bath_2016', 'half_bath_2015']
✅ Backfilled: full_bath across ['full_bath_2019', 'full_bath_2018', 'full_bath_2017', 'full_bath_2016', 'full_bath_2015']
✅ Backfilled: total_rooms across ['total_rooms_2019', 'total_rooms_2018', 'total_rooms_2017', 'total_rooms_2016', 'total_rooms_2015']
✅ Backfilled: bedrooms across ['bedrooms_2019', 'bedrooms_2018', 'bedrooms_2017', 'bedrooms_2016', 'bedrooms_2015']
✅ Backfilled: floors across ['floors_2019', 'floors_2018', 'floors_2017', 'floors_2016', 'floors_2015']
✅ Backfilled: half_bath across ['half_bath_2019', 'half_bath_2018', 'half_bath_2017', 'half_bath_2016', 'half_bath_2015']
✅ Backfilled: full_bath across ['full_bath_2019', 'full_bath_2018', 'full_bath_2017', 'full_bath_2016', 'full_bath_2015']
✅ Backfilled: total_rooms across ['total_rooms_2019', 

In [None]:
# Identify elevator-related columns from training set
elevator_cols = [col for col in train_merged.columns if col.startswith("elevator")]

# Drop from both train and test
train_merged.drop(columns=elevator_cols, inplace=True)
test_merged.drop(columns=elevator_cols, inplace=True)

print(f" Dropped elevator-related columns from train/test: {elevator_cols}")


🗑️ Dropped elevator-related columns from train/test: ['elevator_2015', 'elevator_2016', 'elevator_2017', 'elevator_2018', 'elevator_2019']


In [None]:
def backfill_year_features(df, features, years):
    for feature in features:
        cols = [f"{feature}_{y}" for y in years if f"{feature}_{y}" in df.columns]
        if len(cols) >= 2:
            df[cols] = df[cols].bfill(axis=1)
            print(f" Backfilled: {feature} across {cols}")
        else:
            print(f" Skipped {feature} — not enough year-based columns found.")
    return df

def create_year_built_final(df):
    year_cols = [f"year_built_{y}" for y in ['2019', '2018', '2017', '2016', '2015'] if f"year_built_{y}" in df.columns]
    if year_cols:
        df['year_built_final'] = df[year_cols].bfill(axis=1).iloc[:, 0]
        print(f" Created year_built_final from: {year_cols}")
    else:
        print(" Skipped: no year_built_* columns found.")
    return df

# Define reverse years for backfill (latest → oldest)
years = ['2019', '2018', '2017', '2016', '2015']
features_to_backfill = ['fireplaces', 'quality', 'quality_description']

# Apply to both train and test
train_merged = backfill_year_features(train_merged, features_to_backfill, years)
train_merged = create_year_built_final(train_merged)

test_merged = backfill_year_features(test_merged, features_to_backfill, years)
test_merged = create_year_built_final(test_merged)



✅ Backfilled: fireplaces across ['fireplaces_2019', 'fireplaces_2018', 'fireplaces_2017', 'fireplaces_2016', 'fireplaces_2015']
✅ Backfilled: quality across ['quality_2019', 'quality_2018', 'quality_2017', 'quality_2016', 'quality_2015']
✅ Backfilled: quality_description across ['quality_description_2019', 'quality_description_2018', 'quality_description_2017', 'quality_description_2016', 'quality_description_2015']
✅ Created year_built_final from: ['year_built_2019', 'year_built_2018', 'year_built_2017', 'year_built_2016', 'year_built_2015']
✅ Backfilled: fireplaces across ['fireplaces_2019', 'fireplaces_2018', 'fireplaces_2017', 'fireplaces_2016', 'fireplaces_2015']
✅ Backfilled: quality across ['quality_2019', 'quality_2018', 'quality_2017', 'quality_2016', 'quality_2015']
✅ Backfilled: quality_description across ['quality_description_2019', 'quality_description_2018', 'quality_description_2017', 'quality_description_2016', 'quality_description_2015']
✅ Created year_built_final from

In [17]:
## Identify columns to drop from training data
cols_to_drop = [col for col in train_merged.columns if col.startswith("year_remodeled")]

# Drop from both train and test
train_merged.drop(columns=cols_to_drop, inplace=True)
test_merged.drop(columns=cols_to_drop, inplace=True)

print(f"🗑️ Dropped year_remodeled-related columns from train/test: {cols_to_drop}")



🗑️ Dropped year_remodeled-related columns from train/test: ['year_remodeled_2015', 'year_remodeled_2016', 'year_remodeled_2017', 'year_remodeled_2018', 'year_remodeled_2019']


In [None]:
def backfill_categorical_year_features(df, features, years):
    for feature in features:
        year_cols = [f"{feature}_{y}" for y in years if f"{feature}_{y}" in df.columns]
        if len(year_cols) >= 2:
            df[year_cols] = df[year_cols].bfill(axis=1)
            print(f" Backfilled: {feature} across {year_cols}")
        else:
            print(f" Skipped: {feature} — not enough year-based columns.")
    return df

# Backfill from most recent year to oldest
years = ['2019', '2018', '2017', '2016', '2015']
features = ['building_condition', 'foundation_type', 'grade', 'has_cooling', 
            'has_heat', 'physical_condition', 'exterior_walls']

# Apply to train and test
train_merged = backfill_categorical_year_features(train_merged, features, years)
test_merged = backfill_categorical_year_features(test_merged, features, years)




✅ Backfilled: building_condition across ['building_condition_2019', 'building_condition_2018', 'building_condition_2017', 'building_condition_2016', 'building_condition_2015']
✅ Backfilled: foundation_type across ['foundation_type_2019', 'foundation_type_2018', 'foundation_type_2017', 'foundation_type_2016', 'foundation_type_2015']
✅ Backfilled: grade across ['grade_2019', 'grade_2018', 'grade_2017', 'grade_2016', 'grade_2015']
✅ Backfilled: has_cooling across ['has_cooling_2019', 'has_cooling_2018', 'has_cooling_2017', 'has_cooling_2016', 'has_cooling_2015']
✅ Backfilled: has_heat across ['has_heat_2019', 'has_heat_2018', 'has_heat_2017', 'has_heat_2016', 'has_heat_2015']
✅ Backfilled: physical_condition across ['physical_condition_2019', 'physical_condition_2018', 'physical_condition_2017', 'physical_condition_2016', 'physical_condition_2015']
✅ Backfilled: exterior_walls across ['exterior_walls_2019', 'exterior_walls_2018', 'exterior_walls_2017', 'exterior_walls_2016', 'exterior_wal

In [None]:
def backfill_year_columns(df, features, years):
    for feature in features:
        cols = [f"{feature}_{y}" for y in years if f"{feature}_{y}" in df.columns]
        if len(cols) >= 2:
            df[cols] = df[cols].bfill(axis=1)
            print(f" Backfilled: {feature} across {cols}")
        else:
            print(f" Skipped: {feature} — not enough year-based columns found.")
    return df

# Define year ranges
area_years = ['2019', '2018', '2017', '2016', '2015']
value_years = ['2018', '2017', '2016', '2015']

# Define feature groups
area_features = ['building_area', 'land_area']
value_features = ['building_value', 'land_value', 'assessed']

# Apply to train and test
train_merged = backfill_year_columns(train_merged, area_features, area_years)
train_merged = backfill_year_columns(train_merged, value_features, value_years)

test_merged = backfill_year_columns(test_merged, area_features, area_years)
test_merged = backfill_year_columns(test_merged, value_features, value_years)




✅ Backfilled: building_area across ['building_area_2019', 'building_area_2018', 'building_area_2017', 'building_area_2016', 'building_area_2015']
✅ Backfilled: land_area across ['land_area_2019', 'land_area_2018', 'land_area_2017', 'land_area_2016', 'land_area_2015']
✅ Backfilled: building_value across ['building_value_2018', 'building_value_2017', 'building_value_2016', 'building_value_2015']
✅ Backfilled: land_value across ['land_value_2018', 'land_value_2017', 'land_value_2016', 'land_value_2015']
✅ Backfilled: assessed across ['assessed_2018', 'assessed_2017', 'assessed_2016', 'assessed_2015']
✅ Backfilled: building_area across ['building_area_2019', 'building_area_2018', 'building_area_2017', 'building_area_2016', 'building_area_2015']
✅ Backfilled: land_area across ['land_area_2019', 'land_area_2018', 'land_area_2017', 'land_area_2016', 'land_area_2015']
✅ Backfilled: building_value across ['building_value_2018', 'building_value_2017', 'building_value_2016', 'building_value_2015'

In [None]:
def backfill_protested_columns(df, years):
    cols = [f"protested_{y}" for y in years if f"protested_{y}" in df.columns]
    if len(cols) >= 2:
        df[cols] = df[cols].bfill(axis=1)
        print(f" Backfilled: protested across {cols}")
    else:
        print(" Skipped protested — not enough year-based columns found.")
    return df

# Only use pre-2019 years to avoid leakage
years = ['2018', '2017', '2016', '2015']

# Apply to both datasets
train_merged = backfill_protested_columns(train_merged, years)
test_merged = backfill_protested_columns(test_merged, years)




✅ Backfilled: protested across ['protested_2018', 'protested_2017', 'protested_2016', 'protested_2015']
✅ Backfilled: protested across ['protested_2018', 'protested_2017', 'protested_2016', 'protested_2015']


In [None]:
# Save 'acct' from test_merged only
acct_test = test_merged[['acct']].copy() if 'acct' in test_merged.columns else None

# Drop 'acct' from both train and test
for df_name, df in [('train_merged', train_merged), ('test_merged', test_merged)]:
    if 'acct' in df.columns:
        df.drop(columns='acct', inplace=True)
        print(f" Dropped 'acct' from {df_name}")




🗑️ Dropped 'acct' from train_merged
🗑️ Dropped 'acct' from test_merged


In [None]:
for col in ['zone', 'subneighborhood', 'neighborhood', 'region']:
    if col in train_merged.columns:
        print(f"{col}: {train_merged[col].nunique()} unique values in training set")
    else:
        print(f" {col} not found in training set")


zone: 1589 unique values in training set
subneighborhood: 6161 unique values in training set
neighborhood: 959 unique values in training set
region: 87 unique values in training set


cols_to_drop = ['zone', 'subneighborhood']

for df_name, df in [('train_merged', train_merged), ('test_merged', test_merged)]:
    drop_cols = [col for col in cols_to_drop if col in df.columns]
    if drop_cols:
        df.drop(columns=drop_cols, inplace=True)
        print(f"🗑️ Dropped columns from {df_name}: {drop_cols}")


In [None]:
for col in ['neighborhood', 'region','zone','subneighborhood']:
    if col in train_merged.columns:
        # Step 1: Compute frequency from training data
        freq_map = train_merged[col].value_counts(normalize=True)

        # Step 2: Apply to both datasets
        train_merged[f'{col}_freq'] = train_merged[col].map(freq_map)
        test_merged[f'{col}_freq'] = test_merged[col].map(freq_map)

        print(f" Frequency encoded: {col} → {col}_freq (based on training set)")
    else:
        print(f" Column '{col}' not found in training set")


✅ Frequency encoded: neighborhood → neighborhood_freq (based on training set)


  train_merged[f'{col}_freq'] = train_merged[col].map(freq_map)
  test_merged[f'{col}_freq'] = test_merged[col].map(freq_map)
  train_merged[f'{col}_freq'] = train_merged[col].map(freq_map)
  test_merged[f'{col}_freq'] = test_merged[col].map(freq_map)


✅ Frequency encoded: region → region_freq (based on training set)
✅ Frequency encoded: zone → zone_freq (based on training set)


  train_merged[f'{col}_freq'] = train_merged[col].map(freq_map)
  test_merged[f'{col}_freq'] = test_merged[col].map(freq_map)
  train_merged[f'{col}_freq'] = train_merged[col].map(freq_map)


✅ Frequency encoded: subneighborhood → subneighborhood_freq (based on training set)


  test_merged[f'{col}_freq'] = test_merged[col].map(freq_map)


In [None]:
if all(col in train_merged.columns for col in ['land_area_2019', 'land_area_2015']):
    growth = train_merged['land_area_2019'] - train_merged['land_area_2015']
    changed_pct = (growth != 0).mean() * 100
    print(f" Percentage of homes with land area change (2015→2019): {changed_pct:.2f}%")
else:
    print(" One or both columns ('land_area_2015', 'land_area_2019') not found in training set")




🧪 Percentage of homes with land area change (2015→2019): 6.42%


In [None]:
for df_name, df in [('train_merged', train_merged), ('test_merged', test_merged)]:
    if 'year_built_final' in df.columns:
        df['year_built_final'] = df['year_built_final'].astype(str)
        print(f" Converted 'year_built_final' to string in {df_name}")
    else:
        print(f" 'year_built_final' not found in {df_name}")


🔤 Converted 'year_built_final' to string in train_merged
🔤 Converted 'year_built_final' to string in test_merged


In [None]:
for df_name, df in [('train_merged', train_merged), ('test_merged', test_merged)]:
    if 'floor_area_total_final' in df.columns and 'year_built_final' in df.columns:
        df.loc[df['floor_area_total_final'] == 0, 'year_built_final'] = 'None'
        print(f" Set 'year_built_final' to 'None' where 'floor_area_total_final' == 0 in {df_name}")
    else:
        print(f" Required columns missing in {df_name}")


⚠️ Required columns missing in train_merged
⚠️ Required columns missing in test_merged


In [None]:
# Define year ranges per feature type
value_years = range(2015, 2019)   # 2015–2018 for value columns
area_years  = range(2015, 2020)   # 2015–2019 for area columns

# Define base columns
base_cols_year_map = {
    'building_value': value_years,
    'land_value': value_years,
    'building_area': area_years,
    'land_area': area_years,
}

# Generate full list of columns to impute
cols_to_impute = []
for base, years in base_cols_year_map.items():
    for year in years:
        cols_to_impute.append(f"{base}_{year}")

# Impute using neighborhood → region strategy
for col in cols_to_impute:
    # First, fill by neighborhood (train only)
    if 'neighborhood' in train_merged.columns:
        medians_by_neigh = train_merged.groupby('neighborhood')[col].median()
        train_merged[col] = train_merged.apply(
            lambda row: medians_by_neigh[row['neighborhood']] if pd.isna(row[col]) else row[col], axis=1)
        test_merged[col] = test_merged.apply(
            lambda row: medians_by_neigh.get(row['neighborhood'], np.nan) if pd.isna(row[col]) else row[col], axis=1)

    # Then, fill remaining by region (train only)
    if 'region' in train_merged.columns:
        medians_by_region = train_merged.groupby('region')[col].median()
        train_merged[col] = train_merged.apply(
            lambda row: medians_by_region[row['region']] if pd.isna(row[col]) else row[col], axis=1)
        test_merged[col] = test_merged.apply(
            lambda row: medians_by_region.get(row['region'], np.nan) if pd.isna(row[col]) else row[col], axis=1)

    print(f" Imputed '{col}' using group medians (neighborhood → region) from training data")



✅ Imputed 'building_value_2015' using group medians (neighborhood → region) from training data
✅ Imputed 'building_value_2016' using group medians (neighborhood → region) from training data
✅ Imputed 'building_value_2017' using group medians (neighborhood → region) from training data
✅ Imputed 'building_value_2018' using group medians (neighborhood → region) from training data
✅ Imputed 'land_value_2015' using group medians (neighborhood → region) from training data
✅ Imputed 'land_value_2016' using group medians (neighborhood → region) from training data
✅ Imputed 'land_value_2017' using group medians (neighborhood → region) from training data
✅ Imputed 'land_value_2018' using group medians (neighborhood → region) from training data
✅ Imputed 'building_area_2015' using group medians (neighborhood → region) from training data
✅ Imputed 'building_area_2016' using group medians (neighborhood → region) from training data
✅ Imputed 'building_area_2017' using group medians (neighborhood → r

In [None]:
from pandas.api.types import is_numeric_dtype

# Step 1: Coerce non-numeric to NaN in both sets
for df_name, df in [('train_merged', train_merged), ('test_merged', test_merged)]:
    if 'year_built_final' in df.columns:
        df['year_built_final'] = pd.to_numeric(df['year_built_final'], errors='coerce')
        print(f" Coerced 'year_built_final' to numeric in {df_name}")

# Step 2: Group-based imputation (use training data only)
if 'year_built_final' in train_merged.columns and 'neighborhood' in train_merged.columns:
    # Neighborhood-based median from train
    neigh_medians = train_merged.groupby('neighborhood')['year_built_final'].median()

    # Apply to train
    train_merged['year_built_final'] = train_merged.apply(
        lambda row: neigh_medians[row['neighborhood']] if pd.isna(row['year_built_final']) else row['year_built_final'], axis=1)

    # Apply to test
    test_merged['year_built_final'] = test_merged.apply(
        lambda row: neigh_medians.get(row['neighborhood'], np.nan) if pd.isna(row['year_built_final']) else row['year_built_final'], axis=1)

if 'region' in train_merged.columns:
    # Region-based fallback median from train
    region_medians = train_merged.groupby('region')['year_built_final'].median()

    train_merged['year_built_final'] = train_merged.apply(
        lambda row: region_medians[row['region']] if pd.isna(row['year_built_final']) else row['year_built_final'], axis=1)

    test_merged['year_built_final'] = test_merged.apply(
        lambda row: region_medians.get(row['region'], np.nan) if pd.isna(row['year_built_final']) else row['year_built_final'], axis=1)

print(" Imputed 'year_built_final' using neighborhood → region medians from training set")


🔢 Coerced 'year_built_final' to numeric in train_merged
🔢 Coerced 'year_built_final' to numeric in test_merged
✅ Imputed 'year_built_final' using neighborhood → region medians from training set


In [None]:
# List of all assessed columns to impute
assessed_cols = ['assessed_2015', 'assessed_2016', 'assessed_2017', 'assessed_2018']

for col in assessed_cols:
    if col not in train_merged.columns:
        continue

    # Step 1: Compute medians from training data only
    neigh_medians = train_merged.groupby('neighborhood')[col].median()
    region_medians = train_merged.groupby('region')[col].median()
    global_median = train_merged[col].median()

    # Step 2: Train set imputation
    train_merged[col] = train_merged.apply(
        lambda row: neigh_medians[row['neighborhood']]
        if pd.isna(row[col]) and row['neighborhood'] in neigh_medians else
        region_medians[row['region']]
        if pd.isna(row[col]) and row['region'] in region_medians else
        global_median
        if pd.isna(row[col]) else
        row[col],
        axis=1
    )

    # Step 3: Test set imputation (using train medians only)
    test_merged[col] = test_merged.apply(
        lambda row: neigh_medians.get(row['neighborhood'], np.nan)
        if pd.isna(row[col]) else row[col],
        axis=1
    )
    test_merged[col] = test_merged.apply(
        lambda row: region_medians.get(row['region'], np.nan)
        if pd.isna(row[col]) else row[col],
        axis=1
    )
    test_merged[col].fillna(global_median, inplace=True)

    print(f" Imputed '{col}' using neighborhood → region → global medians (from training data)")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_merged[col].fillna(global_median, inplace=True)


✅ Imputed 'assessed_2015' using neighborhood → region → global medians (from training data)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_merged[col].fillna(global_median, inplace=True)


✅ Imputed 'assessed_2016' using neighborhood → region → global medians (from training data)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_merged[col].fillna(global_median, inplace=True)


✅ Imputed 'assessed_2017' using neighborhood → region → global medians (from training data)
✅ Imputed 'assessed_2018' using neighborhood → region → global medians (from training data)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_merged[col].fillna(global_median, inplace=True)


In [None]:
# === Step 1: Compute neighborhood-level stats ===
neigh_stats = train_merged.groupby('neighborhood')['assessed_2018'].agg([
    ('neigh_assess_mean', 'mean'),
    ('neigh_assess_median', 'median'),
    ('neigh_assess_std', 'std'),
    ('neigh_assess_q1', lambda x: x.quantile(0.25)),
    ('neigh_assess_q3', lambda x: x.quantile(0.75)),
]).reset_index()
neigh_stats['neigh_assess_iqr'] = neigh_stats['neigh_assess_q3'] - neigh_stats['neigh_assess_q1']

# === Step 2: Compute region-level stats ===
region_stats = train_merged.groupby('region')['assessed_2018'].agg([
    ('region_assess_mean', 'mean'),
    ('region_assess_median', 'median'),
    ('region_assess_std', 'std'),
    ('region_assess_q1', lambda x: x.quantile(0.25)),
    ('region_assess_q3', lambda x: x.quantile(0.75)),
]).reset_index()
region_stats['region_assess_iqr'] = region_stats['region_assess_q3'] - region_stats['region_assess_q1']

# === Step 3: Fallback std maps from training data ===
# For neighborhood fallback, group region medians of neighborhood std
neigh_std_by_region = neigh_stats.merge(train_merged[['neighborhood', 'region']], on='neighborhood', how='left') \
                                  .groupby('region')['neigh_assess_std'].median()
global_neigh_std = neigh_stats['neigh_assess_std'].median()

region_std_by_neigh = region_stats.merge(train_merged[['neighborhood', 'region']], on='region', how='left') \
                                   .groupby('neighborhood')['region_assess_std'].median()
global_region_std = region_stats['region_assess_std'].median()

# === Step 4: Merge into train/test and compute features ===
for df_name, df in [('train_merged', train_merged), ('test_merged', test_merged)]:
    df = df.merge(neigh_stats, on='neighborhood', how='left')
    df = df.merge(region_stats, on='region', how='left')

    # Fill missing std values via fallback
    df['neigh_assess_std'] = df['neigh_assess_std'].fillna(
        df['region'].map(neigh_std_by_region)
    ).fillna(global_neigh_std)

    df['region_assess_std'] = df['region_assess_std'].fillna(
        df['neighborhood'].map(region_std_by_neigh)
    ).fillna(global_region_std)

    # Compute derived features
    df['assess_minus_neigh_mean'] = df['assessed_2018'] - df['neigh_assess_mean']
    df['assess_ratio_neigh_mean'] = df['assessed_2018'] / (df['neigh_assess_mean'] + 1e-6)
    df['z_score_assess_neigh'] = df['assess_minus_neigh_mean'] / (df['neigh_assess_std'] + 1e-6)

    df['assess_minus_region_mean'] = df['assessed_2018'] - df['region_assess_mean']
    df['assess_ratio_region_mean'] = df['assessed_2018'] / (df['region_assess_mean'] + 1e-6)
    df['z_score_assess_region'] = df['assess_minus_region_mean'] / (df['region_assess_std'] + 1e-6)

    # Save back
    if df_name == 'train_merged':
        train_merged = df
    else:
        test_merged = df

print(" Completed: Stats merge + std fallback + z-score computation.")


✅ Completed: Stats merge + std fallback + z-score computation.


In [None]:
cols_to_drop = ['neighborhood', 'region','zone','subneighborhood']

for df_name, df in [('train_merged', train_merged), ('test_merged', test_merged)]:
    drop_cols = [col for col in cols_to_drop if col in df.columns]
    if drop_cols:
        df.drop(columns=drop_cols, inplace=True)
        print(f" Dropped columns from {df_name}: {drop_cols}")


🗑️ Dropped columns from train_merged: ['neighborhood', 'region', 'zone', 'subneighborhood']
🗑️ Dropped columns from test_merged: ['neighborhood', 'region', 'zone', 'subneighborhood']


In [None]:
growth_pairs = {
    'building_value_growth': ('building_value_2018', 'building_value_2015'),
    'land_value_growth':     ('land_value_2018', 'land_value_2015'),
    'assessed_growth':       ('assessed_2018', 'assessed_2015')
}

for df_name, df in [('train_merged', train_merged), ('test_merged', test_merged)]:
    for new_col, (final_col, base_col) in growth_pairs.items():
        if final_col in df.columns and base_col in df.columns:
            df[new_col] = df[final_col] - df[base_col]
            print(f" Created {new_col} in {df_name}")
        else:
            print(f" Skipped {new_col} in {df_name}: missing {final_col} or {base_col}")

    # === Building Age ===
    if 'year_built_final' in df.columns:
        df['year_built_final'] = pd.to_numeric(df['year_built_final'], errors='coerce')  # handle 'Vacant', etc.
        df['building_age'] = 2018 - df['year_built_final']
        print(f" Created building_age in {df_name}")
    else:
        print(f" Skipped building_age in {df_name}: missing year_built_final")


✅ Created building_value_growth in train_merged
✅ Created land_value_growth in train_merged
✅ Created assessed_growth in train_merged
✅ Created building_age in train_merged
✅ Created building_value_growth in test_merged
✅ Created land_value_growth in test_merged
✅ Created assessed_growth in test_merged
✅ Created building_age in test_merged


In [None]:
if 'school_dist' in train_merged.columns:
    missing_pct = train_merged['school_dist'].isna().mean() * 100
    missing_count = train_merged['school_dist'].isna().sum()

    print(f" 'school_dist' has {missing_count} missing values ({missing_pct:.2f}%) in training set")
else:
    print(" 'school_dist' not found in training set")


🔍 'school_dist' has 3 missing values (0.00%) in training set


In [None]:
if 'school_dist' in train_merged.columns:
    school_dist_median = train_merged['school_dist'].median()

    for df_name, df in [('train_merged', train_merged), ('test_merged', test_merged)]:
        if 'school_dist' in df.columns:
            df['school_dist'] = df['school_dist'].fillna(school_dist_median)
            print(f" Filled missing 'school_dist' in {df_name} using train median ({school_dist_median})")
else:
    print(" 'school_dist' not found in train_merged")


✅ Filled missing 'school_dist' in train_merged using train median (8.0)
✅ Filled missing 'school_dist' in test_merged using train median (8.0)


In [None]:
if 'year_built_final' in train_merged.columns:
    missing_pct = train_merged['year_built_final'].isna().mean() * 100
    missing_count = train_merged['year_built_final'].isna().sum()

    print(f" 'year_built_final' has {missing_count} missing values ({missing_pct:.2f}%) in training set")
else:
    print(" 'year_built_final' not found in training set")


🔍 'year_built_final' has 0 missing values (0.00%) in training set


In [None]:
# === STEP 0: Define base feature names ===
numeric_bases = [
    'garage_area', 'porch_area', 'floors', 'half_bath', 'full_bath',
    'total_rooms', 'bedrooms', 'fireplaces', 'building_area', 'building_value'
]

categorical_fill_map = {
    'quality': 'None',
    'quality_description': 'None',
    'building_condition': 'None',
    'foundation_type': 'None',
    'grade': 'None',
    'has_cooling': False,
    'has_heat': False,
    'physical_condition': 'None',
    'exterior_walls': 'None',
    'protested': False
}

# Generate full list of columns (2015–2019 only, no final columns)
numeric_cols_to_zero = [
    f'{base}_{year}' for base in numeric_bases for year in range(2015, 2020)
] + ['building_value_growth']

categorical_cols_to_fill = {
    f'{base}_{year}': val
    for base, val in categorical_fill_map.items()
    for year in range(2015, 2020)
}

# === STEP 1: Apply imputation if floor_area_total_2019 == 0 ===
for df_name, df in [('train_merged', train_merged), ('test_merged', test_merged)]:
    if 'floor_area_total_2019' in df.columns:
        zero_floor_mask = df['floor_area_total_2019'] == 0

        # Fill numeric columns with 0
        for col in numeric_cols_to_zero:
            if col in df.columns:
                df.loc[zero_floor_mask, col] = df.loc[zero_floor_mask, col].fillna(0)

        # Fill categorical/boolean columns
        for col, fill_val in categorical_cols_to_fill.items():
            if col in df.columns:
                df.loc[zero_floor_mask, col] = df.loc[zero_floor_mask, col].fillna(fill_val)

        print(f" Filled structure-dependent missing values in {df_name} for {zero_floor_mask.sum()} rows")
    else:
        print(f" 'floor_area_total_2019' not found in {df_name}")



  df.loc[zero_floor_mask, col] = df.loc[zero_floor_mask, col].fillna(fill_val)
  df.loc[zero_floor_mask, col] = df.loc[zero_floor_mask, col].fillna(fill_val)
  df.loc[zero_floor_mask, col] = df.loc[zero_floor_mask, col].fillna(fill_val)
  df.loc[zero_floor_mask, col] = df.loc[zero_floor_mask, col].fillna(fill_val)
  df.loc[zero_floor_mask, col] = df.loc[zero_floor_mask, col].fillna(fill_val)
  df.loc[zero_floor_mask, col] = df.loc[zero_floor_mask, col].fillna(fill_val)
  df.loc[zero_floor_mask, col] = df.loc[zero_floor_mask, col].fillna(fill_val)
  df.loc[zero_floor_mask, col] = df.loc[zero_floor_mask, col].fillna(fill_val)
  df.loc[zero_floor_mask, col] = df.loc[zero_floor_mask, col].fillna(fill_val)
  df.loc[zero_floor_mask, col] = df.loc[zero_floor_mask, col].fillna(fill_val)
  df.loc[zero_floor_mask, col] = df.loc[zero_floor_mask, col].fillna(fill_val)
  df.loc[zero_floor_mask, col] = df.loc[zero_floor_mask, col].fillna(fill_val)
  df.loc[zero_floor_mask, col] = df.loc[zero_floor_m

✅ Filled structure-dependent missing values in train_merged for 218 rows
✅ Filled structure-dependent missing values in test_merged for 150 rows


  df.loc[zero_floor_mask, col] = df.loc[zero_floor_mask, col].fillna(fill_val)
  df.loc[zero_floor_mask, col] = df.loc[zero_floor_mask, col].fillna(fill_val)
  df.loc[zero_floor_mask, col] = df.loc[zero_floor_mask, col].fillna(fill_val)
  df.loc[zero_floor_mask, col] = df.loc[zero_floor_mask, col].fillna(fill_val)
  df.loc[zero_floor_mask, col] = df.loc[zero_floor_mask, col].fillna(fill_val)
  df.loc[zero_floor_mask, col] = df.loc[zero_floor_mask, col].fillna(fill_val)
  df.loc[zero_floor_mask, col] = df.loc[zero_floor_mask, col].fillna(fill_val)
  df.loc[zero_floor_mask, col] = df.loc[zero_floor_mask, col].fillna(fill_val)
  df.loc[zero_floor_mask, col] = df.loc[zero_floor_mask, col].fillna(fill_val)
  df.loc[zero_floor_mask, col] = df.loc[zero_floor_mask, col].fillna(fill_val)
  df.loc[zero_floor_mask, col] = df.loc[zero_floor_mask, col].fillna(fill_val)
  df.loc[zero_floor_mask, col] = df.loc[zero_floor_mask, col].fillna(fill_val)
  df.loc[zero_floor_mask, col] = df.loc[zero_floor_m

In [37]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import KFold

# Clear specific variables
for var in ['ordinal_cols_all', 'bool_cols_all']:
    if var in locals():
        del globals()[var]

# === STEP 1: Boolean Encoding (2015–2019 only) ===
bool_bases = ['has_cooling', 'has_heat', 'protested']
bool_cols_all = [f"{base}_{year}" for base in bool_bases for year in range(2015, 2020)]

for col in bool_cols_all:
    if col in train_merged.columns:
        mode_val = train_merged[col].mode(dropna=True)[0]
        train_merged[col] = train_merged[col].fillna(mode_val).astype(int)
        test_merged[col] = test_merged[col].fillna(mode_val).astype(int)

# === STEP 2: Ordinal Cleaning and Encoding (2015–2019 only) ===
ordinal_bases = [
    'quality', 'quality_description', 'grade',
    'building_condition', 'physical_condition'
]

ordinal_cols_all = [f"{base}_{year}" for base in ordinal_bases for year in range(2015, 2020)]

# Column-specific replacements
replacement_maps = {
    'quality': {'E': 'D', 'F': 'D', 'X': np.nan, 'None': np.nan},
    'quality_description': {'Poor': 'Very Low', 'None': np.nan},
    'grade': {'X': 'F', 'X-': 'F', 'X+': 'F', 'E': 'D', 'E-': 'D-', 'E+': 'D+', 'None': np.nan},
    'building_condition': {'Very Poor': 'Poor', 'Unsound': 'Poor', 'None': np.nan},
    'physical_condition': {'Very Poor': 'Poor', 'Unsound': 'Poor', 'None': np.nan}
}

# Ordinal category order
ord_categories = {
    'quality': ['D', 'C', 'B', 'A'],
    'quality_description': ['Very Low', 'Low', 'Average', 'Good', 'Excellent', 'Superior'],
    'grade': ['F', 'D-', 'D', 'D+', 'C-', 'C', 'C+', 'B-', 'B', 'B+', 'A-', 'A', 'A+'],
    'building_condition': ['Poor', 'Fair', 'Average', 'Good', 'Very Good', 'Excellent'],
    'physical_condition': ['Poor', 'Fair', 'Average', 'Good', 'Very Good', 'Excellent']
}

# Clean and encode
for base in ordinal_bases:
    for year in range(2015, 2020):
        col = f"{base}_{year}"
        if col in train_merged.columns:
            replacements = replacement_maps.get(base, {})
            train_merged[col] = train_merged[col].replace(replacements)
            test_merged[col] = test_merged[col].replace(replacements)

            mode_val = train_merged[col].mode(dropna=True)[0]
            train_merged[col] = train_merged[col].fillna(mode_val)
            test_merged[col] = test_merged[col].fillna(mode_val)

            encoder = OrdinalEncoder(categories=[ord_categories[base]], handle_unknown='use_encoded_value', unknown_value=-1)
            train_merged[[col]] = encoder.fit_transform(train_merged[[col]])
            test_merged[[col]] = encoder.transform(test_merged[[col]])



  train_merged[col] = train_merged[col].fillna(mode_val).astype(int)
  test_merged[col] = test_merged[col].fillna(mode_val).astype(int)
  train_merged[col] = train_merged[col].fillna(mode_val).astype(int)
  test_merged[col] = test_merged[col].fillna(mode_val).astype(int)
  train_merged[col] = train_merged[col].fillna(mode_val).astype(int)
  test_merged[col] = test_merged[col].fillna(mode_val).astype(int)
  train_merged[col] = train_merged[col].fillna(mode_val).astype(int)
  test_merged[col] = test_merged[col].fillna(mode_val).astype(int)
  train_merged[col] = train_merged[col].fillna(mode_val).astype(int)
  test_merged[col] = test_merged[col].fillna(mode_val).astype(int)
  train_merged[col] = train_merged[col].fillna(mode_val).astype(int)
  test_merged[col] = test_merged[col].fillna(mode_val).astype(int)
  train_merged[col] = train_merged[col].fillna(mode_val).astype(int)
  test_merged[col] = test_merged[col].fillna(mode_val).astype(int)
  train_merged[col] = train_merged[col].fillna(m

In [None]:
# === STEP 3: Target Encoding (2015–2019 only) ===
def group_and_target_encode_cv(train_df, test_df, target_name, column, rare_threshold=0.001, smoothing=10, n_splits=5):
    freq = train_df[column].value_counts(normalize=True)
    rare_cats = freq[freq < rare_threshold].index
    train_df[column] = train_df[column].replace(rare_cats, 'Other')
    test_df[column] = test_df[column].replace(rare_cats, 'Other')

    global_mean = train_df[target_name].mean()
    oof_encoded = pd.Series(index=train_df.index, dtype='float64')

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    for train_idx, val_idx in kf.split(train_df):
        X_tr, X_val = train_df.iloc[train_idx], train_df.iloc[val_idx]
        stats = X_tr.groupby(column)[target_name].agg(['mean', 'count'])
        smooth = (stats['mean'] * stats['count'] + global_mean * smoothing) / (stats['count'] + smoothing)
        oof_encoded.iloc[val_idx] = X_val[column].map(smooth).fillna(global_mean)

    final_stats = train_df.groupby(column)[target_name].agg(['mean', 'count'])
    final_smooth = (final_stats['mean'] * final_stats['count'] + global_mean * smoothing) / (final_stats['count'] + smoothing)
    test_encoded = test_df[column].map(final_smooth).fillna(global_mean)

    return oof_encoded, test_encoded

# Target-encodable nominal columns
target_encodable_bases = ['foundation_type', 'exterior_walls']
target_encodable_cols_all = [f"{base}_{year}" for base in target_encodable_bases for year in range(2015, 2020)]

# Apply target encoding
for col in target_encodable_cols_all:
    if col in train_merged.columns:
        mode_val = train_merged[col].mode(dropna=True)[0]
        train_merged[col] = train_merged[col].fillna(mode_val)
        test_merged[col] = test_merged[col].fillna(mode_val)

        train_merged[f'{col}_te'], test_merged[f'{col}_te'] = group_and_target_encode_cv(
            train_merged, test_merged, target_name='assessed_2018', column=col,
            rare_threshold=0.001, smoothing=10, n_splits=5
        )

        train_merged.drop(columns=[col], inplace=True)
        test_merged.drop(columns=[col], inplace=True)

print(" Done: Boolean, Ordinal, and Target Encoding for 2015–2019 features only.")

✅ Done: Boolean, Ordinal, and Target Encoding for 2015–2019 features only.


In [None]:
# Step 1: Get growth columns from training set
growth_cols = [col for col in train_merged.columns if '_growth' in col]

# Step 2: Compute medians from train_merged only
growth_medians = {col: train_merged[col].median() for col in growth_cols}

# Step 3: Apply to both train and test
for df_name, df in [('train_merged', train_merged), ('test_merged', test_merged)]:
    for col in growth_cols:
        if col in df.columns:
            df[f'{col}_missing'] = df[col].isna().astype(int)
            df[col].fillna(growth_medians[col], inplace=True)
    print(f" Filled and flagged missing values in {df_name} for: {growth_cols}")


✅ Filled and flagged missing values in train_merged for: ['building_value_growth', 'land_value_growth', 'assessed_growth']
✅ Filled and flagged missing values in test_merged for: ['building_value_growth', 'land_value_growth', 'assessed_growth']


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(growth_medians[col], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(growth_medians[col], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting 

In [None]:

# === Step 1: List your growth features ===
growth_features = ['land_value_growth', 'building_value_growth', 'assessed_growth']

# === Step 2: Binning Function (train-based binning) ===
def bin_growth_feature_safe(train_df, test_df, feature, bins=4):
    try:
        # Quantile binning on train only
        train_df[f'{feature}_bin'], bin_edges = pd.qcut(train_df[feature], q=bins, labels=False, retbins=True, duplicates='drop')
        test_df[f'{feature}_bin'] = pd.cut(test_df[feature], bins=bin_edges, labels=False, include_lowest=True)
    except ValueError:
        # Fallback: Equal-width binning
        min_val = train_df[feature].min()
        max_val = train_df[feature].max()
        bin_edges = np.linspace(min_val, max_val, bins + 1)
        train_df[f'{feature}_bin'] = pd.cut(train_df[feature], bins=bin_edges, labels=False, include_lowest=True)
        test_df[f'{feature}_bin'] = pd.cut(test_df[feature], bins=bin_edges, labels=False, include_lowest=True)

    # Convert to category
    train_df[f'{feature}_bin'] = train_df[f'{feature}_bin'].astype('category')
    test_df[f'{feature}_bin'] = test_df[f'{feature}_bin'].astype('category')
    return train_df, test_df

# === Step 3: Apply to train_merged and test_merged ===
for feature in growth_features:
    train_merged, test_merged = bin_growth_feature_safe(train_merged, test_merged, feature)

# === Step 4: Bin year_built_final using train-based quantiles ===
train_merged['year_built_bin'], bin_edges = pd.qcut(
    train_merged['year_built_final'], q=5, retbins=True, labels=False, duplicates='drop'
)
test_merged['year_built_bin'] = pd.cut(
    test_merged['year_built_final'], bins=bin_edges, labels=False, include_lowest=True
)

# Convert to category
train_merged['year_built_bin'] = train_merged['year_built_bin'].astype('category')
test_merged['year_built_bin'] = test_merged['year_built_bin'].astype('category')


print(" Binned growth & year_built features safely with no leakage.")



✅ Binned growth & year_built features safely with no leakage.


In [41]:
# === Step 5: Drop original continuous columns ===
cols_to_drop = growth_features + ['year_built_final']
train_merged=train_merged.drop(columns=cols_to_drop)
test_merged=test_merged.drop(columns=cols_to_drop)

In [None]:
rare_threshold = 0.001  # 0.1%

cat_cols = [
    "quality_description_final",
    "foundation_type_final_te",
    "physical_condition_final",
    "exterior_walls_final_te",
    "region_freq",
    "neighborhood_freq"
]

for col in cat_cols:
    if col in train_merged.columns:
        freq = train_merged[col].value_counts(normalize=True)
        rare = freq[freq < rare_threshold]
        if not rare.empty:
            print(f"\n Rare categories in '{col}' (less than 0.1% of training data):\n{rare}")
    else:
        print(f" Column '{col}' not found in train_merged")




⚠️ Column 'quality_description_final' not found in train_merged
⚠️ Column 'foundation_type_final_te' not found in train_merged
⚠️ Column 'physical_condition_final' not found in train_merged
⚠️ Column 'exterior_walls_final_te' not found in train_merged

⚠️ Rare categories in 'region_freq' (less than 0.1% of training data):
region_freq
0.000963    0.000963
0.000641    0.000641
0.000347    0.000347
0.000159    0.000159
0.000083    0.000083
0.000010    0.000019
0.000002    0.000019
0.000008    0.000016
0.000005    0.000005
Name: proportion, dtype: float64

⚠️ Rare categories in 'neighborhood_freq' (less than 0.1% of training data):
neighborhood_freq
0.000500    0.001000
0.000996    0.000996
0.000988    0.000988
0.000984    0.000984
0.000492    0.000984
              ...   
0.000008    0.000024
0.000021    0.000021
0.000014    0.000014
0.000002    0.000006
0.000003    0.000006
Name: proportion, Length: 321, dtype: float64


In [None]:
# Define frequency columns and threshold
freq_cols = ['region_freq', 'neighborhood_freq', 'zone_freq', 'subneighborhood_freq']
rare_thresh = 0.001

# Apply rare value replacement for each frequency column
for col in freq_cols:
    if col in train_merged.columns:
        rare_vals = train_merged[col].value_counts(normalize=True)[lambda x: x < rare_thresh].index
        train_merged[col] = train_merged[col].replace(rare_vals, 0)
        test_merged[col] = test_merged[col].replace(rare_vals, 0)
        print(f" Replaced rare values in {col} using train_merged threshold < {rare_thresh}")
    else:
        print(f" Column {col} not found in train_merged — skipping.")



✅ Replaced rare values in region_freq using train_merged threshold < 0.001
✅ Replaced rare values in neighborhood_freq using train_merged threshold < 0.001
✅ Replaced rare values in zone_freq using train_merged threshold < 0.001
✅ Replaced rare values in subneighborhood_freq using train_merged threshold < 0.001


In [None]:
import pandas as pd

# === Compute skewness for numeric columns ===
numeric_cols = train_merged.select_dtypes(include=[np.number])
skew_values = numeric_cols.skew().sort_values(ascending=False)

# === Save skewness to CSV ===
skew_df = skew_values.reset_index()
skew_df.columns = ['feature', 'skewness']
skew_df.to_csv("feature_skewness.csv", index=False)

print(" Saved skewness stats to 'feature_skewness.csv'")


📁 Saved skewness stats to 'feature_skewness.csv'


In [None]:
import pandas as pd

# Load your skewness report
skew_df = pd.read_csv("feature_skewness.csv")  # Update path if needed

# === Step 1: Categorize features by skew level ===
ultra_skewed = []
moderately_skewed = []

for _, row in skew_df.iterrows():
    feature = row['feature']
    skew = row['skewness']
    
    if feature not in train_merged.columns:
        continue

    unique_vals = train_merged[feature].nunique()
    is_binary = set(train_merged[feature].dropna().unique()).issubset({0, 1})

    if unique_vals > 10 and not is_binary and not feature.endswith('_te'):
        if skew > 100:
            ultra_skewed.append(feature)
        elif 2< skew <= 100:
            moderately_skewed.append(feature)

print(f" {len(ultra_skewed)} ultra-skewed features to clip at 0.995.")
print(f" {len(moderately_skewed)} moderately-skewed features to clip at 0.999.")

# === Step 2: Compute quantile clipping bounds ===
clip_bounds = {}

for col in ultra_skewed:
    clip_bounds[col] = (
        train_merged[col].quantile(0.005),
        train_merged[col].quantile(0.995)
    )

for col in moderately_skewed:
    clip_bounds[col] = (
        train_merged[col].quantile(0.001),
        train_merged[col].quantile(0.999)
    )

# === Step 3: Apply clipping to both train and test ===
for df_name, df in [('train_merged', train_merged), ('test_merged', test_merged)]:
    for col, (lower, upper) in clip_bounds.items():
        if col in df.columns:
            df[col] = df[col].clip(lower, upper)

print(" Adaptive clipping applied: 0.995 for ultra-skewed, 0.999 for moderately-skewed features.")


✅ 5 ultra-skewed features to clip at 0.995.
✅ 76 moderately-skewed features to clip at 0.999.
✂️ Adaptive clipping applied: 0.995 for ultra-skewed, 0.999 for moderately-skewed features.


In [46]:
def add_features(df):
    df = df.copy()
    
    # === Ratio features ===
    df['area_ratio'] = df['building_area_2019'] / (df['land_area_2019'] + 1)
    df['porch_ratio'] = df['porch_area_2019'] / (df['building_area_2019'] + 1)
    df['floor_density'] = df['floor_area_total_2019'] / (df['land_area_2019'] + 1)
    
    df['value_ratio'] = df['building_value_2018'] / (df['land_value_2018'] + 1)
    df['value_per_sqft'] = df['building_value_2018'] / (df['building_area_2019'] + 1)
    df['price_per_sqft'] = df['assessed_2018'] / (df['building_area_2019'] + 1)

    # === Bathroom & room structure ===
    df['bathroom_score'] = df['full_bath_2019'] + 0.5 * df['half_bath_2019']
    df['bathroom_density'] = df['bathroom_score'] / (df['total_rooms_2019'] + 1)
    df['bedroom_ratio'] = df['bedrooms_2019'] / (df['total_rooms_2019'] + 1)
    df['rooms_per_floor'] = df['total_rooms_2019'] / (df['floors_2019'] + 1)

    # === Core interactions ===
    df['bedrooms_x_floors'] = df['bedrooms_2019'] * df['floors_2019']
    df['rooms_x_quality'] = df['total_rooms_2019'] * df['quality_2019']
    df['assess_x_age'] = df['assessed_2018'] * df['building_age']
    df['grade_quality_index'] = df['grade_2019'] * df['quality_2019']

    # === Selected high-signal interactions ===
    df['area_x_quality'] = df['building_area_2019'] * df['quality_2019']
    df['floor_area_x_grade'] = df['floor_area_total_2019'] * df['grade_2019']
    df['value_to_neigh_median'] = df['building_value_2018'] / (df['neigh_assess_median'] + 1)
    df['assess_to_neigh_mean'] = df['assessed_2018'] / (df['neigh_assess_mean'] + 1)
    df['value_per_age'] = df['building_value_2018'] / (df['building_age'] + 1)

    # === Clean up ===
    df = df.replace([np.inf, -np.inf], np.nan)
    df = df.fillna(0)
    
    return df

# === Apply to train and test sets ===
train_merged = add_features(train_merged)
test_merged = add_features(test_merged)



In [47]:
# Select numeric columns only
numeric_features = train_merged.select_dtypes(include=['int64', 'float64'])

# Calculate skewness
skewness = numeric_features.skew(numeric_only=True)

# Filter and sort features with skewness > 30
highly_skewed = skewness[skewness > 5].sort_values(ascending=False)

print("📊 Features with skewness > 5:")
print(highly_skewed)

📊 Features with skewness > 5:
building_value_growth_missing    354.478490
land_value_growth_missing        354.478490
assessed_growth_missing           33.631805
value_ratio                       22.718640
foundation_type_2017_te           19.398895
foundation_type_2018_te           19.209417
foundation_type_2016_te           19.088215
foundation_type_2019_te           18.622057
foundation_type_2015_te           17.140939
floor_density                     14.857656
area_ratio                        14.266433
floor_area_lower_2015              9.470069
floor_area_lower_2016              9.001662
assess_x_age                       8.945820
floor_area_lower_2017              8.676705
neigh_assess_std                   8.407189
floor_area_lower_2018              8.338474
floor_area_lower_2019              7.911224
neigh_assess_iqr                   6.953338
value_per_age                      6.361554
land_area_2019                     6.145570
land_area_2018                     6.130493
la

In [None]:
import numpy as np
import pandas as pd

# === Step 1: Drop dummy _missing columns ===
drop_cols = [
    'building_value_growth_missing',
    'land_value_growth_missing',
    'assessed_growth_missing'
]
train_merged = train_merged.drop(columns=drop_cols, errors='ignore')
test_merged = test_merged.drop(columns=drop_cols, errors='ignore')
print("🗑️ Dropped dummy _missing columns from train and test.")

# === Step 2: Compute skewness from train_merged only ===
numeric_cols = train_merged.select_dtypes(include=[np.number])
skew_series = numeric_cols.skew()

# === Step 3: Categorize by skew level (excluding _te and _missing) ===
ultra_skewed = []
moderately_skewed = []

for col, skew_val in skew_series.items():
    if col.endswith('_te') or col.endswith('_missing'):
        continue
    if skew_val > 100:
        ultra_skewed.append(col)
    elif skew_val > 2:
        moderately_skewed.append(col)

print(f" {len(ultra_skewed)} ultra-skewed features (clip at 0.5%–99.5%).")
print(f" {len(moderately_skewed)} moderately-skewed features (clip at 0.1%–99.9%).")

# === Step 4: Compute clipping bounds ===
clip_bounds = {}

for col in ultra_skewed:
    clip_bounds[col] = (
        train_merged[col].quantile(0.005),
        train_merged[col].quantile(0.995)
    )

for col in moderately_skewed:
    clip_bounds[col] = (
        train_merged[col].quantile(0.001),
        train_merged[col].quantile(0.999)
    )

# === Step 5: Apply clipping to both train and test sets ===
for df in [train_merged, test_merged]:
    for col, (lower, upper) in clip_bounds.items():
        if col in df.columns:
            df[col] = df[col].clip(lower, upper)

print(" Adaptive clipping complete: 0.995 for ultra-skewed, 0.999 for moderately skewed.")


🗑️ Dropped dummy _missing columns from train and test.
✂️ 0 ultra-skewed features (clip at 0.5%–99.5%).
✂️ 57 moderately-skewed features (clip at 0.1%–99.9%).
✅ Adaptive clipping complete: 0.995 for ultra-skewed, 0.999 for moderately skewed.


In [None]:
# Define columns to drop
year_built_cols = [f'year_built_{year}' for year in range(2015, 2020)]

# Drop if columns exist
train_merged.drop(columns=[col for col in year_built_cols if col in train_merged.columns], inplace=True)
test_merged.drop(columns=[col for col in year_built_cols if col in test_merged.columns], inplace=True)

print(" Dropped year_built_2015 to year_built_2019 from both train and test sets.")

✅ Dropped year_built_2015 to year_built_2019 from both train and test sets.


In [50]:
if "TARGET" in train_merged.columns:
    train_merged.drop("TARGET", axis=1,inplace=True)
else:
    print("TARGET not found in columns:", train_merged.columns.tolist())


In [None]:
# Drop floor_area_total_final if it exists
for df_name, df in [('train_merged', train_merged), ('test_merged', test_merged)]:
    if 'floor_area_total_final' in df.columns:
        df.drop(columns='floor_area_total_final', inplace=True)
        print(f" Dropped 'floor_area_total_final' from {df_name}")
    else:
        print(f"ℹ 'floor_area_total_final' not found in {df_name}")


ℹ️ 'floor_area_total_final' not found in train_merged
ℹ️ 'floor_area_total_final' not found in test_merged


In [None]:
train_cols = set(train_merged.columns)
test_cols = set(test_merged.columns)

missing_in_test = train_cols - test_cols
missing_in_train = test_cols - train_cols

print(" Columns in train but not in test:")
print(sorted(missing_in_test))

print("\n Columns in test but not in train:")
print(sorted(missing_in_train))


✅ Columns in train but not in test:
[]

❌ Columns in test but not in train:
[]


In [53]:
# Check if column sets match
print(set(train_merged.columns) == set(test_merged.columns))  # Should be True

# Check if column order matches
print(list(train_merged.columns) == list(test_merged.columns))  # Must also be True


True
True


In [54]:
import numpy as np
import pandas as pd
import optuna
import lightgbm as lgb
import shap
from sklearn.model_selection import KFold
from sklearn.metrics import root_mean_squared_error
from optuna.integration import LightGBMPruningCallback
from optuna.pruners import SuccessiveHalvingPruner
from lightgbm import log_evaluation, early_stopping

# === STEP 0: Setup Data ===
X_full = train_merged.copy()
y_full = pd.Series(y_train)
X_test = test_merged.copy()

# Detect categorical columns
cat_cols = X_full.select_dtypes(include=['category', 'object']).columns.tolist()
for col in cat_cols:
    X_full[col] = X_full[col].astype("category")
    X_test[col] = X_test[col].astype("category")

global_oof_preds = np.zeros(len(X_full))
best_score = float('inf')

# === STEP 1: Define Optuna Objective ===
def objective(trial):
    global global_oof_preds, best_score

    params = {
        "objective": "regression",
        "metric": "rmse",
        "boosting_type": "gbdt",
        "learning_rate": trial.suggest_float("learning_rate", 0.025, 0.04, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 160, 220),
        "max_depth": trial.suggest_int("max_depth", 7, 11),
        "min_child_samples": trial.suggest_int("min_child_samples", 18, 30),
        "subsample": trial.suggest_float("subsample", 0.65, 0.88),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 0.75),
        "reg_alpha": trial.suggest_float("reg_alpha", 1.0, 5.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1.0, 4.0, log=True),
        "min_split_gain": trial.suggest_float("min_split_gain", 0.15, 0.25),
        "verbose": -1,
        "n_jobs": -1,
    }

    kf = KFold(n_splits=3, shuffle=True, random_state=42)
    val_rmse = []
    oof_preds = np.zeros(len(X_full))

    for fold, (train_idx, val_idx) in enumerate(kf.split(X_full)):
        X_train, X_val = X_full.iloc[train_idx], X_full.iloc[val_idx]
        y_train_fold, y_val = y_full.iloc[train_idx], y_full.iloc[val_idx]

        dtrain = lgb.Dataset(X_train, label=y_train_fold, categorical_feature=cat_cols)
        dvalid = lgb.Dataset(X_val, label=y_val, categorical_feature=cat_cols)

        model = lgb.train(
            params,
            dtrain,
            valid_sets=[dvalid],
            num_boost_round=1000,
            callbacks=[
                early_stopping(stopping_rounds=100),
                log_evaluation(period=100),
                LightGBMPruningCallback(trial, "rmse")
            ]
        )

        val_pred = model.predict(X_val, num_iteration=model.best_iteration)
        oof_preds[val_idx] = val_pred
        val_rmse.append(root_mean_squared_error(y_val, val_pred))

    mean_rmse = np.mean(val_rmse)
    trial.set_user_attr("cv_rmse", mean_rmse)

    if mean_rmse < best_score:
        best_score = mean_rmse
        global_oof_preds[:] = oof_preds

    print(f" Trial {trial.number} | CV RMSE: {mean_rmse:,.2f}")
    return mean_rmse

# === STEP 2: Run Optuna ===
study = optuna.create_study(
    direction='minimize',
    study_name='lgbm_study_final_with_shap',
    storage='sqlite:///lgbm_study_final_with_shap.db',
    load_if_exists=True,
    pruner=SuccessiveHalvingPruner(min_resource=100, reduction_factor=2)
)
study.optimize(objective, n_trials=25, show_progress_bar=True)

print(" Best RMSE:", study.best_value)
print(" Best Parameters:", study.best_params)
np.save("oof_preds_lgbm.npy", global_oof_preds)
print(" Saved: oof_preds_lgbm.npy")

# === STEP 3: SHAP + GAIN Feature Selection ===
kf = KFold(n_splits=3, shuffle=True, random_state=42)
selected_feature_sets = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_full)):
    X_train_raw, y_train_fold = X_full.iloc[train_idx], y_full.iloc[train_idx]

    train_dataset = lgb.Dataset(X_train_raw, label=y_train_fold, categorical_feature=cat_cols)
    model_temp = lgb.train(
        study.best_params,
        train_dataset,
        num_boost_round=200,
        valid_sets=[train_dataset],
        callbacks=[log_evaluation(period=100)] 
    )

    # SHAP importance
    explainer = shap.TreeExplainer(model_temp)
    shap_values = explainer.shap_values(X_train_raw)
    shap_df = pd.DataFrame(np.abs(shap_values), columns=X_train_raw.columns)
    shap_importance = shap_df.mean().sort_values(ascending=False)
    shap_cumsum = shap_importance.cumsum() / shap_importance.sum()
    top_shap = shap_cumsum[shap_cumsum <= 0.95].index.tolist()

    # Gain importance
    gain_importance = pd.Series(model_temp.feature_importance(importance_type='gain'), index=X_train_raw.columns)
    gain_sorted = gain_importance.sort_values(ascending=False)
    gain_cumsum = gain_sorted.cumsum() / gain_sorted.sum()
    top_gain = gain_cumsum[gain_cumsum <= 0.95].index.tolist()

    selected_features = list(set(top_shap).union(set(top_gain)))
    selected_feature_sets.append(selected_features)

# === STEP 4: Final Feature Union ===
final_union_features = list(set().union(*selected_feature_sets))
print(" Final Union Feature Count:", len(final_union_features))

# Filter only those categorical columns that are in final features
filtered_cat_cols = [col for col in cat_cols if col in final_union_features]

# === STEP 5: Final Model on Selected Features ===
X_full_selected = X_full[final_union_features]
X_test_selected = X_test[final_union_features]


final_dataset = lgb.Dataset(X_full_selected, label=y_full, categorical_feature=filtered_cat_cols)
final_model = lgb.train(
    study.best_params,
    final_dataset,
    num_boost_round=1000,
    valid_sets=[final_dataset],
    valid_names=["train"],
    callbacks=[log_evaluation(period=100)]
)

# === STEP 6: Predict on Test Set ===
test_preds = final_model.predict(X_test_selected, num_iteration=final_model.best_iteration)
np.save("test_preds_lgbm_shap.npy", test_preds)
print(" Saved: test_preds_lgbm_shap.npy")

# === STEP 7: Save Submission ===
submission = pd.DataFrame({
    'ACCOUNT': acct_test.values.ravel(),  # Replace with your ID col
    'TARGET': test_preds
})
submission.to_csv("submission_lgbm_shap.csv", index=False)
print(" Submission saved: submission_lgbm_shap.csv")

  from .autonotebook import tqdm as notebook_tqdm
[I 2025-05-21 06:12:57,615] A new study created in RDB with name: lgbm_study_final_with_shap
  0%|          | 0/25 [00:00<?, ?it/s]

Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 42644
[200]	valid_0's rmse: 39953.4
[300]	valid_0's rmse: 39753.4
[400]	valid_0's rmse: 39603.1
[500]	valid_0's rmse: 39490.7
[600]	valid_0's rmse: 39387.7
[700]	valid_0's rmse: 39298.3
[800]	valid_0's rmse: 39229.7
[900]	valid_0's rmse: 39154.3
[1000]	valid_0's rmse: 39088.6
Did not meet early stopping. Best iteration is:
[999]	valid_0's rmse: 39088




Training until validation scores don't improve for 100 rounds




[100]	valid_0's rmse: 41882.8




[200]	valid_0's rmse: 39216.4




[300]	valid_0's rmse: 38732.9




[400]	valid_0's rmse: 38406




[500]	valid_0's rmse: 38265.1




[600]	valid_0's rmse: 38116




[700]	valid_0's rmse: 38001.7




[800]	valid_0's rmse: 37886.1




[900]	valid_0's rmse: 37757.2




[1000]	valid_0's rmse: 37650.2
Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 37650.2




Training until validation scores don't improve for 100 rounds




[100]	valid_0's rmse: 51644.4




[200]	valid_0's rmse: 48048.1




[300]	valid_0's rmse: 47512.4




[400]	valid_0's rmse: 47183.4




[500]	valid_0's rmse: 47003.3




[600]	valid_0's rmse: 46891.3




[700]	valid_0's rmse: 46911.6




Early stopping, best iteration is:
[614]	valid_0's rmse: 46874.7


Best trial: 0. Best value: 41204.3:   4%|▍         | 1/25 [05:57<2:22:51, 357.13s/it]

📉 Trial 0 | CV RMSE: 41,204.31
[I 2025-05-21 06:18:54,716] Trial 0 finished with value: 41204.31262134961 and parameters: {'learning_rate': 0.039730220495763816, 'num_leaves': 210, 'max_depth': 11, 'min_child_samples': 18, 'subsample': 0.8602930077632327, 'colsample_bytree': 0.6962834149319597, 'reg_alpha': 4.731216779813845, 'reg_lambda': 1.7984216897061351, 'min_split_gain': 0.24840445491683466}. Best is trial 0 with value: 41204.31262134961.
Training until validation scores don't improve for 100 rounds


Best trial: 0. Best value: 41204.3:   4%|▍         | 1/25 [06:15<2:22:51, 357.13s/it]

[100]	valid_0's rmse: 48109.1
[I 2025-05-21 06:19:13,324] Trial 1 pruned. Trial was pruned at iteration 100.


Best trial: 0. Best value: 41204.3:   8%|▊         | 2/25 [06:15<1:00:35, 158.06s/it]

Training until validation scores don't improve for 100 rounds


Best trial: 0. Best value: 41204.3:   8%|▊         | 2/25 [06:29<1:00:35, 158.06s/it]

[100]	valid_0's rmse: 44653.6
[I 2025-05-21 06:19:27,537] Trial 2 pruned. Trial was pruned at iteration 100.


Best trial: 0. Best value: 41204.3:  12%|█▏        | 3/25 [06:30<33:53, 92.42s/it]   

Training until validation scores don't improve for 100 rounds


Best trial: 0. Best value: 41204.3:  12%|█▏        | 3/25 [06:45<33:53, 92.42s/it]

[100]	valid_0's rmse: 48168.8
[I 2025-05-21 06:19:43,459] Trial 3 pruned. Trial was pruned at iteration 100.


Best trial: 0. Best value: 41204.3:  16%|█▌        | 4/25 [06:45<21:45, 62.18s/it]

Training until validation scores don't improve for 100 rounds


Best trial: 0. Best value: 41204.3:  16%|█▌        | 4/25 [07:03<21:45, 62.18s/it]

[100]	valid_0's rmse: 50664.8
[I 2025-05-21 06:20:01,104] Trial 4 pruned. Trial was pruned at iteration 100.


Best trial: 0. Best value: 41204.3:  20%|██        | 5/25 [07:03<15:22, 46.12s/it]

Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 46779
[200]	valid_0's rmse: 39954
[300]	valid_0's rmse: 39393.5
[400]	valid_0's rmse: 39019.3
[500]	valid_0's rmse: 38754.5
[600]	valid_0's rmse: 38611.1
[700]	valid_0's rmse: 38468.1
[800]	valid_0's rmse: 38331.5
[900]	valid_0's rmse: 38217.9
[1000]	valid_0's rmse: 38107.8
Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 38107.8




Training until validation scores don't improve for 100 rounds




[100]	valid_0's rmse: 45969.4




[200]	valid_0's rmse: 40471.3




[300]	valid_0's rmse: 40000.3




[400]	valid_0's rmse: 39683.3




[500]	valid_0's rmse: 39472.1




[600]	valid_0's rmse: 39346.6




[700]	valid_0's rmse: 39177.8




[800]	valid_0's rmse: 39116.8




[900]	valid_0's rmse: 39034.7




[1000]	valid_0's rmse: 38937.5
Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 38937.5




Training until validation scores don't improve for 100 rounds




[100]	valid_0's rmse: 57939.7




[200]	valid_0's rmse: 51638




[300]	valid_0's rmse: 50640.9




[400]	valid_0's rmse: 49994.2




[500]	valid_0's rmse: 49444.2




[600]	valid_0's rmse: 49069.8




[700]	valid_0's rmse: 48742.3




[800]	valid_0's rmse: 48512.8




[900]	valid_0's rmse: 48333.9




[1000]	valid_0's rmse: 48205.7
Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 48205.7


Best trial: 0. Best value: 41204.3:  20%|██        | 5/25 [12:44<15:22, 46.12s/it]

📉 Trial 5 | CV RMSE: 41,750.35
[I 2025-05-21 06:25:41,920] Trial 5 finished with value: 41750.34968844729 and parameters: {'learning_rate': 0.031050034671392383, 'num_leaves': 180, 'max_depth': 10, 'min_child_samples': 26, 'subsample': 0.8519732948009487, 'colsample_bytree': 0.7457454713540842, 'reg_alpha': 3.444561895683996, 'reg_lambda': 1.0530887955789128, 'min_split_gain': 0.16435650675011576}. Best is trial 0 with value: 41204.31262134961.


Best trial: 0. Best value: 41204.3:  24%|██▍       | 6/25 [12:44<46:19, 146.31s/it]

Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 43599.3


Best trial: 0. Best value: 41204.3:  24%|██▍       | 6/25 [13:04<46:19, 146.31s/it]

[200]	valid_0's rmse: 40976
[I 2025-05-21 06:26:02,495] Trial 6 pruned. Trial was pruned at iteration 200.


Best trial: 0. Best value: 41204.3:  28%|██▊       | 7/25 [13:05<31:33, 105.21s/it]

Training until validation scores don't improve for 100 rounds


Best trial: 0. Best value: 41204.3:  28%|██▊       | 7/25 [13:19<31:33, 105.21s/it]

[100]	valid_0's rmse: 50843.4
[I 2025-05-21 06:26:17,304] Trial 7 pruned. Trial was pruned at iteration 100.


Best trial: 0. Best value: 41204.3:  32%|███▏      | 8/25 [13:19<21:39, 76.43s/it] 

Training until validation scores don't improve for 100 rounds


Best trial: 0. Best value: 41204.3:  32%|███▏      | 8/25 [13:34<21:39, 76.43s/it]

[100]	valid_0's rmse: 47492.1
[I 2025-05-21 06:26:32,564] Trial 8 pruned. Trial was pruned at iteration 100.


Best trial: 0. Best value: 41204.3:  36%|███▌      | 9/25 [13:35<15:16, 57.31s/it]

Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 43310.9
[200]	valid_0's rmse: 39854.5
[300]	valid_0's rmse: 39305.5
[400]	valid_0's rmse: 39008.9
[500]	valid_0's rmse: 38790.1
[600]	valid_0's rmse: 38664.2
[700]	valid_0's rmse: 38542.3
[800]	valid_0's rmse: 38515


Best trial: 0. Best value: 41204.3:  40%|████      | 10/25 [14:54<15:59, 63.99s/it]

[I 2025-05-21 06:27:51,610] Trial 9 pruned. Trial was pruned at iteration 800.
Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 44308.7


Best trial: 0. Best value: 41204.3:  40%|████      | 10/25 [15:23<15:59, 63.99s/it]

[200]	valid_0's rmse: 39949.1
[I 2025-05-21 06:28:20,946] Trial 10 pruned. Trial was pruned at iteration 200.


Best trial: 0. Best value: 41204.3:  44%|████▍     | 11/25 [15:23<12:27, 53.42s/it]

Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 45718.3


Best trial: 0. Best value: 41204.3:  44%|████▍     | 11/25 [15:49<12:27, 53.42s/it]

[200]	valid_0's rmse: 40118.6
[I 2025-05-21 06:28:46,851] Trial 11 pruned. Trial was pruned at iteration 200.


Best trial: 0. Best value: 41204.3:  48%|████▊     | 12/25 [15:49<09:45, 45.04s/it]

Training until validation scores don't improve for 100 rounds


Best trial: 0. Best value: 41204.3:  48%|████▊     | 12/25 [16:06<09:45, 45.04s/it]

[100]	valid_0's rmse: 46641.4
[I 2025-05-21 06:29:04,528] Trial 12 pruned. Trial was pruned at iteration 100.


Best trial: 0. Best value: 41204.3:  52%|█████▏    | 13/25 [16:07<07:21, 36.79s/it]

Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 43968.1


Best trial: 0. Best value: 41204.3:  56%|█████▌    | 14/25 [16:30<06:00, 32.75s/it]

[200]	valid_0's rmse: 40089.5
[I 2025-05-21 06:29:28,135] Trial 13 pruned. Trial was pruned at iteration 200.
Training until validation scores don't improve for 100 rounds


Best trial: 0. Best value: 41204.3:  56%|█████▌    | 14/25 [16:48<06:00, 32.75s/it]

[100]	valid_0's rmse: 48958.6
[I 2025-05-21 06:29:45,722] Trial 14 pruned. Trial was pruned at iteration 100.


Best trial: 0. Best value: 41204.3:  60%|██████    | 15/25 [16:48<04:42, 28.21s/it]

Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 46592.7


Best trial: 0. Best value: 41204.3:  60%|██████    | 15/25 [17:15<04:42, 28.21s/it]

[200]	valid_0's rmse: 40556.1
[I 2025-05-21 06:30:13,366] Trial 15 pruned. Trial was pruned at iteration 200.


Best trial: 0. Best value: 41204.3:  64%|██████▍   | 16/25 [17:15<04:12, 28.04s/it]

Training until validation scores don't improve for 100 rounds


Best trial: 0. Best value: 41204.3:  64%|██████▍   | 16/25 [17:32<04:12, 28.04s/it]

[100]	valid_0's rmse: 52892.7
[I 2025-05-21 06:30:30,192] Trial 16 pruned. Trial was pruned at iteration 100.


Best trial: 0. Best value: 41204.3:  68%|██████▊   | 17/25 [17:32<03:17, 24.66s/it]

Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 44889.2


Best trial: 0. Best value: 41204.3:  68%|██████▊   | 17/25 [17:56<03:17, 24.66s/it]

[200]	valid_0's rmse: 40527.4
[I 2025-05-21 06:30:54,158] Trial 17 pruned. Trial was pruned at iteration 200.


Best trial: 0. Best value: 41204.3:  72%|███████▏  | 18/25 [17:56<02:51, 24.48s/it]

Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 45418.4


Best trial: 0. Best value: 41204.3:  72%|███████▏  | 18/25 [18:22<02:51, 24.48s/it]

[200]	valid_0's rmse: 40308
[I 2025-05-21 06:31:19,807] Trial 18 pruned. Trial was pruned at iteration 200.


Best trial: 0. Best value: 41204.3:  76%|███████▌  | 19/25 [18:22<02:28, 24.80s/it]

Training until validation scores don't improve for 100 rounds


Best trial: 0. Best value: 41204.3:  76%|███████▌  | 19/25 [18:37<02:28, 24.80s/it]

[100]	valid_0's rmse: 49693.8
[I 2025-05-21 06:31:34,962] Trial 19 pruned. Trial was pruned at iteration 100.


Best trial: 0. Best value: 41204.3:  80%|████████  | 20/25 [18:37<01:49, 21.91s/it]

Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 46319.1


Best trial: 0. Best value: 41204.3:  80%|████████  | 20/25 [19:04<01:49, 21.91s/it]

[200]	valid_0's rmse: 40395.6
[I 2025-05-21 06:32:01,720] Trial 20 pruned. Trial was pruned at iteration 200.


Best trial: 0. Best value: 41204.3:  84%|████████▍ | 21/25 [19:04<01:33, 23.36s/it]

Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 43192.9
[200]	valid_0's rmse: 39786
[300]	valid_0's rmse: 39210.6
[400]	valid_0's rmse: 38782.7
[500]	valid_0's rmse: 38613.4
[600]	valid_0's rmse: 38489.6
[700]	valid_0's rmse: 38404.4
[800]	valid_0's rmse: 38309.5
[900]	valid_0's rmse: 38228
[1000]	valid_0's rmse: 38130.1
Did not meet early stopping. Best iteration is:
[999]	valid_0's rmse: 38128.8




Training until validation scores don't improve for 100 rounds




[100]	valid_0's rmse: 42647.7




[200]	valid_0's rmse: 39991.5




[300]	valid_0's rmse: 39612.9




[400]	valid_0's rmse: 39301.1




[500]	valid_0's rmse: 39086.5




[600]	valid_0's rmse: 38961.1




[700]	valid_0's rmse: 38848.9




[800]	valid_0's rmse: 38759.2




[900]	valid_0's rmse: 38683.2




[1000]	valid_0's rmse: 38616.4
Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 38616.4




Training until validation scores don't improve for 100 rounds




[100]	valid_0's rmse: 54960.4




[200]	valid_0's rmse: 51292.8




[300]	valid_0's rmse: 50309.5




[400]	valid_0's rmse: 49821.3




[500]	valid_0's rmse: 49437.6




[600]	valid_0's rmse: 49113.3




[700]	valid_0's rmse: 48916.9




[800]	valid_0's rmse: 48766.7




[900]	valid_0's rmse: 48677.4




[1000]	valid_0's rmse: 48606
Did not meet early stopping. Best iteration is:
[994]	valid_0's rmse: 48596.9


Best trial: 0. Best value: 41204.3:  88%|████████▊ | 22/25 [25:35<06:41, 133.91s/it]

📉 Trial 21 | CV RMSE: 41,780.70
[I 2025-05-21 06:38:33,532] Trial 21 finished with value: 41780.69537889775 and parameters: {'learning_rate': 0.03936596074205405, 'num_leaves': 200, 'max_depth': 11, 'min_child_samples': 27, 'subsample': 0.6740600552468192, 'colsample_bytree': 0.7469353665132124, 'reg_alpha': 1.6497734351015796, 'reg_lambda': 1.2008202347841108, 'min_split_gain': 0.1852351138395164}. Best is trial 0 with value: 41204.31262134961.
Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 43465.6
[200]	valid_0's rmse: 39654.8
[300]	valid_0's rmse: 39097.3
[400]	valid_0's rmse: 38807.9
[500]	valid_0's rmse: 38543.8
[600]	valid_0's rmse: 38426.8
[700]	valid_0's rmse: 38290.3
[800]	valid_0's rmse: 38189.5
[900]	valid_0's rmse: 38076.6
[1000]	valid_0's rmse: 37994.8
Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 37994.8




Training until validation scores don't improve for 100 rounds




[100]	valid_0's rmse: 42924.6




[200]	valid_0's rmse: 40042.6




[300]	valid_0's rmse: 39640.3




[400]	valid_0's rmse: 39413.9




[500]	valid_0's rmse: 39215.2




[600]	valid_0's rmse: 39056.3




[700]	valid_0's rmse: 38953.5




[800]	valid_0's rmse: 38866.4




[900]	valid_0's rmse: 38776.6




[1000]	valid_0's rmse: 38720.7
Did not meet early stopping. Best iteration is:
[999]	valid_0's rmse: 38718.9




Training until validation scores don't improve for 100 rounds




[100]	valid_0's rmse: 54696.8




[200]	valid_0's rmse: 51251.4




[300]	valid_0's rmse: 50441.2




[400]	valid_0's rmse: 49867.5




[500]	valid_0's rmse: 49299.4




[600]	valid_0's rmse: 49004.7




[700]	valid_0's rmse: 48739.7




[800]	valid_0's rmse: 48604.7




[900]	valid_0's rmse: 48530.4




[1000]	valid_0's rmse: 48434.7
Did not meet early stopping. Best iteration is:
[980]	valid_0's rmse: 48418.5


Best trial: 0. Best value: 41204.3:  88%|████████▊ | 22/25 [31:55<06:41, 133.91s/it]

📉 Trial 22 | CV RMSE: 41,710.74
[I 2025-05-21 06:44:53,582] Trial 22 finished with value: 41710.74337886497 and parameters: {'learning_rate': 0.03794571053612395, 'num_leaves': 211, 'max_depth': 11, 'min_child_samples': 27, 'subsample': 0.698911598076823, 'colsample_bytree': 0.7356004187004775, 'reg_alpha': 1.4034349123008893, 'reg_lambda': 1.3402359339427115, 'min_split_gain': 0.178602440913176}. Best is trial 0 with value: 41204.31262134961.


Best trial: 0. Best value: 41204.3:  92%|█████████▏| 23/25 [31:56<06:55, 207.78s/it]

Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 43823.4
[200]	valid_0's rmse: 39886
[300]	valid_0's rmse: 39263.3
[400]	valid_0's rmse: 38957.9
[500]	valid_0's rmse: 38774.3
[600]	valid_0's rmse: 38555.8
[700]	valid_0's rmse: 38435
[800]	valid_0's rmse: 38305.7
[900]	valid_0's rmse: 38158
[1000]	valid_0's rmse: 38068.8
Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 38068.8




Training until validation scores don't improve for 100 rounds




[100]	valid_0's rmse: 43284.4




[200]	valid_0's rmse: 40279.7




[300]	valid_0's rmse: 39836.7




[400]	valid_0's rmse: 39662.6




[500]	valid_0's rmse: 39451.2




[600]	valid_0's rmse: 39295.2




[700]	valid_0's rmse: 39198.9




[800]	valid_0's rmse: 39104.5




[900]	valid_0's rmse: 39062.5




[1000]	valid_0's rmse: 39027.5
Did not meet early stopping. Best iteration is:
[998]	valid_0's rmse: 39025.1




Training until validation scores don't improve for 100 rounds




[100]	valid_0's rmse: 55541.9




[200]	valid_0's rmse: 51840.3




[300]	valid_0's rmse: 50935.1




[400]	valid_0's rmse: 50399.3




[500]	valid_0's rmse: 49911.8




[600]	valid_0's rmse: 49537.2




[700]	valid_0's rmse: 49239.4




[800]	valid_0's rmse: 49114.7




[900]	valid_0's rmse: 48968.2




[1000]	valid_0's rmse: 48879.2
Did not meet early stopping. Best iteration is:
[991]	valid_0's rmse: 48875.1


Best trial: 0. Best value: 41204.3:  96%|█████████▌| 24/25 [38:50<04:29, 269.91s/it]

📉 Trial 23 | CV RMSE: 41,989.67
[I 2025-05-21 06:51:48,442] Trial 23 finished with value: 41989.67380768543 and parameters: {'learning_rate': 0.03758933717860998, 'num_leaves': 212, 'max_depth': 10, 'min_child_samples': 28, 'subsample': 0.7022070452273712, 'colsample_bytree': 0.7356001603162485, 'reg_alpha': 1.329282705386652, 'reg_lambda': 1.350620425637288, 'min_split_gain': 0.16641477085509726}. Best is trial 0 with value: 41204.31262134961.
Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 44218.6
[200]	valid_0's rmse: 39570.6
[300]	valid_0's rmse: 39145.5
[400]	valid_0's rmse: 38907.8
[500]	valid_0's rmse: 38645.8
[600]	valid_0's rmse: 38493.6
[700]	valid_0's rmse: 38375.9
[800]	valid_0's rmse: 38274.8
[900]	valid_0's rmse: 38194.4
[1000]	valid_0's rmse: 38106.6
Did not meet early stopping. Best iteration is:
[985]	valid_0's rmse: 38099.9




Training until validation scores don't improve for 100 rounds




[100]	valid_0's rmse: 43593




[200]	valid_0's rmse: 39806.6




[300]	valid_0's rmse: 39408




[400]	valid_0's rmse: 39200.9




[500]	valid_0's rmse: 39019.7




[600]	valid_0's rmse: 38880.6




[700]	valid_0's rmse: 38809.9




[800]	valid_0's rmse: 38691.8




[900]	valid_0's rmse: 38615.3




[1000]	valid_0's rmse: 38526.4
Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 38526.4




Training until validation scores don't improve for 100 rounds




[100]	valid_0's rmse: 55410.3




[200]	valid_0's rmse: 50517.1




[300]	valid_0's rmse: 49689




[400]	valid_0's rmse: 49166




[500]	valid_0's rmse: 48654.1




[600]	valid_0's rmse: 48229.6




[700]	valid_0's rmse: 47986.6




[800]	valid_0's rmse: 47760.4




[900]	valid_0's rmse: 47578




[1000]	valid_0's rmse: 47482.3
Did not meet early stopping. Best iteration is:
[999]	valid_0's rmse: 47480


Best trial: 0. Best value: 41204.3: 100%|██████████| 25/25 [46:27<00:00, 111.49s/it]

📉 Trial 24 | CV RMSE: 41,368.74
[I 2025-05-21 06:59:24,844] Trial 24 finished with value: 41368.737424700535 and parameters: {'learning_rate': 0.03523781685901837, 'num_leaves': 220, 'max_depth': 11, 'min_child_samples': 26, 'subsample': 0.8403536627344244, 'colsample_bytree': 0.7141580570208755, 'reg_alpha': 2.045336154849232, 'reg_lambda': 1.0784119087472819, 'min_split_gain': 0.17069174630549644}. Best is trial 0 with value: 41204.31262134961.





✅ Best RMSE: 41204.31262134961
✅ Best Parameters: {'learning_rate': 0.039730220495763816, 'num_leaves': 210, 'max_depth': 11, 'min_child_samples': 18, 'subsample': 0.8602930077632327, 'colsample_bytree': 0.6962834149319597, 'reg_alpha': 4.731216779813845, 'reg_lambda': 1.7984216897061351, 'min_split_gain': 0.24840445491683466}
📁 Saved: oof_preds_lgbm.npy
[100]	training's l2: 1.4004e+09
[200]	training's l2: 8.47778e+08
[100]	training's l2: 1.34942e+09
[200]	training's l2: 8.13923e+08
[100]	training's l2: 1.18705e+09
[200]	training's l2: 7.70703e+08
✅ Final Union Feature Count: 60
[100]	train's l2: 1.29808e+09
[200]	train's l2: 8.24013e+08
[300]	train's l2: 6.53762e+08
[400]	train's l2: 5.48705e+08
[500]	train's l2: 4.75295e+08
[600]	train's l2: 4.1779e+08
[700]	train's l2: 3.76047e+08
[800]	train's l2: 3.38444e+08
[900]	train's l2: 3.08479e+08
[1000]	train's l2: 2.84088e+08
📁 Saved: test_preds_lgbm_shap.npy
📤 Submission saved: submission_lgbm_shap.csv


In [55]:
import numpy as np
import pandas as pd
import optuna
from xgboost import XGBRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import root_mean_squared_error
from optuna.integration import XGBoostPruningCallback
from shap import TreeExplainer

# === STEP 0: Prepare Data ===
X_full = train_merged.copy()
y_full = pd.Series(y_train)
X_test = test_merged.copy()

bin_cols = [
    'building_value_growth_bin',
    'assessed_growth_bin',
    'land_value_growth_bin','year_built_bin'
]

for col in bin_cols:
    X_full[col] = X_full[col].cat.codes
    X_test[col] = X_test[col].cat.codes

categorical_cols = X_full.select_dtypes(include='object').columns.tolist()
X_full[categorical_cols] = X_full[categorical_cols].astype('category')
X_test[categorical_cols] = X_test[categorical_cols].astype('category')

# === Global OOF Tracker ===
global_oof_preds = np.zeros(len(X_full))
best_score = float("inf")

# === STEP 1: Optuna Objective Function (No SHAP during tuning) ===
def objective(trial):
    global global_oof_preds, best_score

    params = {
        "objective": "reg:squarederror",
        "eval_metric": "rmse",
        "tree_method": "hist",
        "learning_rate": trial.suggest_float("learning_rate", 0.047, 0.05, log=True),
        "max_depth": 6,
        "min_child_weight": trial.suggest_int("min_child_weight", 11, 12),
        "subsample": trial.suggest_float("subsample", 0.87, 0.89),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.7, 0.74),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.30, 0.56, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.05, 0.11, log=True),
        "gamma": trial.suggest_float("gamma", 1.1, 4.3),
        "n_estimators": 1000,
        "n_jobs": -1,
        "enable_categorical": True,
    }

    kf = KFold(n_splits=3, shuffle=True, random_state=42)
    oof_preds = np.zeros(len(X_full))
    fold_rmse = []

    for fold, (train_idx, val_idx) in enumerate(kf.split(X_full)):
        X_train, y_train_fold = X_full.iloc[train_idx], y_full.iloc[train_idx]
        X_val, y_val = X_full.iloc[val_idx], y_full.iloc[val_idx]

        model = XGBRegressor(
            **params,
            early_stopping_rounds=100,
            callbacks=[XGBoostPruningCallback(trial, "validation_0-rmse"),
                       ]
        )
        model.fit(X_train, y_train_fold, eval_set=[(X_val, y_val)], verbose=100)

        val_pred = model.predict(X_val)
        oof_preds[val_idx] = val_pred
        fold_rmse.append(root_mean_squared_error(y_val, val_pred))

    mean_rmse = np.mean(fold_rmse)
    trial.set_user_attr("cv_rmse", mean_rmse)

    if mean_rmse < best_score:
        best_score = mean_rmse
        global_oof_preds[:] = oof_preds

    print(f" Trial {trial.number} | CV RMSE: {mean_rmse:,.2f}")
    return mean_rmse

# === STEP 2: Run Optuna ===
study = optuna.create_study(
    direction='minimize',
    study_name='xgbreg_optuna_final_no_shap',
    pruner=optuna.pruners.SuccessiveHalvingPruner(min_resource=100, reduction_factor=2)
)
study.optimize(objective, n_trials=25, show_progress_bar=True)

print(" Best RMSE:", study.best_value)
print(" Best Parameters:", study.best_params)
np.save("oof_preds_xgbreg.npy", global_oof_preds)
print(" Saved: oof_preds_xgbreg.npy")

# === STEP 3: Post-Optuna SHAP + Gain Feature Selection ===
kf = KFold(n_splits=3, shuffle=True, random_state=42)
selected_feature_sets = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_full)):
    X_train_raw, y_train_fold = X_full.iloc[train_idx], y_full.iloc[train_idx]

    model_temp = XGBRegressor(**study.best_params, n_estimators=200)
    model_temp.fit(X_train_raw, y_train_fold)

    # === SHAP Importance ===
    explainer = TreeExplainer(model_temp)
    shap_values = explainer.shap_values(X_train_raw)
    shap_df = pd.DataFrame(np.abs(shap_values), columns=X_train_raw.columns)
    shap_importance = shap_df.mean().sort_values(ascending=False)
    shap_cumsum = shap_importance.cumsum() / shap_importance.sum()
    top_shap = shap_cumsum[shap_cumsum <= 0.95].index.tolist()

    # === Gain Importance ===
    gain_importance = pd.Series(model_temp.feature_importances_, index=X_train_raw.columns)
    gain_sorted = gain_importance.sort_values(ascending=False)
    gain_cumsum = gain_sorted.cumsum() / gain_sorted.sum()
    top_gain = gain_cumsum[gain_cumsum <= 0.95].index.tolist()

    selected_features = list(set(top_shap).union(set(top_gain)))
    selected_feature_sets.append(selected_features)

# === STEP 4: Final Feature Union ===
final_union_features = list(set().union(*selected_feature_sets))
print(" Final Union Feature Count:", len(final_union_features))

# === STEP 5: Train Final Model with Early Stopping on Union Features ===
X_full_selected = X_full[final_union_features]
X_test_selected = X_test[final_union_features]

final_model = XGBRegressor(**study.best_params)
final_model.set_params(n_estimators=1000, verbosity=1, early_stopping_rounds=100)
final_model.fit(X_full_selected, y_full, eval_set=[(X_full_selected, y_full)], verbose=100)

# === STEP 6: Predict on Test Set ===
test_preds = final_model.predict(X_test_selected)
np.save("test_preds_xgbreg.npy", test_preds)
print(" Saved: test_preds_xgbreg.npy")

# === STEP 7: Create Submission File ===
account_ids = acct_test.values.ravel()  # Replace with actual ID column
submission = pd.DataFrame({
    'ACCOUNT': account_ids,
    'TARGET': test_preds
})
submission.to_csv("submission_xgbreg.csv", index=False)
print(" Submission saved: submission_xgbreg.csv")


[I 2025-05-21 08:49:11,193] A new study created in memory with name: xgbreg_optuna_final_no_shap
  0%|          | 0/25 [00:00<?, ?it/s]

[0]	validation_0-rmse:298008.09165
[100]	validation_0-rmse:43291.66107
[200]	validation_0-rmse:41955.74571
[300]	validation_0-rmse:41342.78856
[400]	validation_0-rmse:40694.30373
[500]	validation_0-rmse:40257.10117
[600]	validation_0-rmse:39941.51384
[700]	validation_0-rmse:39659.15918
[800]	validation_0-rmse:39490.57387
[900]	validation_0-rmse:39309.40731
[999]	validation_0-rmse:39178.99984
[0]	validation_0-rmse:301343.55001




[100]	validation_0-rmse:43633.89491




[200]	validation_0-rmse:42121.58462




[300]	validation_0-rmse:41172.17645




[400]	validation_0-rmse:40406.67757




[500]	validation_0-rmse:39897.28876




[600]	validation_0-rmse:39393.62353




[700]	validation_0-rmse:39012.23096




[800]	validation_0-rmse:38743.93992




[900]	validation_0-rmse:38482.59508




[999]	validation_0-rmse:38318.32360




[0]	validation_0-rmse:306099.73562




[100]	validation_0-rmse:50000.96847




[200]	validation_0-rmse:48882.94213




[300]	validation_0-rmse:48277.67689




[400]	validation_0-rmse:47772.75817




[500]	validation_0-rmse:47194.99235




[600]	validation_0-rmse:46630.32761




[700]	validation_0-rmse:46375.34017




[800]	validation_0-rmse:46072.81374




[900]	validation_0-rmse:45828.59745




[999]	validation_0-rmse:45678.48864


Best trial: 0. Best value: 41058.6:   4%|▍         | 1/25 [09:30<3:48:12, 570.53s/it]

 Trial 0 | CV RMSE: 41,058.60
[I 2025-05-21 08:58:41,716] Trial 0 finished with value: 41058.604166666664 and parameters: {'learning_rate': 0.04869675093251221, 'min_child_weight': 11, 'subsample': 0.8857951150868569, 'colsample_bytree': 0.7374817458794671, 'reg_alpha': 0.3152961170624885, 'reg_lambda': 0.06301071787244443, 'gamma': 1.4782920262535209}. Best is trial 0 with value: 41058.604166666664.
[0]	validation_0-rmse:298388.82953
[100]	validation_0-rmse:43632.89920


Best trial: 0. Best value: 41058.6:   8%|▊         | 2/25 [09:55<1:35:46, 249.83s/it]

[I 2025-05-21 08:59:07,064] Trial 1 pruned. Trial was pruned at iteration 100.
[0]	validation_0-rmse:298439.19875


Best trial: 0. Best value: 41058.6:  12%|█▏        | 3/25 [10:20<53:58, 147.21s/it]  

[I 2025-05-21 08:59:32,160] Trial 2 pruned. Trial was pruned at iteration 100.
[0]	validation_0-rmse:298195.27940
[100]	validation_0-rmse:43860.08307


Best trial: 0. Best value: 41058.6:  16%|█▌        | 4/25 [10:45<34:36, 98.87s/it] 

[I 2025-05-21 08:59:56,916] Trial 3 pruned. Trial was pruned at iteration 100.
[0]	validation_0-rmse:297656.79112
[100]	validation_0-rmse:43657.46969


Best trial: 0. Best value: 41058.6:  20%|██        | 5/25 [11:11<24:11, 72.57s/it]

[I 2025-05-21 09:00:22,843] Trial 4 pruned. Trial was pruned at iteration 100.
[0]	validation_0-rmse:298299.96979


Best trial: 0. Best value: 41058.6:  24%|██▍       | 6/25 [11:36<17:51, 56.41s/it]

[I 2025-05-21 09:00:47,902] Trial 5 pruned. Trial was pruned at iteration 100.
[0]	validation_0-rmse:298418.05635
[100]	validation_0-rmse:43613.99989
[200]	validation_0-rmse:42129.13212


Best trial: 0. Best value: 41058.6:  28%|██▊       | 7/25 [12:19<15:36, 52.04s/it]

[I 2025-05-21 09:01:30,950] Trial 6 pruned. Trial was pruned at iteration 200.
[0]	validation_0-rmse:298475.38298
[100]	validation_0-rmse:43325.46059
[200]	validation_0-rmse:41877.19686
[300]	validation_0-rmse:41134.46557
[400]	validation_0-rmse:40644.02103
[500]	validation_0-rmse:40183.99235
[600]	validation_0-rmse:39859.09795
[700]	validation_0-rmse:39625.19067
[800]	validation_0-rmse:39413.51210
[900]	validation_0-rmse:39247.20538
[999]	validation_0-rmse:39095.49060
[0]	validation_0-rmse:301796.32780




[100]	validation_0-rmse:44104.29825




[200]	validation_0-rmse:42123.79447




[300]	validation_0-rmse:41206.93334




[400]	validation_0-rmse:40423.92597




[500]	validation_0-rmse:39872.75975




[600]	validation_0-rmse:39426.94347




[700]	validation_0-rmse:39052.35720




[800]	validation_0-rmse:38808.22977




[900]	validation_0-rmse:38574.15905




[999]	validation_0-rmse:38352.44667




[0]	validation_0-rmse:306540.65998




[100]	validation_0-rmse:49761.06708




[200]	validation_0-rmse:48584.13069




[300]	validation_0-rmse:48074.08250




[400]	validation_0-rmse:47615.65540




[500]	validation_0-rmse:47038.38206




[600]	validation_0-rmse:46567.64522




[700]	validation_0-rmse:46224.88176




[800]	validation_0-rmse:45919.10568




[900]	validation_0-rmse:45671.61614




[999]	validation_0-rmse:45499.46488


Best trial: 7. Best value: 40980.9:  32%|███▏      | 8/25 [22:17<1:03:57, 225.72s/it]

 Trial 7 | CV RMSE: 40,980.93
[I 2025-05-21 09:11:28,541] Trial 7 finished with value: 40980.930989583336 and parameters: {'learning_rate': 0.04720857261312059, 'min_child_weight': 11, 'subsample': 0.8886675210033114, 'colsample_bytree': 0.7269521202249652, 'reg_alpha': 0.3033016426842381, 'reg_lambda': 0.07450056363162388, 'gamma': 1.9947747256925998}. Best is trial 7 with value: 40980.930989583336.
[0]	validation_0-rmse:298350.46133
[100]	validation_0-rmse:43734.82880


Best trial: 7. Best value: 40980.9:  36%|███▌      | 9/25 [22:41<43:24, 162.77s/it]  

[I 2025-05-21 09:11:52,885] Trial 8 pruned. Trial was pruned at iteration 100.
[0]	validation_0-rmse:298417.92154


Best trial: 7. Best value: 40980.9:  40%|████      | 10/25 [23:07<30:07, 120.48s/it]

[I 2025-05-21 09:12:18,672] Trial 9 pruned. Trial was pruned at iteration 100.
[0]	validation_0-rmse:298165.92951
[100]	validation_0-rmse:43408.56298


Best trial: 7. Best value: 40980.9:  44%|████▍     | 11/25 [23:52<22:42, 97.31s/it] 

[I 2025-05-21 09:13:03,442] Trial 10 pruned. Trial was pruned at iteration 200.
[0]	validation_0-rmse:298017.23390
[100]	validation_0-rmse:43436.16303


Best trial: 7. Best value: 40980.9:  48%|████▊     | 12/25 [24:35<17:29, 80.72s/it]

[I 2025-05-21 09:13:46,215] Trial 11 pruned. Trial was pruned at iteration 200.
[0]	validation_0-rmse:298124.76390
[100]	validation_0-rmse:43215.33082
[200]	validation_0-rmse:41942.40371
[300]	validation_0-rmse:41204.32419
[400]	validation_0-rmse:40681.36497


Best trial: 7. Best value: 40980.9:  52%|█████▏    | 13/25 [25:55<16:06, 80.52s/it]

[I 2025-05-21 09:15:06,291] Trial 12 pruned. Trial was pruned at iteration 400.
[0]	validation_0-rmse:297953.68628
[100]	validation_0-rmse:43326.60915


Best trial: 7. Best value: 40980.9:  56%|█████▌    | 14/25 [26:37<12:39, 69.03s/it]

[I 2025-05-21 09:15:48,749] Trial 13 pruned. Trial was pruned at iteration 200.
[0]	validation_0-rmse:298189.77001
[100]	validation_0-rmse:43220.71401
[200]	validation_0-rmse:41951.24734
[300]	validation_0-rmse:41192.58903
[400]	validation_0-rmse:40637.10472
[500]	validation_0-rmse:40121.78753
[600]	validation_0-rmse:39782.28487
[700]	validation_0-rmse:39508.38039
[800]	validation_0-rmse:39305.76908
[900]	validation_0-rmse:39141.08464
[999]	validation_0-rmse:38996.65355
[0]	validation_0-rmse:301525.73128




[100]	validation_0-rmse:43520.85700




[200]	validation_0-rmse:41756.03744




[300]	validation_0-rmse:40804.85231




[400]	validation_0-rmse:40125.41177




[500]	validation_0-rmse:39587.52759




[600]	validation_0-rmse:39148.86749




[700]	validation_0-rmse:38865.82938




[800]	validation_0-rmse:38590.82301




[900]	validation_0-rmse:38346.66845




[999]	validation_0-rmse:38166.58734




[0]	validation_0-rmse:306291.79949




[100]	validation_0-rmse:50113.06954




[200]	validation_0-rmse:48855.12295




[300]	validation_0-rmse:48460.83678




[400]	validation_0-rmse:48125.79375




[500]	validation_0-rmse:47667.79681




[600]	validation_0-rmse:47242.73868




[700]	validation_0-rmse:46885.53296




[800]	validation_0-rmse:46665.99360




[900]	validation_0-rmse:46488.15741




[999]	validation_0-rmse:46293.14752


Best trial: 7. Best value: 40980.9:  60%|██████    | 15/25 [35:56<36:06, 216.68s/it]

 Trial 14 | CV RMSE: 41,152.13
[I 2025-05-21 09:25:07,615] Trial 14 finished with value: 41152.127604166664 and parameters: {'learning_rate': 0.04809801404702377, 'min_child_weight': 11, 'subsample': 0.8827518292391843, 'colsample_bytree': 0.7219966846776295, 'reg_alpha': 0.34761835495727206, 'reg_lambda': 0.08116693084448152, 'gamma': 2.2251467315358364}. Best is trial 7 with value: 40980.930989583336.
[0]	validation_0-rmse:297838.22286
[100]	validation_0-rmse:43053.25935
[200]	validation_0-rmse:41909.97448
[300]	validation_0-rmse:41324.32506


Best trial: 7. Best value: 40980.9:  64%|██████▍   | 16/25 [37:16<26:18, 175.41s/it]

[I 2025-05-21 09:26:27,203] Trial 15 pruned. Trial was pruned at iteration 400.
[0]	validation_0-rmse:298243.25997
[100]	validation_0-rmse:43157.96836
[200]	validation_0-rmse:41736.58304
[300]	validation_0-rmse:40990.70294
[400]	validation_0-rmse:40487.62251
[500]	validation_0-rmse:40064.07732
[600]	validation_0-rmse:39682.52840
[700]	validation_0-rmse:39403.37084
[800]	validation_0-rmse:39232.24348
[900]	validation_0-rmse:39085.63289
[999]	validation_0-rmse:38933.86272
[0]	validation_0-rmse:301579.35377




[100]	validation_0-rmse:43930.66712




[200]	validation_0-rmse:42220.89088




[300]	validation_0-rmse:41165.25430




[400]	validation_0-rmse:40416.73355




[500]	validation_0-rmse:39858.82492




[600]	validation_0-rmse:39370.68034




[700]	validation_0-rmse:39011.28266




[800]	validation_0-rmse:38749.79879




[900]	validation_0-rmse:38540.12210




[999]	validation_0-rmse:38345.32415




[0]	validation_0-rmse:306329.19988




[100]	validation_0-rmse:50038.53526




[200]	validation_0-rmse:48900.47724




[300]	validation_0-rmse:48621.77605




[400]	validation_0-rmse:48039.57604




[500]	validation_0-rmse:47538.66067




[600]	validation_0-rmse:47102.69284




[700]	validation_0-rmse:46827.45648




[800]	validation_0-rmse:46541.02322




[900]	validation_0-rmse:46275.20764




[999]	validation_0-rmse:46108.16210


Best trial: 7. Best value: 40980.9:  68%|██████▊   | 17/25 [46:56<39:37, 297.23s/it]

 Trial 16 | CV RMSE: 41,129.12
[I 2025-05-21 09:36:07,718] Trial 16 finished with value: 41129.115885416664 and parameters: {'learning_rate': 0.04791773402057432, 'min_child_weight': 11, 'subsample': 0.8831280659402007, 'colsample_bytree': 0.7326036611500717, 'reg_alpha': 0.3314633519862176, 'reg_lambda': 0.07924203984629284, 'gamma': 2.234305686215955}. Best is trial 7 with value: 40980.930989583336.
[0]	validation_0-rmse:298034.03555
[100]	validation_0-rmse:43102.85154
[200]	validation_0-rmse:41904.06535
[300]	validation_0-rmse:41098.14690
[400]	validation_0-rmse:40521.89540
[500]	validation_0-rmse:40107.88188
[600]	validation_0-rmse:39869.08631
[700]	validation_0-rmse:39611.40133
[800]	validation_0-rmse:39389.95421


Best trial: 7. Best value: 40980.9:  72%|███████▏  | 18/25 [49:38<29:56, 256.70s/it]

[I 2025-05-21 09:38:50,083] Trial 17 pruned. Trial was pruned at iteration 800.
[0]	validation_0-rmse:298494.84382


Best trial: 7. Best value: 40980.9:  76%|███████▌  | 19/25 [50:03<18:41, 186.97s/it]

[I 2025-05-21 09:39:14,620] Trial 18 pruned. Trial was pruned at iteration 100.
[0]	validation_0-rmse:297904.69490
[100]	validation_0-rmse:43110.03401
[200]	validation_0-rmse:41777.64879
[300]	validation_0-rmse:41150.28459
[400]	validation_0-rmse:40622.58295
[500]	validation_0-rmse:40137.38302
[600]	validation_0-rmse:39872.00910
[700]	validation_0-rmse:39606.86207


Best trial: 7. Best value: 40980.9:  80%|████████  | 20/25 [52:38<14:47, 177.41s/it]

[I 2025-05-21 09:41:49,733] Trial 19 pruned. Trial was pruned at iteration 800.
[0]	validation_0-rmse:297756.60063
[100]	validation_0-rmse:43401.72496


Best trial: 7. Best value: 40980.9:  84%|████████▍ | 21/25 [53:23<09:10, 137.50s/it]

[I 2025-05-21 09:42:34,198] Trial 20 pruned. Trial was pruned at iteration 200.
[0]	validation_0-rmse:298258.82744
[100]	validation_0-rmse:43135.97314
[200]	validation_0-rmse:41899.69652
[300]	validation_0-rmse:41136.82774
[400]	validation_0-rmse:40655.96349


Best trial: 7. Best value: 40980.9:  88%|████████▊ | 22/25 [54:42<06:00, 120.11s/it]

[I 2025-05-21 09:43:53,748] Trial 21 pruned. Trial was pruned at iteration 400.
[0]	validation_0-rmse:298231.78725
[100]	validation_0-rmse:43163.01262
[200]	validation_0-rmse:41780.89193
[300]	validation_0-rmse:40993.59481
[400]	validation_0-rmse:40410.82802
[500]	validation_0-rmse:39975.92285
[600]	validation_0-rmse:39667.51813
[700]	validation_0-rmse:39430.66891
[800]	validation_0-rmse:39239.04460
[900]	validation_0-rmse:39098.28936
[999]	validation_0-rmse:38956.33639
[0]	validation_0-rmse:301567.50508




[100]	validation_0-rmse:44145.80393




[200]	validation_0-rmse:42170.60777




[300]	validation_0-rmse:41167.11120




[400]	validation_0-rmse:40414.05901




[500]	validation_0-rmse:39866.97169




[600]	validation_0-rmse:39396.59127




[700]	validation_0-rmse:39098.02530




[800]	validation_0-rmse:38866.49053




[900]	validation_0-rmse:38629.84523




[999]	validation_0-rmse:38418.65690




[0]	validation_0-rmse:306317.64758




[100]	validation_0-rmse:50342.56307




[200]	validation_0-rmse:49190.82811




[300]	validation_0-rmse:48721.98557




[400]	validation_0-rmse:48172.84712




[500]	validation_0-rmse:47610.18921




[600]	validation_0-rmse:47196.09894




[700]	validation_0-rmse:46939.99339




[800]	validation_0-rmse:46703.33968




[900]	validation_0-rmse:46465.66973




[999]	validation_0-rmse:46288.54979


Best trial: 7. Best value: 40980.9:  92%|█████████▏| 23/25 [1:04:13<08:30, 255.28s/it]

 Trial 22 | CV RMSE: 41,221.12
[I 2025-05-21 09:53:24,296] Trial 22 finished with value: 41221.122395833336 and parameters: {'learning_rate': 0.04795844764338198, 'min_child_weight': 11, 'subsample': 0.8843044148774571, 'colsample_bytree': 0.7321968578539245, 'reg_alpha': 0.3419528694410741, 'reg_lambda': 0.07810443878106536, 'gamma': 2.45588745781989}. Best is trial 7 with value: 40980.930989583336.
[0]	validation_0-rmse:298103.99696
[100]	validation_0-rmse:43244.57292
[200]	validation_0-rmse:42065.85709


Best trial: 7. Best value: 40980.9:  96%|█████████▌| 24/25 [1:04:55<03:11, 191.53s/it]

[I 2025-05-21 09:54:07,110] Trial 23 pruned. Trial was pruned at iteration 200.
[0]	validation_0-rmse:298290.09849
[100]	validation_0-rmse:43226.69084
[200]	validation_0-rmse:41752.71323
[300]	validation_0-rmse:41085.52337
[400]	validation_0-rmse:40469.25339
[500]	validation_0-rmse:40023.69821
[600]	validation_0-rmse:39751.47708
[700]	validation_0-rmse:39490.67199
[800]	validation_0-rmse:39294.72189
[900]	validation_0-rmse:39112.44497
[999]	validation_0-rmse:38960.93966
[0]	validation_0-rmse:301625.81619




[100]	validation_0-rmse:43861.84812




[200]	validation_0-rmse:42223.12222




[300]	validation_0-rmse:41249.17322




[400]	validation_0-rmse:40417.83394




[500]	validation_0-rmse:39888.76973




[600]	validation_0-rmse:39460.26270




[700]	validation_0-rmse:39100.90504




[800]	validation_0-rmse:38837.66781




[900]	validation_0-rmse:38603.68688




[999]	validation_0-rmse:38400.95640




[0]	validation_0-rmse:306375.08520




[100]	validation_0-rmse:50063.32655




[200]	validation_0-rmse:48963.97715




[300]	validation_0-rmse:48397.59245




[400]	validation_0-rmse:47928.17547




[500]	validation_0-rmse:47470.69584




[600]	validation_0-rmse:46994.83782




[700]	validation_0-rmse:46723.80273




[800]	validation_0-rmse:46477.73816




[900]	validation_0-rmse:46197.97602




[999]	validation_0-rmse:46006.11736


Best trial: 7. Best value: 40980.9: 100%|██████████| 25/25 [1:14:33<00:00, 178.95s/it]

 Trial 24 | CV RMSE: 41,122.67
[I 2025-05-21 10:03:44,905] Trial 24 finished with value: 41122.671875 and parameters: {'learning_rate': 0.04775969184158978, 'min_child_weight': 11, 'subsample': 0.8864709507691806, 'colsample_bytree': 0.7287450806475548, 'reg_alpha': 0.3330835373597742, 'reg_lambda': 0.06730794122465343, 'gamma': 2.119827910421352}. Best is trial 7 with value: 40980.930989583336.
 Best RMSE: 40980.930989583336
 Best Parameters: {'learning_rate': 0.04720857261312059, 'min_child_weight': 11, 'subsample': 0.8886675210033114, 'colsample_bytree': 0.7269521202249652, 'reg_alpha': 0.3033016426842381, 'reg_lambda': 0.07450056363162388, 'gamma': 1.9947747256925998}
 Saved: oof_preds_xgbreg.npy





 Final Union Feature Count: 112
[0]	validation_0-rmse:302184.48445
[100]	validation_0-rmse:38617.83791
[200]	validation_0-rmse:34073.80065
[300]	validation_0-rmse:31526.93363
[400]	validation_0-rmse:29586.13995
[500]	validation_0-rmse:28104.49179
[600]	validation_0-rmse:26872.00274
[700]	validation_0-rmse:25883.70149
[800]	validation_0-rmse:24981.54000
[900]	validation_0-rmse:24182.99681
[999]	validation_0-rmse:23498.29818
 Saved: test_preds_xgbreg.npy
 Submission saved: submission_xgbreg.csv


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# === Compute Residuals ===
residuals = y_full - global_oof_preds
res_df = X_full.copy()
res_df["actual"] = y_full
res_df["predicted"] = global_oof_preds
res_df["residual"] = residuals

# === Protest Count (2015–2018) ===
protest_cols = [f"protested_{year}" for year in range(2015, 2019)]
if all(col in res_df.columns for col in protest_cols):
    res_df["protest_count"] = res_df[protest_cols].sum(axis=1)

# === Plot 1: Residuals vs Actual Value ===
plt.figure(figsize=(8, 6))
sns.scatterplot(data=res_df, x="actual", y="residual", alpha=0.3)
plt.axhline(0, color="red", linestyle="--")
plt.title("Residuals vs. Actual Value")
plt.xlabel("Actual Value")
plt.ylabel("Residual (Actual - Predicted)")
plt.tight_layout()
plt.savefig("residuals_vs_actual_value.png", dpi=300)
plt.close()

# === Plot 2: Residuals by Protest Count ===
if "protest_count" in res_df.columns:
    plt.figure(figsize=(8, 6))
    sns.boxplot(x="protest_count", y="residual", data=res_df)
    plt.axhline(0, color="red", linestyle="--")
    plt.title("Residuals by Protest Count (2015–2018)")
    plt.xlabel("Number of Years Protested")
    plt.ylabel("Residual")
    plt.tight_layout()
    plt.savefig("residuals_by_protest_count.png", dpi=300)
    plt.close()

# === Plot 3: Residuals vs. Neighborhood Frequency ===
if "neighborhood_freq" in res_df.columns:
    plt.figure(figsize=(8, 6))
    sns.scatterplot(x="neighborhood_freq", y="residual", data=res_df, alpha=0.3)
    plt.axhline(0, color="red", linestyle="--")
    plt.title("Residuals vs. Neighborhood Frequency")
    plt.xlabel("Neighborhood Frequency")
    plt.ylabel("Residual")
    plt.tight_layout()
    plt.savefig("residuals_vs_neighborhood_freq.png", dpi=300)
    plt.close()


In [None]:
import numpy as np
import pandas as pd
import shap
import matplotlib.pyplot as plt

# === SHAP Setup ===
explainer = shap.TreeExplainer(final_model)
shap_values = explainer.shap_values(X_full_selected)

# === SHAP DataFrame ===
shap_df = pd.DataFrame(np.abs(shap_values), columns=X_full_selected.columns)
shap_means = shap_df.mean().sort_values(ascending=False)
shap_importance_df = shap_means.reset_index()
shap_importance_df.columns = ["feature", "mean_shap"]

# === SHAP Summary Plot: Top 30 ===
plt.figure()
shap.summary_plot(shap_values, X_full_selected, max_display=30, show=False)
plt.tight_layout()
plt.savefig("shap_summary_top30_union.png", dpi=300)
plt.close()

# === SHAP Summary Plot: Bottom 30 ===
bottom_features = shap_importance_df.tail(30)["feature"].tolist()
shap.summary_plot(
    shap_values[:, [X_full_selected.columns.get_loc(f) for f in bottom_features]],
    X_full_selected[bottom_features],
    show=False
)
plt.tight_layout()
plt.savefig("shap_summary_bottom30_union.png", dpi=300)
plt.close()


AttributeError: 'XGBRegressor' object has no attribute 'feature_importance'