In [5]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.cluster import MiniBatchKMeans
from sklearn.ensemble import IsolationForest
import warnings
from openpyxl import Workbook
from openpyxl.styles import Font, PatternFill
import time, os

warnings.filterwarnings('ignore')
os.environ['OMP_NUM_THREADS'] = '1'

# Config
INCLUDE_MLS, INCLUDE_CENSUS, INCLUDE_NEIGHBORHOOD, INCLUDE_IMAGE = True, True, True, False
MIN_PRICE, TEST_SIZE, RAND_STATE, N_JOBS, N_CLUSTERS = 20000, 0.3, 42, -1, 8
N_EST, QUANTILES = 500, [0.1, 0.5, 0.9]

INPUT_PATH = "/Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/Untitled 5_2025-12-23-1212.csv"
OUTPUT_DIR = "/Users/jenny.lin/BASIS_AVM_Onboarding/cate_scenario_analyses/model_outputs"

BASE_FEATS = ["living_sqft","lot_sqft","year_built","effective_year_built","bedrooms","full_baths","half_baths","garage_spaces","fireplace_code","latitude","longitude","geo_cluster"]
ENG_FEATS = ["sqft_per_bedroom","lot_to_living_ratio","property_age","is_new","has_garage","luxury_score","log_sqft","age_squared"]
PRIOR_FEATS = ["prior_sale_price","prior_price_per_sqft","sqft_per_prior_dollar","years_since_last_sale","expected_appreciation","has_prior_sale","recently_sold"]
CLUSTER_FEATS = ["cluster_avg_price","cluster_med_price"]
CENSUS_FEATS = ["total_population_25plus","male_bachelors_degree","female_bachelors_degree","pct_bachelors_degree","total_population","median_earnings_total","median_earnings_male","median_earnings_female","median_household_income","median_home_value","median_gross_rent","owner_occupied_units","renter_occupied_units","pct_owner_occupied","occupied_units","vacant_units","median_age","civilian_employed","civilian_unemployed","unemployment_rate","income_education_score"]
ELECT_FEATS = ["votes_gop","votes_dem","total_votes","per_gop","per_dem","per_point_diff","dem_margin","rep_margin"]
IMG_FEATS = ["topic_1","topic_2","topic_3","topic_4","topic_5","topic_6","topic_7","topic_8","topic_9","topic_10","gran_c_in","gran_c_ex","gran_c","high_c_in","high_c_ex","high_c"]

def optimize_dtypes(df):
    for c in df.select_dtypes(['float64']).columns: df[c] = df[c].astype('float32')
    for c in df.select_dtypes(['int64']).columns:
        df[c] = df[c].astype('int8') if set(df[c].dropna().unique()).issubset({0,1}) else df[c].astype('int32')
    return df

def load_data(path):
    print(f"Loading {path}")
    df = pd.read_csv(path, low_memory=False)
    df.columns = df.columns.str.lower()
    y_col = next((c for c in ['sale_price','currentsalesprice','price','saleprice'] if c in df.columns), 'currentsalesprice')
    id_col = next((c for c in ['cc_list_id','property_id','propertyid','id'] if c in df.columns), 'cc_list_id')
    state_col = next((c for c in ['sample_state','state','state_code'] if c in df.columns), None)
    print(f"{len(df):,} records | {df.memory_usage(deep=True).sum()/1024**2:.1f}MB | Price:{y_col} ID:{id_col}")
    return optimize_dtypes(df), y_col, id_col, state_col

def assign_segment(row, df_q):
    """Robust segmentation that handles missing columns gracefully"""
    # Size tier - use available size metrics
    size = 'medium'
    if 'living_sqft' in df_q.columns and 'living_sqft' in row.index:
        sqft = row.get('living_sqft',0)
        if sqft > 0:
            q25,q50,q75 = df_q['living_sqft'].quantile([.25,.5,.75])
            size = 'compact' if sqft<q25 else 'medium' if sqft<q50 else 'large' if sqft<q75 else 'xlarge'

    # Age tier
    age_class = 'mature'
    if 'property_age' in row.index:
        age = row.get('property_age',50)
        age_class = 'new' if age<=5 else 'recent' if age<=15 else 'mature' if age<=30 else 'aging' if age<=50 else 'old'
    elif 'year_built' in row.index:
        yb = row.get('year_built',1980)
        age = 2024 - yb
        age_class = 'new' if age<=5 else 'recent' if age<=15 else 'mature' if age<=30 else 'aging' if age<=50 else 'old'

    # Location quality
    loc = 'mid'
    if INCLUDE_CENSUS and 'median_household_income' in df_q.columns and 'median_household_income' in row.index:
        inc,edu = row.get('median_household_income',0), row.get('pct_bachelors_degree',0)
        iq33,iq67 = df_q['median_household_income'].quantile([.33,.67])
        eq33,eq67 = df_q['pct_bachelors_degree'].quantile([.33,.67]) if 'pct_bachelors_degree' in df_q.columns else (0,0)
        loc = 'prime' if inc>iq67 or edu>eq67 else 'basic' if inc<iq33 and edu<eq33 else 'mid'

    # Luxury tier
    lux_tier = 'mid'
    if 'luxury_score' in df_q.columns and 'luxury_score' in row.index:
        lux = row.get('luxury_score',0)
        lq33,lq67 = df_q['luxury_score'].quantile([.33,.67])
        lux_tier = 'upscale' if lux>lq67 else 'standard' if lux<lq33 else 'mid'

    return f"{size}_{age_class}_{loc}_{lux_tier}"

def engineer(df, y_col, with_price=False):
    if 'living_sqft' in df.columns and 'bedrooms' in df.columns: df['sqft_per_bedroom'] = df['living_sqft']/(df['bedrooms']+1)
    if 'lot_sqft' in df.columns:
        if 'living_sqft' in df.columns: df['lot_to_living_ratio'] = df['lot_sqft']/(df['living_sqft']+1)
        if 'lot_acres' not in df.columns: df['lot_acres'] = df['lot_sqft']/43560
    if 'year_built' in df.columns:
        df['property_age'] = 2024-df['year_built']
        df['is_new'] = (df['property_age']<=5).astype('int8')
        df['age_squared'] = df['property_age']**2
    if 'garage_spaces' in df.columns: df['has_garage'] = (df['garage_spaces']>0).astype('int8')
    if 'living_sqft' in df.columns: df['log_sqft'] = np.log1p(df['living_sqft'])
    lux = [df[c]/1000 if c=='living_sqft' else df[c] for c in ['living_sqft','full_baths','garage_spaces'] if c in df.columns]
    df['luxury_score'] = sum(lux)/len(lux) if lux else 0
    if INCLUDE_CENSUS and all(c in df.columns for c in ['median_household_income','pct_bachelors_degree']):
        df['income_education_score'] = df['median_household_income']*df['pct_bachelors_degree']
    if 'prior_sale_price' in df.columns and 'living_sqft' in df.columns:
        df['prior_price_per_sqft'] = df['prior_sale_price']/(df['living_sqft']+1)
        df['sqft_per_prior_dollar'] = df['living_sqft']/(df['prior_sale_price']+1)
    if 'prior_sale_date' in df.columns:
        df['prior_sale_date'] = pd.to_datetime(df['prior_sale_date'], errors='coerce')
        df['years_since_last_sale'] = (pd.Timestamp('2024-01-01')-df['prior_sale_date']).dt.days/365.25
        if 'prior_sale_price' in df.columns:
            df['expected_appreciation'] = df['prior_sale_price']*(1.04)**df['years_since_last_sale']
        df['recently_sold'] = (df['years_since_last_sale']<2).astype('int8')
    if 'prior_sale_price' in df.columns:
        df['has_prior_sale'] = df['prior_sale_price'].notna().astype('int8')
        missing = df['prior_sale_price'].isna()
        df.loc[missing,'prior_sale_price'] = df.loc[missing,'median_home_value'] if 'median_home_value' in df.columns and INCLUDE_CENSUS else df['prior_sale_price'].median()
    if 'years_since_last_sale' in df.columns: df['years_since_last_sale'] = df['years_since_last_sale'].fillna(999)
    if with_price and y_col in df.columns and 'living_sqft' in df.columns:
        df['price_per_sqft'] = df[y_col]/(df['living_sqft']+1)
        df['sqft_per_dollar'] = df['living_sqft']/(df[y_col]+1)
    return df

def geo_cluster(df):
    if not all(c in df.columns for c in ['latitude','longitude']):
        df['geo_cluster'] = 0; return df
    valid = df[['latitude','longitude']].notna().all(axis=1)
    if valid.sum()<N_CLUSTERS: df['geo_cluster'] = 0; return df
    df['geo_cluster'] = 0
    kmeans = MiniBatchKMeans(n_clusters=N_CLUSTERS, random_state=RAND_STATE, batch_size=1000, n_init=3)
    df.loc[valid,'geo_cluster'] = kmeans.fit_predict(df.loc[valid,['latitude','longitude']])
    return df

def add_cluster_feats(train, test, y_col):
    if 'geo_cluster' not in train.columns or y_col not in train.columns:
        for d in [train,test]: d['cluster_avg_price'] = d['cluster_med_price'] = train[y_col].median() if y_col in train.columns else 0
        return train, test
    stats = train.groupby('geo_cluster')[y_col].agg(['mean','median']).reset_index()
    stats.columns = ['geo_cluster','cluster_avg_price','cluster_med_price']
    train = train.merge(stats, on='geo_cluster', how='left').fillna({c:train[y_col].median() for c in ['cluster_avg_price','cluster_med_price']})
    test = test.merge(stats, on='geo_cluster', how='left').fillna({c:train[y_col].median() for c in ['cluster_avg_price','cluster_med_price']})
    return train, test

def filter_outliers(df, name, y_col):
    orig = len(df)
    df = engineer(df, y_col, with_price=True)
    if 'price_per_sqft' in df.columns:
        lb,ub = df['price_per_sqft'].quantile([.05,.95])
        df = df[(df['price_per_sqft']>=lb)&(df['price_per_sqft']<=ub)]
    if 'sqft_per_dollar' in df.columns: df = df[df['sqft_per_dollar']<=df['sqft_per_dollar'].quantile(.95)]
    if 'price_per_sqft' in df.columns: df = df.drop(columns=['price_per_sqft','sqft_per_dollar'])
    if 'lot_sqft' in df.columns: df = df[df['lot_sqft']<=df['lot_sqft'].quantile(.98)]
    if 'year_built' in df.columns: df = df[(df['year_built']>=1900)&(df['year_built']<=2025)]
    if len(df)>=100:
        try:
            feats = [c for c in ['living_sqft','lot_sqft',y_col,'year_built'] if c in df.columns]
            if len(feats)>=3:
                X = df[feats].fillna(df[feats].median())
                df = df[IsolationForest(contamination=.05,random_state=RAND_STATE,n_jobs=N_JOBS).fit_predict(X)==1]
        except: pass
    pct_filt = (orig-len(df))/orig*100 if orig>0 else 0
    if pct_filt > 0: print(f"  {name}: {orig:,}→{len(df):,} ({pct_filt:.1f}% filtered)")
    return df

def train_model(X, y, q):
    return XGBRegressor(objective='reg:quantileerror',quantile_alpha=q,n_estimators=N_EST,learning_rate=.05,max_depth=6,min_child_weight=3,subsample=.8,colsample_bytree=.8,random_state=RAND_STATE,n_jobs=N_JOBS,tree_method='hist').fit(X,y,verbose=False)

def feature_importance(models, feat_names, metrics):
    rows = []
    for seg, mdl in models.items():
        scores = mdl['q50'].get_booster().get_score(importance_type="gain")
        w = metrics[seg]["n_test"]
        for k,v in scores.items():
            idx = int(k[1:])
            if idx<len(feat_names): rows.append((feat_names[idx],v,w))
    if not rows: return pd.DataFrame(columns=["feature","importance"])
    df = pd.DataFrame(rows, columns=["feature","gain","weight"])
    out = df.assign(wg=df["gain"]*df["weight"]).groupby("feature",as_index=False).agg(tg=("wg","sum")).sort_values("tg",ascending=False)
    out["importance"] = out["tg"]/out["tg"].sum()
    return out[["feature","importance"]].head(100)

def prepare_data(df, y_col, id_col, state_col):
    print(f"\nPreparing data...")
    df = df[df[y_col]>=MIN_PRICE]
    print(f"{len(df):,} records after price filter")
    df = engineer(geo_cluster(df), y_col)
    feat_groups = []
    if INCLUDE_MLS: feat_groups.extend([BASE_FEATS,ENG_FEATS,PRIOR_FEATS,CLUSTER_FEATS])
    if INCLUDE_CENSUS: feat_groups.append(CENSUS_FEATS)
    if INCLUDE_NEIGHBORHOOD: feat_groups.append(ELECT_FEATS)
    if INCLUDE_IMAGE: feat_groups.append(IMG_FEATS)
    all_feats = [f for g in feat_groups for f in g]
    feats = [f for f in all_feats if f in df.columns]
    print(f"{len(feats)}/{len(all_feats)} features available")
    cols = list(set(feats+[y_col,id_col]+([state_col] if state_col and state_col in df.columns else [])))
    df = df[[c for c in cols if c in df.columns]].copy()
    df[feats] = df[feats].fillna(df[feats].median())
    return df.dropna(subset=[y_col]), feats

def train_segments(df, feats, y_col, id_col, state_col):
    print(f"\nTraining segmented models on {len(df):,} properties")
    df['seg'] = df.apply(lambda r: assign_segment(r,df), axis=1)
    seg_cnts = df['seg'].value_counts()
    print(f"{len(seg_cnts)} segments created")
    for seg, cnt in seg_cnts.head(10).items(): print(f"  {seg}: {cnt:,}")
    small = seg_cnts[seg_cnts<50].index
    if len(small)>0:
        df.loc[df['seg'].isin(small),'seg'] = 'other'
        print(f"Merged {len(small)} small segments into 'other'")
    models, metrics, preds_list = {}, {}, []
    for seg in df['seg'].unique():
        seg_df = df[df['seg']==seg].copy()
        if len(seg_df)<50: continue
        seg_df = filter_outliers(seg_df, seg, y_col)
        if len(seg_df)<50: continue
        train_idx = seg_df.sample(frac=1-TEST_SIZE,random_state=RAND_STATE).index
        train_df, test_df = seg_df.loc[train_idx].copy(), seg_df.loc[seg_df.index.difference(train_idx)].copy()
        train_df, test_df = add_cluster_feats(train_df, test_df, y_col)
        X_tr,y_tr = train_df[feats].values, train_df[y_col].values
        X_te,y_te = test_df[feats].values, test_df[y_col].values
        ids_te = test_df[id_col].values
        states_te = test_df[state_col].values if state_col and state_col in test_df.columns else ['Unknown']*len(test_df)
        seg_models, seg_preds = {}, []
        for q in QUANTILES:
            m = train_model(X_tr,y_tr,q)
            seg_models[f"q{int(q*100)}"] = m
            seg_preds.append(m.predict(X_te))
        models[seg] = seg_models
        y_pred = seg_preds[1]
        mae,mape = mean_absolute_error(y_te,y_pred), np.mean(np.abs((y_te-y_pred)/y_te))*100
        r2,cov = r2_score(y_te,y_pred), np.mean((y_te>=seg_preds[0])&(y_te<=seg_preds[2]))*100
        metrics[seg] = {'n_train':len(X_tr),'n_test':len(X_te),'mae':mae,'mape':mape,'r2':r2,'cov':cov,'p_min':seg_df[y_col].min(),'p_max':seg_df[y_col].max(),'p_med':seg_df[y_col].median()}
        print(f"  {seg}: {len(test_df):,} test | MAE:${mae:,.0f} R²:{r2:.3f}")
        preds_list.append(pd.DataFrame({'property_id':ids_te,'state':states_te,'actual':y_te,'predicted':y_pred,'pred_lower':seg_preds[0],'pred_upper':seg_preds[2],'segment':seg}))
    return {'models':models,'metrics':metrics,'predictions':pd.concat(preds_list),'feature_importance':feature_importance(models,feats,metrics),'feature_names':feats}

def save_results(results, out_dir):
    print(f"\nSaving results...")
    preds, metrics, fi = results['predictions'], results['metrics'], results['feature_importance']
    wb = Workbook()
    wb.remove(wb.active)

    # Summary
    ws = wb.create_sheet("Summary",0)
    ws['A1'], ws['A1'].font = 'GRANULAR SEGMENTED AVM', Font(bold=True,size=14)
    r2 = r2_score(preds['actual'],preds['predicted'])
    mae = mean_absolute_error(preds['actual'],preds['predicted'])
    mape = np.mean(np.abs((preds['actual']-preds['predicted'])/preds['actual']))*100
    data = [['Metric','Value'],['Properties',len(preds)],['Segments',len(metrics)],['R²',f'{r2:.4f}'],['MAE',f'${mae:,.0f}'],['MAPE%',f'{mape:.2f}']]
    for i,(k,v) in enumerate(data,5): ws[f'A{i}'],ws[f'A{i}'].font,ws[f'B{i}'] = k,Font(bold=True),v

    # Segments
    ws = wb.create_sheet("Segments")
    seg_df = pd.DataFrame([{**{'Segment':s},**m} for s,m in metrics.items()])
    for i,h in enumerate(seg_df.columns,1):
        c = ws.cell(1,i,h)
        c.font,c.fill = Font(bold=True,color='FFFFFF'),PatternFill(start_color='366092',end_color='366092',fill_type='solid')
    for i,row in enumerate(seg_df.itertuples(index=False),2):
        for j,v in enumerate(row,1): ws.cell(i,j,v)

    # Predictions
    ws = wb.create_sheet("Predictions")
    for i,h in enumerate(preds.columns,1):
        c = ws.cell(1,i,h)
        c.font,c.fill = Font(bold=True,color='FFFFFF'),PatternFill(start_color='366092',end_color='366092',fill_type='solid')
    for i,row in enumerate(preds.itertuples(index=False),2):
        for j,v in enumerate(row,1): ws.cell(i,j,v)

    ts = time.strftime("%Y%m%d_%H%M%S")
    xl_path = f"{out_dir}/granular_{ts}.xlsx"
    wb.save(xl_path)
    preds.to_csv(f"{out_dir}/predictions_{ts}.csv",index=False)
    seg_df.to_csv(f"{out_dir}/segments_{ts}.csv",index=False)
    fi.to_csv(f"{out_dir}/importance_{ts}.csv",index=False)
    print(f"✓ Excel: {xl_path}")
    print(f"✓ CSVs saved with timestamp: {ts}")

def main():
    t0 = time.time()
    print("="*60)
    print("GRANULAR SEGMENTED AVM")
    print("="*60)
    df, y_col, id_col, state_col = load_data(INPUT_PATH)
    df, feats = prepare_data(df, y_col, id_col, state_col)
    results = train_segments(df, feats, y_col, id_col, state_col)
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    save_results(results, OUTPUT_DIR)
    preds = results['predictions']
    r2 = r2_score(preds['actual'],preds['predicted'])
    mae = mean_absolute_error(preds['actual'],preds['predicted'])
    print(f"\n{'='*60}")
    print(f"✓ COMPLETE in {time.time()-t0:.1f}s")
    print(f"  {len(preds):,} properties | {preds['segment'].nunique()} segments")
    print(f"  R²: {r2:.4f} | MAE: ${mae:,.0f}")
    print(f"{'='*60}")

if __name__ == "__main__": main()

GRANULAR SEGMENTED AVM
Loading /Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/Untitled 5_2025-12-23-1212.csv
127,326 records | 129.6MB | Price:sale_price ID:property_id

Preparing data...
127,258 records after price filter
51/60 features available

Training segmented models on 127,258 properties



KeyboardInterrupt



In [13]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.cluster import MiniBatchKMeans
from sklearn.ensemble import IsolationForest
import warnings
from openpyxl import Workbook
from openpyxl.styles import Font, PatternFill, Alignment
from openpyxl.utils.dataframe import dataframe_to_rows
import time, os

warnings.filterwarnings('ignore')
os.environ['OMP_NUM_THREADS'] = '4'

# Config - FIXED PARAMETERS
INCLUDE_MLS, INCLUDE_CENSUS, INCLUDE_NEIGHBORHOOD, INCLUDE_IMAGE = True, True, True, False
MIN_PRICE, TEST_SIZE, RAND_STATE, N_JOBS, N_CLUSTERS = 20000, 0.3, 42, -1, 8
N_EST, QUANTILES, MAX_SEGMENTS = 100, [0.1, 0.5, 0.9], 7  # MAX 7 SEGMENTS

INPUT_PATH = "/Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/Untitled 5_2025-12-23-1212.csv"
OUTPUT_DIR = "/Users/jenny.lin/BASIS_AVM_Onboarding/cate_scenario_analyses/model_outputs"

BASE_FEATS = ["living_sqft","lot_sqft","year_built","effective_year_built","bedrooms","full_baths","half_baths","garage_spaces","fireplace_code","latitude","longitude","geo_cluster"]
ENG_FEATS = ["sqft_per_bedroom","lot_to_living_ratio","property_age","is_new","has_garage","luxury_score","log_sqft","age_squared"]
PRIOR_FEATS = ["prior_sale_price","prior_price_per_sqft","sqft_per_prior_dollar","years_since_last_sale","expected_appreciation","has_prior_sale","recently_sold"]
CLUSTER_FEATS = ["cluster_avg_price","cluster_med_price"]
CENSUS_FEATS = ["total_population_25plus","male_bachelors_degree","female_bachelors_degree","pct_bachelors_degree","total_population","median_earnings_total","median_earnings_male","median_earnings_female","median_household_income","median_home_value","median_gross_rent","owner_occupied_units","renter_occupied_units","pct_owner_occupied","occupied_units","vacant_units","median_age","civilian_employed","civilian_unemployed","unemployment_rate","income_education_score"]
ELECT_FEATS = ["votes_gop","votes_dem","total_votes","per_gop","per_dem","per_point_diff","dem_margin","rep_margin"]
IMG_FEATS = ["topic_1","topic_2","topic_3","topic_4","topic_5","topic_6","topic_7","topic_8","topic_9","topic_10","gran_c_in","gran_c_ex","gran_c","high_c_in","high_c_ex","high_c"]

def optimize_dtypes(df):
    for c in df.select_dtypes(['float64']).columns: df[c] = df[c].astype('float32')
    for c in df.select_dtypes(['int64']).columns:
        df[c] = df[c].astype('int8') if set(df[c].dropna().unique()).issubset({0,1}) else df[c].astype('int32')
    return df

def load_data(path):
    print(f"Loading {path}")
    df = pd.read_csv(path, low_memory=False)
    df.columns = df.columns.str.lower()
    y_col = next((c for c in ['sale_price','currentsalesprice','price','saleprice'] if c in df.columns), 'currentsalesprice')
    id_col = next((c for c in ['cc_list_id','property_id','propertyid','id'] if c in df.columns), 'cc_list_id')
    state_col = next((c for c in ['sample_state','state','state_code'] if c in df.columns), None)
    print(f"{len(df):,} records | {df.memory_usage(deep=True).sum()/1024**2:.1f}MB | Price:{y_col} ID:{id_col}")
    return optimize_dtypes(df), y_col, id_col, state_col

def assign_segments_simple(df):
    """Segment based on PROPERTY CHARACTERISTICS only - NO PRICE"""

    # Primary: SIZE (living sqft)
    if 'living_sqft' in df.columns:
        sqft = df['living_sqft'].fillna(df['living_sqft'].median())
        q33, q67 = sqft.quantile([.33, .67])
        size_tier = pd.Series(['medium'] * len(df), index=df.index)
        size_tier[sqft < q33] = 'small'
        size_tier[sqft > q67] = 'large'
    else:
        size_tier = pd.Series(['medium'] * len(df), index=df.index)

    # Secondary: BEDROOMS (property complexity)
    if 'bedrooms' in df.columns:
        beds = df['bedrooms'].fillna(df['bedrooms'].median())
        beds_median = beds.median()
        room_tier = pd.Series(['standard'] * len(df), index=df.index)
        room_tier[beds <= 2] = 'compact'
        room_tier[beds >= 4] = 'family'
    else:
        room_tier = pd.Series(['standard'] * len(df), index=df.index)

    # Tertiary (optional): AGE
    if 'year_built' in df.columns:
        age = 2024 - df['year_built'].fillna(1980)
        age_tier = pd.Series(['mature'] * len(df), index=df.index)
        age_tier[age <= 10] = 'new'
        age_tier[age > 40] = 'older'
    else:
        age_tier = pd.Series(['mature'] * len(df), index=df.index)

    # Combine into segments
    # Format: size_rooms_age (e.g., "small_compact_new", "large_family_mature")
    segments = size_tier + '_' + room_tier + '_' + age_tier

    return segments

def engineer(df, y_col, with_price=False):
    if 'living_sqft' in df.columns and 'bedrooms' in df.columns: df['sqft_per_bedroom'] = df['living_sqft']/(df['bedrooms']+1)
    if 'lot_sqft' in df.columns:
        if 'living_sqft' in df.columns: df['lot_to_living_ratio'] = df['lot_sqft']/(df['living_sqft']+1)
        if 'lot_acres' not in df.columns: df['lot_acres'] = df['lot_sqft']/43560
    if 'year_built' in df.columns:
        df['property_age'] = 2024-df['year_built']
        df['is_new'] = (df['property_age']<=5).astype('int8')
        df['age_squared'] = df['property_age']**2
    if 'garage_spaces' in df.columns: df['has_garage'] = (df['garage_spaces']>0).astype('int8')
    if 'living_sqft' in df.columns: df['log_sqft'] = np.log1p(df['living_sqft'])
    lux = [df[c]/1000 if c=='living_sqft' else df[c] for c in ['living_sqft','full_baths','garage_spaces'] if c in df.columns]
    df['luxury_score'] = sum(lux)/len(lux) if lux else 0
    if INCLUDE_CENSUS and all(c in df.columns for c in ['median_household_income','pct_bachelors_degree']):
        df['income_education_score'] = df['median_household_income']*df['pct_bachelors_degree']
    if 'prior_sale_price' in df.columns and 'living_sqft' in df.columns:
        df['prior_price_per_sqft'] = df['prior_sale_price']/(df['living_sqft']+1)
        df['sqft_per_prior_dollar'] = df['living_sqft']/(df['prior_sale_price']+1)
    if 'prior_sale_date' in df.columns:
        df['prior_sale_date'] = pd.to_datetime(df['prior_sale_date'], errors='coerce')
        df['years_since_last_sale'] = (pd.Timestamp('2024-01-01')-df['prior_sale_date']).dt.days/365.25
        if 'prior_sale_price' in df.columns:
            df['expected_appreciation'] = df['prior_sale_price']*(1.04)**df['years_since_last_sale']
        df['recently_sold'] = (df['years_since_last_sale']<2).astype('int8')
    if 'prior_sale_price' in df.columns:
        df['has_prior_sale'] = df['prior_sale_price'].notna().astype('int8')
        missing = df['prior_sale_price'].isna()
        df.loc[missing,'prior_sale_price'] = df.loc[missing,'median_home_value'] if 'median_home_value' in df.columns and INCLUDE_CENSUS else df['prior_sale_price'].median()
    if 'years_since_last_sale' in df.columns: df['years_since_last_sale'] = df['years_since_last_sale'].fillna(999)
    if with_price and y_col in df.columns and 'living_sqft' in df.columns:
        df['price_per_sqft'] = df[y_col]/(df['living_sqft']+1)
        df['sqft_per_dollar'] = df['living_sqft']/(df[y_col]+1)
    return df

def geo_cluster(df):
    if not all(c in df.columns for c in ['latitude','longitude']):
        df['geo_cluster'] = 0; return df
    valid = df[['latitude','longitude']].notna().all(axis=1)
    if valid.sum()<N_CLUSTERS: df['geo_cluster'] = 0; return df
    df['geo_cluster'] = 0
    kmeans = MiniBatchKMeans(n_clusters=N_CLUSTERS, random_state=RAND_STATE, batch_size=1000, n_init=3)
    df.loc[valid,'geo_cluster'] = kmeans.fit_predict(df.loc[valid,['latitude','longitude']])
    return df

def add_cluster_feats(train, test, y_col):
    if 'geo_cluster' not in train.columns or y_col not in train.columns:
        for d in [train,test]: d['cluster_avg_price'] = d['cluster_med_price'] = train[y_col].median() if y_col in train.columns else 0
        return train, test
    stats = train.groupby('geo_cluster')[y_col].agg(['mean','median']).reset_index()
    stats.columns = ['geo_cluster','cluster_avg_price','cluster_med_price']
    train = train.merge(stats, on='geo_cluster', how='left').fillna({c:train[y_col].median() for c in ['cluster_avg_price','cluster_med_price']})
    test = test.merge(stats, on='geo_cluster', how='left').fillna({c:train[y_col].median() for c in ['cluster_avg_price','cluster_med_price']})
    return train, test

def filter_outliers(df, name, y_col):
    orig = len(df)
    df = engineer(df, y_col, with_price=True)
    if 'price_per_sqft' in df.columns:
        lb,ub = df['price_per_sqft'].quantile([.05,.95])
        df = df[(df['price_per_sqft']>=lb)&(df['price_per_sqft']<=ub)]
    if 'sqft_per_dollar' in df.columns: df = df[df['sqft_per_dollar']<=df['sqft_per_dollar'].quantile(.95)]
    if 'price_per_sqft' in df.columns: df = df.drop(columns=['price_per_sqft','sqft_per_dollar'])
    if 'lot_sqft' in df.columns: df = df[df['lot_sqft']<=df['lot_sqft'].quantile(.98)]
    if 'year_built' in df.columns: df = df[(df['year_built']>=1900)&(df['year_built']<=2025)]
    if len(df)>=100:
        try:
            feats = [c for c in ['living_sqft','lot_sqft',y_col,'year_built'] if c in df.columns]
            if len(feats)>=3:
                X = df[feats].fillna(df[feats].median())
                df = df[IsolationForest(contamination=.05,random_state=RAND_STATE,n_jobs=N_JOBS).fit_predict(X)==1]
        except: pass
    pct_filt = (orig-len(df))/orig*100 if orig>0 else 0
    if pct_filt > 0: print(f"  {name}: {orig:,}→{len(df):,} ({pct_filt:.1f}% filtered)")
    return df

def train_model(X, y, q):
    return XGBRegressor(
        objective='reg:quantileerror',
        quantile_alpha=q,
        n_estimators=N_EST,
        learning_rate=.1,
        max_depth=5,
        min_child_weight=3,
        subsample=.8,
        colsample_bytree=.8,
        random_state=RAND_STATE,
        n_jobs=N_JOBS,
        tree_method='hist'
    ).fit(X, y, verbose=False)

def get_feature_importance_per_segment(model, feat_names, top_n=20):
    """Extract feature importance for a single segment model"""
    scores = model.get_booster().get_score(importance_type="gain")
    importance_list = []
    for k, v in scores.items():
        idx = int(k[1:])
        if idx < len(feat_names):
            importance_list.append((feat_names[idx], v))

    # Sort and normalize
    importance_list.sort(key=lambda x: x[1], reverse=True)
    total_gain = sum(v for _, v in importance_list)

    if total_gain > 0:
        importance_df = pd.DataFrame([
            {'feature': feat, 'gain': gain, 'importance': gain/total_gain}
            for feat, gain in importance_list[:top_n]
        ])
    else:
        importance_df = pd.DataFrame(columns=['feature', 'gain', 'importance'])

    return importance_df

def feature_importance(models, feat_names, metrics):
    """Global feature importance weighted by segment size"""
    rows = []
    for seg, mdl in models.items():
        scores = mdl['q50'].get_booster().get_score(importance_type="gain")
        w = metrics[seg]["n_test"]
        for k,v in scores.items():
            idx = int(k[1:])
            if idx<len(feat_names): rows.append((feat_names[idx],v,w))
    if not rows: return pd.DataFrame(columns=["feature","importance"])
    df = pd.DataFrame(rows, columns=["feature","gain","weight"])
    out = df.assign(wg=df["gain"]*df["weight"]).groupby("feature",as_index=False).agg(tg=("wg","sum")).sort_values("tg",ascending=False)
    out["importance"] = out["tg"]/out["tg"].sum()
    return out[["feature","importance"]].head(100)

def prepare_data(df, y_col, id_col, state_col):
    print(f"\nPreparing data...")
    df = df[df[y_col]>=MIN_PRICE]
    print(f"{len(df):,} records after price filter")
    df = engineer(geo_cluster(df), y_col)
    feat_groups = []
    if INCLUDE_MLS: feat_groups.extend([BASE_FEATS,ENG_FEATS,PRIOR_FEATS,CLUSTER_FEATS])
    if INCLUDE_CENSUS: feat_groups.append(CENSUS_FEATS)
    if INCLUDE_NEIGHBORHOOD: feat_groups.append(ELECT_FEATS)
    if INCLUDE_IMAGE: feat_groups.append(IMG_FEATS)
    all_feats = [f for g in feat_groups for f in g]
    feats = [f for f in all_feats if f in df.columns]
    print(f"{len(feats)}/{len(all_feats)} features available")
    cols = list(set(feats+[y_col,id_col]+([state_col] if state_col and state_col in df.columns else [])))
    df = df[[c for c in cols if c in df.columns]].copy()
    df[feats] = df[feats].fillna(df[feats].median())
    return df.dropna(subset=[y_col]), feats

def train_segments(df, feats, y_col, id_col, state_col):
    print(f"\nTraining segmented models on {len(df):,} properties")

    # Segment based on property characteristics (NO PRICE)
    df['seg'] = assign_segments_simple(df)

    seg_cnts = df['seg'].value_counts()
    print(f"{len(seg_cnts)} segments created")
    for seg, cnt in seg_cnts.head(MAX_SEGMENTS).items(): print(f"  {seg}: {cnt:,}")

    # Merge small segments
    small = seg_cnts[seg_cnts<50].index
    if len(small)>0:
        df.loc[df['seg'].isin(small),'seg'] = 'other'
        print(f"Merged {len(small)} small segments into 'other'")

    # If still too many segments, merge smallest ones
    seg_cnts = df['seg'].value_counts()
    if len(seg_cnts) > MAX_SEGMENTS:
        keep_segs = seg_cnts.head(MAX_SEGMENTS-1).index
        df.loc[~df['seg'].isin(keep_segs),'seg'] = 'other'
        print(f"Consolidated to {MAX_SEGMENTS} segments maximum")

    models, metrics, preds_list, segment_importances = {}, {}, [], {}

    for seg in df['seg'].unique():
        seg_df = df[df['seg']==seg].copy()
        if len(seg_df)<50: continue
        seg_df = filter_outliers(seg_df, seg, y_col)
        if len(seg_df)<50: continue
        train_idx = seg_df.sample(frac=1-TEST_SIZE,random_state=RAND_STATE).index
        train_df, test_df = seg_df.loc[train_idx].copy(), seg_df.loc[seg_df.index.difference(train_idx)].copy()
        train_df, test_df = add_cluster_feats(train_df, test_df, y_col)
        X_tr,y_tr = train_df[feats].values, train_df[y_col].values
        X_te,y_te = test_df[feats].values, test_df[y_col].values
        ids_te = test_df[id_col].values
        states_te = test_df[state_col].values if state_col and state_col in test_df.columns else ['Unknown']*len(test_df)
        seg_models, seg_preds = {}, []
        for q in QUANTILES:
            m = train_model(X_tr,y_tr,q)
            seg_models[f"q{int(q*100)}"] = m
            seg_preds.append(m.predict(X_te))
        models[seg] = seg_models

        # Get feature importance for this segment
        segment_importances[seg] = get_feature_importance_per_segment(seg_models['q50'], feats, top_n=20)

        y_pred = seg_preds[1]
        mae,mape = mean_absolute_error(y_te,y_pred), np.mean(np.abs((y_te-y_pred)/y_te))*100
        r2,cov = r2_score(y_te,y_pred), np.mean((y_te>=seg_preds[0])&(y_te<=seg_preds[2]))*100
        metrics[seg] = {'n_train':len(X_tr),'n_test':len(X_te),'mae':mae,'mape':mape,'r2':r2,'cov':cov,'p_min':seg_df[y_col].min(),'p_max':seg_df[y_col].max(),'p_med':seg_df[y_col].median()}
        print(f"  {seg}: {len(test_df):,} test | MAE:${mae:,.0f} | MAPE:{mape:.2f}% | R²:{r2:.3f}")
        preds_list.append(pd.DataFrame({'property_id':ids_te,'state':states_te,'actual':y_te,'predicted':y_pred,'pred_lower':seg_preds[0],'pred_upper':seg_preds[2],'segment':seg}))

    return {
        'models': models,
        'metrics': metrics,
        'predictions': pd.concat(preds_list),
        'feature_importance': feature_importance(models, feats, metrics),
        'segment_importances': segment_importances,
        'feature_names': feats
    }

def save_results(results, out_dir):
    print(f"\nSaving results...")
    preds, metrics, fi = results['predictions'], results['metrics'], results['feature_importance']
    seg_importances = results['segment_importances']

    wb = Workbook()
    wb.remove(wb.active)

    # Summary
    ws = wb.create_sheet("Summary",0)
    ws['A1'], ws['A1'].font = 'GRANULAR SEGMENTED AVM', Font(bold=True,size=14)
    r2 = r2_score(preds['actual'],preds['predicted'])
    mae = mean_absolute_error(preds['actual'],preds['predicted'])
    mape = np.mean(np.abs((preds['actual']-preds['predicted'])/preds['actual']))*100
    data = [['Metric','Value'],['Properties',len(preds)],['Segments',len(metrics)],['R²',f'{r2:.4f}'],['MAE',f'${mae:,.0f}'],['MAPE%',f'{mape:.2f}%']]
    for i,(k,v) in enumerate(data,5): ws[f'A{i}'],ws[f'A{i}'].font,ws[f'B{i}'] = k,Font(bold=True),v

    # Segments
    ws = wb.create_sheet("Segments")
    seg_df = pd.DataFrame([{**{'Segment':s},**m} for s,m in metrics.items()])
    for i,h in enumerate(seg_df.columns,1):
        c = ws.cell(1,i,h)
        c.font,c.fill = Font(bold=True,color='FFFFFF'),PatternFill(start_color='366092',end_color='366092',fill_type='solid')
    for i,row in enumerate(seg_df.itertuples(index=False),2):
        for j,v in enumerate(row,1): ws.cell(i,j,v)

    # Global Feature Importance
    ws = wb.create_sheet("Global_Feature_Importance")
    ws['A1'], ws['A1'].font = 'Global Feature Importance (Weighted)', Font(bold=True,size=12)
    for r_idx, row in enumerate(dataframe_to_rows(fi, index=False, header=True), 2):
        for c_idx, value in enumerate(row, 1):
            cell = ws.cell(row=r_idx, column=c_idx, value=value)
            if r_idx == 2:  # Header row
                cell.font = Font(bold=True, color='FFFFFF')
                cell.fill = PatternFill(start_color='366092', end_color='366092', fill_type='solid')

    # Per-Segment Feature Importance (one sheet per segment)
    for seg_name, seg_fi in seg_importances.items():
        sheet_name = f"FI_{seg_name}"[:31]  # Excel sheet name limit
        ws = wb.create_sheet(sheet_name)
        ws['A1'] = f'Feature Importance: {seg_name}'
        ws['A1'].font = Font(bold=True, size=12)

        for r_idx, row in enumerate(dataframe_to_rows(seg_fi, index=False, header=True), 2):
            for c_idx, value in enumerate(row, 1):
                cell = ws.cell(row=r_idx, column=c_idx, value=value)
                if r_idx == 2:  # Header row
                    cell.font = Font(bold=True, color='FFFFFF')
                    cell.fill = PatternFill(start_color='366092', end_color='366092', fill_type='solid')

    # Predictions
    ws = wb.create_sheet("Predictions")
    for i,h in enumerate(preds.columns,1):
        c = ws.cell(1,i,h)
        c.font,c.fill = Font(bold=True,color='FFFFFF'),PatternFill(start_color='366092',end_color='366092',fill_type='solid')
    for i,row in enumerate(preds.itertuples(index=False),2):
        for j,v in enumerate(row,1): ws.cell(i,j,v)

    ts = time.strftime("%Y%m%d_%H%M%S")
    xl_path = f"{out_dir}/granular_{ts}.xlsx"
    wb.save(xl_path)
    preds.to_csv(f"{out_dir}/predictions_{ts}.csv",index=False)
    seg_df.to_csv(f"{out_dir}/segments_{ts}.csv",index=False)
    fi.to_csv(f"{out_dir}/global_importance_{ts}.csv",index=False)

    # Save per-segment feature importance CSVs
    for seg_name, seg_fi in seg_importances.items():
        seg_fi.to_csv(f"{out_dir}/importance_{seg_name}_{ts}.csv", index=False)

    print(f"✓ Excel: {xl_path}")
    print(f"✓ CSVs saved with timestamp: {ts}")
    print(f"✓ Per-segment feature importance saved for {len(seg_importances)} segments")

def main():
    t0 = time.time()
    print("="*60)
    print("GRANULAR SEGMENTED AVM")
    print("="*60)
    df, y_col, id_col, state_col = load_data(INPUT_PATH)
    df, feats = prepare_data(df, y_col, id_col, state_col)
    results = train_segments(df, feats, y_col, id_col, state_col)
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    save_results(results, OUTPUT_DIR)
    preds = results['predictions']
    r2 = r2_score(preds['actual'],preds['predicted'])
    mae = mean_absolute_error(preds['actual'],preds['predicted'])
    mape = np.mean(np.abs((preds['actual']-preds['predicted'])/preds['actual']))*100
    print(f"\n{'='*60}")
    print(f"✓ COMPLETE in {time.time()-t0:.1f}s")
    print(f"  {len(preds):,} properties | {preds['segment'].nunique()} segments")
    print(f"  R²: {r2:.4f} | MAE: ${mae:,.0f} | MAPE: {mape:.2f}%")
    print(f"{'='*60}")

if __name__ == "__main__": main()

GRANULAR SEGMENTED AVM
Loading /Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/Untitled 5_2025-12-23-1212.csv
127,326 records | 129.6MB | Price:sale_price ID:property_id

Preparing data...
127,258 records after price filter
51/60 features available

Training segmented models on 127,258 properties
27 segments created
  large_family_mature: 19,246
  medium_family_older: 17,021
  large_family_older: 12,118
  small_standard_older: 11,606
  medium_family_mature: 10,980
  small_family_older: 7,384
  small_compact_mature: 6,931
Consolidated to 7 segments maximum
  other: 48,903→38,254 (21.8% filtered)
  other: 11,476 test | MAE:$620,127 | MAPE:26.91% | R²:0.212
  large_family_older: 12,118→8,747 (27.8% filtered)
  large_family_older: 2,624 test | MAE:$393,518 | MAPE:24.01% | R²:0.445
  small_family_older: 7,384→5,453 (26.2% filtered)
  small_family_older: 1,636 test | MAE:$471,420 | MAPE:21.50% | R²:0.104
  small_standard_older: 11,606→8,712 (24.9% filtered)
  small_standard_olde

In [15]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.cluster import MiniBatchKMeans
from sklearn.ensemble import IsolationForest
import warnings
from openpyxl import Workbook
from openpyxl.styles import Font, PatternFill, Alignment
from openpyxl.utils.dataframe import dataframe_to_rows
import time, os

warnings.filterwarnings('ignore')
os.environ['OMP_NUM_THREADS'] = '4'

# Config - FIXED PARAMETERS
INCLUDE_MLS, INCLUDE_CENSUS, INCLUDE_NEIGHBORHOOD, INCLUDE_IMAGE = True, True, True, False
MIN_PRICE, TEST_SIZE, RAND_STATE, N_JOBS, N_CLUSTERS = 20000, 0.3, 42, -1, 8
N_EST, QUANTILES, MAX_SEGMENTS = 100, [0.1, 0.5, 0.9], 7  # MAX 7 SEGMENTS

INPUT_PATH = "/Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/Untitled 5_2025-12-23-1212.csv"
OUTPUT_DIR = "/Users/jenny.lin/BASIS_AVM_Onboarding/cate_scenario_analyses/model_outputs"

BASE_FEATS = ["living_sqft","lot_sqft","year_built","effective_year_built","bedrooms","full_baths","half_baths","garage_spaces","fireplace_code","latitude","longitude","geo_cluster"]
ENG_FEATS = ["sqft_per_bedroom","lot_to_living_ratio","property_age","is_new","has_garage","luxury_score","log_sqft","age_squared"]
PRIOR_FEATS = ["prior_sale_price","prior_price_per_sqft","sqft_per_prior_dollar","years_since_last_sale","expected_appreciation","has_prior_sale","recently_sold"]
CLUSTER_FEATS = ["cluster_avg_price","cluster_med_price"]
CENSUS_FEATS = ["total_population_25plus","male_bachelors_degree","female_bachelors_degree","pct_bachelors_degree","total_population","median_earnings_total","median_earnings_male","median_earnings_female","median_household_income","median_home_value","median_gross_rent","owner_occupied_units","renter_occupied_units","pct_owner_occupied","occupied_units","vacant_units","median_age","civilian_employed","civilian_unemployed","unemployment_rate","income_education_score"]
ELECT_FEATS = ["votes_gop","votes_dem","total_votes","per_gop","per_dem","per_point_diff","dem_margin","rep_margin"]
IMG_FEATS = ["topic_1","topic_2","topic_3","topic_4","topic_5","topic_6","topic_7","topic_8","topic_9","topic_10","gran_c_in","gran_c_ex","gran_c","high_c_in","high_c_ex","high_c"]

def optimize_dtypes(df):
    for c in df.select_dtypes(['float64']).columns: df[c] = df[c].astype('float32')
    for c in df.select_dtypes(['int64']).columns:
        df[c] = df[c].astype('int8') if set(df[c].dropna().unique()).issubset({0,1}) else df[c].astype('int32')
    return df

def load_data(path):
    print(f"Loading {path}")
    df = pd.read_csv(path, low_memory=False)
    df.columns = df.columns.str.lower()
    y_col = next((c for c in ['sale_price','currentsalesprice','price','saleprice'] if c in df.columns), 'currentsalesprice')
    id_col = next((c for c in ['cc_list_id','property_id','propertyid','id'] if c in df.columns), 'cc_list_id')
    state_col = next((c for c in ['sample_state','state','state_code'] if c in df.columns), None)
    print(f"{len(df):,} records | {df.memory_usage(deep=True).sum()/1024**2:.1f}MB | Price:{y_col} ID:{id_col}")
    return optimize_dtypes(df), y_col, id_col, state_col

def assign_segments_simple(df):
    """Segment based on PROPERTY CHARACTERISTICS only - NO PRICE"""

    # Primary: SIZE (living sqft)
    if 'living_sqft' in df.columns:
        sqft = df['living_sqft'].fillna(df['living_sqft'].median())
        q33, q67 = sqft.quantile([.33, .67])
        size_tier = pd.Series(['medium'] * len(df), index=df.index)
        size_tier[sqft < q33] = 'small'
        size_tier[sqft > q67] = 'large'
    else:
        size_tier = pd.Series(['medium'] * len(df), index=df.index)

    # Secondary: BEDROOMS (property complexity)
    if 'bedrooms' in df.columns:
        beds = df['bedrooms'].fillna(df['bedrooms'].median())
        beds_median = beds.median()
        room_tier = pd.Series(['standard'] * len(df), index=df.index)
        room_tier[beds <= 2] = 'compact'
        room_tier[beds >= 4] = 'family'
    else:
        room_tier = pd.Series(['standard'] * len(df), index=df.index)

    # Tertiary (optional): AGE
    if 'year_built' in df.columns:
        age = 2024 - df['year_built'].fillna(1980)
        age_tier = pd.Series(['mature'] * len(df), index=df.index)
        age_tier[age <= 10] = 'new'
        age_tier[age > 40] = 'older'
    else:
        age_tier = pd.Series(['mature'] * len(df), index=df.index)

    # Combine into segments
    # Format: size_rooms_age (e.g., "small_compact_new", "large_family_mature")
    segments = size_tier + '_' + room_tier + '_' + age_tier

    return segments

def engineer(df, y_col, with_price=False):
    if 'living_sqft' in df.columns and 'bedrooms' in df.columns: df['sqft_per_bedroom'] = df['living_sqft']/(df['bedrooms']+1)
    if 'lot_sqft' in df.columns:
        if 'living_sqft' in df.columns: df['lot_to_living_ratio'] = df['lot_sqft']/(df['living_sqft']+1)
        if 'lot_acres' not in df.columns: df['lot_acres'] = df['lot_sqft']/43560
    if 'year_built' in df.columns:
        df['property_age'] = 2024-df['year_built']
        df['is_new'] = (df['property_age']<=5).astype('int8')
        df['age_squared'] = df['property_age']**2
    if 'garage_spaces' in df.columns: df['has_garage'] = (df['garage_spaces']>0).astype('int8')
    if 'living_sqft' in df.columns: df['log_sqft'] = np.log1p(df['living_sqft'])
    lux = [df[c]/1000 if c=='living_sqft' else df[c] for c in ['living_sqft','full_baths','garage_spaces'] if c in df.columns]
    df['luxury_score'] = sum(lux)/len(lux) if lux else 0
    if INCLUDE_CENSUS and all(c in df.columns for c in ['median_household_income','pct_bachelors_degree']):
        df['income_education_score'] = df['median_household_income']*df['pct_bachelors_degree']
    if 'prior_sale_price' in df.columns and 'living_sqft' in df.columns:
        df['prior_price_per_sqft'] = df['prior_sale_price']/(df['living_sqft']+1)
        df['sqft_per_prior_dollar'] = df['living_sqft']/(df['prior_sale_price']+1)
    if 'prior_sale_date' in df.columns:
        df['prior_sale_date'] = pd.to_datetime(df['prior_sale_date'], errors='coerce')
        df['years_since_last_sale'] = (pd.Timestamp('2024-01-01')-df['prior_sale_date']).dt.days/365.25
        if 'prior_sale_price' in df.columns:
            df['expected_appreciation'] = df['prior_sale_price']*(1.04)**df['years_since_last_sale']
        df['recently_sold'] = (df['years_since_last_sale']<2).astype('int8')
    if 'prior_sale_price' in df.columns:
        df['has_prior_sale'] = df['prior_sale_price'].notna().astype('int8')
        missing = df['prior_sale_price'].isna()
        df.loc[missing,'prior_sale_price'] = df.loc[missing,'median_home_value'] if 'median_home_value' in df.columns and INCLUDE_CENSUS else df['prior_sale_price'].median()
    if 'years_since_last_sale' in df.columns: df['years_since_last_sale'] = df['years_since_last_sale'].fillna(999)
    if with_price and y_col in df.columns and 'living_sqft' in df.columns:
        df['price_per_sqft'] = df[y_col]/(df['living_sqft']+1)
        df['sqft_per_dollar'] = df['living_sqft']/(df[y_col]+1)
    return df

def geo_cluster(df):
    if not all(c in df.columns for c in ['latitude','longitude']):
        df['geo_cluster'] = 0; return df
    valid = df[['latitude','longitude']].notna().all(axis=1)
    if valid.sum()<N_CLUSTERS: df['geo_cluster'] = 0; return df
    df['geo_cluster'] = 0
    kmeans = MiniBatchKMeans(n_clusters=N_CLUSTERS, random_state=RAND_STATE, batch_size=1000, n_init=3)
    df.loc[valid,'geo_cluster'] = kmeans.fit_predict(df.loc[valid,['latitude','longitude']])
    return df

def add_cluster_feats(train, test, y_col):
    if 'geo_cluster' not in train.columns or y_col not in train.columns:
        for d in [train,test]: d['cluster_avg_price'] = d['cluster_med_price'] = train[y_col].median() if y_col in train.columns else 0
        return train, test
    stats = train.groupby('geo_cluster')[y_col].agg(['mean','median']).reset_index()
    stats.columns = ['geo_cluster','cluster_avg_price','cluster_med_price']
    train = train.merge(stats, on='geo_cluster', how='left').fillna({c:train[y_col].median() for c in ['cluster_avg_price','cluster_med_price']})
    test = test.merge(stats, on='geo_cluster', how='left').fillna({c:train[y_col].median() for c in ['cluster_avg_price','cluster_med_price']})
    return train, test

def filter_outliers(df, name, y_col):
    orig = len(df)
    df = engineer(df, y_col, with_price=True)
    if 'price_per_sqft' in df.columns:
        lb,ub = df['price_per_sqft'].quantile([.05,.95])
        df = df[(df['price_per_sqft']>=lb)&(df['price_per_sqft']<=ub)]
    if 'sqft_per_dollar' in df.columns: df = df[df['sqft_per_dollar']<=df['sqft_per_dollar'].quantile(.95)]
    if 'price_per_sqft' in df.columns: df = df.drop(columns=['price_per_sqft','sqft_per_dollar'])
    if 'lot_sqft' in df.columns: df = df[df['lot_sqft']<=df['lot_sqft'].quantile(.98)]
    if 'year_built' in df.columns: df = df[(df['year_built']>=1900)&(df['year_built']<=2025)]
    if len(df)>=100:
        try:
            feats = [c for c in ['living_sqft','lot_sqft',y_col,'year_built'] if c in df.columns]
            if len(feats)>=3:
                X = df[feats].fillna(df[feats].median())
                df = df[IsolationForest(contamination=.05,random_state=RAND_STATE,n_jobs=N_JOBS).fit_predict(X)==1]
        except: pass
    pct_filt = (orig-len(df))/orig*100 if orig>0 else 0
    if pct_filt > 0: print(f"  {name}: {orig:,}→{len(df):,} ({pct_filt:.1f}% filtered)")
    return df

def train_model(X, y, q):
    return XGBRegressor(
        objective='reg:quantileerror',
        quantile_alpha=q,
        n_estimators=N_EST,
        learning_rate=.1,
        max_depth=5,
        min_child_weight=3,
        subsample=.8,
        colsample_bytree=.8,
        random_state=RAND_STATE,
        n_jobs=N_JOBS,
        tree_method='hist'
    ).fit(X, y, verbose=False)

def get_feature_importance_per_segment(model, feat_names, top_n=20):
    """Extract feature importance for a single segment model"""
    scores = model.get_booster().get_score(importance_type="gain")
    importance_list = []
    for k, v in scores.items():
        idx = int(k[1:])
        if idx < len(feat_names):
            importance_list.append((feat_names[idx], v))

    # Sort and normalize
    importance_list.sort(key=lambda x: x[1], reverse=True)
    total_gain = sum(v for _, v in importance_list)

    if total_gain > 0:
        importance_df = pd.DataFrame([
            {'feature': feat, 'gain': gain, 'importance': gain/total_gain}
            for feat, gain in importance_list[:top_n]
        ])
    else:
        importance_df = pd.DataFrame(columns=['feature', 'gain', 'importance'])

    return importance_df

def feature_importance(models, feat_names, metrics):
    """Global feature importance weighted by segment size - NOW INCLUDES GAIN"""
    rows = []
    for seg, mdl in models.items():
        scores = mdl['q50'].get_booster().get_score(importance_type="gain")
        w = metrics[seg]["n_test"]
        for k,v in scores.items():
            idx = int(k[1:])
            if idx<len(feat_names): rows.append((feat_names[idx],v,w))
    if not rows: return pd.DataFrame(columns=["feature","gain","importance"])

    df = pd.DataFrame(rows, columns=["feature","gain","weight"])

    # Calculate total gain per feature (summing across segments)
    total_gain_per_feature = df.groupby("feature")["gain"].sum().reset_index()
    total_gain_per_feature.columns = ["feature", "total_gain"]

    # Calculate weighted gain and importance
    out = df.assign(wg=df["gain"]*df["weight"]).groupby("feature",as_index=False).agg(
        total_gain=("gain", "sum"),
        weighted_gain=("wg","sum")
    ).sort_values("weighted_gain",ascending=False)

    out["importance"] = out["weighted_gain"]/out["weighted_gain"].sum()

    return out[["feature","total_gain","importance"]].head(100)

def prepare_data(df, y_col, id_col, state_col):
    print(f"\nPreparing data...")
    df = df[df[y_col]>=MIN_PRICE]
    print(f"{len(df):,} records after price filter")
    df = engineer(geo_cluster(df), y_col)
    feat_groups = []
    if INCLUDE_MLS: feat_groups.extend([BASE_FEATS,ENG_FEATS,PRIOR_FEATS,CLUSTER_FEATS])
    if INCLUDE_CENSUS: feat_groups.append(CENSUS_FEATS)
    if INCLUDE_NEIGHBORHOOD: feat_groups.append(ELECT_FEATS)
    if INCLUDE_IMAGE: feat_groups.append(IMG_FEATS)
    all_feats = [f for g in feat_groups for f in g]
    feats = [f for f in all_feats if f in df.columns]
    print(f"{len(feats)}/{len(all_feats)} features available")
    cols = list(set(feats+[y_col,id_col]+([state_col] if state_col and state_col in df.columns else [])))
    df = df[[c for c in cols if c in df.columns]].copy()
    df[feats] = df[feats].fillna(df[feats].median())
    return df.dropna(subset=[y_col]), feats

def train_segments(df, feats, y_col, id_col, state_col):
    print(f"\nTraining segmented models on {len(df):,} properties")

    # Segment based on property characteristics (NO PRICE)
    df['seg'] = assign_segments_simple(df)

    seg_cnts = df['seg'].value_counts()
    print(f"{len(seg_cnts)} segments created")
    for seg, cnt in seg_cnts.head(MAX_SEGMENTS).items(): print(f"  {seg}: {cnt:,}")

    # Merge small segments
    small = seg_cnts[seg_cnts<50].index
    if len(small)>0:
        df.loc[df['seg'].isin(small),'seg'] = 'other'
        print(f"Merged {len(small)} small segments into 'other'")

    # If still too many segments, merge smallest ones
    seg_cnts = df['seg'].value_counts()
    if len(seg_cnts) > MAX_SEGMENTS:
        keep_segs = seg_cnts.head(MAX_SEGMENTS-1).index
        df.loc[~df['seg'].isin(keep_segs),'seg'] = 'other'
        print(f"Consolidated to {MAX_SEGMENTS} segments maximum")

    models, metrics, preds_list, segment_importances = {}, {}, [], {}

    for seg in df['seg'].unique():
        seg_df = df[df['seg']==seg].copy()
        if len(seg_df)<50: continue
        seg_df = filter_outliers(seg_df, seg, y_col)
        if len(seg_df)<50: continue
        train_idx = seg_df.sample(frac=1-TEST_SIZE,random_state=RAND_STATE).index
        train_df, test_df = seg_df.loc[train_idx].copy(), seg_df.loc[seg_df.index.difference(train_idx)].copy()
        train_df, test_df = add_cluster_feats(train_df, test_df, y_col)
        X_tr,y_tr = train_df[feats].values, train_df[y_col].values
        X_te,y_te = test_df[feats].values, test_df[y_col].values
        ids_te = test_df[id_col].values
        states_te = test_df[state_col].values if state_col and state_col in test_df.columns else ['Unknown']*len(test_df)
        seg_models, seg_preds = {}, []
        for q in QUANTILES:
            m = train_model(X_tr,y_tr,q)
            seg_models[f"q{int(q*100)}"] = m
            seg_preds.append(m.predict(X_te))
        models[seg] = seg_models

        # Get feature importance for this segment
        segment_importances[seg] = get_feature_importance_per_segment(seg_models['q50'], feats, top_n=20)

        y_pred = seg_preds[1]
        mae,mape = mean_absolute_error(y_te,y_pred), np.mean(np.abs((y_te-y_pred)/y_te))*100
        r2,cov = r2_score(y_te,y_pred), np.mean((y_te>=seg_preds[0])&(y_te<=seg_preds[2]))*100
        metrics[seg] = {'n_train':len(X_tr),'n_test':len(X_te),'mae':mae,'mape':mape,'r2':r2,'cov':cov,'p_min':seg_df[y_col].min(),'p_max':seg_df[y_col].max(),'p_med':seg_df[y_col].median()}
        print(f"  {seg}: {len(test_df):,} test | MAE:${mae:,.0f} | MAPE:{mape:.2f}% | R²:{r2:.3f}")
        preds_list.append(pd.DataFrame({'property_id':ids_te,'state':states_te,'actual':y_te,'predicted':y_pred,'pred_lower':seg_preds[0],'pred_upper':seg_preds[2],'segment':seg}))

    return {
        'models': models,
        'metrics': metrics,
        'predictions': pd.concat(preds_list),
        'feature_importance': feature_importance(models, feats, metrics),
        'segment_importances': segment_importances,
        'feature_names': feats
    }

def save_results(results, out_dir):
    print(f"\nSaving results...")
    preds, metrics, fi = results['predictions'], results['metrics'], results['feature_importance']
    seg_importances = results['segment_importances']

    wb = Workbook()
    wb.remove(wb.active)

    # Summary
    ws = wb.create_sheet("Summary",0)
    ws['A1'], ws['A1'].font = 'GRANULAR SEGMENTED AVM', Font(bold=True,size=14)
    r2 = r2_score(preds['actual'],preds['predicted'])
    mae = mean_absolute_error(preds['actual'],preds['predicted'])
    mape = np.mean(np.abs((preds['actual']-preds['predicted'])/preds['actual']))*100
    data = [['Metric','Value'],['Properties',len(preds)],['Segments',len(metrics)],['R²',f'{r2:.4f}'],['MAE',f'${mae:,.0f}'],['MAPE%',f'{mape:.2f}%']]
    for i,(k,v) in enumerate(data,5): ws[f'A{i}'],ws[f'A{i}'].font,ws[f'B{i}'] = k,Font(bold=True),v

    # Segments
    ws = wb.create_sheet("Segments")
    seg_df = pd.DataFrame([{**{'Segment':s},**m} for s,m in metrics.items()])
    for i,h in enumerate(seg_df.columns,1):
        c = ws.cell(1,i,h)
        c.font,c.fill = Font(bold=True,color='FFFFFF'),PatternFill(start_color='366092',end_color='366092',fill_type='solid')
    for i,row in enumerate(seg_df.itertuples(index=False),2):
        for j,v in enumerate(row,1): ws.cell(i,j,v)

    # Global Feature Importance
    ws = wb.create_sheet("Global_Feature_Importance")
    ws['A1'] = 'Global Feature Importance (Weighted)'
    ws['A1'].font = Font(bold=True, size=12)
    for r_idx, row in enumerate(dataframe_to_rows(fi, index=False, header=True), 2):
        for c_idx, value in enumerate(row, 1):
            cell = ws.cell(row=r_idx, column=c_idx, value=value)
            if r_idx == 2:  # Header row
                cell.font = Font(bold=True, color='FFFFFF')
                cell.fill = PatternFill(start_color='366092', end_color='366092', fill_type='solid')

    # Per-Segment Feature Importance (one sheet per segment)
    for seg_name, seg_fi in seg_importances.items():
        sheet_name = f"FI_{seg_name}"[:31]  # Excel sheet name limit
        ws = wb.create_sheet(sheet_name)
        ws['A1'] = f'Feature Importance: {seg_name}'
        ws['A1'].font = Font(bold=True, size=12)

        for r_idx, row in enumerate(dataframe_to_rows(seg_fi, index=False, header=True), 2):
            for c_idx, value in enumerate(row, 1):
                cell = ws.cell(row=r_idx, column=c_idx, value=value)
                if r_idx == 2:  # Header row
                    cell.font = Font(bold=True, color='FFFFFF')
                    cell.fill = PatternFill(start_color='366092', end_color='366092', fill_type='solid')

    # Predictions
    ws = wb.create_sheet("Predictions")
    for i,h in enumerate(preds.columns,1):
        c = ws.cell(1,i,h)
        c.font,c.fill = Font(bold=True,color='FFFFFF'),PatternFill(start_color='366092',end_color='366092',fill_type='solid')
    for i,row in enumerate(preds.itertuples(index=False),2):
        for j,v in enumerate(row,1): ws.cell(i,j,v)

    ts = time.strftime("%Y%m%d_%H%M%S")
    xl_path = f"{out_dir}/granular_{ts}.xlsx"
    wb.save(xl_path)
    preds.to_csv(f"{out_dir}/predictions_{ts}.csv",index=False)
    seg_df.to_csv(f"{out_dir}/segments_{ts}.csv",index=False)
    fi.to_csv(f"{out_dir}/global_importance_{ts}.csv",index=False)

    # Save per-segment feature importance CSVs
    for seg_name, seg_fi in seg_importances.items():
        seg_fi.to_csv(f"{out_dir}/importance_{seg_name}_{ts}.csv", index=False)

    print(f"✓ Excel: {xl_path}")
    print(f"✓ CSVs saved with timestamp: {ts}")
    print(f"✓ Per-segment feature importance saved for {len(seg_importances)} segments")

def main():
    t0 = time.time()
    print("="*60)
    print("GRANULAR SEGMENTED AVM")
    print("="*60)
    df, y_col, id_col, state_col = load_data(INPUT_PATH)
    df, feats = prepare_data(df, y_col, id_col, state_col)
    results = train_segments(df, feats, y_col, id_col, state_col)
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    save_results(results, OUTPUT_DIR)
    preds = results['predictions']
    r2 = r2_score(preds['actual'],preds['predicted'])
    mae = mean_absolute_error(preds['actual'],preds['predicted'])
    mape = np.mean(np.abs((preds['actual']-preds['predicted'])/preds['actual']))*100
    print(f"\n{'='*60}")
    print(f"✓ COMPLETE in {time.time()-t0:.1f}s")
    print(f"  {len(preds):,} properties | {preds['segment'].nunique()} segments")
    print(f"  R²: {r2:.4f} | MAE: ${mae:,.0f} | MAPE: {mape:.2f}%")
    print(f"{'='*60}")

if __name__ == "__main__": main()

GRANULAR SEGMENTED AVM
Loading /Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/Untitled 5_2025-12-23-1212.csv
127,326 records | 129.6MB | Price:sale_price ID:property_id

Preparing data...
127,258 records after price filter
51/60 features available

Training segmented models on 127,258 properties
27 segments created
  large_family_mature: 19,246
  medium_family_older: 17,021
  large_family_older: 12,118
  small_standard_older: 11,606
  medium_family_mature: 10,980
  small_family_older: 7,384
  small_compact_mature: 6,931
Consolidated to 7 segments maximum
  other: 48,903→38,254 (21.8% filtered)
  other: 11,476 test | MAE:$620,127 | MAPE:26.91% | R²:0.212
  large_family_older: 12,118→8,747 (27.8% filtered)
  large_family_older: 2,624 test | MAE:$393,518 | MAPE:24.01% | R²:0.445
  small_family_older: 7,384→5,453 (26.2% filtered)
  small_family_older: 1,636 test | MAE:$471,420 | MAPE:21.50% | R²:0.104
  small_standard_older: 11,606→8,712 (24.9% filtered)
  small_standard_olde

In [8]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.cluster import MiniBatchKMeans
from sklearn.ensemble import IsolationForest
import warnings
from openpyxl import Workbook
from openpyxl.styles import Font, PatternFill
import time, os

warnings.filterwarnings('ignore')
os.environ['OMP_NUM_THREADS'] = '4'

# Config - FIXED PARAMETERS
INCLUDE_MLS, INCLUDE_CENSUS, INCLUDE_NEIGHBORHOOD, INCLUDE_IMAGE = True, True, True, False
MIN_PRICE, TEST_SIZE, RAND_STATE, N_JOBS, N_CLUSTERS = 20000, 0.3, 42, -1, 8
N_EST, QUANTILES = 100, [0.1, 0.5, 0.9]  # REDUCED from 500 to 100

INPUT_PATH = "/Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/Untitled 5_2025-12-23-1212.csv"
OUTPUT_DIR = "/Users/jenny.lin/BASIS_AVM_Onboarding/cate_scenario_analyses/model_outputs"

BASE_FEATS = ["living_sqft","lot_sqft","year_built","effective_year_built","bedrooms","full_baths","half_baths","garage_spaces","fireplace_code","latitude","longitude","geo_cluster"]
ENG_FEATS = ["sqft_per_bedroom","lot_to_living_ratio","property_age","is_new","has_garage","luxury_score","log_sqft","age_squared"]
PRIOR_FEATS = ["prior_sale_price","prior_price_per_sqft","sqft_per_prior_dollar","years_since_last_sale","expected_appreciation","has_prior_sale","recently_sold"]
CLUSTER_FEATS = ["cluster_avg_price","cluster_med_price"]
CENSUS_FEATS = ["total_population_25plus","male_bachelors_degree","female_bachelors_degree","pct_bachelors_degree","total_population","median_earnings_total","median_earnings_male","median_earnings_female","median_household_income","median_home_value","median_gross_rent","owner_occupied_units","renter_occupied_units","pct_owner_occupied","occupied_units","vacant_units","median_age","civilian_employed","civilian_unemployed","unemployment_rate","income_education_score"]
ELECT_FEATS = ["votes_gop","votes_dem","total_votes","per_gop","per_dem","per_point_diff","dem_margin","rep_margin"]
IMG_FEATS = ["topic_1","topic_2","topic_3","topic_4","topic_5","topic_6","topic_7","topic_8","topic_9","topic_10","gran_c_in","gran_c_ex","gran_c","high_c_in","high_c_ex","high_c"]

def optimize_dtypes(df):
    for c in df.select_dtypes(['float64']).columns: df[c] = df[c].astype('float32')
    for c in df.select_dtypes(['int64']).columns:
        df[c] = df[c].astype('int8') if set(df[c].dropna().unique()).issubset({0,1}) else df[c].astype('int32')
    return df

def load_data(path):
    print(f"Loading {path}")
    df = pd.read_csv(path, low_memory=False)
    df.columns = df.columns.str.lower()
    y_col = next((c for c in ['sale_price','currentsalesprice','price','saleprice'] if c in df.columns), 'currentsalesprice')
    id_col = next((c for c in ['cc_list_id','property_id','propertyid','id'] if c in df.columns), 'cc_list_id')
    state_col = next((c for c in ['sample_state','state','state_code'] if c in df.columns), None)
    print(f"{len(df):,} records | {df.memory_usage(deep=True).sum()/1024**2:.1f}MB | Price:{y_col} ID:{id_col}")
    return optimize_dtypes(df), y_col, id_col, state_col

def assign_segment(row, df_q):
    """Robust segmentation with tighter thresholds to create more segments"""
    # Size tier - use tighter quantiles
    size = 'medium'
    if 'living_sqft' in df_q.columns and 'living_sqft' in row.index:
        sqft = row.get('living_sqft',0)
        if sqft > 0:
            q20,q40,q60,q80 = df_q['living_sqft'].quantile([.2,.4,.6,.8])
            if sqft < q20: size = 'tiny'
            elif sqft < q40: size = 'small'
            elif sqft < q60: size = 'medium'
            elif sqft < q80: size = 'large'
            else: size = 'xlarge'

    # Age tier - more granular
    age_class = 'mature'
    if 'property_age' in row.index:
        age = row.get('property_age',50)
        if age <= 3: age_class = 'new'
        elif age <= 10: age_class = 'recent'
        elif age <= 25: age_class = 'mature'
        elif age <= 45: age_class = 'aging'
        else: age_class = 'old'
    elif 'year_built' in row.index:
        yb = row.get('year_built',1980)
        age = 2024 - yb
        if age <= 3: age_class = 'new'
        elif age <= 10: age_class = 'recent'
        elif age <= 25: age_class = 'mature'
        elif age <= 45: age_class = 'aging'
        else: age_class = 'old'

    # Location quality - use tighter quantiles
    loc = 'mid'
    if INCLUDE_CENSUS and 'median_household_income' in df_q.columns and 'median_household_income' in row.index:
        inc = row.get('median_household_income',0)
        iq25,iq75 = df_q['median_household_income'].quantile([.25,.75])
        if inc > iq75: loc = 'prime'
        elif inc < iq25: loc = 'basic'
        else: loc = 'mid'

    # Luxury tier - use tighter quantiles
    lux_tier = 'mid'
    if 'luxury_score' in df_q.columns and 'luxury_score' in row.index:
        lux = row.get('luxury_score',0)
        lq25,lq75 = df_q['luxury_score'].quantile([.25,.75])
        if lux > lq75: lux_tier = 'upscale'
        elif lux < lq25: lux_tier = 'standard'
        else: lux_tier = 'mid'

    return f"{size}_{age_class}_{loc}_{lux_tier}"

def engineer(df, y_col, with_price=False):
    if 'living_sqft' in df.columns and 'bedrooms' in df.columns: df['sqft_per_bedroom'] = df['living_sqft']/(df['bedrooms']+1)
    if 'lot_sqft' in df.columns:
        if 'living_sqft' in df.columns: df['lot_to_living_ratio'] = df['lot_sqft']/(df['living_sqft']+1)
        if 'lot_acres' not in df.columns: df['lot_acres'] = df['lot_sqft']/43560
    if 'year_built' in df.columns:
        df['property_age'] = 2024-df['year_built']
        df['is_new'] = (df['property_age']<=5).astype('int8')
        df['age_squared'] = df['property_age']**2
    if 'garage_spaces' in df.columns: df['has_garage'] = (df['garage_spaces']>0).astype('int8')
    if 'living_sqft' in df.columns: df['log_sqft'] = np.log1p(df['living_sqft'])
    lux = [df[c]/1000 if c=='living_sqft' else df[c] for c in ['living_sqft','full_baths','garage_spaces'] if c in df.columns]
    df['luxury_score'] = sum(lux)/len(lux) if lux else 0
    if INCLUDE_CENSUS and all(c in df.columns for c in ['median_household_income','pct_bachelors_degree']):
        df['income_education_score'] = df['median_household_income']*df['pct_bachelors_degree']
    if 'prior_sale_price' in df.columns and 'living_sqft' in df.columns:
        df['prior_price_per_sqft'] = df['prior_sale_price']/(df['living_sqft']+1)
        df['sqft_per_prior_dollar'] = df['living_sqft']/(df['prior_sale_price']+1)
    if 'prior_sale_date' in df.columns:
        df['prior_sale_date'] = pd.to_datetime(df['prior_sale_date'], errors='coerce')
        df['years_since_last_sale'] = (pd.Timestamp('2024-01-01')-df['prior_sale_date']).dt.days/365.25
        if 'prior_sale_price' in df.columns:
            df['expected_appreciation'] = df['prior_sale_price']*(1.04)**df['years_since_last_sale']
        df['recently_sold'] = (df['years_since_last_sale']<2).astype('int8')
    if 'prior_sale_price' in df.columns:
        df['has_prior_sale'] = df['prior_sale_price'].notna().astype('int8')
        missing = df['prior_sale_price'].isna()
        df.loc[missing,'prior_sale_price'] = df.loc[missing,'median_home_value'] if 'median_home_value' in df.columns and INCLUDE_CENSUS else df['prior_sale_price'].median()
    if 'years_since_last_sale' in df.columns: df['years_since_last_sale'] = df['years_since_last_sale'].fillna(999)
    if with_price and y_col in df.columns and 'living_sqft' in df.columns:
        df['price_per_sqft'] = df[y_col]/(df['living_sqft']+1)
        df['sqft_per_dollar'] = df['living_sqft']/(df[y_col]+1)
    return df

def geo_cluster(df):
    if not all(c in df.columns for c in ['latitude','longitude']):
        df['geo_cluster'] = 0; return df
    valid = df[['latitude','longitude']].notna().all(axis=1)
    if valid.sum()<N_CLUSTERS: df['geo_cluster'] = 0; return df
    df['geo_cluster'] = 0
    kmeans = MiniBatchKMeans(n_clusters=N_CLUSTERS, random_state=RAND_STATE, batch_size=1000, n_init=3)
    df.loc[valid,'geo_cluster'] = kmeans.fit_predict(df.loc[valid,['latitude','longitude']])
    return df

def add_cluster_feats(train, test, y_col):
    if 'geo_cluster' not in train.columns or y_col not in train.columns:
        for d in [train,test]: d['cluster_avg_price'] = d['cluster_med_price'] = train[y_col].median() if y_col in train.columns else 0
        return train, test
    stats = train.groupby('geo_cluster')[y_col].agg(['mean','median']).reset_index()
    stats.columns = ['geo_cluster','cluster_avg_price','cluster_med_price']
    train = train.merge(stats, on='geo_cluster', how='left').fillna({c:train[y_col].median() for c in ['cluster_avg_price','cluster_med_price']})
    test = test.merge(stats, on='geo_cluster', how='left').fillna({c:train[y_col].median() for c in ['cluster_avg_price','cluster_med_price']})
    return train, test

def filter_outliers(df, name, y_col):
    orig = len(df)
    df = engineer(df, y_col, with_price=True)
    if 'price_per_sqft' in df.columns:
        lb,ub = df['price_per_sqft'].quantile([.05,.95])
        df = df[(df['price_per_sqft']>=lb)&(df['price_per_sqft']<=ub)]
    if 'sqft_per_dollar' in df.columns: df = df[df['sqft_per_dollar']<=df['sqft_per_dollar'].quantile(.95)]
    if 'price_per_sqft' in df.columns: df = df.drop(columns=['price_per_sqft','sqft_per_dollar'])
    if 'lot_sqft' in df.columns: df = df[df['lot_sqft']<=df['lot_sqft'].quantile(.98)]
    if 'year_built' in df.columns: df = df[(df['year_built']>=1900)&(df['year_built']<=2025)]
    if len(df)>=100:
        try:
            feats = [c for c in ['living_sqft','lot_sqft',y_col,'year_built'] if c in df.columns]
            if len(feats)>=3:
                X = df[feats].fillna(df[feats].median())
                df = df[IsolationForest(contamination=.05,random_state=RAND_STATE,n_jobs=N_JOBS).fit_predict(X)==1]
        except: pass
    pct_filt = (orig-len(df))/orig*100 if orig>0 else 0
    if pct_filt > 0: print(f"  {name}: {orig:,}→{len(df):,} ({pct_filt:.1f}% filtered)")
    return df

def train_model(X, y, q):
    # FIXED: Optimized parameters for speed
    return XGBRegressor(
        objective='reg:quantileerror',
        quantile_alpha=q,
        n_estimators=N_EST,  # Now 100 instead of 500
        learning_rate=.1,     # INCREASED from .05 to .1 for faster convergence
        max_depth=5,          # REDUCED from 6 to 5
        min_child_weight=3,
        subsample=.8,
        colsample_bytree=.8,
        random_state=RAND_STATE,
        n_jobs=N_JOBS,
        tree_method='hist'
    ).fit(X, y, verbose=False)

def feature_importance(models, feat_names, metrics):
    rows = []
    for seg, mdl in models.items():
        scores = mdl['q50'].get_booster().get_score(importance_type="gain")
        w = metrics[seg]["n_test"]
        for k,v in scores.items():
            idx = int(k[1:])
            if idx<len(feat_names): rows.append((feat_names[idx],v,w))
    if not rows: return pd.DataFrame(columns=["feature","importance"])
    df = pd.DataFrame(rows, columns=["feature","gain","weight"])
    out = df.assign(wg=df["gain"]*df["weight"]).groupby("feature",as_index=False).agg(tg=("wg","sum")).sort_values("tg",ascending=False)
    out["importance"] = out["tg"]/out["tg"].sum()
    return out[["feature","importance"]].head(100)

def prepare_data(df, y_col, id_col, state_col):
    print(f"\nPreparing data...")
    df = df[df[y_col]>=MIN_PRICE]
    print(f"{len(df):,} records after price filter")
    df = engineer(geo_cluster(df), y_col)
    feat_groups = []
    if INCLUDE_MLS: feat_groups.extend([BASE_FEATS,ENG_FEATS,PRIOR_FEATS,CLUSTER_FEATS])
    if INCLUDE_CENSUS: feat_groups.append(CENSUS_FEATS)
    if INCLUDE_NEIGHBORHOOD: feat_groups.append(ELECT_FEATS)
    if INCLUDE_IMAGE: feat_groups.append(IMG_FEATS)
    all_feats = [f for g in feat_groups for f in g]
    feats = [f for f in all_feats if f in df.columns]
    print(f"{len(feats)}/{len(all_feats)} features available")
    cols = list(set(feats+[y_col,id_col]+([state_col] if state_col and state_col in df.columns else [])))
    df = df[[c for c in cols if c in df.columns]].copy()
    df[feats] = df[feats].fillna(df[feats].median())
    return df.dropna(subset=[y_col]), feats

def train_segments(df, feats, y_col, id_col, state_col):
    print(f"\nTraining segmented models on {len(df):,} properties")
    df['seg'] = df.apply(lambda r: assign_segment(r,df), axis=1)
    seg_cnts = df['seg'].value_counts()
    print(f"{len(seg_cnts)} segments created")
    for seg, cnt in seg_cnts.head(10).items(): print(f"  {seg}: {cnt:,}")
    small = seg_cnts[seg_cnts<50].index
    if len(small)>0:
        df.loc[df['seg'].isin(small),'seg'] = 'other'
        print(f"Merged {len(small)} small segments into 'other'")
    models, metrics, preds_list = {}, {}, []
    for seg in df['seg'].unique():
        seg_df = df[df['seg']==seg].copy()
        if len(seg_df)<50: continue
        seg_df = filter_outliers(seg_df, seg, y_col)
        if len(seg_df)<50: continue
        train_idx = seg_df.sample(frac=1-TEST_SIZE,random_state=RAND_STATE).index
        train_df, test_df = seg_df.loc[train_idx].copy(), seg_df.loc[seg_df.index.difference(train_idx)].copy()
        train_df, test_df = add_cluster_feats(train_df, test_df, y_col)
        X_tr,y_tr = train_df[feats].values, train_df[y_col].values
        X_te,y_te = test_df[feats].values, test_df[y_col].values
        ids_te = test_df[id_col].values
        states_te = test_df[state_col].values if state_col and state_col in test_df.columns else ['Unknown']*len(test_df)
        seg_models, seg_preds = {}, []
        for q in QUANTILES:
            m = train_model(X_tr,y_tr,q)
            seg_models[f"q{int(q*100)}"] = m
            seg_preds.append(m.predict(X_te))
        models[seg] = seg_models
        y_pred = seg_preds[1]
        mae,mape = mean_absolute_error(y_te,y_pred), np.mean(np.abs((y_te-y_pred)/y_te))*100
        r2,cov = r2_score(y_te,y_pred), np.mean((y_te>=seg_preds[0])&(y_te<=seg_preds[2]))*100
        metrics[seg] = {'n_train':len(X_tr),'n_test':len(X_te),'mae':mae,'mape':mape,'r2':r2,'cov':cov,'p_min':seg_df[y_col].min(),'p_max':seg_df[y_col].max(),'p_med':seg_df[y_col].median()}
        print(f"  {seg}: {len(test_df):,} test | MAE:${mae:,.0f} R²:{r2:.3f}")
        preds_list.append(pd.DataFrame({'property_id':ids_te,'state':states_te,'actual':y_te,'predicted':y_pred,'pred_lower':seg_preds[0],'pred_upper':seg_preds[2],'segment':seg}))
    return {'models':models,'metrics':metrics,'predictions':pd.concat(preds_list),'feature_importance':feature_importance(models,feats,metrics),'feature_names':feats}

def save_results(results, out_dir):
    print(f"\nSaving results...")
    preds, metrics, fi = results['predictions'], results['metrics'], results['feature_importance']
    wb = Workbook()
    wb.remove(wb.active)

    # Summary
    ws = wb.create_sheet("Summary",0)
    ws['A1'], ws['A1'].font = 'GRANULAR SEGMENTED AVM', Font(bold=True,size=14)
    r2 = r2_score(preds['actual'],preds['predicted'])
    mae = mean_absolute_error(preds['actual'],preds['predicted'])
    mape = np.mean(np.abs((preds['actual']-preds['predicted'])/preds['actual']))*100
    data = [['Metric','Value'],['Properties',len(preds)],['Segments',len(metrics)],['R²',f'{r2:.4f}'],['MAE',f'${mae:,.0f}'],['MAPE%',f'{mape:.2f}']]
    for i,(k,v) in enumerate(data,5): ws[f'A{i}'],ws[f'A{i}'].font,ws[f'B{i}'] = k,Font(bold=True),v

    # Segments
    ws = wb.create_sheet("Segments")
    seg_df = pd.DataFrame([{**{'Segment':s},**m} for s,m in metrics.items()])
    for i,h in enumerate(seg_df.columns,1):
        c = ws.cell(1,i,h)
        c.font,c.fill = Font(bold=True,color='FFFFFF'),PatternFill(start_color='366092',end_color='366092',fill_type='solid')
    for i,row in enumerate(seg_df.itertuples(index=False),2):
        for j,v in enumerate(row,1): ws.cell(i,j,v)

    # Predictions
    ws = wb.create_sheet("Predictions")
    for i,h in enumerate(preds.columns,1):
        c = ws.cell(1,i,h)
        c.font,c.fill = Font(bold=True,color='FFFFFF'),PatternFill(start_color='366092',end_color='366092',fill_type='solid')
    for i,row in enumerate(preds.itertuples(index=False),2):
        for j,v in enumerate(row,1): ws.cell(i,j,v)

    ts = time.strftime("%Y%m%d_%H%M%S")
    xl_path = f"{out_dir}/granular_{ts}.xlsx"
    wb.save(xl_path)
    preds.to_csv(f"{out_dir}/predictions_{ts}.csv",index=False)
    seg_df.to_csv(f"{out_dir}/segments_{ts}.csv",index=False)
    fi.to_csv(f"{out_dir}/importance_{ts}.csv",index=False)
    print(f"✓ Excel: {xl_path}")
    print(f"✓ CSVs saved with timestamp: {ts}")

def main():
    t0 = time.time()
    print("="*60)
    print("GRANULAR SEGMENTED AVM")
    print("="*60)
    df, y_col, id_col, state_col = load_data(INPUT_PATH)
    df, feats = prepare_data(df, y_col, id_col, state_col)
    results = train_segments(df, feats, y_col, id_col, state_col)
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    save_results(results, OUTPUT_DIR)
    preds = results['predictions']
    r2 = r2_score(preds['actual'],preds['predicted'])
    mae = mean_absolute_error(preds['actual'],preds['predicted'])
    print(f"\n{'='*60}")
    print(f"✓ COMPLETE in {time.time()-t0:.1f}s")
    print(f"  {len(preds):,} properties | {preds['segment'].nunique()} segments")
    print(f"  R²: {r2:.4f} | MAE: ${mae:,.0f}")
    print(f"{'='*60}")

if __name__ == "__main__": main()

GRANULAR SEGMENTED AVM
Loading /Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/Untitled 5_2025-12-23-1212.csv
127,326 records | 129.6MB | Price:sale_price ID:property_id

Preparing data...
127,258 records after price filter
51/60 features available

Training segmented models on 127,258 properties


KeyboardInterrupt: 

In [9]:
pd.read_csv('/Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/Untitled 5_2025-12-23-1212.csv')

Unnamed: 0,PROPERTY_ID,SALE_PRICE,SALE_DATE,LATITUDE,LONGITUDE,STATE,CITY,ZIP,CENSUS_TRACT,YEAR_BUILT,...,VOTES_GOP,VOTES_DEM,TOTAL_VOTES,PER_GOP,PER_DEM,PER_POINT_DIFF,DEM_MARGIN,REP_MARGIN,POLITICAL_LEAN_STRENGTH,STATE_FIPS
0,173029632,100000000,2024-07-22,39.321025,-84.639359,oh,fairfield,45014.0,10800.0,1990.0,...,114392,69613,186737,0.612583,0.372786,0.239797,-0.127214,0.112583,0.239797,39
1,91476555,100000000,2021-04-19,40.744511,-73.637589,ny,mineola,11501.0,303600.0,2017.0,...,326716,396504,732756,0.445873,0.541113,-0.095240,0.041113,-0.054127,0.095240,36
2,94792198,100000000,2023-06-21,40.930592,-73.893787,ny,yonkers,10701.0,300.0,1965.0,...,144713,312371,462122,0.313149,0.675949,-0.362800,0.175949,-0.186851,0.362800,36
3,107312767,99000000,2021-10-05,39.332883,-84.222479,oh,maineville,45039.0,32203.0,2005.0,...,87988,46069,136100,0.646495,0.338494,0.308001,-0.161506,0.146495,0.308001,39
4,101158806,98750000,2023-07-19,40.747580,-82.212181,oh,jeromesville,44840.0,970900.0,2005.0,...,19407,6541,26405,0.734974,0.247718,0.487256,-0.252282,0.234974,0.487256,39
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127321,93412006,10000,2022-09-08,44.181778,-75.069692,ny,oswegatchie,13670.0,492501.0,1890.0,...,24608,19361,44907,0.547977,0.431135,0.116841,-0.068865,0.047977,0.116841,36
127322,106448552,10000,2024-08-13,38.745783,-82.975813,oh,portsmouth,45662.0,3400.0,1919.0,...,22609,9080,32047,0.705495,0.283334,0.422161,-0.216666,0.205495,0.422161,39
127323,105735114,10000,2016-07-22,39.747802,-84.266273,oh,dayton,45417.0,4400.0,1932.0,...,129034,135064,268505,0.480565,0.503022,-0.022458,0.003022,-0.019435,0.022458,39
127324,93659705,10000,2014-08-25,42.468624,-76.761039,ny,trumansburg,14886.0,950100.0,2017.0,...,5621,3903,9766,0.575568,0.399652,0.175916,-0.100348,0.075568,0.175916,36


In [11]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.cluster import MiniBatchKMeans
from sklearn.ensemble import IsolationForest
import warnings
from openpyxl import Workbook
from openpyxl.styles import Font, PatternFill
import time, os

warnings.filterwarnings('ignore')
os.environ['OMP_NUM_THREADS'] = '4'

# Config - FIXED PARAMETERS
INCLUDE_MLS, INCLUDE_CENSUS, INCLUDE_NEIGHBORHOOD, INCLUDE_IMAGE = True, True, True, False
MIN_PRICE, TEST_SIZE, RAND_STATE, N_JOBS, N_CLUSTERS = 20000, 0.3, 42, -1, 8
N_EST, QUANTILES, MAX_SEGMENTS = 100, [0.1, 0.5, 0.9], 7  # MAX 7 SEGMENTS

INPUT_PATH = "/Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/Untitled 5_2025-12-23-1212.csv"
OUTPUT_DIR = "/Users/jenny.lin/BASIS_AVM_Onboarding/cate_scenario_analyses/model_outputs"

BASE_FEATS = ["living_sqft","lot_sqft","year_built","effective_year_built","bedrooms","full_baths","half_baths","garage_spaces","fireplace_code","latitude","longitude","geo_cluster"]
ENG_FEATS = ["sqft_per_bedroom","lot_to_living_ratio","property_age","is_new","has_garage","luxury_score","log_sqft","age_squared"]
PRIOR_FEATS = ["prior_sale_price","prior_price_per_sqft","sqft_per_prior_dollar","years_since_last_sale","expected_appreciation","has_prior_sale","recently_sold"]
CLUSTER_FEATS = ["cluster_avg_price","cluster_med_price"]
CENSUS_FEATS = ["total_population_25plus","male_bachelors_degree","female_bachelors_degree","pct_bachelors_degree","total_population","median_earnings_total","median_earnings_male","median_earnings_female","median_household_income","median_home_value","median_gross_rent","owner_occupied_units","renter_occupied_units","pct_owner_occupied","occupied_units","vacant_units","median_age","civilian_employed","civilian_unemployed","unemployment_rate","income_education_score"]
ELECT_FEATS = ["votes_gop","votes_dem","total_votes","per_gop","per_dem","per_point_diff","dem_margin","rep_margin"]
IMG_FEATS = ["topic_1","topic_2","topic_3","topic_4","topic_5","topic_6","topic_7","topic_8","topic_9","topic_10","gran_c_in","gran_c_ex","gran_c","high_c_in","high_c_ex","high_c"]

def optimize_dtypes(df):
    for c in df.select_dtypes(['float64']).columns: df[c] = df[c].astype('float32')
    for c in df.select_dtypes(['int64']).columns:
        df[c] = df[c].astype('int8') if set(df[c].dropna().unique()).issubset({0,1}) else df[c].astype('int32')
    return df

def load_data(path):
    print(f"Loading {path}")
    df = pd.read_csv(path, low_memory=False)
    df.columns = df.columns.str.lower()
    y_col = next((c for c in ['sale_price','currentsalesprice','price','saleprice'] if c in df.columns), 'currentsalesprice')
    id_col = next((c for c in ['cc_list_id','property_id','propertyid','id'] if c in df.columns), 'cc_list_id')
    state_col = next((c for c in ['sample_state','state','state_code'] if c in df.columns), None)
    print(f"{len(df):,} records | {df.memory_usage(deep=True).sum()/1024**2:.1f}MB | Price:{y_col} ID:{id_col}")
    return optimize_dtypes(df), y_col, id_col, state_col

def assign_segments_simple(df):
    """Simple 5-7 segment strategy based on price and size"""
    segments = pd.Series(['mid'] * len(df), index=df.index)

    # Primary: Price tier (3 segments)
    if 'currentsalesprice' in df.columns or 'sale_price' in df.columns:
        price_col = 'currentsalesprice' if 'currentsalesprice' in df.columns else 'sale_price'
        price = df[price_col].fillna(df[price_col].median())
        q33, q67 = price.quantile([.33, .67])
        price_tier = pd.Series(['mid'] * len(df), index=df.index)
        price_tier[price < q33] = 'budget'
        price_tier[price > q67] = 'premium'
    else:
        price_tier = pd.Series(['mid'] * len(df), index=df.index)

    # Secondary: Size (2 segments: small vs large)
    if 'living_sqft' in df.columns:
        sqft = df['living_sqft'].fillna(df['living_sqft'].median())
        sqft_median = sqft.median()
        size_tier = pd.Series(['small'] * len(df), index=df.index)
        size_tier[sqft > sqft_median] = 'large'
    else:
        size_tier = pd.Series(['small'] * len(df), index=df.index)

    # Combine: price_size format (e.g., "budget_small", "premium_large")
    segments = price_tier + '_' + size_tier

    return segments

def engineer(df, y_col, with_price=False):
    if 'living_sqft' in df.columns and 'bedrooms' in df.columns: df['sqft_per_bedroom'] = df['living_sqft']/(df['bedrooms']+1)
    if 'lot_sqft' in df.columns:
        if 'living_sqft' in df.columns: df['lot_to_living_ratio'] = df['lot_sqft']/(df['living_sqft']+1)
        if 'lot_acres' not in df.columns: df['lot_acres'] = df['lot_sqft']/43560
    if 'year_built' in df.columns:
        df['property_age'] = 2024-df['year_built']
        df['is_new'] = (df['property_age']<=5).astype('int8')
        df['age_squared'] = df['property_age']**2
    if 'garage_spaces' in df.columns: df['has_garage'] = (df['garage_spaces']>0).astype('int8')
    if 'living_sqft' in df.columns: df['log_sqft'] = np.log1p(df['living_sqft'])
    lux = [df[c]/1000 if c=='living_sqft' else df[c] for c in ['living_sqft','full_baths','garage_spaces'] if c in df.columns]
    df['luxury_score'] = sum(lux)/len(lux) if lux else 0
    if INCLUDE_CENSUS and all(c in df.columns for c in ['median_household_income','pct_bachelors_degree']):
        df['income_education_score'] = df['median_household_income']*df['pct_bachelors_degree']
    if 'prior_sale_price' in df.columns and 'living_sqft' in df.columns:
        df['prior_price_per_sqft'] = df['prior_sale_price']/(df['living_sqft']+1)
        df['sqft_per_prior_dollar'] = df['living_sqft']/(df['prior_sale_price']+1)
    if 'prior_sale_date' in df.columns:
        df['prior_sale_date'] = pd.to_datetime(df['prior_sale_date'], errors='coerce')
        df['years_since_last_sale'] = (pd.Timestamp('2024-01-01')-df['prior_sale_date']).dt.days/365.25
        if 'prior_sale_price' in df.columns:
            df['expected_appreciation'] = df['prior_sale_price']*(1.04)**df['years_since_last_sale']
        df['recently_sold'] = (df['years_since_last_sale']<2).astype('int8')
    if 'prior_sale_price' in df.columns:
        df['has_prior_sale'] = df['prior_sale_price'].notna().astype('int8')
        missing = df['prior_sale_price'].isna()
        df.loc[missing,'prior_sale_price'] = df.loc[missing,'median_home_value'] if 'median_home_value' in df.columns and INCLUDE_CENSUS else df['prior_sale_price'].median()
    if 'years_since_last_sale' in df.columns: df['years_since_last_sale'] = df['years_since_last_sale'].fillna(999)
    if with_price and y_col in df.columns and 'living_sqft' in df.columns:
        df['price_per_sqft'] = df[y_col]/(df['living_sqft']+1)
        df['sqft_per_dollar'] = df['living_sqft']/(df[y_col]+1)
    return df

def geo_cluster(df):
    if not all(c in df.columns for c in ['latitude','longitude']):
        df['geo_cluster'] = 0; return df
    valid = df[['latitude','longitude']].notna().all(axis=1)
    if valid.sum()<N_CLUSTERS: df['geo_cluster'] = 0; return df
    df['geo_cluster'] = 0
    kmeans = MiniBatchKMeans(n_clusters=N_CLUSTERS, random_state=RAND_STATE, batch_size=1000, n_init=3)
    df.loc[valid,'geo_cluster'] = kmeans.fit_predict(df.loc[valid,['latitude','longitude']])
    return df

def add_cluster_feats(train, test, y_col):
    if 'geo_cluster' not in train.columns or y_col not in train.columns:
        for d in [train,test]: d['cluster_avg_price'] = d['cluster_med_price'] = train[y_col].median() if y_col in train.columns else 0
        return train, test
    stats = train.groupby('geo_cluster')[y_col].agg(['mean','median']).reset_index()
    stats.columns = ['geo_cluster','cluster_avg_price','cluster_med_price']
    train = train.merge(stats, on='geo_cluster', how='left').fillna({c:train[y_col].median() for c in ['cluster_avg_price','cluster_med_price']})
    test = test.merge(stats, on='geo_cluster', how='left').fillna({c:train[y_col].median() for c in ['cluster_avg_price','cluster_med_price']})
    return train, test

def filter_outliers(df, name, y_col):
    orig = len(df)
    df = engineer(df, y_col, with_price=True)
    if 'price_per_sqft' in df.columns:
        lb,ub = df['price_per_sqft'].quantile([.05,.95])
        df = df[(df['price_per_sqft']>=lb)&(df['price_per_sqft']<=ub)]
    if 'sqft_per_dollar' in df.columns: df = df[df['sqft_per_dollar']<=df['sqft_per_dollar'].quantile(.95)]
    if 'price_per_sqft' in df.columns: df = df.drop(columns=['price_per_sqft','sqft_per_dollar'])
    if 'lot_sqft' in df.columns: df = df[df['lot_sqft']<=df['lot_sqft'].quantile(.98)]
    if 'year_built' in df.columns: df = df[(df['year_built']>=1900)&(df['year_built']<=2025)]
    if len(df)>=100:
        try:
            feats = [c for c in ['living_sqft','lot_sqft',y_col,'year_built'] if c in df.columns]
            if len(feats)>=3:
                X = df[feats].fillna(df[feats].median())
                df = df[IsolationForest(contamination=.05,random_state=RAND_STATE,n_jobs=N_JOBS).fit_predict(X)==1]
        except: pass
    pct_filt = (orig-len(df))/orig*100 if orig>0 else 0
    if pct_filt > 0: print(f"  {name}: {orig:,}→{len(df):,} ({pct_filt:.1f}% filtered)")
    return df

def train_model(X, y, q):
    return XGBRegressor(
        objective='reg:quantileerror',
        quantile_alpha=q,
        n_estimators=N_EST,
        learning_rate=.1,
        max_depth=5,
        min_child_weight=3,
        subsample=.8,
        colsample_bytree=.8,
        random_state=RAND_STATE,
        n_jobs=N_JOBS,
        tree_method='hist'
    ).fit(X, y, verbose=False)

def feature_importance(models, feat_names, metrics):
    rows = []
    for seg, mdl in models.items():
        scores = mdl['q50'].get_booster().get_score(importance_type="gain")
        w = metrics[seg]["n_test"]
        for k,v in scores.items():
            idx = int(k[1:])
            if idx<len(feat_names): rows.append((feat_names[idx],v,w))
    if not rows: return pd.DataFrame(columns=["feature","importance"])
    df = pd.DataFrame(rows, columns=["feature","gain","weight"])
    out = df.assign(wg=df["gain"]*df["weight"]).groupby("feature",as_index=False).agg(tg=("wg","sum")).sort_values("tg",ascending=False)
    out["importance"] = out["tg"]/out["tg"].sum()
    return out[["feature","importance"]].head(100)

def prepare_data(df, y_col, id_col, state_col):
    print(f"\nPreparing data...")
    df = df[df[y_col]>=MIN_PRICE]
    print(f"{len(df):,} records after price filter")
    df = engineer(geo_cluster(df), y_col)
    feat_groups = []
    if INCLUDE_MLS: feat_groups.extend([BASE_FEATS,ENG_FEATS,PRIOR_FEATS,CLUSTER_FEATS])
    if INCLUDE_CENSUS: feat_groups.append(CENSUS_FEATS)
    if INCLUDE_NEIGHBORHOOD: feat_groups.append(ELECT_FEATS)
    if INCLUDE_IMAGE: feat_groups.append(IMG_FEATS)
    all_feats = [f for g in feat_groups for f in g]
    feats = [f for f in all_feats if f in df.columns]
    print(f"{len(feats)}/{len(all_feats)} features available")
    cols = list(set(feats+[y_col,id_col]+([state_col] if state_col and state_col in df.columns else [])))
    df = df[[c for c in cols if c in df.columns]].copy()
    df[feats] = df[feats].fillna(df[feats].median())
    return df.dropna(subset=[y_col]), feats

def train_segments(df, feats, y_col, id_col, state_col):
    print(f"\nTraining segmented models on {len(df):,} properties")

    # Simple 5-7 segment strategy
    df['seg'] = assign_segments_simple(df)

    seg_cnts = df['seg'].value_counts()
    print(f"{len(seg_cnts)} segments created")
    for seg, cnt in seg_cnts.head(MAX_SEGMENTS).items(): print(f"  {seg}: {cnt:,}")

    # Merge small segments
    small = seg_cnts[seg_cnts<50].index
    if len(small)>0:
        df.loc[df['seg'].isin(small),'seg'] = 'other'
        print(f"Merged {len(small)} small segments into 'other'")

    # If still too many segments, merge smallest ones
    seg_cnts = df['seg'].value_counts()
    if len(seg_cnts) > MAX_SEGMENTS:
        keep_segs = seg_cnts.head(MAX_SEGMENTS-1).index
        df.loc[~df['seg'].isin(keep_segs),'seg'] = 'other'
        print(f"Consolidated to {MAX_SEGMENTS} segments maximum")

    models, metrics, preds_list = {}, {}, []
    for seg in df['seg'].unique():
        seg_df = df[df['seg']==seg].copy()
        if len(seg_df)<50: continue
        seg_df = filter_outliers(seg_df, seg, y_col)
        if len(seg_df)<50: continue
        train_idx = seg_df.sample(frac=1-TEST_SIZE,random_state=RAND_STATE).index
        train_df, test_df = seg_df.loc[train_idx].copy(), seg_df.loc[seg_df.index.difference(train_idx)].copy()
        train_df, test_df = add_cluster_feats(train_df, test_df, y_col)
        X_tr,y_tr = train_df[feats].values, train_df[y_col].values
        X_te,y_te = test_df[feats].values, test_df[y_col].values
        ids_te = test_df[id_col].values
        states_te = test_df[state_col].values if state_col and state_col in test_df.columns else ['Unknown']*len(test_df)
        seg_models, seg_preds = {}, []
        for q in QUANTILES:
            m = train_model(X_tr,y_tr,q)
            seg_models[f"q{int(q*100)}"] = m
            seg_preds.append(m.predict(X_te))
        models[seg] = seg_models
        y_pred = seg_preds[1]
        mae,mape = mean_absolute_error(y_te,y_pred), np.mean(np.abs((y_te-y_pred)/y_te))*100
        r2,cov = r2_score(y_te,y_pred), np.mean((y_te>=seg_preds[0])&(y_te<=seg_preds[2]))*100
        metrics[seg] = {'n_train':len(X_tr),'n_test':len(X_te),'mae':mae,'mape':mape,'r2':r2,'cov':cov,'p_min':seg_df[y_col].min(),'p_max':seg_df[y_col].max(),'p_med':seg_df[y_col].median()}
        print(f"  {seg}: {len(test_df):,} test | MAE:${mae:,.0f} | MAPE:{mape:.2f}% | R²:{r2:.3f}")
        preds_list.append(pd.DataFrame({'property_id':ids_te,'state':states_te,'actual':y_te,'predicted':y_pred,'pred_lower':seg_preds[0],'pred_upper':seg_preds[2],'segment':seg}))
    return {'models':models,'metrics':metrics,'predictions':pd.concat(preds_list),'feature_importance':feature_importance(models,feats,metrics),'feature_names':feats}

def save_results(results, out_dir):
    print(f"\nSaving results...")
    preds, metrics, fi = results['predictions'], results['metrics'], results['feature_importance']
    wb = Workbook()
    wb.remove(wb.active)

    # Summary
    ws = wb.create_sheet("Summary",0)
    ws['A1'], ws['A1'].font = 'GRANULAR SEGMENTED AVM', Font(bold=True,size=14)
    r2 = r2_score(preds['actual'],preds['predicted'])
    mae = mean_absolute_error(preds['actual'],preds['predicted'])
    mape = np.mean(np.abs((preds['actual']-preds['predicted'])/preds['actual']))*100
    data = [['Metric','Value'],['Properties',len(preds)],['Segments',len(metrics)],['R²',f'{r2:.4f}'],['MAE',f'${mae:,.0f}'],['MAPE%',f'{mape:.2f}%']]
    for i,(k,v) in enumerate(data,5): ws[f'A{i}'],ws[f'A{i}'].font,ws[f'B{i}'] = k,Font(bold=True),v

    # Segments
    ws = wb.create_sheet("Segments")
    seg_df = pd.DataFrame([{**{'Segment':s},**m} for s,m in metrics.items()])
    for i,h in enumerate(seg_df.columns,1):
        c = ws.cell(1,i,h)
        c.font,c.fill = Font(bold=True,color='FFFFFF'),PatternFill(start_color='366092',end_color='366092',fill_type='solid')
    for i,row in enumerate(seg_df.itertuples(index=False),2):
        for j,v in enumerate(row,1): ws.cell(i,j,v)

    # Predictions
    ws = wb.create_sheet("Predictions")
    for i,h in enumerate(preds.columns,1):
        c = ws.cell(1,i,h)
        c.font,c.fill = Font(bold=True,color='FFFFFF'),PatternFill(start_color='366092',end_color='366092',fill_type='solid')
    for i,row in enumerate(preds.itertuples(index=False),2):
        for j,v in enumerate(row,1): ws.cell(i,j,v)

    ts = time.strftime("%Y%m%d_%H%M%S")
    xl_path = f"{out_dir}/granular_{ts}.xlsx"
    wb.save(xl_path)
    preds.to_csv(f"{out_dir}/predictions_{ts}.csv",index=False)
    seg_df.to_csv(f"{out_dir}/segments_{ts}.csv",index=False)
    fi.to_csv(f"{out_dir}/importance_{ts}.csv",index=False)
    print(f"✓ Excel: {xl_path}")
    print(f"✓ CSVs saved with timestamp: {ts}")

def main():
    t0 = time.time()
    print("="*60)
    print("GRANULAR SEGMENTED AVM")
    print("="*60)
    df, y_col, id_col, state_col = load_data(INPUT_PATH)
    df, feats = prepare_data(df, y_col, id_col, state_col)
    results = train_segments(df, feats, y_col, id_col, state_col)
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    save_results(results, OUTPUT_DIR)
    preds = results['predictions']
    r2 = r2_score(preds['actual'],preds['predicted'])
    mae = mean_absolute_error(preds['actual'],preds['predicted'])
    mape = np.mean(np.abs((preds['actual']-preds['predicted'])/preds['actual']))*100
    print(f"\n{'='*60}")
    print(f"✓ COMPLETE in {time.time()-t0:.1f}s")
    print(f"  {len(preds):,} properties | {preds['segment'].nunique()} segments")
    print(f"  R²: {r2:.4f} | MAE: ${mae:,.0f} | MAPE: {mape:.2f}%")
    print(f"{'='*60}")

if __name__ == "__main__": main()

GRANULAR SEGMENTED AVM
Loading /Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/Untitled 5_2025-12-23-1212.csv
127,326 records | 129.6MB | Price:sale_price ID:property_id

Preparing data...
127,258 records after price filter
51/60 features available

Training segmented models on 127,258 properties
6 segments created
  budget_small: 23,028
  mid_large: 22,880
  premium_large: 21,935
  mid_small: 20,638
  premium_small: 19,967
  budget_large: 18,810
  premium_small: 19,967→15,479 (22.5% filtered)
  premium_small: 4,644 test | MAE:$1,693,499 | MAPE:34.70% | R²:0.463
  premium_large: 21,935→16,873 (23.1% filtered)
  premium_large: 5,062 test | MAE:$635,936 | MAPE:21.32% | R²:0.395
  mid_large: 22,880→17,671 (22.8% filtered)
  mid_large: 5,301 test | MAE:$119,768 | MAPE:10.16% | R²:0.117
  mid_small: 20,638→15,752 (23.7% filtered)
  mid_small: 4,726 test | MAE:$113,631 | MAPE:9.51% | R²:0.169
  budget_large: 18,810→14,526 (22.8% filtered)
  budget_large: 4,358 test | MAE:$37,205

In [18]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.cluster import MiniBatchKMeans
from sklearn.ensemble import IsolationForest
import warnings
from openpyxl import Workbook
from openpyxl.styles import Font, PatternFill, Alignment
from openpyxl.utils.dataframe import dataframe_to_rows
import time, os

warnings.filterwarnings('ignore')
os.environ['OMP_NUM_THREADS'] = '4'

# Config - FIXED PARAMETERS
INCLUDE_MLS, INCLUDE_CENSUS, INCLUDE_NEIGHBORHOOD, INCLUDE_IMAGE = True, True, True, False
MIN_PRICE, TEST_SIZE, RAND_STATE, N_JOBS, N_CLUSTERS = 20000, 0.3, 42, -1, 8
N_EST, QUANTILES, MAX_SEGMENTS = 100, [0.1, 0.5, 0.9], 7  # MAX 7 SEGMENTS

INPUT_PATH = "/Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/Untitled 5_2025-12-23-1212.csv"
OUTPUT_DIR = "/Users/jenny.lin/BASIS_AVM_Onboarding/cate_scenario_analyses/model_outputs"

BASE_FEATS = ["living_sqft","lot_sqft","year_built","effective_year_built","bedrooms","full_baths","half_baths","garage_spaces","fireplace_code","latitude","longitude","geo_cluster"]
ENG_FEATS = ["sqft_per_bedroom","lot_to_living_ratio","property_age","is_new","has_garage","luxury_score","log_sqft","age_squared"]
PRIOR_FEATS = ["prior_sale_price","prior_price_per_sqft","sqft_per_prior_dollar","years_since_last_sale","expected_appreciation","has_prior_sale","recently_sold"]
CLUSTER_FEATS = ["cluster_avg_price","cluster_med_price"]
CENSUS_FEATS = ["total_population_25plus","male_bachelors_degree","female_bachelors_degree","pct_bachelors_degree","median_earnings_total","median_earnings_male","median_earnings_female","median_household_income","median_home_value","median_gross_rent","owner_occupied_units","renter_occupied_units","pct_owner_occupied","occupied_units","vacant_units","median_age","civilian_employed","civilian_unemployed","unemployment_rate","income_education_score"]
ELECT_FEATS = ["votes_gop","votes_dem","total_votes","per_gop","per_dem","per_point_diff","dem_margin","rep_margin"]
IMG_FEATS = ["topic_1","topic_2","topic_3","topic_4","topic_5","topic_6","topic_7","topic_8","topic_9","topic_10","gran_c_in","gran_c_ex","gran_c","high_c_in","high_c_ex","high_c"]

def optimize_dtypes(df):
    for c in df.select_dtypes(['float64']).columns: df[c] = df[c].astype('float32')
    for c in df.select_dtypes(['int64']).columns:
        df[c] = df[c].astype('int8') if set(df[c].dropna().unique()).issubset({0,1}) else df[c].astype('int32')
    return df

def load_data(path):
    print(f"Loading {path}")
    df = pd.read_csv(path, low_memory=False)
    df.columns = df.columns.str.lower()
    y_col = next((c for c in ['sale_price','currentsalesprice','price','saleprice'] if c in df.columns), 'currentsalesprice')
    id_col = next((c for c in ['cc_list_id','property_id','propertyid','id'] if c in df.columns), 'cc_list_id')
    state_col = next((c for c in ['sample_state','state','state_code'] if c in df.columns), None)
    print(f"{len(df):,} records | {df.memory_usage(deep=True).sum()/1024**2:.1f}MB | Price:{y_col} ID:{id_col}")
    return optimize_dtypes(df), y_col, id_col, state_col

def assign_segments_simple(df):
    """Simple 5-7 segment strategy based on price and size"""
    segments = pd.Series(['mid'] * len(df), index=df.index)

    # Primary: Price tier (3 segments)
    if 'currentsalesprice' in df.columns or 'sale_price' in df.columns:
        price_col = 'currentsalesprice' if 'currentsalesprice' in df.columns else 'sale_price'
        price = df[price_col].fillna(df[price_col].median())
        q33, q67 = price.quantile([.33, .67])
        price_tier = pd.Series(['mid'] * len(df), index=df.index)
        price_tier[price < q33] = 'budget'
        price_tier[price > q67] = 'premium'
    else:
        price_tier = pd.Series(['mid'] * len(df), index=df.index)

    # Secondary: Size (2 segments: small vs large)
    if 'living_sqft' in df.columns:
        sqft = df['living_sqft'].fillna(df['living_sqft'].median())
        sqft_median = sqft.median()
        size_tier = pd.Series(['small'] * len(df), index=df.index)
        size_tier[sqft > sqft_median] = 'large'
    else:
        size_tier = pd.Series(['small'] * len(df), index=df.index)

    # Combine: price_size format (e.g., "budget_small", "premium_large")
    segments = price_tier + '_' + size_tier

    return segments

def engineer(df, y_col, with_price=False):
    if 'living_sqft' in df.columns and 'bedrooms' in df.columns: df['sqft_per_bedroom'] = df['living_sqft']/(df['bedrooms']+1)
    if 'lot_sqft' in df.columns:
        if 'living_sqft' in df.columns: df['lot_to_living_ratio'] = df['lot_sqft']/(df['living_sqft']+1)
        if 'lot_acres' not in df.columns: df['lot_acres'] = df['lot_sqft']/43560
    if 'year_built' in df.columns:
        df['property_age'] = 2024-df['year_built']
        df['is_new'] = (df['property_age']<=5).astype('int8')
        df['age_squared'] = df['property_age']**2
    if 'garage_spaces' in df.columns: df['has_garage'] = (df['garage_spaces']>0).astype('int8')
    if 'living_sqft' in df.columns: df['log_sqft'] = np.log1p(df['living_sqft'])
    lux = [df[c]/1000 if c=='living_sqft' else df[c] for c in ['living_sqft','full_baths','garage_spaces'] if c in df.columns]
    df['luxury_score'] = sum(lux)/len(lux) if lux else 0
    if INCLUDE_CENSUS and all(c in df.columns for c in ['median_household_income','pct_bachelors_degree']):
        df['income_education_score'] = df['median_household_income']*df['pct_bachelors_degree']
    if 'prior_sale_price' in df.columns and 'living_sqft' in df.columns:
        df['prior_price_per_sqft'] = df['prior_sale_price']/(df['living_sqft']+1)
        df['sqft_per_prior_dollar'] = df['living_sqft']/(df['prior_sale_price']+1)
    if 'prior_sale_date' in df.columns:
        df['prior_sale_date'] = pd.to_datetime(df['prior_sale_date'], errors='coerce')
        df['years_since_last_sale'] = (pd.Timestamp('2024-01-01')-df['prior_sale_date']).dt.days/365.25
        if 'prior_sale_price' in df.columns:
            df['expected_appreciation'] = df['prior_sale_price']*(1.04)**df['years_since_last_sale']
        df['recently_sold'] = (df['years_since_last_sale']<2).astype('int8')
    if 'prior_sale_price' in df.columns:
        df['has_prior_sale'] = df['prior_sale_price'].notna().astype('int8')
        missing = df['prior_sale_price'].isna()
        df.loc[missing,'prior_sale_price'] = df.loc[missing,'median_home_value'] if 'median_home_value' in df.columns and INCLUDE_CENSUS else df['prior_sale_price'].median()
    if 'years_since_last_sale' in df.columns: df['years_since_last_sale'] = df['years_since_last_sale'].fillna(999)
    if with_price and y_col in df.columns and 'living_sqft' in df.columns:
        df['price_per_sqft'] = df[y_col]/(df['living_sqft']+1)
        df['sqft_per_dollar'] = df['living_sqft']/(df[y_col]+1)
    return df

def geo_cluster(df):
    if not all(c in df.columns for c in ['latitude','longitude']):
        df['geo_cluster'] = 0; return df
    valid = df[['latitude','longitude']].notna().all(axis=1)
    if valid.sum()<N_CLUSTERS: df['geo_cluster'] = 0; return df
    df['geo_cluster'] = 0
    kmeans = MiniBatchKMeans(n_clusters=N_CLUSTERS, random_state=RAND_STATE, batch_size=1000, n_init=3)
    df.loc[valid,'geo_cluster'] = kmeans.fit_predict(df.loc[valid,['latitude','longitude']])
    return df

def add_cluster_feats(train, test, y_col):
    if 'geo_cluster' not in train.columns or y_col not in train.columns:
        for d in [train,test]: d['cluster_avg_price'] = d['cluster_med_price'] = train[y_col].median() if y_col in train.columns else 0
        return train, test
    stats = train.groupby('geo_cluster')[y_col].agg(['mean','median']).reset_index()
    stats.columns = ['geo_cluster','cluster_avg_price','cluster_med_price']
    train = train.merge(stats, on='geo_cluster', how='left').fillna({c:train[y_col].median() for c in ['cluster_avg_price','cluster_med_price']})
    test = test.merge(stats, on='geo_cluster', how='left').fillna({c:train[y_col].median() for c in ['cluster_avg_price','cluster_med_price']})
    return train, test

def filter_outliers(df, name, y_col):
    orig = len(df)
    df = engineer(df, y_col, with_price=True)
    if 'price_per_sqft' in df.columns:
        lb,ub = df['price_per_sqft'].quantile([.05,.95])
        df = df[(df['price_per_sqft']>=lb)&(df['price_per_sqft']<=ub)]
    if 'sqft_per_dollar' in df.columns: df = df[df['sqft_per_dollar']<=df['sqft_per_dollar'].quantile(.95)]
    if 'price_per_sqft' in df.columns: df = df.drop(columns=['price_per_sqft','sqft_per_dollar'])
    if 'lot_sqft' in df.columns: df = df[df['lot_sqft']<=df['lot_sqft'].quantile(.98)]
    if 'year_built' in df.columns: df = df[(df['year_built']>=1900)&(df['year_built']<=2025)]
    if len(df)>=100:
        try:
            feats = [c for c in ['living_sqft','lot_sqft',y_col,'year_built'] if c in df.columns]
            if len(feats)>=3:
                X = df[feats].fillna(df[feats].median())
                df = df[IsolationForest(contamination=.05,random_state=RAND_STATE,n_jobs=N_JOBS).fit_predict(X)==1]
        except: pass
    pct_filt = (orig-len(df))/orig*100 if orig>0 else 0
    if pct_filt > 0: print(f"  {name}: {orig:,}→{len(df):,} ({pct_filt:.1f}% filtered)")
    return df

def train_model(X, y, q):
    return XGBRegressor(
        objective='reg:quantileerror',
        quantile_alpha=q,
        n_estimators=N_EST,
        learning_rate=.1,
        max_depth=5,
        min_child_weight=3,
        subsample=.8,
        colsample_bytree=.8,
        random_state=RAND_STATE,
        n_jobs=N_JOBS,
        tree_method='hist'
    ).fit(X, y, verbose=False)

def get_feature_importance_per_segment(model, feat_names, top_n=20):
    """Extract feature importance for a single segment model"""
    scores = model.get_booster().get_score(importance_type="gain")
    importance_list = []
    for k, v in scores.items():
        idx = int(k[1:])
        if idx < len(feat_names):
            importance_list.append((feat_names[idx], v))

    # Sort and normalize
    importance_list.sort(key=lambda x: x[1], reverse=True)
    total_gain = sum(v for _, v in importance_list)

    if total_gain > 0:
        importance_df = pd.DataFrame([
            {'feature': feat, 'gain': gain, 'importance': gain/total_gain}
            for feat, gain in importance_list[:top_n]
        ])
    else:
        importance_df = pd.DataFrame(columns=['feature', 'gain', 'importance'])

    return importance_df

def feature_importance(models, feat_names, metrics):
    """Global feature importance weighted by segment size - NOW INCLUDES TOTAL_GAIN"""
    rows = []
    for seg, mdl in models.items():
        scores = mdl['q50'].get_booster().get_score(importance_type="gain")
        w = metrics[seg]["n_test"]
        for k,v in scores.items():
            idx = int(k[1:])
            if idx<len(feat_names): rows.append((feat_names[idx],v,w))
    if not rows: return pd.DataFrame(columns=["feature","total_gain","importance"])

    df = pd.DataFrame(rows, columns=["feature","gain","weight"])

    # Calculate both total_gain and weighted importance
    out = df.assign(wg=df["gain"]*df["weight"]).groupby("feature",as_index=False).agg(
        total_gain=("gain","sum"),      # Sum of gain across all segments
        weighted_gain=("wg","sum")      # Weighted by segment size
    ).sort_values("weighted_gain",ascending=False)

    out["importance"] = out["weighted_gain"]/out["weighted_gain"].sum()

    return out[["feature","total_gain","importance"]].head(100)

def prepare_data(df, y_col, id_col, state_col):
    print(f"\nPreparing data...")
    df = df[df[y_col]>=MIN_PRICE]
    print(f"{len(df):,} records after price filter")
    df = engineer(geo_cluster(df), y_col)
    feat_groups = []
    if INCLUDE_MLS: feat_groups.extend([BASE_FEATS,ENG_FEATS,PRIOR_FEATS,CLUSTER_FEATS])
    if INCLUDE_CENSUS: feat_groups.append(CENSUS_FEATS)
    if INCLUDE_NEIGHBORHOOD: feat_groups.append(ELECT_FEATS)
    if INCLUDE_IMAGE: feat_groups.append(IMG_FEATS)
    all_feats = [f for g in feat_groups for f in g]
    feats = [f for f in all_feats if f in df.columns]
    print(f"{len(feats)}/{len(all_feats)} features available")
    cols = list(set(feats+[y_col,id_col]+([state_col] if state_col and state_col in df.columns else [])))
    df = df[[c for c in cols if c in df.columns]].copy()
    df[feats] = df[feats].fillna(df[feats].median())
    return df.dropna(subset=[y_col]), feats

def train_segments(df, feats, y_col, id_col, state_col):
    print(f"\nTraining segmented models on {len(df):,} properties")

    # Simple 5-7 segment strategy
    df['seg'] = assign_segments_simple(df)

    seg_cnts = df['seg'].value_counts()
    print(f"{len(seg_cnts)} segments created")
    for seg, cnt in seg_cnts.head(MAX_SEGMENTS).items(): print(f"  {seg}: {cnt:,}")

    # Merge small segments
    small = seg_cnts[seg_cnts<50].index
    if len(small)>0:
        df.loc[df['seg'].isin(small),'seg'] = 'other'
        print(f"Merged {len(small)} small segments into 'other'")

    # If still too many segments, merge smallest ones
    seg_cnts = df['seg'].value_counts()
    if len(seg_cnts) > MAX_SEGMENTS:
        keep_segs = seg_cnts.head(MAX_SEGMENTS-1).index
        df.loc[~df['seg'].isin(keep_segs),'seg'] = 'other'
        print(f"Consolidated to {MAX_SEGMENTS} segments maximum")

    models, metrics, preds_list, segment_importances = {}, {}, [], {}

    for seg in df['seg'].unique():
        seg_df = df[df['seg']==seg].copy()
        if len(seg_df)<50: continue
        seg_df = filter_outliers(seg_df, seg, y_col)
        if len(seg_df)<50: continue
        train_idx = seg_df.sample(frac=1-TEST_SIZE,random_state=RAND_STATE).index
        train_df, test_df = seg_df.loc[train_idx].copy(), seg_df.loc[seg_df.index.difference(train_idx)].copy()
        train_df, test_df = add_cluster_feats(train_df, test_df, y_col)
        X_tr,y_tr = train_df[feats].values, train_df[y_col].values
        X_te,y_te = test_df[feats].values, test_df[y_col].values
        ids_te = test_df[id_col].values
        states_te = test_df[state_col].values if state_col and state_col in test_df.columns else ['Unknown']*len(test_df)
        seg_models, seg_preds = {}, []
        for q in QUANTILES:
            m = train_model(X_tr,y_tr,q)
            seg_models[f"q{int(q*100)}"] = m
            seg_preds.append(m.predict(X_te))
        models[seg] = seg_models

        # Get feature importance for this segment
        segment_importances[seg] = get_feature_importance_per_segment(seg_models['q50'], feats, top_n=20)

        y_pred = seg_preds[1]
        mae,mape = mean_absolute_error(y_te,y_pred), np.mean(np.abs((y_te-y_pred)/y_te))*100
        r2,cov = r2_score(y_te,y_pred), np.mean((y_te>=seg_preds[0])&(y_te<=seg_preds[2]))*100
        metrics[seg] = {'n_train':len(X_tr),'n_test':len(X_te),'mae':mae,'mape':mape,'r2':r2,'cov':cov,'p_min':seg_df[y_col].min(),'p_max':seg_df[y_col].max(),'p_med':seg_df[y_col].median()}
        print(f"  {seg}: {len(test_df):,} test | MAE:${mae:,.0f} | MAPE:{mape:.2f}% | R²:{r2:.3f}")
        preds_list.append(pd.DataFrame({'property_id':ids_te,'state':states_te,'actual':y_te,'predicted':y_pred,'pred_lower':seg_preds[0],'pred_upper':seg_preds[2],'segment':seg}))

    return {
        'models': models,
        'metrics': metrics,
        'predictions': pd.concat(preds_list),
        'feature_importance': feature_importance(models, feats, metrics),
        'segment_importances': segment_importances,
        'feature_names': feats
    }

def save_results(results, out_dir):
    print(f"\nSaving results...")
    preds, metrics, fi = results['predictions'], results['metrics'], results['feature_importance']
    seg_importances = results['segment_importances']

    wb = Workbook()
    wb.remove(wb.active)

    # Summary
    ws = wb.create_sheet("Summary",0)
    ws['A1'], ws['A1'].font = 'GRANULAR SEGMENTED AVM', Font(bold=True,size=14)
    r2 = r2_score(preds['actual'],preds['predicted'])
    mae = mean_absolute_error(preds['actual'],preds['predicted'])
    mape = np.mean(np.abs((preds['actual']-preds['predicted'])/preds['actual']))*100
    data = [['Metric','Value'],['Properties',len(preds)],['Segments',len(metrics)],['R²',f'{r2:.4f}'],['MAE',f'${mae:,.0f}'],['MAPE%',f'{mape:.2f}%']]
    for i,(k,v) in enumerate(data,5): ws[f'A{i}'],ws[f'A{i}'].font,ws[f'B{i}'] = k,Font(bold=True),v

    # Segments
    ws = wb.create_sheet("Segments")
    seg_df = pd.DataFrame([{**{'Segment':s},**m} for s,m in metrics.items()])
    for i,h in enumerate(seg_df.columns,1):
        c = ws.cell(1,i,h)
        c.font,c.fill = Font(bold=True,color='FFFFFF'),PatternFill(start_color='366092',end_color='366092',fill_type='solid')
    for i,row in enumerate(seg_df.itertuples(index=False),2):
        for j,v in enumerate(row,1): ws.cell(i,j,v)

    # Global Feature Importance
    ws = wb.create_sheet("Global_Feature_Importance")
    ws['A1'] = 'Global Feature Importance (Weighted)'
    ws['A1'].font = Font(bold=True, size=12)
    for r_idx, row in enumerate(dataframe_to_rows(fi, index=False, header=True), 2):
        for c_idx, value in enumerate(row, 1):
            cell = ws.cell(row=r_idx, column=c_idx, value=value)
            if r_idx == 2:  # Header row
                cell.font = Font(bold=True, color='FFFFFF')
                cell.fill = PatternFill(start_color='366092', end_color='366092', fill_type='solid')

    # Per-Segment Feature Importance (one sheet per segment)
    for seg_name, seg_fi in seg_importances.items():
        sheet_name = f"FI_{seg_name}"[:31]  # Excel sheet name limit
        ws = wb.create_sheet(sheet_name)
        ws['A1'] = f'Feature Importance: {seg_name}'
        ws['A1'].font = Font(bold=True, size=12)

        for r_idx, row in enumerate(dataframe_to_rows(seg_fi, index=False, header=True), 2):
            for c_idx, value in enumerate(row, 1):
                cell = ws.cell(row=r_idx, column=c_idx, value=value)
                if r_idx == 2:  # Header row
                    cell.font = Font(bold=True, color='FFFFFF')
                    cell.fill = PatternFill(start_color='366092', end_color='366092', fill_type='solid')

    # Predictions
    ws = wb.create_sheet("Predictions")
    for i,h in enumerate(preds.columns,1):
        c = ws.cell(1,i,h)
        c.font,c.fill = Font(bold=True,color='FFFFFF'),PatternFill(start_color='366092',end_color='366092',fill_type='solid')
    for i,row in enumerate(preds.itertuples(index=False),2):
        for j,v in enumerate(row,1): ws.cell(i,j,v)

    ts = time.strftime("%Y%m%d_%H%M%S")
    xl_path = f"{out_dir}/granular_{ts}.xlsx"
    wb.save(xl_path)
    preds.to_csv(f"{out_dir}/predictions_{ts}.csv",index=False)
    seg_df.to_csv(f"{out_dir}/segments_{ts}.csv",index=False)
    fi.to_csv(f"{out_dir}/global_importance_{ts}.csv",index=False)

    # Save per-segment feature importance CSVs
    for seg_name, seg_fi in seg_importances.items():
        seg_fi.to_csv(f"{out_dir}/importance_{seg_name}_{ts}.csv", index=False)

    print(f"✓ Excel: {xl_path}")
    print(f"✓ CSVs saved with timestamp: {ts}")
    print(f"✓ Per-segment feature importance saved for {len(seg_importances)} segments")

def main():
    t0 = time.time()
    print("="*60)
    print("GRANULAR SEGMENTED AVM")
    print("="*60)
    df, y_col, id_col, state_col = load_data(INPUT_PATH)
    df, feats = prepare_data(df, y_col, id_col, state_col)
    results = train_segments(df, feats, y_col, id_col, state_col)
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    save_results(results, OUTPUT_DIR)
    preds = results['predictions']
    r2 = r2_score(preds['actual'],preds['predicted'])
    mae = mean_absolute_error(preds['actual'],preds['predicted'])
    mape = np.mean(np.abs((preds['actual']-preds['predicted'])/preds['actual']))*100
    print(f"\n{'='*60}")
    print(f"✓ COMPLETE in {time.time()-t0:.1f}s")
    print(f"  {len(preds):,} properties | {preds['segment'].nunique()} segments")
    print(f"  R²: {r2:.4f} | MAE: ${mae:,.0f} | MAPE: {mape:.2f}%")
    print(f"{'='*60}")

if __name__ == "__main__": main()

GRANULAR SEGMENTED AVM
Loading /Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/Untitled 5_2025-12-23-1212.csv
127,326 records | 129.6MB | Price:sale_price ID:property_id

Preparing data...
127,258 records after price filter
48/57 features available

Training segmented models on 127,258 properties
6 segments created
  budget_small: 23,028
  mid_large: 22,880
  premium_large: 21,935
  mid_small: 20,638
  premium_small: 19,967
  budget_large: 18,810
  premium_small: 19,967→15,479 (22.5% filtered)
  premium_small: 4,644 test | MAE:$1,651,245 | MAPE:35.23% | R²:0.494
  premium_large: 21,935→16,873 (23.1% filtered)
  premium_large: 5,062 test | MAE:$632,201 | MAPE:21.25% | R²:0.404
  mid_large: 22,880→17,671 (22.8% filtered)
  mid_large: 5,301 test | MAE:$119,653 | MAPE:10.14% | R²:0.115
  mid_small: 20,638→15,752 (23.7% filtered)
  mid_small: 4,726 test | MAE:$113,877 | MAPE:9.52% | R²:0.165
  budget_large: 18,810→14,526 (22.8% filtered)
  budget_large: 4,358 test | MAE:$37,307

In [19]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.cluster import MiniBatchKMeans
from sklearn.ensemble import IsolationForest
import warnings
from openpyxl import Workbook
from openpyxl.styles import Font, PatternFill, Alignment
from openpyxl.utils.dataframe import dataframe_to_rows
import time, os

warnings.filterwarnings('ignore')
os.environ['OMP_NUM_THREADS'] = '4'

# Config - FIXED PARAMETERS
INCLUDE_MLS, INCLUDE_CENSUS, INCLUDE_NEIGHBORHOOD, INCLUDE_IMAGE = True, True, True, False
MIN_PRICE, TEST_SIZE, RAND_STATE, N_JOBS, N_CLUSTERS = 20000, 0.3, 42, -1, 8
N_EST, QUANTILES, MAX_SEGMENTS = 100, [0.1, 0.5, 0.9], 7  # MAX 7 SEGMENTS

# INPUT PATHS
TRAINING_INPUT_PATH = "/Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/Untitled 5_2025-12-23-1212.csv"
PREDICTION_INPUT_PATH = "/Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/MLS_w_luxury_AVM_outliers.csv"  # Set to None to skip
OUTPUT_DIR = "/Users/jenny.lin/BASIS_AVM_Onboarding/cate_scenario_analyses/model_outputs"

BASE_FEATS = ["living_sqft","lot_sqft","year_built","effective_year_built","bedrooms","full_baths","half_baths","garage_spaces","fireplace_code","latitude","longitude","geo_cluster"]
ENG_FEATS = ["sqft_per_bedroom","lot_to_living_ratio","property_age","is_new","has_garage","luxury_score","log_sqft","age_squared"]
PRIOR_FEATS = ["prior_sale_price","prior_price_per_sqft","sqft_per_prior_dollar","years_since_last_sale","expected_appreciation","has_prior_sale","recently_sold"]
CLUSTER_FEATS = ["cluster_avg_price","cluster_med_price"]
CENSUS_FEATS = ["total_population_25plus","male_bachelors_degree","female_bachelors_degree","pct_bachelors_degree","median_earnings_total","median_earnings_male","median_earnings_female","median_household_income","median_home_value","median_gross_rent","owner_occupied_units","renter_occupied_units","pct_owner_occupied","occupied_units","vacant_units","median_age","civilian_employed","civilian_unemployed","unemployment_rate","income_education_score"]
ELECT_FEATS = ["votes_gop","votes_dem","total_votes","per_gop","per_dem","per_point_diff","dem_margin","rep_margin"]
IMG_FEATS = ["topic_1","topic_2","topic_3","topic_4","topic_5","topic_6","topic_7","topic_8","topic_9","topic_10","gran_c_in","gran_c_ex","gran_c","high_c_in","high_c_ex","high_c"]

def optimize_dtypes(df):
    for c in df.select_dtypes(['float64']).columns: df[c] = df[c].astype('float32')
    for c in df.select_dtypes(['int64']).columns:
        df[c] = df[c].astype('int8') if set(df[c].dropna().unique()).issubset({0,1}) else df[c].astype('int32')
    return df

def load_data(path):
    print(f"Loading {path}")
    df = pd.read_csv(path, low_memory=False)
    df.columns = df.columns.str.lower()
    y_col = next((c for c in ['sale_price','currentsalesprice','price','saleprice'] if c in df.columns), 'currentsalesprice')
    id_col = next((c for c in ['cc_list_id','property_id','propertyid','id'] if c in df.columns), 'cc_list_id')
    state_col = next((c for c in ['sample_state','state','state_code'] if c in df.columns), None)
    print(f"{len(df):,} records | {df.memory_usage(deep=True).sum()/1024**2:.1f}MB | Price:{y_col} ID:{id_col}")
    return optimize_dtypes(df), y_col, id_col, state_col

def assign_segments_simple(df):
    """Simple 5-7 segment strategy based on price and size"""
    segments = pd.Series(['mid'] * len(df), index=df.index)

    # Primary: Price tier (3 segments)
    if 'currentsalesprice' in df.columns or 'sale_price' in df.columns:
        price_col = 'currentsalesprice' if 'currentsalesprice' in df.columns else 'sale_price'
        price = df[price_col].fillna(df[price_col].median())
        q33, q67 = price.quantile([.33, .67])
        price_tier = pd.Series(['mid'] * len(df), index=df.index)
        price_tier[price < q33] = 'budget'
        price_tier[price > q67] = 'premium'
    else:
        price_tier = pd.Series(['mid'] * len(df), index=df.index)

    # Secondary: Size (2 segments: small vs large)
    if 'living_sqft' in df.columns:
        sqft = df['living_sqft'].fillna(df['living_sqft'].median())
        sqft_median = sqft.median()
        size_tier = pd.Series(['small'] * len(df), index=df.index)
        size_tier[sqft > sqft_median] = 'large'
    else:
        size_tier = pd.Series(['small'] * len(df), index=df.index)

    # Combine: price_size format (e.g., "budget_small", "premium_large")
    segments = price_tier + '_' + size_tier

    return segments

def engineer(df, y_col, with_price=False):
    if 'living_sqft' in df.columns and 'bedrooms' in df.columns: df['sqft_per_bedroom'] = df['living_sqft']/(df['bedrooms']+1)
    if 'lot_sqft' in df.columns:
        if 'living_sqft' in df.columns: df['lot_to_living_ratio'] = df['lot_sqft']/(df['living_sqft']+1)
        if 'lot_acres' not in df.columns: df['lot_acres'] = df['lot_sqft']/43560
    if 'year_built' in df.columns:
        df['property_age'] = 2024-df['year_built']
        df['is_new'] = (df['property_age']<=5).astype('int8')
        df['age_squared'] = df['property_age']**2
    if 'garage_spaces' in df.columns: df['has_garage'] = (df['garage_spaces']>0).astype('int8')
    if 'living_sqft' in df.columns: df['log_sqft'] = np.log1p(df['living_sqft'])
    lux = [df[c]/1000 if c=='living_sqft' else df[c] for c in ['living_sqft','full_baths','garage_spaces'] if c in df.columns]
    df['luxury_score'] = sum(lux)/len(lux) if lux else 0
    if INCLUDE_CENSUS and all(c in df.columns for c in ['median_household_income','pct_bachelors_degree']):
        df['income_education_score'] = df['median_household_income']*df['pct_bachelors_degree']
    if 'prior_sale_price' in df.columns and 'living_sqft' in df.columns:
        df['prior_price_per_sqft'] = df['prior_sale_price']/(df['living_sqft']+1)
        df['sqft_per_prior_dollar'] = df['living_sqft']/(df['prior_sale_price']+1)
    if 'prior_sale_date' in df.columns:
        df['prior_sale_date'] = pd.to_datetime(df['prior_sale_date'], errors='coerce')
        df['years_since_last_sale'] = (pd.Timestamp('2024-01-01')-df['prior_sale_date']).dt.days/365.25
        if 'prior_sale_price' in df.columns:
            df['expected_appreciation'] = df['prior_sale_price']*(1.04)**df['years_since_last_sale']
        df['recently_sold'] = (df['years_since_last_sale']<2).astype('int8')
    if 'prior_sale_price' in df.columns:
        df['has_prior_sale'] = df['prior_sale_price'].notna().astype('int8')
        missing = df['prior_sale_price'].isna()
        df.loc[missing,'prior_sale_price'] = df.loc[missing,'median_home_value'] if 'median_home_value' in df.columns and INCLUDE_CENSUS else df['prior_sale_price'].median()
    if 'years_since_last_sale' in df.columns: df['years_since_last_sale'] = df['years_since_last_sale'].fillna(999)
    if with_price and y_col in df.columns and 'living_sqft' in df.columns:
        df['price_per_sqft'] = df[y_col]/(df['living_sqft']+1)
        df['sqft_per_dollar'] = df['living_sqft']/(df[y_col]+1)
    return df

def geo_cluster(df, kmeans_model=None):
    """Apply geo clustering - can use existing model for prediction"""
    if not all(c in df.columns for c in ['latitude','longitude']):
        df['geo_cluster'] = 0
        return df, kmeans_model

    valid = df[['latitude','longitude']].notna().all(axis=1)
    if valid.sum() < N_CLUSTERS:
        df['geo_cluster'] = 0
        return df, kmeans_model

    df['geo_cluster'] = 0

    if kmeans_model is None:
        # Training: fit new model
        kmeans_model = MiniBatchKMeans(n_clusters=N_CLUSTERS, random_state=RAND_STATE, batch_size=1000, n_init=3)
        df.loc[valid,'geo_cluster'] = kmeans_model.fit_predict(df.loc[valid,['latitude','longitude']])
    else:
        # Prediction: use existing model
        df.loc[valid,'geo_cluster'] = kmeans_model.predict(df.loc[valid,['latitude','longitude']])

    return df, kmeans_model

def add_cluster_feats(train, test, y_col):
    if 'geo_cluster' not in train.columns or y_col not in train.columns:
        for d in [train,test]: d['cluster_avg_price'] = d['cluster_med_price'] = train[y_col].median() if y_col in train.columns else 0
        return train, test
    stats = train.groupby('geo_cluster')[y_col].agg(['mean','median']).reset_index()
    stats.columns = ['geo_cluster','cluster_avg_price','cluster_med_price']
    train = train.merge(stats, on='geo_cluster', how='left').fillna({c:train[y_col].median() for c in ['cluster_avg_price','cluster_med_price']})
    test = test.merge(stats, on='geo_cluster', how='left').fillna({c:train[y_col].median() for c in ['cluster_avg_price','cluster_med_price']})
    return train, test

def filter_outliers(df, name, y_col):
    orig = len(df)
    df = engineer(df, y_col, with_price=True)
    if 'price_per_sqft' in df.columns:
        lb,ub = df['price_per_sqft'].quantile([.05,.95])
        df = df[(df['price_per_sqft']>=lb)&(df['price_per_sqft']<=ub)]
    if 'sqft_per_dollar' in df.columns: df = df[df['sqft_per_dollar']<=df['sqft_per_dollar'].quantile(.95)]
    if 'price_per_sqft' in df.columns: df = df.drop(columns=['price_per_sqft','sqft_per_dollar'])
    if 'lot_sqft' in df.columns: df = df[df['lot_sqft']<=df['lot_sqft'].quantile(.98)]
    if 'year_built' in df.columns: df = df[(df['year_built']>=1900)&(df['year_built']<=2025)]
    if len(df)>=100:
        try:
            feats = [c for c in ['living_sqft','lot_sqft',y_col,'year_built'] if c in df.columns]
            if len(feats)>=3:
                X = df[feats].fillna(df[feats].median())
                df = df[IsolationForest(contamination=.05,random_state=RAND_STATE,n_jobs=N_JOBS).fit_predict(X)==1]
        except: pass
    pct_filt = (orig-len(df))/orig*100 if orig>0 else 0
    if pct_filt > 0: print(f"  {name}: {orig:,}→{len(df):,} ({pct_filt:.1f}% filtered)")
    return df

def train_model(X, y, q):
    return XGBRegressor(
        objective='reg:quantileerror',
        quantile_alpha=q,
        n_estimators=N_EST,
        learning_rate=.1,
        max_depth=5,
        min_child_weight=3,
        subsample=.8,
        colsample_bytree=.8,
        random_state=RAND_STATE,
        n_jobs=N_JOBS,
        tree_method='hist'
    ).fit(X, y, verbose=False)

def get_feature_importance_per_segment(model, feat_names, top_n=20):
    """Extract feature importance for a single segment model"""
    scores = model.get_booster().get_score(importance_type="gain")
    importance_list = []
    for k, v in scores.items():
        idx = int(k[1:])
        if idx < len(feat_names):
            importance_list.append((feat_names[idx], v))

    # Sort and normalize
    importance_list.sort(key=lambda x: x[1], reverse=True)
    total_gain = sum(v for _, v in importance_list)

    if total_gain > 0:
        importance_df = pd.DataFrame([
            {'feature': feat, 'gain': gain, 'importance': gain/total_gain}
            for feat, gain in importance_list[:top_n]
        ])
    else:
        importance_df = pd.DataFrame(columns=['feature', 'gain', 'importance'])

    return importance_df

def feature_importance(models, feat_names, metrics):
    """Global feature importance weighted by segment size - NOW INCLUDES TOTAL_GAIN"""
    rows = []
    for seg, mdl in models.items():
        scores = mdl['q50'].get_booster().get_score(importance_type="gain")
        w = metrics[seg]["n_test"]
        for k,v in scores.items():
            idx = int(k[1:])
            if idx<len(feat_names): rows.append((feat_names[idx],v,w))
    if not rows: return pd.DataFrame(columns=["feature","total_gain","importance"])

    df = pd.DataFrame(rows, columns=["feature","gain","weight"])

    # Calculate both total_gain and weighted importance
    out = df.assign(wg=df["gain"]*df["weight"]).groupby("feature",as_index=False).agg(
        total_gain=("gain","sum"),      # Sum of gain across all segments
        weighted_gain=("wg","sum")      # Weighted by segment size
    ).sort_values("weighted_gain",ascending=False)

    out["importance"] = out["weighted_gain"]/out["weighted_gain"].sum()

    return out[["feature","total_gain","importance"]].head(100)

def prepare_data(df, y_col, id_col, state_col, for_training=True):
    print(f"\nPreparing data...")
    if for_training:
        df = df[df[y_col]>=MIN_PRICE]
        print(f"{len(df):,} records after price filter")
    df, kmeans_model = geo_cluster(engineer(df, y_col))
    feat_groups = []
    if INCLUDE_MLS: feat_groups.extend([BASE_FEATS,ENG_FEATS,PRIOR_FEATS,CLUSTER_FEATS])
    if INCLUDE_CENSUS: feat_groups.append(CENSUS_FEATS)
    if INCLUDE_NEIGHBORHOOD: feat_groups.append(ELECT_FEATS)
    if INCLUDE_IMAGE: feat_groups.append(IMG_FEATS)
    all_feats = [f for g in feat_groups for f in g]
    feats = [f for f in all_feats if f in df.columns]
    print(f"{len(feats)}/{len(all_feats)} features available")
    cols = list(set(feats+[y_col,id_col]+([state_col] if state_col and state_col in df.columns else [])))
    df = df[[c for c in cols if c in df.columns]].copy()
    df[feats] = df[feats].fillna(df[feats].median())
    if for_training:
        return df.dropna(subset=[y_col]), feats, kmeans_model
    else:
        return df, feats, kmeans_model

def train_segments(df, feats, y_col, id_col, state_col):
    print(f"\nTraining segmented models on {len(df):,} properties")

    # Simple 5-7 segment strategy
    df['seg'] = assign_segments_simple(df)

    seg_cnts = df['seg'].value_counts()
    print(f"{len(seg_cnts)} segments created")
    for seg, cnt in seg_cnts.head(MAX_SEGMENTS).items(): print(f"  {seg}: {cnt:,}")

    # Merge small segments
    small = seg_cnts[seg_cnts<50].index
    if len(small)>0:
        df.loc[df['seg'].isin(small),'seg'] = 'other'
        print(f"Merged {len(small)} small segments into 'other'")

    # If still too many segments, merge smallest ones
    seg_cnts = df['seg'].value_counts()
    if len(seg_cnts) > MAX_SEGMENTS:
        keep_segs = seg_cnts.head(MAX_SEGMENTS-1).index
        df.loc[~df['seg'].isin(keep_segs),'seg'] = 'other'
        print(f"Consolidated to {MAX_SEGMENTS} segments maximum")

    models, metrics, preds_list, segment_importances = {}, {}, [], {}

    for seg in df['seg'].unique():
        seg_df = df[df['seg']==seg].copy()
        if len(seg_df)<50: continue
        seg_df = filter_outliers(seg_df, seg, y_col)
        if len(seg_df)<50: continue
        train_idx = seg_df.sample(frac=1-TEST_SIZE,random_state=RAND_STATE).index
        train_df, test_df = seg_df.loc[train_idx].copy(), seg_df.loc[seg_df.index.difference(train_idx)].copy()
        train_df, test_df = add_cluster_feats(train_df, test_df, y_col)
        X_tr,y_tr = train_df[feats].values, train_df[y_col].values
        X_te,y_te = test_df[feats].values, test_df[y_col].values
        ids_te = test_df[id_col].values
        states_te = test_df[state_col].values if state_col and state_col in test_df.columns else ['Unknown']*len(test_df)
        seg_models, seg_preds = {}, []
        for q in QUANTILES:
            m = train_model(X_tr,y_tr,q)
            seg_models[f"q{int(q*100)}"] = m
            seg_preds.append(m.predict(X_te))
        models[seg] = seg_models

        # Get feature importance for this segment
        segment_importances[seg] = get_feature_importance_per_segment(seg_models['q50'], feats, top_n=20)

        y_pred = seg_preds[1]
        mae,mape = mean_absolute_error(y_te,y_pred), np.mean(np.abs((y_te-y_pred)/y_te))*100
        r2,cov = r2_score(y_te,y_pred), np.mean((y_te>=seg_preds[0])&(y_te<=seg_preds[2]))*100
        metrics[seg] = {'n_train':len(X_tr),'n_test':len(X_te),'mae':mae,'mape':mape,'r2':r2,'cov':cov,'p_min':seg_df[y_col].min(),'p_max':seg_df[y_col].max(),'p_med':seg_df[y_col].median()}
        print(f"  {seg}: {len(test_df):,} test | MAE:${mae:,.0f} | MAPE:{mape:.2f}% | R²:{r2:.3f}")
        preds_list.append(pd.DataFrame({'property_id':ids_te,'state':states_te,'actual':y_te,'predicted':y_pred,'pred_lower':seg_preds[0],'pred_upper':seg_preds[2],'segment':seg}))

    return {
        'models': models,
        'metrics': metrics,
        'predictions': pd.concat(preds_list),
        'feature_importance': feature_importance(models, feats, metrics),
        'segment_importances': segment_importances,
        'feature_names': feats
    }

def predict_new_properties(pred_df, models, feats, y_col, id_col, state_col, kmeans_model, train_cluster_stats):
    """Generate predictions for new properties using trained models"""
    print(f"\n{'='*60}")
    print("GENERATING PREDICTIONS FOR NEW PROPERTIES")
    print(f"{'='*60}")
    print(f"Input properties: {len(pred_df):,}")

    # Engineer features (no price filtering for prediction)
    pred_df = engineer(pred_df, y_col)
    pred_df, _ = geo_cluster(pred_df, kmeans_model)

    # Add cluster features from training data
    if 'geo_cluster' in pred_df.columns and train_cluster_stats is not None:
        pred_df = pred_df.merge(train_cluster_stats, on='geo_cluster', how='left')
        median_price = train_cluster_stats['cluster_avg_price'].median()
        pred_df['cluster_avg_price'] = pred_df['cluster_avg_price'].fillna(median_price)
        pred_df['cluster_med_price'] = pred_df['cluster_med_price'].fillna(median_price)

    # Fill missing features with median or 0
    for feat in feats:
        if feat not in pred_df.columns:
            pred_df[feat] = 0
        else:
            pred_df[feat] = pred_df[feat].fillna(pred_df[feat].median() if pred_df[feat].notna().sum() > 0 else 0)

    # Assign segments
    pred_df['seg'] = assign_segments_simple(pred_df)

    # Generate predictions
    preds_list = []
    for seg in pred_df['seg'].unique():
        seg_df = pred_df[pred_df['seg']==seg].copy()

        if seg not in models:
            print(f"  Warning: Segment '{seg}' not in trained models, using 'mid_large' as fallback")
            seg = 'mid_large' if 'mid_large' in models else list(models.keys())[0]

        X = seg_df[feats].values
        ids = seg_df[id_col].values
        states = seg_df[state_col].values if state_col and state_col in seg_df.columns else ['Unknown']*len(seg_df)

        # Get predictions from quantile models
        pred_lower = models[seg]['q10'].predict(X)
        pred_mid = models[seg]['q50'].predict(X)
        pred_upper = models[seg]['q90'].predict(X)

        actual = seg_df[y_col].values if y_col in seg_df.columns else [np.nan]*len(seg_df)

        preds_list.append(pd.DataFrame({
            'property_id': ids,
            'state': states,
            'actual': actual,
            'predicted': pred_mid,
            'pred_lower': pred_lower,
            'pred_upper': pred_upper,
            'segment': seg,
            'error': [actual[i] - pred_mid[i] if not np.isnan(actual[i]) else np.nan for i in range(len(actual))],
            'pct_error': [100 * (actual[i] - pred_mid[i]) / actual[i] if not np.isnan(actual[i]) and actual[i] != 0 else np.nan for i in range(len(actual))]
        }))

        print(f"  {seg}: {len(seg_df):,} properties predicted")

    result_df = pd.concat(preds_list, ignore_index=True)
    print(f"\n✓ Generated {len(result_df):,} predictions")

    # Calculate metrics if actuals are available
    valid_actuals = result_df['actual'].notna().sum()
    if valid_actuals > 0:
        valid_preds = result_df[result_df['actual'].notna()].copy()
        mae = mean_absolute_error(valid_preds['actual'], valid_preds['predicted'])
        mape = np.mean(np.abs((valid_preds['actual'] - valid_preds['predicted']) / valid_preds['actual'])) * 100
        r2 = r2_score(valid_preds['actual'], valid_preds['predicted'])
        print(f"  Validation metrics ({valid_actuals} properties):")
        print(f"  MAE: ${mae:,.0f} | MAPE: {mape:.2f}% | R²: {r2:.4f}")

    return result_df

def save_results(results, out_dir, new_predictions=None):
    print(f"\nSaving results...")
    preds, metrics, fi = results['predictions'], results['metrics'], results['feature_importance']
    seg_importances = results['segment_importances']

    wb = Workbook()
    wb.remove(wb.active)

    # Summary
    ws = wb.create_sheet("Summary",0)
    ws['A1'], ws['A1'].font = 'GRANULAR SEGMENTED AVM', Font(bold=True,size=14)
    r2 = r2_score(preds['actual'],preds['predicted'])
    mae = mean_absolute_error(preds['actual'],preds['predicted'])
    mape = np.mean(np.abs((preds['actual']-preds['predicted'])/preds['actual']))*100
    data = [['Metric','Value'],['Properties',len(preds)],['Segments',len(metrics)],['R²',f'{r2:.4f}'],['MAE',f'${mae:,.0f}'],['MAPE%',f'{mape:.2f}%']]
    if new_predictions is not None:
        data.append(['New Predictions', len(new_predictions)])
    for i,(k,v) in enumerate(data,5): ws[f'A{i}'],ws[f'A{i}'].font,ws[f'B{i}'] = k,Font(bold=True),v

    # Segments
    ws = wb.create_sheet("Segments")
    seg_df = pd.DataFrame([{**{'Segment':s},**m} for s,m in metrics.items()])
    for i,h in enumerate(seg_df.columns,1):
        c = ws.cell(1,i,h)
        c.font,c.fill = Font(bold=True,color='FFFFFF'),PatternFill(start_color='366092',end_color='366092',fill_type='solid')
    for i,row in enumerate(seg_df.itertuples(index=False),2):
        for j,v in enumerate(row,1): ws.cell(i,j,v)

    # Global Feature Importance
    ws = wb.create_sheet("Global_Feature_Importance")
    ws['A1'] = 'Global Feature Importance (Weighted)'
    ws['A1'].font = Font(bold=True, size=12)
    for r_idx, row in enumerate(dataframe_to_rows(fi, index=False, header=True), 2):
        for c_idx, value in enumerate(row, 1):
            cell = ws.cell(row=r_idx, column=c_idx, value=value)
            if r_idx == 2:  # Header row
                cell.font = Font(bold=True, color='FFFFFF')
                cell.fill = PatternFill(start_color='366092', end_color='366092', fill_type='solid')

    # Per-Segment Feature Importance (one sheet per segment)
    for seg_name, seg_fi in seg_importances.items():
        sheet_name = f"FI_{seg_name}"[:31]  # Excel sheet name limit
        ws = wb.create_sheet(sheet_name)
        ws['A1'] = f'Feature Importance: {seg_name}'
        ws['A1'].font = Font(bold=True, size=12)

        for r_idx, row in enumerate(dataframe_to_rows(seg_fi, index=False, header=True), 2):
            for c_idx, value in enumerate(row, 1):
                cell = ws.cell(row=r_idx, column=c_idx, value=value)
                if r_idx == 2:  # Header row
                    cell.font = Font(bold=True, color='FFFFFF')
                    cell.fill = PatternFill(start_color='366092', end_color='366092', fill_type='solid')

    # Test Set Predictions
    ws = wb.create_sheet("Test_Predictions")
    for i,h in enumerate(preds.columns,1):
        c = ws.cell(1,i,h)
        c.font,c.fill = Font(bold=True,color='FFFFFF'),PatternFill(start_color='366092',end_color='366092',fill_type='solid')
    for i,row in enumerate(preds.itertuples(index=False),2):
        for j,v in enumerate(row,1): ws.cell(i,j,v)

    # New Predictions (if provided)
    if new_predictions is not None:
        ws = wb.create_sheet("New_Predictions")
        for i,h in enumerate(new_predictions.columns,1):
            c = ws.cell(1,i,h)
            c.font,c.fill = Font(bold=True,color='FFFFFF'),PatternFill(start_color='4472C4',end_color='4472C4',fill_type='solid')
        for i,row in enumerate(new_predictions.itertuples(index=False),2):
            for j,v in enumerate(row,1): ws.cell(i,j,v)

    ts = time.strftime("%Y%m%d_%H%M%S")
    xl_path = f"{out_dir}/granular_{ts}.xlsx"
    wb.save(xl_path)
    preds.to_csv(f"{out_dir}/test_predictions_{ts}.csv",index=False)
    seg_df.to_csv(f"{out_dir}/segments_{ts}.csv",index=False)
    fi.to_csv(f"{out_dir}/global_importance_{ts}.csv",index=False)

    # Save per-segment feature importance CSVs
    for seg_name, seg_fi in seg_importances.items():
        seg_fi.to_csv(f"{out_dir}/importance_{seg_name}_{ts}.csv", index=False)

    # Save new predictions
    if new_predictions is not None:
        new_predictions.to_csv(f"{out_dir}/new_predictions_{ts}.csv", index=False)
        print(f"✓ New predictions CSV: {out_dir}/new_predictions_{ts}.csv")

    print(f"✓ Excel: {xl_path}")
    print(f"✓ CSVs saved with timestamp: {ts}")
    print(f"✓ Per-segment feature importance saved for {len(seg_importances)} segments")

def main():
    t0 = time.time()
    print("="*60)
    print("GRANULAR SEGMENTED AVM")
    print("="*60)

    # Load and prepare training data
    df, y_col, id_col, state_col = load_data(TRAINING_INPUT_PATH)
    df, feats, kmeans_model = prepare_data(df, y_col, id_col, state_col, for_training=True)

    # Train models
    results = train_segments(df, feats, y_col, id_col, state_col)

    # Get cluster stats for prediction data
    train_cluster_stats = None
    if 'geo_cluster' in df.columns:
        train_cluster_stats = df.groupby('geo_cluster')[y_col].agg(['mean','median']).reset_index()
        train_cluster_stats.columns = ['geo_cluster','cluster_avg_price','cluster_med_price']

    # Generate predictions for new properties if input file provided
    new_predictions = None
    if PREDICTION_INPUT_PATH:
        pred_df, _, _, _ = load_data(PREDICTION_INPUT_PATH)
        new_predictions = predict_new_properties(
            pred_df, results['models'], feats, y_col, id_col, state_col,
            kmeans_model, train_cluster_stats
        )

    # Save results
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    save_results(results, OUTPUT_DIR, new_predictions)

    # Print summary
    preds = results['predictions']
    r2 = r2_score(preds['actual'],preds['predicted'])
    mae = mean_absolute_error(preds['actual'],preds['predicted'])
    mape = np.mean(np.abs((preds['actual']-preds['predicted'])/preds['actual']))*100
    print(f"\n{'='*60}")
    print(f"✓ TRAINING COMPLETE in {time.time()-t0:.1f}s")
    print(f"  Test set: {len(preds):,} properties | {preds['segment'].nunique()} segments")
    print(f"  R²: {r2:.4f} | MAE: ${mae:,.0f} | MAPE: {mape:.2f}%")
    if new_predictions is not None:
        print(f"  New predictions: {len(new_predictions):,} properties")
    print(f"{'='*60}")

if __name__ == "__main__": main()

GRANULAR SEGMENTED AVM
Loading /Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/Untitled 5_2025-12-23-1212.csv
127,326 records | 129.6MB | Price:sale_price ID:property_id

Preparing data...
127,258 records after price filter
48/57 features available

Training segmented models on 127,258 properties
6 segments created
  budget_small: 23,028
  mid_large: 22,880
  premium_large: 21,935
  mid_small: 20,638
  premium_small: 19,967
  budget_large: 18,810
  premium_small: 19,967→15,479 (22.5% filtered)
  premium_small: 4,644 test | MAE:$1,651,245 | MAPE:35.23% | R²:0.494
  premium_large: 21,935→16,873 (23.1% filtered)
  premium_large: 5,062 test | MAE:$632,201 | MAPE:21.25% | R²:0.404
  mid_large: 22,880→17,671 (22.8% filtered)
  mid_large: 5,301 test | MAE:$119,653 | MAPE:10.14% | R²:0.115
  mid_small: 20,638→15,752 (23.7% filtered)
  mid_small: 4,726 test | MAE:$113,877 | MAPE:9.52% | R²:0.165
  budget_large: 18,810→14,526 (22.8% filtered)
  budget_large: 4,358 test | MAE:$37,307

In [20]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.cluster import MiniBatchKMeans
from sklearn.ensemble import IsolationForest
import warnings
from openpyxl import Workbook
from openpyxl.styles import Font, PatternFill, Alignment
from openpyxl.utils.dataframe import dataframe_to_rows
import time, os

warnings.filterwarnings('ignore')
os.environ['OMP_NUM_THREADS'] = '4'

# Config - LOCATION-FOCUSED VERSION
INCLUDE_MLS, INCLUDE_CENSUS, INCLUDE_NEIGHBORHOOD, INCLUDE_IMAGE = True, True, True, False
MIN_PRICE, TEST_SIZE, RAND_STATE, N_JOBS, N_CLUSTERS = 20000, 0.3, 42, -1, 16  # INCREASED clusters from 8 to 16
N_EST, QUANTILES, MAX_SEGMENTS = 100, [0.1, 0.5, 0.9], 7
LOCATION_WEIGHT = 3.0  # NEW: Weight multiplier for location features

# INPUT PATHS
TRAINING_INPUT_PATH = "/Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/Untitled 5_2025-12-23-1212.csv"
PREDICTION_INPUT_PATH = "/Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/MLS_w_luxury_AVM_outliers.csv"
OUTPUT_DIR = "/Users/jenny.lin/BASIS_AVM_Onboarding/cate_scenario_analyses/model_outputs"

BASE_FEATS = ["living_sqft","lot_sqft","year_built","effective_year_built","bedrooms","full_baths","half_baths","garage_spaces","fireplace_code","latitude","longitude","geo_cluster"]
ENG_FEATS = ["sqft_per_bedroom","lot_to_living_ratio","property_age","is_new","has_garage","luxury_score","log_sqft","age_squared"]
PRIOR_FEATS = ["prior_sale_price","prior_price_per_sqft","sqft_per_prior_dollar","years_since_last_sale","expected_appreciation","has_prior_sale","recently_sold"]
CLUSTER_FEATS = ["cluster_avg_price","cluster_med_price"]
CENSUS_FEATS = ["total_population_25plus","male_bachelors_degree","female_bachelors_degree","pct_bachelors_degree","median_earnings_total","median_earnings_male","median_earnings_female","median_household_income","median_home_value","median_gross_rent","owner_occupied_units","renter_occupied_units","pct_owner_occupied","occupied_units","vacant_units","median_age","civilian_employed","civilian_unemployed","unemployment_rate","income_education_score"]
ELECT_FEATS = ["votes_gop","votes_dem","total_votes","per_gop","per_dem","per_point_diff","dem_margin","rep_margin"]
IMG_FEATS = ["topic_1","topic_2","topic_3","topic_4","topic_5","topic_6","topic_7","topic_8","topic_9","topic_10","gran_c_in","gran_c_ex","gran_c","high_c_in","high_c_ex","high_c"]

# NEW: Location-based features to create
LOCATION_FEATS = ["lat_long_interaction", "distance_to_center", "neighborhood_density"]

def optimize_dtypes(df):
    for c in df.select_dtypes(['float64']).columns: df[c] = df[c].astype('float32')
    for c in df.select_dtypes(['int64']).columns:
        df[c] = df[c].astype('int8') if set(df[c].dropna().unique()).issubset({0,1}) else df[c].astype('int32')
    return df

def load_data(path):
    print(f"Loading {path}")
    df = pd.read_csv(path, low_memory=False)
    df.columns = df.columns.str.lower()
    y_col = next((c for c in ['sale_price','currentsalesprice','price','saleprice'] if c in df.columns), 'currentsalesprice')
    id_col = next((c for c in ['cc_list_id','property_id','propertyid','id'] if c in df.columns), 'cc_list_id')
    state_col = next((c for c in ['sample_state','state','state_code'] if c in df.columns), None)
    print(f"{len(df):,} records | {df.memory_usage(deep=True).sum()/1024**2:.1f}MB | Price:{y_col} ID:{id_col}")
    return optimize_dtypes(df), y_col, id_col, state_col

def assign_segments_simple(df):
    """Simple 5-7 segment strategy based on price and size"""
    segments = pd.Series(['mid'] * len(df), index=df.index)

    # Primary: Price tier (3 segments)
    if 'currentsalesprice' in df.columns or 'sale_price' in df.columns:
        price_col = 'currentsalesprice' if 'currentsalesprice' in df.columns else 'sale_price'
        price = df[price_col].fillna(df[price_col].median())
        q33, q67 = price.quantile([.33, .67])
        price_tier = pd.Series(['mid'] * len(df), index=df.index)
        price_tier[price < q33] = 'budget'
        price_tier[price > q67] = 'premium'
    else:
        price_tier = pd.Series(['mid'] * len(df), index=df.index)

    # Secondary: Size (2 segments: small vs large)
    if 'living_sqft' in df.columns:
        sqft = df['living_sqft'].fillna(df['living_sqft'].median())
        sqft_median = sqft.median()
        size_tier = pd.Series(['small'] * len(df), index=df.index)
        size_tier[sqft > sqft_median] = 'large'
    else:
        size_tier = pd.Series(['small'] * len(df), index=df.index)

    # Combine: price_size format (e.g., "budget_small", "premium_large")
    segments = price_tier + '_' + size_tier

    return segments

def engineer(df, y_col, with_price=False):
    # Standard features
    if 'living_sqft' in df.columns and 'bedrooms' in df.columns: df['sqft_per_bedroom'] = df['living_sqft']/(df['bedrooms']+1)
    if 'lot_sqft' in df.columns:
        if 'living_sqft' in df.columns: df['lot_to_living_ratio'] = df['lot_sqft']/(df['living_sqft']+1)
        if 'lot_acres' not in df.columns: df['lot_acres'] = df['lot_sqft']/43560
    if 'year_built' in df.columns:
        df['property_age'] = 2024-df['year_built']
        df['is_new'] = (df['property_age']<=5).astype('int8')
        df['age_squared'] = df['property_age']**2
    if 'garage_spaces' in df.columns: df['has_garage'] = (df['garage_spaces']>0).astype('int8')
    if 'living_sqft' in df.columns: df['log_sqft'] = np.log1p(df['living_sqft'])
    lux = [df[c]/1000 if c=='living_sqft' else df[c] for c in ['living_sqft','full_baths','garage_spaces'] if c in df.columns]
    df['luxury_score'] = sum(lux)/len(lux) if lux else 0
    if INCLUDE_CENSUS and all(c in df.columns for c in ['median_household_income','pct_bachelors_degree']):
        df['income_education_score'] = df['median_household_income']*df['pct_bachelors_degree']
    if 'prior_sale_price' in df.columns and 'living_sqft' in df.columns:
        df['prior_price_per_sqft'] = df['prior_sale_price']/(df['living_sqft']+1)
        df['sqft_per_prior_dollar'] = df['living_sqft']/(df['prior_sale_price']+1)
    if 'prior_sale_date' in df.columns:
        df['prior_sale_date'] = pd.to_datetime(df['prior_sale_date'], errors='coerce')
        df['years_since_last_sale'] = (pd.Timestamp('2024-01-01')-df['prior_sale_date']).dt.days/365.25
        if 'prior_sale_price' in df.columns:
            df['expected_appreciation'] = df['prior_sale_price']*(1.04)**df['years_since_last_sale']
        df['recently_sold'] = (df['years_since_last_sale']<2).astype('int8')
    if 'prior_sale_price' in df.columns:
        df['has_prior_sale'] = df['prior_sale_price'].notna().astype('int8')
        missing = df['prior_sale_price'].isna()
        df.loc[missing,'prior_sale_price'] = df.loc[missing,'median_home_value'] if 'median_home_value' in df.columns and INCLUDE_CENSUS else df['prior_sale_price'].median()
    if 'years_since_last_sale' in df.columns: df['years_since_last_sale'] = df['years_since_last_sale'].fillna(999)

    # ====================================================================
    # NEW: LOCATION-ENHANCED FEATURES
    # ====================================================================
    if all(c in df.columns for c in ['latitude', 'longitude']):
        # Interaction term
        df['lat_long_interaction'] = df['latitude'] * df['longitude']

        # Distance to geographic center of dataset
        lat_center = df['latitude'].median()
        lon_center = df['longitude'].median()
        df['distance_to_center'] = np.sqrt(
            (df['latitude'] - lat_center)**2 + (df['longitude'] - lon_center)**2
        )

        # Neighborhood density (properties within ~0.01 degree radius)
        # Approximation: count nearby properties
        df['neighborhood_density'] = 0
        if len(df) < 10000:  # Only for reasonable dataset sizes
            try:
                from scipy.spatial import cKDTree
                coords = df[['latitude', 'longitude']].values
                tree = cKDTree(coords)
                # Count properties within ~0.5 mile (roughly 0.01 degrees)
                counts = tree.query_ball_point(coords, r=0.01, return_length=True)
                df['neighborhood_density'] = counts
            except:
                pass

    if with_price and y_col in df.columns and 'living_sqft' in df.columns:
        df['price_per_sqft'] = df[y_col]/(df['living_sqft']+1)
        df['sqft_per_dollar'] = df['living_sqft']/(df[y_col]+1)
    return df

def geo_cluster(df, kmeans_model=None):
    """Apply geo clustering with MORE clusters for finer location granularity"""
    if not all(c in df.columns for c in ['latitude','longitude']):
        df['geo_cluster'] = 0
        return df, kmeans_model

    valid = df[['latitude','longitude']].notna().all(axis=1)
    if valid.sum() < N_CLUSTERS:
        df['geo_cluster'] = 0
        return df, kmeans_model

    df['geo_cluster'] = 0

    if kmeans_model is None:
        # Training: fit new model with MORE clusters
        kmeans_model = MiniBatchKMeans(n_clusters=N_CLUSTERS, random_state=RAND_STATE, batch_size=1000, n_init=3)
        df.loc[valid,'geo_cluster'] = kmeans_model.fit_predict(df.loc[valid,['latitude','longitude']])
    else:
        # Prediction: use existing model
        df.loc[valid,'geo_cluster'] = kmeans_model.predict(df.loc[valid,['latitude','longitude']])

    return df, kmeans_model

def add_cluster_feats(train, test, y_col):
    if 'geo_cluster' not in train.columns or y_col not in train.columns:
        for d in [train,test]: d['cluster_avg_price'] = d['cluster_med_price'] = train[y_col].median() if y_col in train.columns else 0
        return train, test
    stats = train.groupby('geo_cluster')[y_col].agg(['mean','median']).reset_index()
    stats.columns = ['geo_cluster','cluster_avg_price','cluster_med_price']
    train = train.merge(stats, on='geo_cluster', how='left').fillna({c:train[y_col].median() for c in ['cluster_avg_price','cluster_med_price']})
    test = test.merge(stats, on='geo_cluster', how='left').fillna({c:train[y_col].median() for c in ['cluster_avg_price','cluster_med_price']})
    return train, test

def filter_outliers(df, name, y_col):
    orig = len(df)
    df = engineer(df, y_col, with_price=True)
    if 'price_per_sqft' in df.columns:
        lb,ub = df['price_per_sqft'].quantile([.05,.95])
        df = df[(df['price_per_sqft']>=lb)&(df['price_per_sqft']<=ub)]
    if 'sqft_per_dollar' in df.columns: df = df[df['sqft_per_dollar']<=df['sqft_per_dollar'].quantile(.95)]
    if 'price_per_sqft' in df.columns: df = df.drop(columns=['price_per_sqft','sqft_per_dollar'])
    if 'lot_sqft' in df.columns: df = df[df['lot_sqft']<=df['lot_sqft'].quantile(.98)]
    if 'year_built' in df.columns: df = df[(df['year_built']>=1900)&(df['year_built']<=2025)]
    if len(df)>=100:
        try:
            feats = [c for c in ['living_sqft','lot_sqft',y_col,'year_built'] if c in df.columns]
            if len(feats)>=3:
                X = df[feats].fillna(df[feats].median())
                df = df[IsolationForest(contamination=.05,random_state=RAND_STATE,n_jobs=N_JOBS).fit_predict(X)==1]
        except: pass
    pct_filt = (orig-len(df))/orig*100 if orig>0 else 0
    if pct_filt > 0: print(f"  {name}: {orig:,}→{len(df):,} ({pct_filt:.1f}% filtered)")
    return df

def apply_feature_weights(X, feat_names, location_weight=LOCATION_WEIGHT):
    """
    Apply higher weights to location features by duplicating them in the feature matrix.
    This gives XGBoost more opportunities to split on location features.
    """
    X_weighted = X.copy()
    location_feature_indices = []

    # Identify location-related features
    location_keywords = ['latitude', 'longitude', 'geo_cluster', 'cluster_avg_price',
                        'cluster_med_price', 'lat_long_interaction', 'distance_to_center',
                        'neighborhood_density']

    for i, feat in enumerate(feat_names):
        if any(keyword in feat for keyword in location_keywords):
            location_feature_indices.append(i)

    # If using sample weights approach (alternative method)
    # We'll return the original X but track which features are location-based
    return X_weighted, location_feature_indices

def train_model(X, y, q, feat_names):
    """
    Train model with emphasis on location features via feature_weights parameter
    """
    # Create feature weights: higher for location features
    feature_weights = np.ones(X.shape[1])

    location_keywords = ['latitude', 'longitude', 'geo_cluster', 'cluster_avg_price',
                        'cluster_med_price', 'lat_long_interaction', 'distance_to_center',
                        'neighborhood_density']

    for i, feat in enumerate(feat_names):
        if any(keyword in feat for keyword in location_keywords):
            feature_weights[i] = LOCATION_WEIGHT

    # XGBoost with feature weights
    model = XGBRegressor(
        objective='reg:quantileerror',
        quantile_alpha=q,
        n_estimators=N_EST,
        learning_rate=.1,
        max_depth=5,
        min_child_weight=3,
        subsample=.8,
        colsample_bytree=.8,
        random_state=RAND_STATE,
        n_jobs=N_JOBS,
        tree_method='hist',
        feature_weights=feature_weights  # NEW: Apply location weights
    )

    return model.fit(X, y, verbose=False)

def get_feature_importance_per_segment(model, feat_names, top_n=20):
    """Extract feature importance for a single segment model"""
    scores = model.get_booster().get_score(importance_type="gain")
    importance_list = []
    for k, v in scores.items():
        idx = int(k[1:])
        if idx < len(feat_names):
            importance_list.append((feat_names[idx], v))

    # Sort and normalize
    importance_list.sort(key=lambda x: x[1], reverse=True)
    total_gain = sum(v for _, v in importance_list)

    if total_gain > 0:
        importance_df = pd.DataFrame([
            {'feature': feat, 'gain': gain, 'importance': gain/total_gain}
            for feat, gain in importance_list[:top_n]
        ])
    else:
        importance_df = pd.DataFrame(columns=['feature', 'gain', 'importance'])

    return importance_df

def feature_importance(models, feat_names, metrics):
    """Global feature importance weighted by segment size"""
    rows = []
    for seg, mdl in models.items():
        scores = mdl['q50'].get_booster().get_score(importance_type="gain")
        w = metrics[seg]["n_test"]
        for k,v in scores.items():
            idx = int(k[1:])
            if idx<len(feat_names): rows.append((feat_names[idx],v,w))
    if not rows: return pd.DataFrame(columns=["feature","total_gain","importance"])

    df = pd.DataFrame(rows, columns=["feature","gain","weight"])

    # Calculate both total_gain and weighted importance
    out = df.assign(wg=df["gain"]*df["weight"]).groupby("feature",as_index=False).agg(
        total_gain=("gain","sum"),
        weighted_gain=("wg","sum")
    ).sort_values("weighted_gain",ascending=False)

    out["importance"] = out["weighted_gain"]/out["weighted_gain"].sum()

    return out[["feature","total_gain","importance"]].head(100)

def prepare_data(df, y_col, id_col, state_col, for_training=True):
    print(f"\nPreparing data...")
    if for_training:
        df = df[df[y_col]>=MIN_PRICE]
        print(f"{len(df):,} records after price filter")
    df, kmeans_model = geo_cluster(engineer(df, y_col))
    feat_groups = []
    if INCLUDE_MLS: feat_groups.extend([BASE_FEATS,ENG_FEATS,PRIOR_FEATS,CLUSTER_FEATS,LOCATION_FEATS])  # ADDED LOCATION_FEATS
    if INCLUDE_CENSUS: feat_groups.append(CENSUS_FEATS)
    if INCLUDE_NEIGHBORHOOD: feat_groups.append(ELECT_FEATS)
    if INCLUDE_IMAGE: feat_groups.append(IMG_FEATS)
    all_feats = [f for g in feat_groups for f in g]
    feats = [f for f in all_feats if f in df.columns]
    print(f"{len(feats)}/{len(all_feats)} features available")

    # Count location features
    location_keywords = ['latitude', 'longitude', 'geo_cluster', 'cluster_avg_price',
                        'cluster_med_price', 'lat_long_interaction', 'distance_to_center',
                        'neighborhood_density']
    location_feat_count = sum(1 for f in feats if any(kw in f for kw in location_keywords))
    print(f"  → {location_feat_count} location-based features (weighted {LOCATION_WEIGHT}x)")

    cols = list(set(feats+[y_col,id_col]+([state_col] if state_col and state_col in df.columns else [])))
    df = df[[c for c in cols if c in df.columns]].copy()
    df[feats] = df[feats].fillna(df[feats].median())
    if for_training:
        return df.dropna(subset=[y_col]), feats, kmeans_model
    else:
        return df, feats, kmeans_model

def train_segments(df, feats, y_col, id_col, state_col):
    print(f"\nTraining LOCATION-FOCUSED segmented models on {len(df):,} properties")
    print(f"Location weight multiplier: {LOCATION_WEIGHT}x")
    print(f"Geo clusters: {N_CLUSTERS}")

    df['seg'] = assign_segments_simple(df)

    seg_cnts = df['seg'].value_counts()
    print(f"{len(seg_cnts)} segments created")
    for seg, cnt in seg_cnts.head(MAX_SEGMENTS).items(): print(f"  {seg}: {cnt:,}")

    small = seg_cnts[seg_cnts<50].index
    if len(small)>0:
        df.loc[df['seg'].isin(small),'seg'] = 'other'
        print(f"Merged {len(small)} small segments into 'other'")

    seg_cnts = df['seg'].value_counts()
    if len(seg_cnts) > MAX_SEGMENTS:
        keep_segs = seg_cnts.head(MAX_SEGMENTS-1).index
        df.loc[~df['seg'].isin(keep_segs),'seg'] = 'other'
        print(f"Consolidated to {MAX_SEGMENTS} segments maximum")

    models, metrics, preds_list, segment_importances = {}, {}, [], {}

    for seg in df['seg'].unique():
        seg_df = df[df['seg']==seg].copy()
        if len(seg_df)<50: continue
        seg_df = filter_outliers(seg_df, seg, y_col)
        if len(seg_df)<50: continue
        train_idx = seg_df.sample(frac=1-TEST_SIZE,random_state=RAND_STATE).index
        train_df, test_df = seg_df.loc[train_idx].copy(), seg_df.loc[seg_df.index.difference(train_idx)].copy()
        train_df, test_df = add_cluster_feats(train_df, test_df, y_col)
        X_tr,y_tr = train_df[feats].values, train_df[y_col].values
        X_te,y_te = test_df[feats].values, test_df[y_col].values
        ids_te = test_df[id_col].values
        states_te = test_df[state_col].values if state_col and state_col in test_df.columns else ['Unknown']*len(test_df)
        seg_models, seg_preds = {}, []
        for q in QUANTILES:
            m = train_model(X_tr,y_tr,q,feats)  # Pass feat_names
            seg_models[f"q{int(q*100)}"] = m
            seg_preds.append(m.predict(X_te))
        models[seg] = seg_models

        segment_importances[seg] = get_feature_importance_per_segment(seg_models['q50'], feats, top_n=20)

        y_pred = seg_preds[1]
        mae,mape = mean_absolute_error(y_te,y_pred), np.mean(np.abs((y_te-y_pred)/y_te))*100
        r2,cov = r2_score(y_te,y_pred), np.mean((y_te>=seg_preds[0])&(y_te<=seg_preds[2]))*100
        metrics[seg] = {'n_train':len(X_tr),'n_test':len(X_te),'mae':mae,'mape':mape,'r2':r2,'cov':cov,'p_min':seg_df[y_col].min(),'p_max':seg_df[y_col].max(),'p_med':seg_df[y_col].median()}
        print(f"  {seg}: {len(test_df):,} test | MAE:${mae:,.0f} | MAPE:{mape:.2f}% | R²:{r2:.3f}")
        preds_list.append(pd.DataFrame({'property_id':ids_te,'state':states_te,'actual':y_te,'predicted':y_pred,'pred_lower':seg_preds[0],'pred_upper':seg_preds[2],'segment':seg}))

    return {
        'models': models,
        'metrics': metrics,
        'predictions': pd.concat(preds_list),
        'feature_importance': feature_importance(models, feats, metrics),
        'segment_importances': segment_importances,
        'feature_names': feats
    }

def predict_new_properties(pred_df, models, feats, y_col, id_col, state_col, kmeans_model, train_cluster_stats):
    """Generate predictions for new properties using trained models"""
    print(f"\n{'='*60}")
    print("GENERATING PREDICTIONS FOR NEW PROPERTIES")
    print(f"{'='*60}")
    print(f"Input properties: {len(pred_df):,}")

    pred_df = engineer(pred_df, y_col)
    pred_df, _ = geo_cluster(pred_df, kmeans_model)

    if 'geo_cluster' in pred_df.columns and train_cluster_stats is not None:
        pred_df = pred_df.merge(train_cluster_stats, on='geo_cluster', how='left')
        median_price = train_cluster_stats['cluster_avg_price'].median()
        pred_df['cluster_avg_price'] = pred_df['cluster_avg_price'].fillna(median_price)
        pred_df['cluster_med_price'] = pred_df['cluster_med_price'].fillna(median_price)

    for feat in feats:
        if feat not in pred_df.columns:
            pred_df[feat] = 0
        else:
            pred_df[feat] = pred_df[feat].fillna(pred_df[feat].median() if pred_df[feat].notna().sum() > 0 else 0)

    pred_df['seg'] = assign_segments_simple(pred_df)

    preds_list = []
    for seg in pred_df['seg'].unique():
        seg_df = pred_df[pred_df['seg']==seg].copy()

        if seg not in models:
            print(f"  Warning: Segment '{seg}' not in trained models, using 'mid_large' as fallback")
            seg = 'mid_large' if 'mid_large' in models else list(models.keys())[0]

        X = seg_df[feats].values
        ids = seg_df[id_col].values
        states = seg_df[state_col].values if state_col and state_col in seg_df.columns else ['Unknown']*len(seg_df)

        pred_lower = models[seg]['q10'].predict(X)
        pred_mid = models[seg]['q50'].predict(X)
        pred_upper = models[seg]['q90'].predict(X)

        actual = seg_df[y_col].values if y_col in seg_df.columns else [np.nan]*len(seg_df)

        preds_list.append(pd.DataFrame({
            'property_id': ids,
            'state': states,
            'actual': actual,
            'predicted': pred_mid,
            'pred_lower': pred_lower,
            'pred_upper': pred_upper,
            'segment': seg,
            'error': [actual[i] - pred_mid[i] if not np.isnan(actual[i]) else np.nan for i in range(len(actual))],
            'pct_error': [100 * (actual[i] - pred_mid[i]) / actual[i] if not np.isnan(actual[i]) and actual[i] != 0 else np.nan for i in range(len(actual))]
        }))

        print(f"  {seg}: {len(seg_df):,} properties predicted")

    result_df = pd.concat(preds_list, ignore_index=True)
    print(f"\n✓ Generated {len(result_df):,} predictions")

    valid_actuals = result_df['actual'].notna().sum()
    if valid_actuals > 0:
        valid_preds = result_df[result_df['actual'].notna()].copy()
        mae = mean_absolute_error(valid_preds['actual'], valid_preds['predicted'])
        mape = np.mean(np.abs((valid_preds['actual'] - valid_preds['predicted']) / valid_preds['actual'])) * 100
        r2 = r2_score(valid_preds['actual'], valid_preds['predicted'])
        print(f"  Validation metrics ({valid_actuals} properties):")
        print(f"  MAE: ${mae:,.0f} | MAPE: {mape:.2f}% | R²: {r2:.4f}")

    return result_df

def save_results(results, out_dir, new_predictions=None):
    print(f"\nSaving results...")
    preds, metrics, fi = results['predictions'], results['metrics'], results['feature_importance']
    seg_importances = results['segment_importances']

    wb = Workbook()
    wb.remove(wb.active)

    # Summary
    ws = wb.create_sheet("Summary",0)
    ws['A1'], ws['A1'].font = 'LOCATION-FOCUSED SEGMENTED AVM', Font(bold=True,size=14)
    r2 = r2_score(preds['actual'],preds['predicted'])
    mae = mean_absolute_error(preds['actual'],preds['predicted'])
    mape = np.mean(np.abs((preds['actual']-preds['predicted'])/preds['actual']))*100
    data = [['Metric','Value'],
            ['Properties',len(preds)],
            ['Segments',len(metrics)],
            ['R²',f'{r2:.4f}'],
            ['MAE',f'${mae:,.0f}'],
            ['MAPE%',f'{mape:.2f}%'],
            ['Location Weight',f'{LOCATION_WEIGHT}x'],
            ['Geo Clusters',N_CLUSTERS]]
    if new_predictions is not None:
        data.append(['New Predictions', len(new_predictions)])
    for i,(k,v) in enumerate(data,5): ws[f'A{i}'],ws[f'A{i}'].font,ws[f'B{i}'] = k,Font(bold=True),v

    # Segments
    ws = wb.create_sheet("Segments")
    seg_df = pd.DataFrame([{**{'Segment':s},**m} for s,m in metrics.items()])
    for i,h in enumerate(seg_df.columns,1):
        c = ws.cell(1,i,h)
        c.font,c.fill = Font(bold=True,color='FFFFFF'),PatternFill(start_color='366092',end_color='366092',fill_type='solid')
    for i,row in enumerate(seg_df.itertuples(index=False),2):
        for j,v in enumerate(row,1): ws.cell(i,j,v)

    # Global Feature Importance
    ws = wb.create_sheet("Global_Feature_Importance")
    ws['A1'] = 'Global Feature Importance (Weighted) - LOCATION EMPHASIZED'
    ws['A1'].font = Font(bold=True, size=12)
    for r_idx, row in enumerate(dataframe_to_rows(fi, index=False, header=True), 2):
        for c_idx, value in enumerate(row, 1):
            cell = ws.cell(row=r_idx, column=c_idx, value=value)
            if r_idx == 2:
                cell.font = Font(bold=True, color='FFFFFF')
                cell.fill = PatternFill(start_color='366092', end_color='366092', fill_type='solid')

    # Per-Segment Feature Importance
    for seg_name, seg_fi in seg_importances.items():
        sheet_name = f"FI_{seg_name}"[:31]
        ws = wb.create_sheet(sheet_name)
        ws['A1'] = f'Feature Importance: {seg_name}'
        ws['A1'].font = Font(bold=True, size=12)

        for r_idx, row in enumerate(dataframe_to_rows(seg_fi, index=False, header=True), 2):
            for c_idx, value in enumerate(row, 1):
                cell = ws.cell(row=r_idx, column=c_idx, value=value)
                if r_idx == 2:
                    cell.font = Font(bold=True, color='FFFFFF')
                    cell.fill = PatternFill(start_color='366092', end_color='366092', fill_type='solid')

    # Test Set Predictions
    ws = wb.create_sheet("Test_Predictions")
    for i,h in enumerate(preds.columns,1):
        c = ws.cell(1,i,h)
        c.font,c.fill = Font(bold=True,color='FFFFFF'),PatternFill(start_color='366092',end_color='366092',fill_type='solid')
    for i,row in enumerate(preds.itertuples(index=False),2):
        for j,v in enumerate(row,1): ws.cell(i,j,v)

    # New Predictions
    if new_predictions is not None:
        ws = wb.create_sheet("New_Predictions")
        for i,h in enumerate(new_predictions.columns,1):
            c = ws.cell(1,i,h)
            c.font,c.fill = Font(bold=True,color='FFFFFF'),PatternFill(start_color='4472C4',end_color='4472C4',fill_type='solid')
        for i,row in enumerate(new_predictions.itertuples(index=False),2):
            for j,v in enumerate(row,1): ws.cell(i,j,v)

    ts = time.strftime("%Y%m%d_%H%M%S")
    xl_path = f"{out_dir}/location_focused_{ts}.xlsx"
    wb.save(xl_path)
    preds.to_csv(f"{out_dir}/location_test_predictions_{ts}.csv",index=False)
    seg_df.to_csv(f"{out_dir}/location_segments_{ts}.csv",index=False)
    fi.to_csv(f"{out_dir}/location_importance_{ts}.csv",index=False)

    for seg_name, seg_fi in seg_importances.items():
        seg_fi.to_csv(f"{out_dir}/location_importance_{seg_name}_{ts}.csv", index=False)

    if new_predictions is not None:
        new_predictions.to_csv(f"{out_dir}/location_new_predictions_{ts}.csv", index=False)
        print(f"✓ New predictions CSV: {out_dir}/location_new_predictions_{ts}.csv")

    print(f"✓ Excel: {xl_path}")
    print(f"✓ CSVs saved with timestamp: {ts}")
    print(f"✓ Per-segment feature importance saved for {len(seg_importances)} segments")

def main():
    t0 = time.time()
    print("="*60)
    print("LOCATION-FOCUSED SEGMENTED AVM")
    print(f"Location weight: {LOCATION_WEIGHT}x | Geo clusters: {N_CLUSTERS}")
    print("="*60)

    df, y_col, id_col, state_col = load_data(TRAINING_INPUT_PATH)
    df, feats, kmeans_model = prepare_data(df, y_col, id_col, state_col, for_training=True)

    results = train_segments(df, feats, y_col, id_col, state_col)

    train_cluster_stats = None
    if 'geo_cluster' in df.columns:
        train_cluster_stats = df.groupby('geo_cluster')[y_col].agg(['mean','median']).reset_index()
        train_cluster_stats.columns = ['geo_cluster','cluster_avg_price','cluster_med_price']

    new_predictions = None
    if PREDICTION_INPUT_PATH:
        pred_df, _, _, _ = load_data(PREDICTION_INPUT_PATH)
        new_predictions = predict_new_properties(
            pred_df, results['models'], feats, y_col, id_col, state_col,
            kmeans_model, train_cluster_stats
        )

    os.makedirs(OUTPUT_DIR, exist_ok=True)
    save_results(results, OUTPUT_DIR, new_predictions)

    preds = results['predictions']
    r2 = r2_score(preds['actual'],preds['predicted'])
    mae = mean_absolute_error(preds['actual'],preds['predicted'])
    mape = np.mean(np.abs((preds['actual']-preds['predicted'])/preds['actual']))*100
    print(f"\n{'='*60}")
    print(f"✓ LOCATION-FOCUSED TRAINING COMPLETE in {time.time()-t0:.1f}s")
    print(f"  Test set: {len(preds):,} properties | {preds['segment'].nunique()} segments")
    print(f"  R²: {r2:.4f} | MAE: ${mae:,.0f} | MAPE: {mape:.2f}%")
    if new_predictions is not None:
        print(f"  New predictions: {len(new_predictions):,} properties")
    print(f"{'='*60}")

if __name__ == "__main__": main()

LOCATION-FOCUSED SEGMENTED AVM
Location weight: 3.0x | Geo clusters: 16
Loading /Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/Untitled 5_2025-12-23-1212.csv
127,326 records | 129.6MB | Price:sale_price ID:property_id

Preparing data...
127,258 records after price filter
51/60 features available
  → 6 location-based features (weighted 3.0x)

Training LOCATION-FOCUSED segmented models on 127,258 properties
Location weight multiplier: 3.0x
Geo clusters: 16
6 segments created
  budget_small: 23,028
  mid_large: 22,880
  premium_large: 21,935
  mid_small: 20,638
  premium_small: 19,967
  budget_large: 18,810
  premium_small: 19,967→15,479 (22.5% filtered)
  premium_small: 4,644 test | MAE:$1,584,320 | MAPE:33.86% | R²:0.535
  premium_large: 21,935→16,873 (23.1% filtered)
  premium_large: 5,062 test | MAE:$632,527 | MAPE:21.30% | R²:0.410
  mid_large: 22,880→17,671 (22.8% filtered)
  mid_large: 5,301 test | MAE:$119,969 | MAPE:10.17% | R²:0.112
  mid_small: 20,638→15,752 (23.7

In [21]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.cluster import MiniBatchKMeans
from sklearn.ensemble import IsolationForest
import warnings
from openpyxl import Workbook
from openpyxl.styles import Font, PatternFill, Alignment
from openpyxl.utils.dataframe import dataframe_to_rows
import time, os

warnings.filterwarnings('ignore')
os.environ['OMP_NUM_THREADS'] = '4'

# Config - FIXED PARAMETERS
INCLUDE_MLS, INCLUDE_CENSUS, INCLUDE_NEIGHBORHOOD, INCLUDE_IMAGE = True, True, True, False
MIN_PRICE, TEST_SIZE, RAND_STATE, N_JOBS, N_CLUSTERS = 20000, 0.3, 42, -1, 8
N_EST, QUANTILES, MAX_SEGMENTS = 100, [0.1, 0.5, 0.9], 7  # MAX 7 SEGMENTS
MIN_ULTRA_HIGH_ASSESSED = 3000000  # $3M assessed value = ultra-high (roughly $5M+ market)

# INPUT PATHS
TRAINING_INPUT_PATH = "/Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/Untitled 5_2025-12-23-1212.csv"
PREDICTION_INPUT_PATH = "/Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/MLS_w_luxury_AVM_outliers.csv"
OUTPUT_DIR = "/Users/jenny.lin/BASIS_AVM_Onboarding/cate_scenario_analyses/model_outputs"

BASE_FEATS = ["living_sqft","lot_sqft","year_built","effective_year_built","bedrooms","full_baths","half_baths","garage_spaces","fireplace_code","latitude","longitude","geo_cluster"]
ENG_FEATS = ["sqft_per_bedroom","lot_to_living_ratio","property_age","is_new","has_garage","luxury_score","log_sqft","age_squared"]
PRIOR_FEATS = ["prior_sale_price","prior_price_per_sqft","sqft_per_prior_dollar","years_since_last_sale","expected_appreciation","has_prior_sale","recently_sold"]
CLUSTER_FEATS = ["cluster_avg_price","cluster_med_price"]
CENSUS_FEATS = ["total_population_25plus","male_bachelors_degree","female_bachelors_degree","pct_bachelors_degree","median_earnings_total","median_earnings_male","median_earnings_female","median_household_income","median_home_value","median_gross_rent","owner_occupied_units","renter_occupied_units","pct_owner_occupied","occupied_units","vacant_units","median_age","civilian_employed","civilian_unemployed","unemployment_rate","income_education_score"]
ELECT_FEATS = ["votes_gop","votes_dem","total_votes","per_gop","per_dem","per_point_diff","dem_margin","rep_margin"]
IMG_FEATS = ["topic_1","topic_2","topic_3","topic_4","topic_5","topic_6","topic_7","topic_8","topic_9","topic_10","gran_c_in","gran_c_ex","gran_c","high_c_in","high_c_ex","high_c"]

def optimize_dtypes(df):
    for c in df.select_dtypes(['float64']).columns: df[c] = df[c].astype('float32')
    for c in df.select_dtypes(['int64']).columns:
        df[c] = df[c].astype('int8') if set(df[c].dropna().unique()).issubset({0,1}) else df[c].astype('int32')
    return df

def load_data(path):
    print(f"Loading {path}")
    df = pd.read_csv(path, low_memory=False)
    df.columns = df.columns.str.lower()
    y_col = next((c for c in ['sale_price','currentsalesprice','price','saleprice'] if c in df.columns), 'currentsalesprice')
    id_col = next((c for c in ['cc_list_id','property_id','propertyid','id'] if c in df.columns), 'cc_list_id')
    state_col = next((c for c in ['sample_state','state','state_code'] if c in df.columns), None)
    print(f"{len(df):,} records | {df.memory_usage(deep=True).sum()/1024**2:.1f}MB | Price:{y_col} ID:{id_col}")
    return optimize_dtypes(df), y_col, id_col, state_col

def assign_segments_simple(df):
    """
    REFACTORED: Use ASSESSED_TOTAL_VALUE (lagged) instead of current sale price
    This avoids data leakage since assessed value is from prior year
    """
    segments = pd.Series(['mid'] * len(df), index=df.index)

    # Primary: Use ASSESSED VALUE for tier (lagged, no leakage)
    if 'assessed_total_value' in df.columns:
        assessed = df['assessed_total_value'].fillna(df['assessed_total_value'].median())

        # Identify ultra-high properties first
        ultra_high_mask = assessed >= MIN_ULTRA_HIGH_ASSESSED

        # For non-ultra properties, calculate standard tiers
        if ultra_high_mask.sum() >= 50:  # Enough ultra-high for separate segment
            non_ultra = assessed[~ultra_high_mask]
            if len(non_ultra) > 0:
                q33, q67 = non_ultra.quantile([.33, .67])
                price_tier = pd.Series(['mid'] * len(df), index=df.index)
                price_tier[assessed < q33] = 'budget'
                price_tier[(assessed >= q33) & (assessed < q67)] = 'mid'
                price_tier[(assessed >= q67) & (~ultra_high_mask)] = 'premium'
                price_tier[ultra_high_mask] = 'ultra_high'
            else:
                price_tier = pd.Series(['mid'] * len(df), index=df.index)
        else:
            # Not enough ultra-high, use standard quartiles
            q33, q67 = assessed.quantile([.33, .67])
            price_tier = pd.Series(['mid'] * len(df), index=df.index)
            price_tier[assessed < q33] = 'budget'
            price_tier[(assessed >= q33) & (assessed < q67)] = 'mid'
            price_tier[assessed >= q67] = 'premium'
    else:
        print("  WARNING: assessed_total_value not found, using fallback segmentation")
        price_tier = pd.Series(['mid'] * len(df), index=df.index)

    # Secondary: Size (2 segments: small vs large)
    if 'living_sqft' in df.columns:
        sqft = df['living_sqft'].fillna(df['living_sqft'].median())
        sqft_median = sqft.median()
        size_tier = pd.Series(['small'] * len(df), index=df.index)
        size_tier[sqft > sqft_median] = 'large'
    else:
        size_tier = pd.Series(['small'] * len(df), index=df.index)

    # Combine segments
    # For ultra_high, ignore size tier (single segment)
    segments = price_tier + '_' + size_tier
    if 'assessed_total_value' in df.columns:
        ultra_high_mask = df['assessed_total_value'].fillna(0) >= MIN_ULTRA_HIGH_ASSESSED
        if ultra_high_mask.sum() >= 50:
            segments[ultra_high_mask] = 'ultra_high'

    return segments

def engineer(df, y_col, with_price=False):
    if 'living_sqft' in df.columns and 'bedrooms' in df.columns: df['sqft_per_bedroom'] = df['living_sqft']/(df['bedrooms']+1)
    if 'lot_sqft' in df.columns:
        if 'living_sqft' in df.columns: df['lot_to_living_ratio'] = df['lot_sqft']/(df['living_sqft']+1)
        if 'lot_acres' not in df.columns: df['lot_acres'] = df['lot_sqft']/43560
    if 'year_built' in df.columns:
        df['property_age'] = 2024-df['year_built']
        df['is_new'] = (df['property_age']<=5).astype('int8')
        df['age_squared'] = df['property_age']**2
    if 'garage_spaces' in df.columns: df['has_garage'] = (df['garage_spaces']>0).astype('int8')
    if 'living_sqft' in df.columns: df['log_sqft'] = np.log1p(df['living_sqft'])
    lux = [df[c]/1000 if c=='living_sqft' else df[c] for c in ['living_sqft','full_baths','garage_spaces'] if c in df.columns]
    df['luxury_score'] = sum(lux)/len(lux) if lux else 0

    # NEW: Assessed value ratios (no leakage - these are lagged)
    if 'assessed_total_value' in df.columns and 'living_sqft' in df.columns:
        df['assessed_per_sqft'] = df['assessed_total_value'] / (df['living_sqft'] + 1)
    if 'assessed_land_value' in df.columns and 'assessed_total_value' in df.columns:
        df['land_to_total_ratio'] = df['assessed_land_value'] / (df['assessed_total_value'] + 1)
    if 'assessed_improvement_value' in df.columns and 'living_sqft' in df.columns:
        df['improvement_per_sqft'] = df['assessed_improvement_value'] / (df['living_sqft'] + 1)

    if INCLUDE_CENSUS and all(c in df.columns for c in ['median_household_income','pct_bachelors_degree']):
        df['income_education_score'] = df['median_household_income']*df['pct_bachelors_degree']
    if 'prior_sale_price' in df.columns and 'living_sqft' in df.columns:
        df['prior_price_per_sqft'] = df['prior_sale_price']/(df['living_sqft']+1)
        df['sqft_per_prior_dollar'] = df['living_sqft']/(df['prior_sale_price']+1)
    if 'prior_sale_date' in df.columns:
        df['prior_sale_date'] = pd.to_datetime(df['prior_sale_date'], errors='coerce')
        df['years_since_last_sale'] = (pd.Timestamp('2024-01-01')-df['prior_sale_date']).dt.days/365.25
        if 'prior_sale_price' in df.columns:
            df['expected_appreciation'] = df['prior_sale_price']*(1.04)**df['years_since_last_sale']
        df['recently_sold'] = (df['years_since_last_sale']<2).astype('int8')
    if 'prior_sale_price' in df.columns:
        df['has_prior_sale'] = df['prior_sale_price'].notna().astype('int8')
        missing = df['prior_sale_price'].isna()
        df.loc[missing,'prior_sale_price'] = df.loc[missing,'median_home_value'] if 'median_home_value' in df.columns and INCLUDE_CENSUS else df['prior_sale_price'].median()
    if 'years_since_last_sale' in df.columns: df['years_since_last_sale'] = df['years_since_last_sale'].fillna(999)
    if with_price and y_col in df.columns and 'living_sqft' in df.columns:
        df['price_per_sqft'] = df[y_col]/(df['living_sqft']+1)
        df['sqft_per_dollar'] = df['living_sqft']/(df[y_col]+1)
    return df

def geo_cluster(df, kmeans_model=None):
    """Apply geo clustering - can use existing model for prediction"""
    if not all(c in df.columns for c in ['latitude','longitude']):
        df['geo_cluster'] = 0
        return df, kmeans_model

    valid = df[['latitude','longitude']].notna().all(axis=1)
    if valid.sum() < N_CLUSTERS:
        df['geo_cluster'] = 0
        return df, kmeans_model

    df['geo_cluster'] = 0

    if kmeans_model is None:
        # Training: fit new model
        kmeans_model = MiniBatchKMeans(n_clusters=N_CLUSTERS, random_state=RAND_STATE, batch_size=1000, n_init=3)
        df.loc[valid,'geo_cluster'] = kmeans_model.fit_predict(df.loc[valid,['latitude','longitude']])
    else:
        # Prediction: use existing model
        df.loc[valid,'geo_cluster'] = kmeans_model.predict(df.loc[valid,['latitude','longitude']])

    return df, kmeans_model

def add_cluster_feats(train, test, y_col):
    if 'geo_cluster' not in train.columns or y_col not in train.columns:
        for d in [train,test]: d['cluster_avg_price'] = d['cluster_med_price'] = train[y_col].median() if y_col in train.columns else 0
        return train, test
    stats = train.groupby('geo_cluster')[y_col].agg(['mean','median']).reset_index()
    stats.columns = ['geo_cluster','cluster_avg_price','cluster_med_price']
    train = train.merge(stats, on='geo_cluster', how='left').fillna({c:train[y_col].median() for c in ['cluster_avg_price','cluster_med_price']})
    test = test.merge(stats, on='geo_cluster', how='left').fillna({c:train[y_col].median() for c in ['cluster_avg_price','cluster_med_price']})
    return train, test

def filter_outliers(df, name, y_col):
    orig = len(df)
    df = engineer(df, y_col, with_price=True)
    if 'price_per_sqft' in df.columns:
        lb,ub = df['price_per_sqft'].quantile([.05,.95])
        df = df[(df['price_per_sqft']>=lb)&(df['price_per_sqft']<=ub)]
    if 'sqft_per_dollar' in df.columns: df = df[df['sqft_per_dollar']<=df['sqft_per_dollar'].quantile(.95)]
    if 'price_per_sqft' in df.columns: df = df.drop(columns=['price_per_sqft','sqft_per_dollar'])
    if 'lot_sqft' in df.columns: df = df[df['lot_sqft']<=df['lot_sqft'].quantile(.98)]
    if 'year_built' in df.columns: df = df[(df['year_built']>=1900)&(df['year_built']<=2025)]
    if len(df)>=100:
        try:
            feats = [c for c in ['living_sqft','lot_sqft',y_col,'year_built'] if c in df.columns]
            if len(feats)>=3:
                X = df[feats].fillna(df[feats].median())
                df = df[IsolationForest(contamination=.05,random_state=RAND_STATE,n_jobs=N_JOBS).fit_predict(X)==1]
        except: pass
    pct_filt = (orig-len(df))/orig*100 if orig>0 else 0
    if pct_filt > 0: print(f"  {name}: {orig:,}→{len(df):,} ({pct_filt:.1f}% filtered)")
    return df

def train_model(X, y, q):
    return XGBRegressor(
        objective='reg:quantileerror',
        quantile_alpha=q,
        n_estimators=N_EST,
        learning_rate=.1,
        max_depth=5,
        min_child_weight=3,
        subsample=.8,
        colsample_bytree=.8,
        random_state=RAND_STATE,
        n_jobs=N_JOBS,
        tree_method='hist'
    ).fit(X, y, verbose=False)

def get_feature_importance_per_segment(model, feat_names, top_n=20):
    """Extract feature importance for a single segment model"""
    scores = model.get_booster().get_score(importance_type="gain")
    importance_list = []
    for k, v in scores.items():
        idx = int(k[1:])
        if idx < len(feat_names):
            importance_list.append((feat_names[idx], v))

    # Sort and normalize
    importance_list.sort(key=lambda x: x[1], reverse=True)
    total_gain = sum(v for _, v in importance_list)

    if total_gain > 0:
        importance_df = pd.DataFrame([
            {'feature': feat, 'gain': gain, 'importance': gain/total_gain}
            for feat, gain in importance_list[:top_n]
        ])
    else:
        importance_df = pd.DataFrame(columns=['feature', 'gain', 'importance'])

    return importance_df

def feature_importance(models, feat_names, metrics):
    """Global feature importance weighted by segment size - NOW INCLUDES TOTAL_GAIN"""
    rows = []
    for seg, mdl in models.items():
        scores = mdl['q50'].get_booster().get_score(importance_type="gain")
        w = metrics[seg]["n_test"]
        for k,v in scores.items():
            idx = int(k[1:])
            if idx<len(feat_names): rows.append((feat_names[idx],v,w))
    if not rows: return pd.DataFrame(columns=["feature","total_gain","importance"])

    df = pd.DataFrame(rows, columns=["feature","gain","weight"])

    # Calculate both total_gain and weighted importance
    out = df.assign(wg=df["gain"]*df["weight"]).groupby("feature",as_index=False).agg(
        total_gain=("gain","sum"),      # Sum of gain across all segments
        weighted_gain=("wg","sum")      # Weighted by segment size
    ).sort_values("weighted_gain",ascending=False)

    out["importance"] = out["weighted_gain"]/out["weighted_gain"].sum()

    return out[["feature","total_gain","importance"]].head(100)

def prepare_data(df, y_col, id_col, state_col, for_training=True):
    print(f"\nPreparing data...")
    print(f"Segmentation basis: ASSESSED_TOTAL_VALUE (lagged, no leakage)")
    if for_training:
        df = df[df[y_col]>=MIN_PRICE]
        print(f"{len(df):,} records after price filter")
    df, kmeans_model = geo_cluster(engineer(df, y_col))
    feat_groups = []
    if INCLUDE_MLS: feat_groups.extend([BASE_FEATS,ENG_FEATS,PRIOR_FEATS,CLUSTER_FEATS])
    if INCLUDE_CENSUS: feat_groups.append(CENSUS_FEATS)
    if INCLUDE_NEIGHBORHOOD: feat_groups.append(ELECT_FEATS)
    if INCLUDE_IMAGE: feat_groups.append(IMG_FEATS)
    all_feats = [f for g in feat_groups for f in g]
    feats = [f for f in all_feats if f in df.columns]

    # Add new assessed-value features if they exist
    assessed_feats = ['assessed_per_sqft', 'land_to_total_ratio', 'improvement_per_sqft']
    for af in assessed_feats:
        if af in df.columns and af not in feats:
            feats.append(af)

    print(f"{len(feats)}/{len(all_feats)} features available")
    cols = list(set(feats+[y_col,id_col]+([state_col] if state_col and state_col in df.columns else [])))
    df = df[[c for c in cols if c in df.columns]].copy()
    df[feats] = df[feats].fillna(df[feats].median())
    if for_training:
        return df.dropna(subset=[y_col]), feats, kmeans_model
    else:
        return df, feats, kmeans_model

def train_segments(df, feats, y_col, id_col, state_col):
    print(f"\nTraining segmented models on {len(df):,} properties")

    # Segment based on ASSESSED VALUE (not sale price)
    df['seg'] = assign_segments_simple(df)

    seg_cnts = df['seg'].value_counts()
    print(f"{len(seg_cnts)} segments created")
    for seg, cnt in seg_cnts.head(MAX_SEGMENTS).items():
        assessed_range = ""
        if 'assessed_total_value' in df.columns and seg != 'other':
            seg_df = df[df['seg']==seg]
            if len(seg_df) > 0:
                min_assessed = seg_df['assessed_total_value'].min()
                max_assessed = seg_df['assessed_total_value'].max()
                assessed_range = f" (assessed: ${min_assessed:,.0f}-${max_assessed:,.0f})"
        print(f"  {seg}: {cnt:,}{assessed_range}")

    # Merge small segments
    small = seg_cnts[seg_cnts<50].index
    if len(small)>0:
        df.loc[df['seg'].isin(small),'seg'] = 'other'
        print(f"Merged {len(small)} small segments into 'other'")

    # If still too many segments, merge smallest ones
    seg_cnts = df['seg'].value_counts()
    if len(seg_cnts) > MAX_SEGMENTS:
        keep_segs = seg_cnts.head(MAX_SEGMENTS-1).index
        df.loc[~df['seg'].isin(keep_segs),'seg'] = 'other'
        print(f"Consolidated to {MAX_SEGMENTS} segments maximum")

    models, metrics, preds_list, segment_importances = {}, {}, [], {}

    for seg in df['seg'].unique():
        seg_df = df[df['seg']==seg].copy()
        if len(seg_df)<50: continue
        seg_df = filter_outliers(seg_df, seg, y_col)
        if len(seg_df)<50: continue
        train_idx = seg_df.sample(frac=1-TEST_SIZE,random_state=RAND_STATE).index
        train_df, test_df = seg_df.loc[train_idx].copy(), seg_df.loc[seg_df.index.difference(train_idx)].copy()
        train_df, test_df = add_cluster_feats(train_df, test_df, y_col)
        X_tr,y_tr = train_df[feats].values, train_df[y_col].values
        X_te,y_te = test_df[feats].values, test_df[y_col].values
        ids_te = test_df[id_col].values
        states_te = test_df[state_col].values if state_col and state_col in test_df.columns else ['Unknown']*len(test_df)
        seg_models, seg_preds = {}, []
        for q in QUANTILES:
            m = train_model(X_tr,y_tr,q)
            seg_models[f"q{int(q*100)}"] = m
            seg_preds.append(m.predict(X_te))
        models[seg] = seg_models

        # Get feature importance for this segment
        segment_importances[seg] = get_feature_importance_per_segment(seg_models['q50'], feats, top_n=20)

        y_pred = seg_preds[1]
        mae,mape = mean_absolute_error(y_te,y_pred), np.mean(np.abs((y_te-y_pred)/y_te))*100
        r2,cov = r2_score(y_te,y_pred), np.mean((y_te>=seg_preds[0])&(y_te<=seg_preds[2]))*100
        metrics[seg] = {'n_train':len(X_tr),'n_test':len(X_te),'mae':mae,'mape':mape,'r2':r2,'cov':cov,'p_min':seg_df[y_col].min(),'p_max':seg_df[y_col].max(),'p_med':seg_df[y_col].median()}
        print(f"  {seg}: {len(test_df):,} test | MAE:${mae:,.0f} | MAPE:{mape:.2f}% | R²:{r2:.3f}")
        preds_list.append(pd.DataFrame({'property_id':ids_te,'state':states_te,'actual':y_te,'predicted':y_pred,'pred_lower':seg_preds[0],'pred_upper':seg_preds[2],'segment':seg}))

    return {
        'models': models,
        'metrics': metrics,
        'predictions': pd.concat(preds_list),
        'feature_importance': feature_importance(models, feats, metrics),
        'segment_importances': segment_importances,
        'feature_names': feats
    }

def predict_new_properties(pred_df, models, feats, y_col, id_col, state_col, kmeans_model, train_cluster_stats):
    """Generate predictions for new properties using trained models"""
    print(f"\n{'='*60}")
    print("GENERATING PREDICTIONS FOR NEW PROPERTIES")
    print(f"{'='*60}")
    print(f"Input properties: {len(pred_df):,}")

    # Engineer features (no price filtering for prediction)
    pred_df = engineer(pred_df, y_col)
    pred_df, _ = geo_cluster(pred_df, kmeans_model)

    # Add cluster features from training data
    if 'geo_cluster' in pred_df.columns and train_cluster_stats is not None:
        pred_df = pred_df.merge(train_cluster_stats, on='geo_cluster', how='left')
        median_price = train_cluster_stats['cluster_avg_price'].median()
        pred_df['cluster_avg_price'] = pred_df['cluster_avg_price'].fillna(median_price)
        pred_df['cluster_med_price'] = pred_df['cluster_med_price'].fillna(median_price)

    # Fill missing features with median or 0
    for feat in feats:
        if feat not in pred_df.columns:
            pred_df[feat] = 0
        else:
            pred_df[feat] = pred_df[feat].fillna(pred_df[feat].median() if pred_df[feat].notna().sum() > 0 else 0)

    # Assign segments based on ASSESSED VALUE
    pred_df['seg'] = assign_segments_simple(pred_df)

    # Generate predictions
    preds_list = []
    for seg in pred_df['seg'].unique():
        seg_df = pred_df[pred_df['seg']==seg].copy()

        if seg not in models:
            print(f"  Warning: Segment '{seg}' not in trained models, using fallback")
            # Use the closest segment as fallback
            if 'premium' in seg and 'premium_large' in models:
                seg = 'premium_large'
            elif 'ultra_high' in seg and 'ultra_high' in models:
                seg = 'ultra_high'
            else:
                seg = list(models.keys())[0]

        X = seg_df[feats].values
        ids = seg_df[id_col].values
        states = seg_df[state_col].values if state_col and state_col in seg_df.columns else ['Unknown']*len(seg_df)

        # Get predictions from quantile models
        pred_lower = models[seg]['q10'].predict(X)
        pred_mid = models[seg]['q50'].predict(X)
        pred_upper = models[seg]['q90'].predict(X)

        actual = seg_df[y_col].values if y_col in seg_df.columns else [np.nan]*len(seg_df)

        # Add assessed value to output for reference
        assessed = seg_df['assessed_total_value'].values if 'assessed_total_value' in seg_df.columns else [np.nan]*len(seg_df)

        preds_list.append(pd.DataFrame({
            'property_id': ids,
            'state': states,
            'assessed_value': assessed,
            'actual': actual,
            'predicted': pred_mid,
            'pred_lower': pred_lower,
            'pred_upper': pred_upper,
            'segment': seg,
            'error': [actual[i] - pred_mid[i] if not np.isnan(actual[i]) else np.nan for i in range(len(actual))],
            'pct_error': [100 * (actual[i] - pred_mid[i]) / actual[i] if not np.isnan(actual[i]) and actual[i] != 0 else np.nan for i in range(len(actual))]
        }))

        print(f"  {seg}: {len(seg_df):,} properties predicted")

    result_df = pd.concat(preds_list, ignore_index=True)
    print(f"\n✓ Generated {len(result_df):,} predictions")

    # Calculate metrics if actuals are available
    valid_actuals = result_df['actual'].notna().sum()
    if valid_actuals > 0:
        valid_preds = result_df[result_df['actual'].notna()].copy()
        mae = mean_absolute_error(valid_preds['actual'], valid_preds['predicted'])
        mape = np.mean(np.abs((valid_preds['actual'] - valid_preds['predicted']) / valid_preds['actual'])) * 100
        r2 = r2_score(valid_preds['actual'], valid_preds['predicted'])
        print(f"  Validation metrics ({valid_actuals} properties):")
        print(f"  MAE: ${mae:,.0f} | MAPE: {mape:.2f}% | R²: {r2:.4f}")

    return result_df

def save_results(results, out_dir, new_predictions=None):
    print(f"\nSaving results...")
    preds, metrics, fi = results['predictions'], results['metrics'], results['feature_importance']
    seg_importances = results['segment_importances']

    wb = Workbook()
    wb.remove(wb.active)

    # Summary
    ws = wb.create_sheet("Summary",0)
    ws['A1'], ws['A1'].font = 'ASSESSED-VALUE SEGMENTED AVM', Font(bold=True,size=14)
    ws['A2'] = 'Segmentation: Based on assessed_total_value (no leakage)'
    ws['A2'].font = Font(italic=True, size=10)
    r2 = r2_score(preds['actual'],preds['predicted'])
    mae = mean_absolute_error(preds['actual'],preds['predicted'])
    mape = np.mean(np.abs((preds['actual']-preds['predicted'])/preds['actual']))*100
    data = [['Metric','Value'],['Properties',len(preds)],['Segments',len(metrics)],['R²',f'{r2:.4f}'],['MAE',f'${mae:,.0f}'],['MAPE%',f'{mape:.2f}%']]
    if new_predictions is not None:
        data.append(['New Predictions', len(new_predictions)])
    for i,(k,v) in enumerate(data,5): ws[f'A{i}'],ws[f'A{i}'].font,ws[f'B{i}'] = k,Font(bold=True),v

    # Segments
    ws = wb.create_sheet("Segments")
    seg_df = pd.DataFrame([{**{'Segment':s},**m} for s,m in metrics.items()])
    for i,h in enumerate(seg_df.columns,1):
        c = ws.cell(1,i,h)
        c.font,c.fill = Font(bold=True,color='FFFFFF'),PatternFill(start_color='366092',end_color='366092',fill_type='solid')
    for i,row in enumerate(seg_df.itertuples(index=False),2):
        for j,v in enumerate(row,1): ws.cell(i,j,v)

    # Global Feature Importance
    ws = wb.create_sheet("Global_Feature_Importance")
    ws['A1'] = 'Global Feature Importance (Weighted)'
    ws['A1'].font = Font(bold=True, size=12)
    for r_idx, row in enumerate(dataframe_to_rows(fi, index=False, header=True), 2):
        for c_idx, value in enumerate(row, 1):
            cell = ws.cell(row=r_idx, column=c_idx, value=value)
            if r_idx == 2:  # Header row
                cell.font = Font(bold=True, color='FFFFFF')
                cell.fill = PatternFill(start_color='366092', end_color='366092', fill_type='solid')

    # Per-Segment Feature Importance (one sheet per segment)
    for seg_name, seg_fi in seg_importances.items():
        sheet_name = f"FI_{seg_name}"[:31]  # Excel sheet name limit
        ws = wb.create_sheet(sheet_name)
        ws['A1'] = f'Feature Importance: {seg_name}'
        ws['A1'].font = Font(bold=True, size=12)

        for r_idx, row in enumerate(dataframe_to_rows(seg_fi, index=False, header=True), 2):
            for c_idx, value in enumerate(row, 1):
                cell = ws.cell(row=r_idx, column=c_idx, value=value)
                if r_idx == 2:  # Header row
                    cell.font = Font(bold=True, color='FFFFFF')
                    cell.fill = PatternFill(start_color='366092', end_color='366092', fill_type='solid')

    # Test Set Predictions
    ws = wb.create_sheet("Test_Predictions")
    for i,h in enumerate(preds.columns,1):
        c = ws.cell(1,i,h)
        c.font,c.fill = Font(bold=True,color='FFFFFF'),PatternFill(start_color='366092',end_color='366092',fill_type='solid')
    for i,row in enumerate(preds.itertuples(index=False),2):
        for j,v in enumerate(row,1): ws.cell(i,j,v)

    # New Predictions (if provided)
    if new_predictions is not None:
        ws = wb.create_sheet("New_Predictions")
        for i,h in enumerate(new_predictions.columns,1):
            c = ws.cell(1,i,h)
            c.font,c.fill = Font(bold=True,color='FFFFFF'),PatternFill(start_color='4472C4',end_color='4472C4',fill_type='solid')
        for i,row in enumerate(new_predictions.itertuples(index=False),2):
            for j,v in enumerate(row,1): ws.cell(i,j,v)

    ts = time.strftime("%Y%m%d_%H%M%S")
    xl_path = f"{out_dir}/assessed_segmented_{ts}.xlsx"
    wb.save(xl_path)
    preds.to_csv(f"{out_dir}/assessed_test_predictions_{ts}.csv",index=False)
    seg_df.to_csv(f"{out_dir}/assessed_segments_{ts}.csv",index=False)
    fi.to_csv(f"{out_dir}/assessed_importance_{ts}.csv",index=False)

    # Save per-segment feature importance CSVs
    for seg_name, seg_fi in seg_importances.items():
        seg_fi.to_csv(f"{out_dir}/assessed_importance_{seg_name}_{ts}.csv", index=False)

    # Save new predictions
    if new_predictions is not None:
        new_predictions.to_csv(f"{out_dir}/assessed_new_predictions_{ts}.csv", index=False)
        print(f"✓ New predictions CSV: {out_dir}/assessed_new_predictions_{ts}.csv")

    print(f"✓ Excel: {xl_path}")
    print(f"✓ CSVs saved with timestamp: {ts}")
    print(f"✓ Per-segment feature importance saved for {len(seg_importances)} segments")

def main():
    t0 = time.time()
    print("="*60)
    print("ASSESSED-VALUE SEGMENTED AVM (NO LEAKAGE)")
    print("="*60)

    # Load and prepare training data
    df, y_col, id_col, state_col = load_data(TRAINING_INPUT_PATH)
    df, feats, kmeans_model = prepare_data(df, y_col, id_col, state_col, for_training=True)

    # Train models
    results = train_segments(df, feats, y_col, id_col, state_col)

    # Get cluster stats for prediction data
    train_cluster_stats = None
    if 'geo_cluster' in df.columns:
        train_cluster_stats = df.groupby('geo_cluster')[y_col].agg(['mean','median']).reset_index()
        train_cluster_stats.columns = ['geo_cluster','cluster_avg_price','cluster_med_price']

    # Generate predictions for new properties if input file provided
    new_predictions = None
    if PREDICTION_INPUT_PATH:
        pred_df, _, _, _ = load_data(PREDICTION_INPUT_PATH)
        new_predictions = predict_new_properties(
            pred_df, results['models'], feats, y_col, id_col, state_col,
            kmeans_model, train_cluster_stats
        )

    # Save results
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    save_results(results, OUTPUT_DIR, new_predictions)

    # Print summary
    preds = results['predictions']
    r2 = r2_score(preds['actual'],preds['predicted'])
    mae = mean_absolute_error(preds['actual'],preds['predicted'])
    mape = np.mean(np.abs((preds['actual']-preds['predicted'])/preds['actual']))*100
    print(f"\n{'='*60}")
    print(f"✓ TRAINING COMPLETE in {time.time()-t0:.1f}s")
    print(f"  Test set: {len(preds):,} properties | {preds['segment'].nunique()} segments")
    print(f"  R²: {r2:.4f} | MAE: ${mae:,.0f} | MAPE: {mape:.2f}%")
    if new_predictions is not None:
        print(f"  New predictions: {len(new_predictions):,} properties")
    print(f"{'='*60}")

if __name__ == "__main__": main()

ASSESSED-VALUE SEGMENTED AVM (NO LEAKAGE)
Loading /Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/Untitled 5_2025-12-23-1212.csv
127,326 records | 129.6MB | Price:sale_price ID:property_id

Preparing data...
Segmentation basis: ASSESSED_TOTAL_VALUE (lagged, no leakage)
127,258 records after price filter
51/57 features available

Training segmented models on 127,258 properties
2 segments created
  mid_small: 63,633
  mid_large: 63,625
  mid_small: 63,633→48,757 (23.4% filtered)
  mid_small: 14,627 test | MAE:$529,126 | MAPE:24.39% | R²:0.200
  mid_large: 63,625→49,085 (22.9% filtered)
  mid_large: 14,725 test | MAE:$310,870 | MAPE:19.92% | R²:0.512
Loading /Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/MLS_w_luxury_AVM_outliers.csv
1 records | 0.0MB | Price:sale_price ID:property_id

GENERATING PREDICTIONS FOR NEW PROPERTIES
Input properties: 1
  mid_small: 1 properties predicted

✓ Generated 1 predictions
  Validation metrics (1 properties):
  MAE: $432,836 |

In [22]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.cluster import MiniBatchKMeans
from sklearn.ensemble import IsolationForest
import warnings
from openpyxl import Workbook
from openpyxl.styles import Font, PatternFill, Alignment
from openpyxl.utils.dataframe import dataframe_to_rows
import time, os

warnings.filterwarnings('ignore')
os.environ['OMP_NUM_THREADS'] = '4'

# Config - FIXED PARAMETERS
INCLUDE_MLS, INCLUDE_CENSUS, INCLUDE_NEIGHBORHOOD, INCLUDE_IMAGE = True, True, True, False
MIN_PRICE, TEST_SIZE, RAND_STATE, N_JOBS, N_CLUSTERS = 20000, 0.3, 42, -1, 8
N_EST, QUANTILES, MAX_SEGMENTS = 100, [0.1, 0.5, 0.9], 7  # MAX 7 SEGMENTS
MIN_ULTRA_HIGH = 5000000  # $5M value indicator threshold for ultra-high segment

# INPUT PATHS
TRAINING_INPUT_PATH = "/Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/Untitled 5_2025-12-23-1212.csv"
PREDICTION_INPUT_PATH = "/Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/MLS_w_luxury_AVM_outliers.csv"
OUTPUT_DIR = "/Users/jenny.lin/BASIS_AVM_Onboarding/cate_scenario_analyses/model_outputs"

BASE_FEATS = ["living_sqft","lot_sqft","year_built","effective_year_built","bedrooms","full_baths","half_baths","garage_spaces","fireplace_code","latitude","longitude","geo_cluster"]
ENG_FEATS = ["sqft_per_bedroom","lot_to_living_ratio","property_age","is_new","has_garage","luxury_score","log_sqft","age_squared"]
PRIOR_FEATS = ["prior_sale_price","prior_price_per_sqft","sqft_per_prior_dollar","years_since_last_sale","expected_appreciation","has_prior_sale","recently_sold"]
CLUSTER_FEATS = ["cluster_avg_price","cluster_med_price"]
CENSUS_FEATS = ["total_population_25plus","male_bachelors_degree","female_bachelors_degree","pct_bachelors_degree","median_earnings_total","median_earnings_male","median_earnings_female","median_household_income","median_home_value","median_gross_rent","owner_occupied_units","renter_occupied_units","pct_owner_occupied","occupied_units","vacant_units","median_age","civilian_employed","civilian_unemployed","unemployment_rate","income_education_score"]
ELECT_FEATS = ["votes_gop","votes_dem","total_votes","per_gop","per_dem","per_point_diff","dem_margin","rep_margin"]
IMG_FEATS = ["topic_1","topic_2","topic_3","topic_4","topic_5","topic_6","topic_7","topic_8","topic_9","topic_10","gran_c_in","gran_c_ex","gran_c","high_c_in","high_c_ex","high_c"]

def optimize_dtypes(df):
    for c in df.select_dtypes(['float64']).columns: df[c] = df[c].astype('float32')
    for c in df.select_dtypes(['int64']).columns:
        df[c] = df[c].astype('int8') if set(df[c].dropna().unique()).issubset({0,1}) else df[c].astype('int32')
    return df

def load_data(path):
    print(f"Loading {path}")
    df = pd.read_csv(path, low_memory=False)
    df.columns = df.columns.str.lower()
    y_col = next((c for c in ['sale_price','currentsalesprice','price','saleprice'] if c in df.columns), 'currentsalesprice')
    id_col = next((c for c in ['cc_list_id','property_id','propertyid','id'] if c in df.columns), 'cc_list_id')
    state_col = next((c for c in ['sample_state','state','state_code'] if c in df.columns), None)
    print(f"{len(df):,} records | {df.memory_usage(deep=True).sum()/1024**2:.1f}MB | Price:{y_col} ID:{id_col}")
    return optimize_dtypes(df), y_col, id_col, state_col

def assign_segments_robust(df):
    """
    ROBUST segmentation using MULTIPLE lagged indicators with fallbacks
    Priority: 1) Prior sale price, 2) Assessed value, 3) Median home value (census), 4) Sqft estimate
    """
    segments = pd.Series(['mid'] * len(df), index=df.index)

    # Build a composite "value indicator" from multiple lagged sources
    value_indicator = pd.Series([np.nan] * len(df), index=df.index)
    source_used = pd.Series(['none'] * len(df), index=df.index)

    # Priority 1: Prior sale price (if recent)
    if 'prior_sale_price' in df.columns and 'years_since_last_sale' in df.columns:
        # Use prior sale if within last 10 years, adjust for appreciation
        recent_prior = (df['years_since_last_sale'] <= 10) & (df['prior_sale_price'] > 0)
        if recent_prior.sum() > 0:
            years = df.loc[recent_prior, 'years_since_last_sale'].fillna(5)
            appreciated_value = df.loc[recent_prior, 'prior_sale_price'] * (1.04 ** years)
            value_indicator[recent_prior] = appreciated_value
            source_used[recent_prior] = 'prior_sale'

    # Priority 2: Assessed value (if reasonable)
    if 'assessed_total_value' in df.columns:
        # Only use assessed value if it seems reasonable (>$10K and <$50M)
        reasonable_assessed = (df['assessed_total_value'] > 10000) & (df['assessed_total_value'] < 50000000)
        missing_indicator = value_indicator.isna()
        use_assessed = reasonable_assessed & missing_indicator
        if use_assessed.sum() > 0:
            # Assessed values are typically 70-90% of market value, so adjust up
            value_indicator[use_assessed] = df.loc[use_assessed, 'assessed_total_value'] * 1.15
            source_used[use_assessed] = 'assessed'

    # Priority 3: Median home value (census tract)
    if 'median_home_value' in df.columns:
        missing_indicator = value_indicator.isna()
        has_census = df['median_home_value'].notna() & (df['median_home_value'] > 0)
        use_census = has_census & missing_indicator
        if use_census.sum() > 0:
            value_indicator[use_census] = df.loc[use_census, 'median_home_value']
            source_used[use_census] = 'census'

    # Priority 4: Fallback - use living sqft * $150/sqft as rough estimate
    if 'living_sqft' in df.columns:
        missing_indicator = value_indicator.isna()
        has_sqft = df['living_sqft'].notna() & (df['living_sqft'] > 0)
        use_sqft = has_sqft & missing_indicator
        if use_sqft.sum() > 0:
            value_indicator[use_sqft] = df.loc[use_sqft, 'living_sqft'] * 150
            source_used[use_sqft] = 'sqft_estimate'

    # Final fallback: use median of all available indicators
    if value_indicator.isna().sum() > 0:
        fallback_value = value_indicator.median()
        value_indicator = value_indicator.fillna(fallback_value)
        source_used[value_indicator.isna()] = 'median_fallback'

    # Now segment based on value_indicator
    ultra_high_mask = value_indicator >= MIN_ULTRA_HIGH

    if ultra_high_mask.sum() >= 50:
        # Enough ultra-high properties for separate segment
        non_ultra = value_indicator[~ultra_high_mask]
        if len(non_ultra) > 0:
            q33, q67 = non_ultra.quantile([.33, .67])
            price_tier = pd.Series(['mid'] * len(df), index=df.index)
            price_tier[value_indicator < q33] = 'budget'
            price_tier[(value_indicator >= q33) & (value_indicator < q67)] = 'mid'
            price_tier[(value_indicator >= q67) & (~ultra_high_mask)] = 'premium'
            price_tier[ultra_high_mask] = 'ultra_high'
        else:
            price_tier = pd.Series(['mid'] * len(df), index=df.index)
    else:
        # Standard quartile segmentation
        q33, q67 = value_indicator.quantile([.33, .67])
        price_tier = pd.Series(['mid'] * len(df), index=df.index)
        price_tier[value_indicator < q33] = 'budget'
        price_tier[(value_indicator >= q33) & (value_indicator < q67)] = 'mid'
        price_tier[value_indicator >= q67] = 'premium'

    # Size tier
    if 'living_sqft' in df.columns:
        sqft = df['living_sqft'].fillna(df['living_sqft'].median())
        sqft_median = sqft.median()
        size_tier = pd.Series(['small'] * len(df), index=df.index)
        size_tier[sqft > sqft_median] = 'large'
    else:
        size_tier = pd.Series(['small'] * len(df), index=df.index)

    # Combine segments
    segments = price_tier + '_' + size_tier
    if ultra_high_mask.sum() >= 50:
        segments[ultra_high_mask] = 'ultra_high'

    # Store the value_indicator and source for diagnostics
    df['_value_indicator'] = value_indicator
    df['_value_source'] = source_used

    return segments

def engineer(df, y_col, with_price=False):
    # Standard features
    if 'living_sqft' in df.columns and 'bedrooms' in df.columns:
        df['sqft_per_bedroom'] = df['living_sqft']/(df['bedrooms']+1)

    if 'lot_sqft' in df.columns:
        if 'living_sqft' in df.columns:
            df['lot_to_living_ratio'] = df['lot_sqft']/(df['living_sqft']+1)
        if 'lot_acres' not in df.columns:
            df['lot_acres'] = df['lot_sqft']/43560

    if 'year_built' in df.columns:
        df['property_age'] = 2024-df['year_built']
        df['is_new'] = (df['property_age']<=5).astype('int8')
        df['age_squared'] = df['property_age']**2

    if 'garage_spaces' in df.columns:
        df['has_garage'] = (df['garage_spaces']>0).astype('int8')

    if 'living_sqft' in df.columns:
        df['log_sqft'] = np.log1p(df['living_sqft'])

    lux = [df[c]/1000 if c=='living_sqft' else df[c] for c in ['living_sqft','full_baths','garage_spaces'] if c in df.columns]
    df['luxury_score'] = sum(lux)/len(lux) if lux else 0

    # NEW: Composite value features
    if 'prior_sale_price' in df.columns and 'years_since_last_sale' in df.columns:
        df['prior_appreciated'] = df['prior_sale_price'] * (1.04 ** df['years_since_last_sale'].fillna(5))

    if 'assessed_total_value' in df.columns and 'living_sqft' in df.columns:
        # Only use if assessed value is reasonable
        reasonable = (df['assessed_total_value'] > 10000) & (df['assessed_total_value'] < 50000000)
        df['assessed_per_sqft'] = 0
        df.loc[reasonable, 'assessed_per_sqft'] = df.loc[reasonable, 'assessed_total_value'] / (df.loc[reasonable, 'living_sqft'] + 1)

    # Ratio of assessed to census median (if both available)
    if 'assessed_total_value' in df.columns and 'median_home_value' in df.columns:
        reasonable_assessed = (df['assessed_total_value'] > 10000)
        reasonable_census = (df['median_home_value'] > 10000)
        reasonable = reasonable_assessed & reasonable_census
        df['assessed_to_census_ratio'] = 1.0
        df.loc[reasonable, 'assessed_to_census_ratio'] = df.loc[reasonable, 'assessed_total_value'] / (df.loc[reasonable, 'median_home_value'] + 1)

    if 'assessed_land_value' in df.columns and 'assessed_total_value' in df.columns:
        reasonable = (df['assessed_total_value'] > 10000) & (df['assessed_land_value'] > 0)
        df['land_to_total_ratio'] = 0
        df.loc[reasonable, 'land_to_total_ratio'] = df.loc[reasonable, 'assessed_land_value'] / (df.loc[reasonable, 'assessed_total_value'] + 1)

    if INCLUDE_CENSUS and all(c in df.columns for c in ['median_household_income','pct_bachelors_degree']):
        df['income_education_score'] = df['median_household_income']*df['pct_bachelors_degree']

    if 'prior_sale_price' in df.columns and 'living_sqft' in df.columns:
        df['prior_price_per_sqft'] = df['prior_sale_price']/(df['living_sqft']+1)
        df['sqft_per_prior_dollar'] = df['living_sqft']/(df['prior_sale_price']+1)

    if 'prior_sale_date' in df.columns:
        df['prior_sale_date'] = pd.to_datetime(df['prior_sale_date'], errors='coerce')
        df['years_since_last_sale'] = (pd.Timestamp('2024-01-01')-df['prior_sale_date']).dt.days/365.25
        if 'prior_sale_price' in df.columns:
            df['expected_appreciation'] = df['prior_sale_price']*(1.04)**df['years_since_last_sale']
        df['recently_sold'] = (df['years_since_last_sale']<2).astype('int8')

    if 'prior_sale_price' in df.columns:
        df['has_prior_sale'] = df['prior_sale_price'].notna().astype('int8')
        missing = df['prior_sale_price'].isna()
        df.loc[missing,'prior_sale_price'] = df.loc[missing,'median_home_value'] if 'median_home_value' in df.columns and INCLUDE_CENSUS else df['prior_sale_price'].median()

    if 'years_since_last_sale' in df.columns:
        df['years_since_last_sale'] = df['years_since_last_sale'].fillna(999)

    if with_price and y_col in df.columns and 'living_sqft' in df.columns:
        df['price_per_sqft'] = df[y_col]/(df['living_sqft']+1)
        df['sqft_per_dollar'] = df['living_sqft']/(df[y_col]+1)

    return df

def geo_cluster(df, kmeans_model=None):
    """Apply geo clustering - can use existing model for prediction"""
    if not all(c in df.columns for c in ['latitude','longitude']):
        df['geo_cluster'] = 0
        return df, kmeans_model

    valid = df[['latitude','longitude']].notna().all(axis=1)
    if valid.sum() < N_CLUSTERS:
        df['geo_cluster'] = 0
        return df, kmeans_model

    df['geo_cluster'] = 0

    if kmeans_model is None:
        # Training: fit new model
        kmeans_model = MiniBatchKMeans(n_clusters=N_CLUSTERS, random_state=RAND_STATE, batch_size=1000, n_init=3)
        df.loc[valid,'geo_cluster'] = kmeans_model.fit_predict(df.loc[valid,['latitude','longitude']])
    else:
        # Prediction: use existing model
        df.loc[valid,'geo_cluster'] = kmeans_model.predict(df.loc[valid,['latitude','longitude']])

    return df, kmeans_model

def add_cluster_feats(train, test, y_col):
    if 'geo_cluster' not in train.columns or y_col not in train.columns:
        for d in [train,test]:
            d['cluster_avg_price'] = d['cluster_med_price'] = train[y_col].median() if y_col in train.columns else 0
        return train, test

    stats = train.groupby('geo_cluster')[y_col].agg(['mean','median']).reset_index()
    stats.columns = ['geo_cluster','cluster_avg_price','cluster_med_price']
    train = train.merge(stats, on='geo_cluster', how='left').fillna({c:train[y_col].median() for c in ['cluster_avg_price','cluster_med_price']})
    test = test.merge(stats, on='geo_cluster', how='left').fillna({c:train[y_col].median() for c in ['cluster_avg_price','cluster_med_price']})
    return train, test

def filter_outliers(df, name, y_col):
    orig = len(df)
    df = engineer(df, y_col, with_price=True)

    if 'price_per_sqft' in df.columns:
        lb,ub = df['price_per_sqft'].quantile([.05,.95])
        df = df[(df['price_per_sqft']>=lb)&(df['price_per_sqft']<=ub)]

    if 'sqft_per_dollar' in df.columns:
        df = df[df['sqft_per_dollar']<=df['sqft_per_dollar'].quantile(.95)]

    if 'price_per_sqft' in df.columns:
        df = df.drop(columns=['price_per_sqft','sqft_per_dollar'])

    if 'lot_sqft' in df.columns:
        df = df[df['lot_sqft']<=df['lot_sqft'].quantile(.98)]

    if 'year_built' in df.columns:
        df = df[(df['year_built']>=1900)&(df['year_built']<=2025)]

    if len(df)>=100:
        try:
            feats = [c for c in ['living_sqft','lot_sqft',y_col,'year_built'] if c in df.columns]
            if len(feats)>=3:
                X = df[feats].fillna(df[feats].median())
                df = df[IsolationForest(contamination=.05,random_state=RAND_STATE,n_jobs=N_JOBS).fit_predict(X)==1]
        except:
            pass

    pct_filt = (orig-len(df))/orig*100 if orig>0 else 0
    if pct_filt > 0:
        print(f"  {name}: {orig:,}→{len(df):,} ({pct_filt:.1f}% filtered)")

    return df

def train_model(X, y, q):
    return XGBRegressor(
        objective='reg:quantileerror',
        quantile_alpha=q,
        n_estimators=N_EST,
        learning_rate=.1,
        max_depth=5,
        min_child_weight=3,
        subsample=.8,
        colsample_bytree=.8,
        random_state=RAND_STATE,
        n_jobs=N_JOBS,
        tree_method='hist'
    ).fit(X, y, verbose=False)

def get_feature_importance_per_segment(model, feat_names, top_n=20):
    """Extract feature importance for a single segment model"""
    scores = model.get_booster().get_score(importance_type="gain")
    importance_list = []
    for k, v in scores.items():
        idx = int(k[1:])
        if idx < len(feat_names):
            importance_list.append((feat_names[idx], v))

    # Sort and normalize
    importance_list.sort(key=lambda x: x[1], reverse=True)
    total_gain = sum(v for _, v in importance_list)

    if total_gain > 0:
        importance_df = pd.DataFrame([
            {'feature': feat, 'gain': gain, 'importance': gain/total_gain}
            for feat, gain in importance_list[:top_n]
        ])
    else:
        importance_df = pd.DataFrame(columns=['feature', 'gain', 'importance'])

    return importance_df

def feature_importance(models, feat_names, metrics):
    """Global feature importance weighted by segment size"""
    rows = []
    for seg, mdl in models.items():
        scores = mdl['q50'].get_booster().get_score(importance_type="gain")
        w = metrics[seg]["n_test"]
        for k,v in scores.items():
            idx = int(k[1:])
            if idx<len(feat_names):
                rows.append((feat_names[idx],v,w))

    if not rows:
        return pd.DataFrame(columns=["feature","total_gain","importance"])

    df = pd.DataFrame(rows, columns=["feature","gain","weight"])

    # Calculate both total_gain and weighted importance
    out = df.assign(wg=df["gain"]*df["weight"]).groupby("feature",as_index=False).agg(
        total_gain=("gain","sum"),
        weighted_gain=("wg","sum")
    ).sort_values("weighted_gain",ascending=False)

    out["importance"] = out["weighted_gain"]/out["weighted_gain"].sum()

    return out[["feature","total_gain","importance"]].head(100)

def prepare_data(df, y_col, id_col, state_col, for_training=True):
    print(f"\nPreparing data...")
    print(f"Segmentation: ROBUST multi-source value indicator (no leakage)")

    if for_training:
        df = df[df[y_col]>=MIN_PRICE]
        print(f"{len(df):,} records after price filter")

    df, kmeans_model = geo_cluster(engineer(df, y_col))

    feat_groups = []
    if INCLUDE_MLS:
        feat_groups.extend([BASE_FEATS,ENG_FEATS,PRIOR_FEATS,CLUSTER_FEATS])
    if INCLUDE_CENSUS:
        feat_groups.append(CENSUS_FEATS)
    if INCLUDE_NEIGHBORHOOD:
        feat_groups.append(ELECT_FEATS)
    if INCLUDE_IMAGE:
        feat_groups.append(IMG_FEATS)

    all_feats = [f for g in feat_groups for f in g]
    feats = [f for f in all_feats if f in df.columns]

    # Add new value-based features if they exist
    value_feats = ['prior_appreciated', 'assessed_per_sqft', 'assessed_to_census_ratio', 'land_to_total_ratio']
    for vf in value_feats:
        if vf in df.columns and vf not in feats:
            feats.append(vf)

    print(f"{len(feats)}/{len(all_feats)} features available")

    cols = list(set(feats+[y_col,id_col]+([state_col] if state_col and state_col in df.columns else [])))
    df = df[[c for c in cols if c in df.columns]].copy()
    df[feats] = df[feats].fillna(df[feats].median())

    if for_training:
        return df.dropna(subset=[y_col]), feats, kmeans_model
    else:
        return df, feats, kmeans_model

def train_segments(df, feats, y_col, id_col, state_col):
    print(f"\nTraining segmented models on {len(df):,} properties")
    print("Segmentation using ROBUST multi-source value indicator")

    # Segment using robust approach
    df['seg'] = assign_segments_robust(df)

    # Diagnostic: Show what data sources were used
    if '_value_indicator' in df.columns and '_value_source' in df.columns:
        source_counts = df['_value_source'].value_counts()
        print(f"\nValue indicator sources:")
        for source, count in source_counts.items():
            print(f"  {source}: {count:,} ({100*count/len(df):.1f}%)")

    seg_cnts = df['seg'].value_counts()
    print(f"\n{len(seg_cnts)} segments created")

    for seg, cnt in seg_cnts.head(MAX_SEGMENTS).items():
        if '_value_indicator' in df.columns and seg != 'other':
            seg_df = df[df['seg']==seg]
            if len(seg_df) > 0:
                min_val = seg_df['_value_indicator'].min()
                max_val = seg_df['_value_indicator'].max()
                median_val = seg_df['_value_indicator'].median()
                print(f"  {seg}: {cnt:,} (value indicator: ${min_val:,.0f}-${max_val:,.0f}, median ${median_val:,.0f})")
        else:
            print(f"  {seg}: {cnt:,}")

    # Merge small segments
    small = seg_cnts[seg_cnts<50].index
    if len(small)>0:
        df.loc[df['seg'].isin(small),'seg'] = 'other'
        print(f"\nMerged {len(small)} small segments into 'other'")

    # If still too many segments, merge smallest ones
    seg_cnts = df['seg'].value_counts()
    if len(seg_cnts) > MAX_SEGMENTS:
        keep_segs = seg_cnts.head(MAX_SEGMENTS-1).index
        df.loc[~df['seg'].isin(keep_segs),'seg'] = 'other'
        print(f"Consolidated to {MAX_SEGMENTS} segments maximum")

    # Clean up temp columns
    if '_value_indicator' in df.columns:
        df = df.drop(columns=['_value_indicator'])
    if '_value_source' in df.columns:
        df = df.drop(columns=['_value_source'])

    models, metrics, preds_list, segment_importances = {}, {}, [], {}

    for seg in df['seg'].unique():
        seg_df = df[df['seg']==seg].copy()
        if len(seg_df)<50: continue

        seg_df = filter_outliers(seg_df, seg, y_col)
        if len(seg_df)<50: continue

        train_idx = seg_df.sample(frac=1-TEST_SIZE,random_state=RAND_STATE).index
        train_df, test_df = seg_df.loc[train_idx].copy(), seg_df.loc[seg_df.index.difference(train_idx)].copy()
        train_df, test_df = add_cluster_feats(train_df, test_df, y_col)

        X_tr,y_tr = train_df[feats].values, train_df[y_col].values
        X_te,y_te = test_df[feats].values, test_df[y_col].values
        ids_te = test_df[id_col].values
        states_te = test_df[state_col].values if state_col and state_col in test_df.columns else ['Unknown']*len(test_df)

        seg_models, seg_preds = {}, []
        for q in QUANTILES:
            m = train_model(X_tr,y_tr,q)
            seg_models[f"q{int(q*100)}"] = m
            seg_preds.append(m.predict(X_te))

        models[seg] = seg_models

        # Get feature importance for this segment
        segment_importances[seg] = get_feature_importance_per_segment(seg_models['q50'], feats, top_n=20)

        y_pred = seg_preds[1]
        mae,mape = mean_absolute_error(y_te,y_pred), np.mean(np.abs((y_te-y_pred)/y_te))*100
        r2,cov = r2_score(y_te,y_pred), np.mean((y_te>=seg_preds[0])&(y_te<=seg_preds[2]))*100
        metrics[seg] = {'n_train':len(X_tr),'n_test':len(X_te),'mae':mae,'mape':mape,'r2':r2,'cov':cov,'p_min':seg_df[y_col].min(),'p_max':seg_df[y_col].max(),'p_med':seg_df[y_col].median()}
        print(f"  {seg}: {len(test_df):,} test | MAE:${mae:,.0f} | MAPE:{mape:.2f}% | R²:{r2:.3f}")

        preds_list.append(pd.DataFrame({
            'property_id':ids_te,
            'state':states_te,
            'actual':y_te,
            'predicted':y_pred,
            'pred_lower':seg_preds[0],
            'pred_upper':seg_preds[2],
            'segment':seg
        }))

    return {
        'models': models,
        'metrics': metrics,
        'predictions': pd.concat(preds_list),
        'feature_importance': feature_importance(models, feats, metrics),
        'segment_importances': segment_importances,
        'feature_names': feats
    }

def predict_new_properties(pred_df, models, feats, y_col, id_col, state_col, kmeans_model, train_cluster_stats):
    """Generate predictions for new properties using trained models"""
    print(f"\n{'='*60}")
    print("GENERATING PREDICTIONS FOR NEW PROPERTIES")
    print(f"{'='*60}")
    print(f"Input properties: {len(pred_df):,}")

    # Engineer features (no price filtering for prediction)
    pred_df = engineer(pred_df, y_col)
    pred_df, _ = geo_cluster(pred_df, kmeans_model)

    # Add cluster features from training data
    if 'geo_cluster' in pred_df.columns and train_cluster_stats is not None:
        pred_df = pred_df.merge(train_cluster_stats, on='geo_cluster', how='left')
        median_price = train_cluster_stats['cluster_avg_price'].median()
        pred_df['cluster_avg_price'] = pred_df['cluster_avg_price'].fillna(median_price)
        pred_df['cluster_med_price'] = pred_df['cluster_med_price'].fillna(median_price)

    # Fill missing features with median or 0
    for feat in feats:
        if feat not in pred_df.columns:
            pred_df[feat] = 0
        else:
            pred_df[feat] = pred_df[feat].fillna(pred_df[feat].median() if pred_df[feat].notna().sum() > 0 else 0)

    # Assign segments based on ROBUST value indicator
    pred_df['seg'] = assign_segments_robust(pred_df)

    # Show value indicator sources for prediction data
    if '_value_source' in pred_df.columns:
        source_counts = pred_df['_value_source'].value_counts()
        print(f"\nValue indicator sources (prediction data):")
        for source, count in source_counts.items():
            print(f"  {source}: {count:,} ({100*count/len(pred_df):.1f}%)")

    # Generate predictions
    preds_list = []
    for seg in pred_df['seg'].unique():
        seg_df = pred_df[pred_df['seg']==seg].copy()

        if seg not in models:
            print(f"  Warning: Segment '{seg}' not in trained models, using fallback")
            # Use the closest segment as fallback
            if 'premium' in seg and 'premium_large' in models:
                seg = 'premium_large'
            elif 'ultra_high' in seg and 'ultra_high' in models:
                seg = 'ultra_high'
            elif 'mid_large' in models:
                seg = 'mid_large'
            else:
                seg = list(models.keys())[0]

        X = seg_df[feats].values
        ids = seg_df[id_col].values
        states = seg_df[state_col].values if state_col and state_col in seg_df.columns else ['Unknown']*len(seg_df)

        # Get predictions from quantile models
        pred_lower = models[seg]['q10'].predict(X)
        pred_mid = models[seg]['q50'].predict(X)
        pred_upper = models[seg]['q90'].predict(X)

        actual = seg_df[y_col].values if y_col in seg_df.columns else [np.nan]*len(seg_df)

        # Add value indicator to output for reference
        value_ind = seg_df['_value_indicator'].values if '_value_indicator' in seg_df.columns else [np.nan]*len(seg_df)
        value_src = seg_df['_value_source'].values if '_value_source' in seg_df.columns else ['unknown']*len(seg_df)

        preds_list.append(pd.DataFrame({
            'property_id': ids,
            'state': states,
            'value_indicator': value_ind,
            'value_source': value_src,
            'actual': actual,
            'predicted': pred_mid,
            'pred_lower': pred_lower,
            'pred_upper': pred_upper,
            'segment': seg,
            'error': [actual[i] - pred_mid[i] if not np.isnan(actual[i]) else np.nan for i in range(len(actual))],
            'pct_error': [100 * (actual[i] - pred_mid[i]) / actual[i] if not np.isnan(actual[i]) and actual[i] != 0 else np.nan for i in range(len(actual))]
        }))

        print(f"  {seg}: {len(seg_df):,} properties predicted")

    result_df = pd.concat(preds_list, ignore_index=True)
    print(f"\n✓ Generated {len(result_df):,} predictions")

    # Clean up temp columns
    if '_value_indicator' in pred_df.columns:
        pred_df = pred_df.drop(columns=['_value_indicator'])
    if '_value_source' in pred_df.columns:
        pred_df = pred_df.drop(columns=['_value_source'])

    # Calculate metrics if actuals are available
    valid_actuals = result_df['actual'].notna().sum()
    if valid_actuals > 0:
        valid_preds = result_df[result_df['actual'].notna()].copy()
        mae = mean_absolute_error(valid_preds['actual'], valid_preds['predicted'])
        mape = np.mean(np.abs((valid_preds['actual'] - valid_preds['predicted']) / valid_preds['actual'])) * 100
        r2 = r2_score(valid_preds['actual'], valid_preds['predicted'])
        print(f"  Validation metrics ({valid_actuals} properties):")
        print(f"  MAE: ${mae:,.0f} | MAPE: {mape:.2f}% | R²: {r2:.4f}")

    return result_df

def save_results(results, out_dir, new_predictions=None):
    print(f"\nSaving results...")
    preds, metrics, fi = results['predictions'], results['metrics'], results['feature_importance']
    seg_importances = results['segment_importances']

    wb = Workbook()
    wb.remove(wb.active)

    # Summary
    ws = wb.create_sheet("Summary",0)
    ws['A1'], ws['A1'].font = 'ROBUST MULTI-SOURCE SEGMENTED AVM', Font(bold=True,size=14)
    ws['A2'] = 'Segmentation: Multi-source value indicator (prior sale, assessed, census)'
    ws['A2'].font = Font(italic=True, size=10)

    r2 = r2_score(preds['actual'],preds['predicted'])
    mae = mean_absolute_error(preds['actual'],preds['predicted'])
    mape = np.mean(np.abs((preds['actual']-preds['predicted'])/preds['actual']))*100

    data = [
        ['Metric','Value'],
        ['Properties',len(preds)],
        ['Segments',len(metrics)],
        ['R²',f'{r2:.4f}'],
        ['MAE',f'${mae:,.0f}'],
        ['MAPE%',f'{mape:.2f}%']
    ]

    if new_predictions is not None:
        data.append(['New Predictions', len(new_predictions)])

    for i,(k,v) in enumerate(data,5):
        ws[f'A{i}'],ws[f'A{i}'].font,ws[f'B{i}'] = k,Font(bold=True),v

    # Segments
    ws = wb.create_sheet("Segments")
    seg_df = pd.DataFrame([{**{'Segment':s},**m} for s,m in metrics.items()])
    for i,h in enumerate(seg_df.columns,1):
        c = ws.cell(1,i,h)
        c.font,c.fill = Font(bold=True,color='FFFFFF'),PatternFill(start_color='366092',end_color='366092',fill_type='solid')
    for i,row in enumerate(seg_df.itertuples(index=False),2):
        for j,v in enumerate(row,1):
            ws.cell(i,j,v)

    # Global Feature Importance
    ws = wb.create_sheet("Global_Feature_Importance")
    ws['A1'] = 'Global Feature Importance (Weighted)'
    ws['A1'].font = Font(bold=True, size=12)
    for r_idx, row in enumerate(dataframe_to_rows(fi, index=False, header=True), 2):
        for c_idx, value in enumerate(row, 1):
            cell = ws.cell(row=r_idx, column=c_idx, value=value)
            if r_idx == 2:  # Header row
                cell.font = Font(bold=True, color='FFFFFF')
                cell.fill = PatternFill(start_color='366092', end_color='366092', fill_type='solid')

    # Per-Segment Feature Importance
    for seg_name, seg_fi in seg_importances.items():
        sheet_name = f"FI_{seg_name}"[:31]
        ws = wb.create_sheet(sheet_name)
        ws['A1'] = f'Feature Importance: {seg_name}'
        ws['A1'].font = Font(bold=True, size=12)

        for r_idx, row in enumerate(dataframe_to_rows(seg_fi, index=False, header=True), 2):
            for c_idx, value in enumerate(row, 1):
                cell = ws.cell(row=r_idx, column=c_idx, value=value)
                if r_idx == 2:
                    cell.font = Font(bold=True, color='FFFFFF')
                    cell.fill = PatternFill(start_color='366092', end_color='366092', fill_type='solid')

    # Test Set Predictions
    ws = wb.create_sheet("Test_Predictions")
    for i,h in enumerate(preds.columns,1):
        c = ws.cell(1,i,h)
        c.font,c.fill = Font(bold=True,color='FFFFFF'),PatternFill(start_color='366092',end_color='366092',fill_type='solid')
    for i,row in enumerate(preds.itertuples(index=False),2):
        for j,v in enumerate(row,1):
            ws.cell(i,j,v)

    # New Predictions
    if new_predictions is not None:
        ws = wb.create_sheet("New_Predictions")
        for i,h in enumerate(new_predictions.columns,1):
            c = ws.cell(1,i,h)
            c.font,c.fill = Font(bold=True,color='FFFFFF'),PatternFill(start_color='4472C4',end_color='4472C4',fill_type='solid')
        for i,row in enumerate(new_predictions.itertuples(index=False),2):
            for j,v in enumerate(row,1):
                ws.cell(i,j,v)

    ts = time.strftime("%Y%m%d_%H%M%S")
    xl_path = f"{out_dir}/robust_segmented_{ts}.xlsx"
    wb.save(xl_path)
    preds.to_csv(f"{out_dir}/robust_test_predictions_{ts}.csv",index=False)
    seg_df.to_csv(f"{out_dir}/robust_segments_{ts}.csv",index=False)
    fi.to_csv(f"{out_dir}/robust_importance_{ts}.csv",index=False)

    # Save per-segment feature importance CSVs
    for seg_name, seg_fi in seg_importances.items():
        seg_fi.to_csv(f"{out_dir}/robust_importance_{seg_name}_{ts}.csv", index=False)

    # Save new predictions
    if new_predictions is not None:
        new_predictions.to_csv(f"{out_dir}/robust_new_predictions_{ts}.csv", index=False)
        print(f"✓ New predictions CSV: {out_dir}/robust_new_predictions_{ts}.csv")

    print(f"✓ Excel: {xl_path}")
    print(f"✓ CSVs saved with timestamp: {ts}")
    print(f"✓ Per-segment feature importance saved for {len(seg_importances)} segments")

def main():
    t0 = time.time()
    print("="*60)
    print("ROBUST MULTI-SOURCE SEGMENTED AVM")
    print("="*60)

    # Load and prepare training data
    df, y_col, id_col, state_col = load_data(TRAINING_INPUT_PATH)
    df, feats, kmeans_model = prepare_data(df, y_col, id_col, state_col, for_training=True)

    # Train models
    results = train_segments(df, feats, y_col, id_col, state_col)

    # Get cluster stats for prediction data
    train_cluster_stats = None
    if 'geo_cluster' in df.columns:
        train_cluster_stats = df.groupby('geo_cluster')[y_col].agg(['mean','median']).reset_index()
        train_cluster_stats.columns = ['geo_cluster','cluster_avg_price','cluster_med_price']

    # Generate predictions for new properties if input file provided
    new_predictions = None
    if PREDICTION_INPUT_PATH:
        pred_df, _, _, _ = load_data(PREDICTION_INPUT_PATH)
        new_predictions = predict_new_properties(
            pred_df, results['models'], feats, y_col, id_col, state_col,
            kmeans_model, train_cluster_stats
        )

    # Save results
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    save_results(results, OUTPUT_DIR, new_predictions)

    # Print summary
    preds = results['predictions']
    r2 = r2_score(preds['actual'],preds['predicted'])
    mae = mean_absolute_error(preds['actual'],preds['predicted'])
    mape = np.mean(np.abs((preds['actual']-preds['predicted'])/preds['actual']))*100
    print(f"\n{'='*60}")
    print(f"✓ TRAINING COMPLETE in {time.time()-t0:.1f}s")
    print(f"  Test set: {len(preds):,} properties | {preds['segment'].nunique()} segments")
    print(f"  R²: {r2:.4f} | MAE: ${mae:,.0f} | MAPE: {mape:.2f}%")
    if new_predictions is not None:
        print(f"  New predictions: {len(new_predictions):,} properties")
    print(f"{'='*60}")

if __name__ == "__main__": main()

ROBUST MULTI-SOURCE SEGMENTED AVM
Loading /Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/Untitled 5_2025-12-23-1212.csv
127,326 records | 129.6MB | Price:sale_price ID:property_id

Preparing data...
Segmentation: ROBUST multi-source value indicator (no leakage)
127,258 records after price filter
51/57 features available

Training segmented models on 127,258 properties
Segmentation using ROBUST multi-source value indicator

Value indicator sources:
  census: 127,258 (100.0%)

6 segments created
  budget_small: 26,330 (value indicator: $12,000-$383,400, median $157,267)
  premium_large: 25,173 (value indicator: $504,650-$2,000,001, median $878,633)
  mid_large: 22,791 (value indicator: $384,950-$504,225, median $434,750)
  mid_small: 20,420 (value indicator: $384,950-$504,225, median $434,750)
  premium_small: 16,883 (value indicator: $504,650-$2,000,001, median $800,350)
  budget_large: 15,661 (value indicator: $21,250-$383,580, median $260,467)
  budget_small: 26,330→19,7

In [23]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.cluster import MiniBatchKMeans
from sklearn.ensemble import IsolationForest
import warnings
from openpyxl import Workbook
from openpyxl.styles import Font, PatternFill, Alignment
from openpyxl.utils.dataframe import dataframe_to_rows
import time, os

warnings.filterwarnings('ignore')
os.environ['OMP_NUM_THREADS'] = '4'

# Config - FIXED PARAMETERS
INCLUDE_MLS, INCLUDE_CENSUS, INCLUDE_NEIGHBORHOOD, INCLUDE_IMAGE = True, True, True, False
MIN_PRICE, TEST_SIZE, RAND_STATE, N_JOBS, N_CLUSTERS = 20000, 0.3, 42, -1, 8
N_EST, QUANTILES, MAX_SEGMENTS = 100, [0.1, 0.5, 0.9], 7  # MAX 7 SEGMENTS

# INPUT PATHS
TRAINING_INPUT_PATH = "/Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/Untitled 5_2025-12-23-1212.csv"
PREDICTION_INPUT_PATH = "/Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/MLS_w_luxury_AVM_outliers.csv"
OUTPUT_DIR = "/Users/jenny.lin/BASIS_AVM_Onboarding/cate_scenario_analyses/model_outputs"

BASE_FEATS = ["living_sqft","lot_sqft","year_built","effective_year_built","bedrooms","full_baths","half_baths","garage_spaces","fireplace_code","latitude","longitude","geo_cluster"]
ENG_FEATS = ["sqft_per_bedroom","lot_to_living_ratio","property_age","is_new","has_garage","luxury_score","log_sqft","age_squared"]
PRIOR_FEATS = ["prior_sale_price","prior_price_per_sqft","sqft_per_prior_dollar","years_since_last_sale","expected_appreciation","has_prior_sale","recently_sold"]
CLUSTER_FEATS = ["cluster_avg_price","cluster_med_price"]
CENSUS_FEATS = ["total_population_25plus","male_bachelors_degree","female_bachelors_degree","pct_bachelors_degree","median_earnings_total","median_earnings_male","median_earnings_female","median_household_income","median_home_value","median_gross_rent","owner_occupied_units","renter_occupied_units","pct_owner_occupied","occupied_units","vacant_units","median_age","civilian_employed","civilian_unemployed","unemployment_rate","income_education_score"]
ELECT_FEATS = ["votes_gop","votes_dem","total_votes","per_gop","per_dem","per_point_diff","dem_margin","rep_margin"]
IMG_FEATS = ["topic_1","topic_2","topic_3","topic_4","topic_5","topic_6","topic_7","topic_8","topic_9","topic_10","gran_c_in","gran_c_ex","gran_c","high_c_in","high_c_ex","high_c"]

def optimize_dtypes(df):
    for c in df.select_dtypes(['float64']).columns: df[c] = df[c].astype('float32')
    for c in df.select_dtypes(['int64']).columns:
        df[c] = df[c].astype('int8') if set(df[c].dropna().unique()).issubset({0,1}) else df[c].astype('int32')
    return df

def load_data(path):
    print(f"Loading {path}")
    df = pd.read_csv(path, low_memory=False)
    df.columns = df.columns.str.lower()
    y_col = next((c for c in ['sale_price','currentsalesprice','price','saleprice'] if c in df.columns), 'currentsalesprice')
    id_col = next((c for c in ['cc_list_id','property_id','propertyid','id'] if c in df.columns), 'cc_list_id')
    state_col = next((c for c in ['sample_state','state','state_code'] if c in df.columns), None)
    print(f"{len(df):,} records | {df.memory_usage(deep=True).sum()/1024**2:.1f}MB | Price:{y_col} ID:{id_col}")
    return optimize_dtypes(df), y_col, id_col, state_col

def assign_segments_improved(df, training_quantiles=None):
    """
    IMPROVED: Better handling of assessed values with variable multipliers
    Key changes:
    1. Accept low assessed values (might be legit tear-downs, land value)
    2. Use variable assessment ratio based on value tier
    3. No quality adjustment on census median
    4. Finer segmentation (6 tiers)
    """
    segments = pd.Series(['mid_mid'] * len(df), index=df.index)

    # ====================================================================
    # Build value indicator with IMPROVED priority system
    # ====================================================================
    value_indicator = pd.Series([np.nan] * len(df), index=df.index)
    source_used = pd.Series(['none'] * len(df), index=df.index)

    # Priority 1: Prior sale price (if recent)
    if 'prior_sale_price' in df.columns and 'years_since_last_sale' in df.columns:
        recent_prior = (df['years_since_last_sale'] <= 10) & (df['prior_sale_price'] > 10000)
        if recent_prior.sum() > 0:
            years = df.loc[recent_prior, 'years_since_last_sale'].fillna(5)
            appreciated_value = df.loc[recent_prior, 'prior_sale_price'] * (1.04 ** years)
            value_indicator[recent_prior] = appreciated_value
            source_used[recent_prior] = 'prior_sale'

    # Priority 2: Assessed value (ACCEPT ALL VALUES >$1K, even low ones)
    if 'assessed_total_value' in df.columns:
        # CHANGED: Don't reject low assessed values - they might be legit!
        # Only reject obviously corrupt data (<$1K or >$100M)
        reasonable_assessed = (df['assessed_total_value'] > 1000) & (df['assessed_total_value'] < 100000000)
        missing_indicator = value_indicator.isna()
        use_assessed = reasonable_assessed & missing_indicator

        if use_assessed.sum() > 0:
            # CHANGED: Use assessment ratio that varies by value
            assessed_vals = df.loc[use_assessed, 'assessed_total_value']

            # Low assessed values (<$50K) likely ARE the market value (tear-downs, vacant lots)
            # High assessed values need typical 1.15x multiplier
            multiplier = pd.Series([1.0] * len(assessed_vals), index=assessed_vals.index)
            multiplier[assessed_vals < 50000] = 1.0   # Very low = use as-is
            multiplier[(assessed_vals >= 50000) & (assessed_vals < 200000)] = 1.1
            multiplier[(assessed_vals >= 200000) & (assessed_vals < 500000)] = 1.15
            multiplier[assessed_vals >= 500000] = 1.2  # High value = higher assessment ratio

            value_indicator[use_assessed] = assessed_vals * multiplier
            source_used[use_assessed] = 'assessed'

    # Priority 3: Census median (use directly, no quality adjustment)
    if 'median_home_value' in df.columns:
        missing_indicator = value_indicator.isna()
        has_census = df['median_home_value'].notna() & (df['median_home_value'] > 0)
        use_census = has_census & missing_indicator
        if use_census.sum() > 0:
            # Use census median directly - it's already neighborhood-level
            value_indicator[use_census] = df.loc[use_census, 'median_home_value']
            source_used[use_census] = 'census'

    # Priority 4: Estimate from sqft + location
    if 'living_sqft' in df.columns:
        missing_indicator = value_indicator.isna()
        has_sqft = df['living_sqft'].notna() & (df['living_sqft'] > 0)
        use_sqft = has_sqft & missing_indicator
        if use_sqft.sum() > 0:
            # Base price per sqft varies by location (use census median if available)
            base_ppsf = pd.Series([150.0] * len(df.loc[use_sqft]), index=df.loc[use_sqft].index)

            if 'median_home_value' in df.columns:
                # Estimate neighborhood price per sqft from census
                census_median = df.loc[use_sqft, 'median_home_value'].fillna(300000)
                estimated_nbhd_sqft = 2000  # Assume typical house
                base_ppsf = (census_median / estimated_nbhd_sqft).clip(50, 500)

            value_indicator[use_sqft] = df.loc[use_sqft, 'living_sqft'] * base_ppsf
            source_used[use_sqft] = 'sqft_estimate'

    # Final fallback
    if value_indicator.isna().sum() > 0:
        fallback_value = value_indicator.median()
        value_indicator = value_indicator.fillna(fallback_value)
        source_used[value_indicator.isna()] = 'median_fallback'

    # ====================================================================
    # Segment based on value indicator with FINER GRANULARITY
    # ====================================================================
    if training_quantiles is None:
        # Training mode: calculate quantiles (6 tiers)
        q20, q40, q60, q80, q95 = value_indicator.quantile([0.20, 0.40, 0.60, 0.80, 0.95])
        training_quantiles = {
            'value_q20': q20,
            'value_q40': q40,
            'value_q60': q60,
            'value_q80': q80,
            'value_q95': q95
        }
        print(f"\nValue indicator quantiles:")
        print(f"  Q20: ${q20:,.0f} | Q40: ${q40:,.0f} | Q60: ${q60:,.0f} | Q80: ${q80:,.0f} | Q95: ${q95:,.0f}")
    else:
        q20, q40, q60, q80, q95 = training_quantiles['value_q20'], training_quantiles['value_q40'], training_quantiles['value_q60'], training_quantiles['value_q80'], training_quantiles['value_q95']

    # Create 6 tiers with finer granularity
    tier = pd.Series(['mid'] * len(df), index=df.index)
    tier[value_indicator < q20] = 'budget'
    tier[(value_indicator >= q20) & (value_indicator < q40)] = 'economy'
    tier[(value_indicator >= q40) & (value_indicator < q60)] = 'mid'
    tier[(value_indicator >= q60) & (value_indicator < q80)] = 'premium'
    tier[(value_indicator >= q80) & (value_indicator < q95)] = 'luxury'
    tier[value_indicator >= q95] = 'ultra'

    # Size category
    if 'living_sqft' in df.columns:
        sqft = df['living_sqft'].fillna(df['living_sqft'].median())
        sqft_median = sqft.median()
        size = pd.Series(['small'] * len(df), index=df.index)
        size[sqft > sqft_median] = 'large'
    else:
        size = pd.Series(['small'] * len(df), index=df.index)

    # Combine segments
    segments = tier + '_' + size

    # Store diagnostics
    df['_value_indicator'] = value_indicator
    df['_value_source'] = source_used

    return segments, training_quantiles

def engineer(df, y_col, with_price=False):
    # Standard features
    if 'living_sqft' in df.columns and 'bedrooms' in df.columns:
        df['sqft_per_bedroom'] = df['living_sqft']/(df['bedrooms']+1)

    if 'lot_sqft' in df.columns:
        if 'living_sqft' in df.columns:
            df['lot_to_living_ratio'] = df['lot_sqft']/(df['living_sqft']+1)
        if 'lot_acres' not in df.columns:
            df['lot_acres'] = df['lot_sqft']/43560

    if 'year_built' in df.columns:
        df['property_age'] = 2024-df['year_built']
        df['is_new'] = (df['property_age']<=5).astype('int8')
        df['age_squared'] = df['property_age']**2

    if 'garage_spaces' in df.columns:
        df['has_garage'] = (df['garage_spaces']>0).astype('int8')

    if 'living_sqft' in df.columns:
        df['log_sqft'] = np.log1p(df['living_sqft'])

    lux = [df[c]/1000 if c=='living_sqft' else df[c] for c in ['living_sqft','full_baths','garage_spaces'] if c in df.columns]
    df['luxury_score'] = sum(lux)/len(lux) if lux else 0

    # Value-based features (lagged, no leakage)
    if 'prior_sale_price' in df.columns and 'years_since_last_sale' in df.columns:
        df['prior_appreciated'] = df['prior_sale_price'] * (1.04 ** df['years_since_last_sale'].fillna(5))

    if 'assessed_total_value' in df.columns and 'living_sqft' in df.columns:
        reasonable = (df['assessed_total_value'] > 1000) & (df['assessed_total_value'] < 100000000)
        df['assessed_per_sqft'] = 0
        df.loc[reasonable, 'assessed_per_sqft'] = df.loc[reasonable, 'assessed_total_value'] / (df.loc[reasonable, 'living_sqft'] + 1)

    if 'assessed_total_value' in df.columns and 'median_home_value' in df.columns:
        reasonable_assessed = (df['assessed_total_value'] > 1000)
        reasonable_census = (df['median_home_value'] > 10000)
        reasonable = reasonable_assessed & reasonable_census
        df['assessed_to_census_ratio'] = 1.0
        df.loc[reasonable, 'assessed_to_census_ratio'] = df.loc[reasonable, 'assessed_total_value'] / (df.loc[reasonable, 'median_home_value'] + 1)

    if 'assessed_land_value' in df.columns and 'assessed_total_value' in df.columns:
        reasonable = (df['assessed_total_value'] > 1000) & (df['assessed_land_value'] > 0)
        df['land_to_total_ratio'] = 0
        df.loc[reasonable, 'land_to_total_ratio'] = df.loc[reasonable, 'assessed_land_value'] / (df.loc[reasonable, 'assessed_total_value'] + 1)

    if INCLUDE_CENSUS and all(c in df.columns for c in ['median_household_income','pct_bachelors_degree']):
        df['income_education_score'] = df['median_household_income']*df['pct_bachelors_degree']

    if 'prior_sale_price' in df.columns and 'living_sqft' in df.columns:
        df['prior_price_per_sqft'] = df['prior_sale_price']/(df['living_sqft']+1)
        df['sqft_per_prior_dollar'] = df['living_sqft']/(df['prior_sale_price']+1)

    if 'prior_sale_date' in df.columns:
        df['prior_sale_date'] = pd.to_datetime(df['prior_sale_date'], errors='coerce')
        df['years_since_last_sale'] = (pd.Timestamp('2024-01-01')-df['prior_sale_date']).dt.days/365.25
        if 'prior_sale_price' in df.columns:
            df['expected_appreciation'] = df['prior_sale_price']*(1.04)**df['years_since_last_sale']
        df['recently_sold'] = (df['years_since_last_sale']<2).astype('int8')

    if 'prior_sale_price' in df.columns:
        df['has_prior_sale'] = df['prior_sale_price'].notna().astype('int8')
        missing = df['prior_sale_price'].isna()
        df.loc[missing,'prior_sale_price'] = df.loc[missing,'median_home_value'] if 'median_home_value' in df.columns and INCLUDE_CENSUS else df['prior_sale_price'].median()

    if 'years_since_last_sale' in df.columns:
        df['years_since_last_sale'] = df['years_since_last_sale'].fillna(999)

    if with_price and y_col in df.columns and 'living_sqft' in df.columns:
        df['price_per_sqft'] = df[y_col]/(df['living_sqft']+1)
        df['sqft_per_dollar'] = df['living_sqft']/(df[y_col]+1)

    return df

def geo_cluster(df, kmeans_model=None):
    """Apply geo clustering - can use existing model for prediction"""
    if not all(c in df.columns for c in ['latitude','longitude']):
        df['geo_cluster'] = 0
        return df, kmeans_model

    valid = df[['latitude','longitude']].notna().all(axis=1)
    if valid.sum() < N_CLUSTERS:
        df['geo_cluster'] = 0
        return df, kmeans_model

    df['geo_cluster'] = 0

    if kmeans_model is None:
        # Training: fit new model
        kmeans_model = MiniBatchKMeans(n_clusters=N_CLUSTERS, random_state=RAND_STATE, batch_size=1000, n_init=3)
        df.loc[valid,'geo_cluster'] = kmeans_model.fit_predict(df.loc[valid,['latitude','longitude']])
    else:
        # Prediction: use existing model
        df.loc[valid,'geo_cluster'] = kmeans_model.predict(df.loc[valid,['latitude','longitude']])

    return df, kmeans_model

def add_cluster_feats(train, test, y_col):
    if 'geo_cluster' not in train.columns or y_col not in train.columns:
        for d in [train,test]:
            d['cluster_avg_price'] = d['cluster_med_price'] = train[y_col].median() if y_col in train.columns else 0
        return train, test

    stats = train.groupby('geo_cluster')[y_col].agg(['mean','median']).reset_index()
    stats.columns = ['geo_cluster','cluster_avg_price','cluster_med_price']
    train = train.merge(stats, on='geo_cluster', how='left').fillna({c:train[y_col].median() for c in ['cluster_avg_price','cluster_med_price']})
    test = test.merge(stats, on='geo_cluster', how='left').fillna({c:train[y_col].median() for c in ['cluster_avg_price','cluster_med_price']})
    return train, test

def filter_outliers(df, name, y_col):
    orig = len(df)
    df = engineer(df, y_col, with_price=True)

    if 'price_per_sqft' in df.columns:
        lb,ub = df['price_per_sqft'].quantile([.05,.95])
        df = df[(df['price_per_sqft']>=lb)&(df['price_per_sqft']<=ub)]

    if 'sqft_per_dollar' in df.columns:
        df = df[df['sqft_per_dollar']<=df['sqft_per_dollar'].quantile(.95)]

    if 'price_per_sqft' in df.columns:
        df = df.drop(columns=['price_per_sqft','sqft_per_dollar'])

    if 'lot_sqft' in df.columns:
        df = df[df['lot_sqft']<=df['lot_sqft'].quantile(.98)]

    if 'year_built' in df.columns:
        df = df[(df['year_built']>=1900)&(df['year_built']<=2025)]

    if len(df)>=100:
        try:
            feats = [c for c in ['living_sqft','lot_sqft',y_col,'year_built'] if c in df.columns]
            if len(feats)>=3:
                X = df[feats].fillna(df[feats].median())
                df = df[IsolationForest(contamination=.05,random_state=RAND_STATE,n_jobs=N_JOBS).fit_predict(X)==1]
        except:
            pass

    pct_filt = (orig-len(df))/orig*100 if orig>0 else 0
    if pct_filt > 0:
        print(f"  {name}: {orig:,}→{len(df):,} ({pct_filt:.1f}% filtered)")

    return df

def train_model(X, y, q):
    return XGBRegressor(
        objective='reg:quantileerror',
        quantile_alpha=q,
        n_estimators=N_EST,
        learning_rate=.1,
        max_depth=5,
        min_child_weight=3,
        subsample=.8,
        colsample_bytree=.8,
        random_state=RAND_STATE,
        n_jobs=N_JOBS,
        tree_method='hist'
    ).fit(X, y, verbose=False)

def get_feature_importance_per_segment(model, feat_names, top_n=20):
    """Extract feature importance for a single segment model"""
    scores = model.get_booster().get_score(importance_type="gain")
    importance_list = []
    for k, v in scores.items():
        idx = int(k[1:])
        if idx < len(feat_names):
            importance_list.append((feat_names[idx], v))

    importance_list.sort(key=lambda x: x[1], reverse=True)
    total_gain = sum(v for _, v in importance_list)

    if total_gain > 0:
        importance_df = pd.DataFrame([
            {'feature': feat, 'gain': gain, 'importance': gain/total_gain}
            for feat, gain in importance_list[:top_n]
        ])
    else:
        importance_df = pd.DataFrame(columns=['feature', 'gain', 'importance'])

    return importance_df

def feature_importance(models, feat_names, metrics):
    """Global feature importance weighted by segment size"""
    rows = []
    for seg, mdl in models.items():
        scores = mdl['q50'].get_booster().get_score(importance_type="gain")
        w = metrics[seg]["n_test"]
        for k,v in scores.items():
            idx = int(k[1:])
            if idx<len(feat_names):
                rows.append((feat_names[idx],v,w))

    if not rows:
        return pd.DataFrame(columns=["feature","total_gain","importance"])

    df = pd.DataFrame(rows, columns=["feature","gain","weight"])

    out = df.assign(wg=df["gain"]*df["weight"]).groupby("feature",as_index=False).agg(
        total_gain=("gain","sum"),
        weighted_gain=("wg","sum")
    ).sort_values("weighted_gain",ascending=False)

    out["importance"] = out["weighted_gain"]/out["weighted_gain"].sum()

    return out[["feature","total_gain","importance"]].head(100)

def prepare_data(df, y_col, id_col, state_col, for_training=True):
    print(f"\nPreparing data...")
    print(f"Segmentation: IMPROVED multi-source $ value (variable assessment ratios)")

    if for_training:
        df = df[df[y_col]>=MIN_PRICE]
        print(f"{len(df):,} records after price filter")

    df, kmeans_model = geo_cluster(engineer(df, y_col))

    feat_groups = []
    if INCLUDE_MLS:
        feat_groups.extend([BASE_FEATS,ENG_FEATS,PRIOR_FEATS,CLUSTER_FEATS])
    if INCLUDE_CENSUS:
        feat_groups.append(CENSUS_FEATS)
    if INCLUDE_NEIGHBORHOOD:
        feat_groups.append(ELECT_FEATS)
    if INCLUDE_IMAGE:
        feat_groups.append(IMG_FEATS)

    all_feats = [f for g in feat_groups for f in g]
    feats = [f for f in all_feats if f in df.columns]

    # Add new value-based features if they exist
    value_feats = ['prior_appreciated', 'assessed_per_sqft', 'assessed_to_census_ratio', 'land_to_total_ratio']
    for vf in value_feats:
        if vf in df.columns and vf not in feats:
            feats.append(vf)

    print(f"{len(feats)}/{len(all_feats)} features available")

    cols = list(set(feats+[y_col,id_col]+([state_col] if state_col and state_col in df.columns else [])))
    df = df[[c for c in cols if c in df.columns]].copy()
    df[feats] = df[feats].fillna(df[feats].median())

    if for_training:
        return df.dropna(subset=[y_col]), feats, kmeans_model
    else:
        return df, feats, kmeans_model

def train_segments(df, feats, y_col, id_col, state_col):
    print(f"\nTraining segmented models on {len(df):,} properties")
    print("Segmentation: IMPROVED multi-source $ value (6 tiers)")

    # Segment using improved approach
    df['seg'], training_quantiles = assign_segments_improved(df, training_quantiles=None)

    # Diagnostic: Show what data sources were used
    if '_value_source' in df.columns:
        source_counts = df['_value_source'].value_counts()
        print(f"\nValue source distribution:")
        for source, count in source_counts.items():
            print(f"  {source}: {count:,} ({100*count/len(df):.1f}%)")

    seg_cnts = df['seg'].value_counts()
    print(f"\n{len(seg_cnts)} segments created")

    for seg, cnt in seg_cnts.head(MAX_SEGMENTS).items():
        if '_value_indicator' in df.columns and seg != 'other':
            seg_df = df[df['seg']==seg]
            if len(seg_df) > 0:
                min_val = seg_df['_value_indicator'].min()
                max_val = seg_df['_value_indicator'].max()
                median_val = seg_df['_value_indicator'].median()
                print(f"  {seg}: {cnt:,} (value: ${min_val:,.0f}-${max_val:,.0f}, median ${median_val:,.0f})")
        else:
            print(f"  {seg}: {cnt:,}")

    # Merge small segments
    small = seg_cnts[seg_cnts<50].index
    if len(small)>0:
        df.loc[df['seg'].isin(small),'seg'] = 'other'
        print(f"\nMerged {len(small)} small segments into 'other'")

    # If still too many segments, merge smallest ones
    seg_cnts = df['seg'].value_counts()
    if len(seg_cnts) > MAX_SEGMENTS:
        keep_segs = seg_cnts.head(MAX_SEGMENTS-1).index
        df.loc[~df['seg'].isin(keep_segs),'seg'] = 'other'
        print(f"Consolidated to {MAX_SEGMENTS} segments maximum")

    # Clean up temp columns
    if '_value_indicator' in df.columns:
        df = df.drop(columns=['_value_indicator'])
    if '_value_source' in df.columns:
        df = df.drop(columns=['_value_source'])

    models, metrics, preds_list, segment_importances = {}, {}, [], {}

    for seg in df['seg'].unique():
        seg_df = df[df['seg']==seg].copy()
        if len(seg_df)<50: continue

        seg_df = filter_outliers(seg_df, seg, y_col)
        if len(seg_df)<50: continue

        train_idx = seg_df.sample(frac=1-TEST_SIZE,random_state=RAND_STATE).index
        train_df, test_df = seg_df.loc[train_idx].copy(), seg_df.loc[seg_df.index.difference(train_idx)].copy()
        train_df, test_df = add_cluster_feats(train_df, test_df, y_col)

        X_tr,y_tr = train_df[feats].values, train_df[y_col].values
        X_te,y_te = test_df[feats].values, test_df[y_col].values
        ids_te = test_df[id_col].values
        states_te = test_df[state_col].values if state_col and state_col in test_df.columns else ['Unknown']*len(test_df)

        seg_models, seg_preds = {}, []
        for q in QUANTILES:
            m = train_model(X_tr,y_tr,q)
            seg_models[f"q{int(q*100)}"] = m
            seg_preds.append(m.predict(X_te))

        models[seg] = seg_models
        segment_importances[seg] = get_feature_importance_per_segment(seg_models['q50'], feats, top_n=20)

        y_pred = seg_preds[1]
        mae,mape = mean_absolute_error(y_te,y_pred), np.mean(np.abs((y_te-y_pred)/y_te))*100
        r2,cov = r2_score(y_te,y_pred), np.mean((y_te>=seg_preds[0])&(y_te<=seg_preds[2]))*100
        metrics[seg] = {'n_train':len(X_tr),'n_test':len(X_te),'mae':mae,'mape':mape,'r2':r2,'cov':cov,'p_min':seg_df[y_col].min(),'p_max':seg_df[y_col].max(),'p_med':seg_df[y_col].median()}
        print(f"  {seg}: {len(test_df):,} test | MAE:${mae:,.0f} | MAPE:{mape:.2f}% | R²:{r2:.3f}")

        preds_list.append(pd.DataFrame({
            'property_id':ids_te,
            'state':states_te,
            'actual':y_te,
            'predicted':y_pred,
            'pred_lower':seg_preds[0],
            'pred_upper':seg_preds[2],
            'segment':seg
        }))

    return {
        'models': models,
        'metrics': metrics,
        'predictions': pd.concat(preds_list),
        'feature_importance': feature_importance(models, feats, metrics),
        'segment_importances': segment_importances,
        'feature_names': feats,
        'training_quantiles': training_quantiles  # Pass quantiles for prediction
    }

def predict_new_properties(pred_df, models, feats, y_col, id_col, state_col, kmeans_model, train_cluster_stats, training_quantiles):
    """Generate predictions for new properties using trained models"""
    print(f"\n{'='*60}")
    print("GENERATING PREDICTIONS FOR NEW PROPERTIES")
    print(f"{'='*60}")
    print(f"Input properties: {len(pred_df):,}")

    # Engineer features (no price filtering for prediction)
    pred_df = engineer(pred_df, y_col)
    pred_df, _ = geo_cluster(pred_df, kmeans_model)

    # Add cluster features from training data
    if 'geo_cluster' in pred_df.columns and train_cluster_stats is not None:
        pred_df = pred_df.merge(train_cluster_stats, on='geo_cluster', how='left')
        median_price = train_cluster_stats['cluster_avg_price'].median()
        pred_df['cluster_avg_price'] = pred_df['cluster_avg_price'].fillna(median_price)
        pred_df['cluster_med_price'] = pred_df['cluster_med_price'].fillna(median_price)

    # Fill missing features
    for feat in feats:
        if feat not in pred_df.columns:
            pred_df[feat] = 0
        else:
            pred_df[feat] = pred_df[feat].fillna(pred_df[feat].median() if pred_df[feat].notna().sum() > 0 else 0)

    # Assign segments using TRAINING quantiles
    pred_df['seg'], _ = assign_segments_improved(pred_df, training_quantiles=training_quantiles)

    # Show diagnostics
    if '_value_source' in pred_df.columns:
        source_counts = pred_df['_value_source'].value_counts()
        print(f"\nValue source distribution (prediction data):")
        for source, count in source_counts.items():
            print(f"  {source}: {count:,} ({100*count/len(pred_df):.1f}%)")

    # Generate predictions
    preds_list = []
    for seg in pred_df['seg'].unique():
        seg_df = pred_df[pred_df['seg']==seg].copy()

        if seg not in models:
            print(f"  Warning: Segment '{seg}' not in trained models, using fallback")
            # Find closest segment
            available_segs = list(models.keys())

            # Try to find a similar segment
            if 'ultra' in seg and any('luxury' in s for s in available_segs):
                seg = [s for s in available_segs if 'luxury' in s][0]
            elif 'luxury' in seg and any('premium' in s for s in available_segs):
                seg = [s for s in available_segs if 'premium' in s][0]
            elif 'premium' in seg and any('mid' in s for s in available_segs):
                seg = [s for s in available_segs if 'mid' in s][0]
            elif 'mid' in seg and any('mid' in s for s in available_segs):
                seg = [s for s in available_segs if 'mid' in s][0]
            elif 'economy' in seg and any('budget' in s for s in available_segs):
                seg = [s for s in available_segs if 'budget' in s][0]
            else:
                seg = available_segs[0]

        X = seg_df[feats].values
        ids = seg_df[id_col].values
        states = seg_df[state_col].values if state_col and state_col in seg_df.columns else ['Unknown']*len(seg_df)

        # Get predictions
        pred_lower = models[seg]['q10'].predict(X)
        pred_mid = models[seg]['q50'].predict(X)
        pred_upper = models[seg]['q90'].predict(X)

        actual = seg_df[y_col].values if y_col in seg_df.columns else [np.nan]*len(seg_df)

        # Get diagnostics
        value_ind = seg_df['_value_indicator'].values if '_value_indicator' in seg_df.columns else [np.nan]*len(seg_df)
        value_src = seg_df['_value_source'].values if '_value_source' in seg_df.columns else ['unknown']*len(seg_df)

        preds_list.append(pd.DataFrame({
            'property_id': ids,
            'state': states,
            'value_indicator': value_ind,
            'value_source': value_src,
            'actual': actual,
            'predicted': pred_mid,
            'pred_lower': pred_lower,
            'pred_upper': pred_upper,
            'segment': seg,
            'error': [actual[i] - pred_mid[i] if not np.isnan(actual[i]) else np.nan for i in range(len(actual))],
            'pct_error': [100 * (actual[i] - pred_mid[i]) / actual[i] if not np.isnan(actual[i]) and actual[i] != 0 else np.nan for i in range(len(actual))]
        }))

        print(f"  {seg}: {len(seg_df):,} properties predicted")

    result_df = pd.concat(preds_list, ignore_index=True)
    print(f"\n✓ Generated {len(result_df):,} predictions")

    # Clean up temp columns
    if '_value_indicator' in pred_df.columns:
        pred_df = pred_df.drop(columns=['_value_indicator'])
    if '_value_source' in pred_df.columns:
        pred_df = pred_df.drop(columns=['_value_source'])

    # Calculate metrics if actuals available
    valid_actuals = result_df['actual'].notna().sum()
    if valid_actuals > 0:
        valid_preds = result_df[result_df['actual'].notna()].copy()
        mae = mean_absolute_error(valid_preds['actual'], valid_preds['predicted'])
        mape = np.mean(np.abs((valid_preds['actual'] - valid_preds['predicted']) / valid_preds['actual'])) * 100
        r2 = r2_score(valid_preds['actual'], valid_preds['predicted'])
        print(f"  Validation metrics ({valid_actuals} properties):")
        print(f"  MAE: ${mae:,.0f} | MAPE: {mape:.2f}% | R²: {r2:.4f}")

    return result_df

def save_results(results, out_dir, new_predictions=None):
    print(f"\nSaving results...")
    preds, metrics, fi = results['predictions'], results['metrics'], results['feature_importance']
    seg_importances = results['segment_importances']

    wb = Workbook()
    wb.remove(wb.active)

    # Summary
    ws = wb.create_sheet("Summary",0)
    ws['A1'], ws['A1'].font = 'IMPROVED SEGMENTED AVM', Font(bold=True,size=14)
    ws['A2'] = 'Segmentation: Variable assessment ratios + 6 tiers'
    ws['A2'].font = Font(italic=True, size=10)

    r2 = r2_score(preds['actual'],preds['predicted'])
    mae = mean_absolute_error(preds['actual'],preds['predicted'])
    mape = np.mean(np.abs((preds['actual']-preds['predicted'])/preds['actual']))*100

    data = [
        ['Metric','Value'],
        ['Properties',len(preds)],
        ['Segments',len(metrics)],
        ['R²',f'{r2:.4f}'],
        ['MAE',f'${mae:,.0f}'],
        ['MAPE%',f'{mape:.2f}%']
    ]

    if new_predictions is not None:
        data.append(['New Predictions', len(new_predictions)])

    for i,(k,v) in enumerate(data,5):
        ws[f'A{i}'],ws[f'A{i}'].font,ws[f'B{i}'] = k,Font(bold=True),v

    # Segments
    ws = wb.create_sheet("Segments")
    seg_df = pd.DataFrame([{**{'Segment':s},**m} for s,m in metrics.items()])
    for i,h in enumerate(seg_df.columns,1):
        c = ws.cell(1,i,h)
        c.font,c.fill = Font(bold=True,color='FFFFFF'),PatternFill(start_color='366092',end_color='366092',fill_type='solid')
    for i,row in enumerate(seg_df.itertuples(index=False),2):
        for j,v in enumerate(row,1):
            ws.cell(i,j,v)

    # Global Feature Importance
    ws = wb.create_sheet("Global_Feature_Importance")
    ws['A1'] = 'Global Feature Importance (Weighted)'
    ws['A1'].font = Font(bold=True, size=12)
    for r_idx, row in enumerate(dataframe_to_rows(fi, index=False, header=True), 2):
        for c_idx, value in enumerate(row, 1):
            cell = ws.cell(row=r_idx, column=c_idx, value=value)
            if r_idx == 2:
                cell.font = Font(bold=True, color='FFFFFF')
                cell.fill = PatternFill(start_color='366092', end_color='366092', fill_type='solid')

    # Per-Segment Feature Importance
    for seg_name, seg_fi in seg_importances.items():
        sheet_name = f"FI_{seg_name}"[:31]
        ws = wb.create_sheet(sheet_name)
        ws['A1'] = f'Feature Importance: {seg_name}'
        ws['A1'].font = Font(bold=True, size=12)

        for r_idx, row in enumerate(dataframe_to_rows(seg_fi, index=False, header=True), 2):
            for c_idx, value in enumerate(row, 1):
                cell = ws.cell(row=r_idx, column=c_idx, value=value)
                if r_idx == 2:
                    cell.font = Font(bold=True, color='FFFFFF')
                    cell.fill = PatternFill(start_color='366092', end_color='366092', fill_type='solid')

    # Test Set Predictions
    ws = wb.create_sheet("Test_Predictions")
    for i,h in enumerate(preds.columns,1):
        c = ws.cell(1,i,h)
        c.font,c.fill = Font(bold=True,color='FFFFFF'),PatternFill(start_color='366092',end_color='366092',fill_type='solid')
    for i,row in enumerate(preds.itertuples(index=False),2):
        for j,v in enumerate(row,1):
            ws.cell(i,j,v)

    # New Predictions
    if new_predictions is not None:
        ws = wb.create_sheet("New_Predictions")
        for i,h in enumerate(new_predictions.columns,1):
            c = ws.cell(1,i,h)
            c.font,c.fill = Font(bold=True,color='FFFFFF'),PatternFill(start_color='4472C4',end_color='4472C4',fill_type='solid')
        for i,row in enumerate(new_predictions.itertuples(index=False),2):
            for j,v in enumerate(row,1):
                ws.cell(i,j,v)

    ts = time.strftime("%Y%m%d_%H%M%S")
    xl_path = f"{out_dir}/improved_segmented_{ts}.xlsx"
    wb.save(xl_path)
    preds.to_csv(f"{out_dir}/improved_test_predictions_{ts}.csv",index=False)
    seg_df.to_csv(f"{out_dir}/improved_segments_{ts}.csv",index=False)
    fi.to_csv(f"{out_dir}/improved_importance_{ts}.csv",index=False)

    for seg_name, seg_fi in seg_importances.items():
        seg_fi.to_csv(f"{out_dir}/improved_importance_{seg_name}_{ts}.csv", index=False)

    if new_predictions is not None:
        new_predictions.to_csv(f"{out_dir}/improved_new_predictions_{ts}.csv", index=False)
        print(f"✓ New predictions CSV: {out_dir}/improved_new_predictions_{ts}.csv")

    print(f"✓ Excel: {xl_path}")
    print(f"✓ CSVs saved with timestamp: {ts}")
    print(f"✓ Per-segment feature importance saved for {len(seg_importances)} segments")

def main():
    t0 = time.time()
    print("="*60)
    print("IMPROVED SEGMENTED AVM")
    print("Variable Assessment Ratios + 6 Tiers")
    print("="*60)

    # Load and prepare training data
    df, y_col, id_col, state_col = load_data(TRAINING_INPUT_PATH)
    df, feats, kmeans_model = prepare_data(df, y_col, id_col, state_col, for_training=True)

    # Train models
    results = train_segments(df, feats, y_col, id_col, state_col)

    # Get cluster stats and training quantiles
    train_cluster_stats = None
    if 'geo_cluster' in df.columns:
        train_cluster_stats = df.groupby('geo_cluster')[y_col].agg(['mean','median']).reset_index()
        train_cluster_stats.columns = ['geo_cluster','cluster_avg_price','cluster_med_price']

    training_quantiles = results['training_quantiles']

    # Generate predictions for new properties
    new_predictions = None
    if PREDICTION_INPUT_PATH:
        pred_df, _, _, _ = load_data(PREDICTION_INPUT_PATH)
        new_predictions = predict_new_properties(
            pred_df, results['models'], feats, y_col, id_col, state_col,
            kmeans_model, train_cluster_stats, training_quantiles
        )

    # Save results
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    save_results(results, OUTPUT_DIR, new_predictions)

    # Print summary
    preds = results['predictions']
    r2 = r2_score(preds['actual'],preds['predicted'])
    mae = mean_absolute_error(preds['actual'],preds['predicted'])
    mape = np.mean(np.abs((preds['actual']-preds['predicted'])/preds['actual']))*100
    print(f"\n{'='*60}")
    print(f"✓ TRAINING COMPLETE in {time.time()-t0:.1f}s")
    print(f"  Test set: {len(preds):,} properties | {preds['segment'].nunique()} segments")
    print(f"  R²: {r2:.4f} | MAE: ${mae:,.0f} | MAPE: {mape:.2f}%")
    if new_predictions is not None:
        print(f"  New predictions: {len(new_predictions):,} properties")
    print(f"{'='*60}")

if __name__ == "__main__": main()

IMPROVED SEGMENTED AVM
Variable Assessment Ratios + 6 Tiers
Loading /Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/Untitled 5_2025-12-23-1212.csv
127,326 records | 129.6MB | Price:sale_price ID:property_id

Preparing data...
Segmentation: IMPROVED multi-source $ value (variable assessment ratios)
127,258 records after price filter
51/57 features available

Training segmented models on 127,258 properties
Segmentation: IMPROVED multi-source $ value (6 tiers)

Value indicator quantiles:
  Q20: $225,450 | Q40: $434,750 | Q60: $434,750 | Q80: $741,940 | Q95: $1,445,367

Value source distribution:
  census: 127,258 (100.0%)

10 segments created
  premium_large: 27,925 (value: $434,750-$732,125, median $434,750)
  premium_small: 26,120 (value: $434,750-$732,125, median $434,750)
  budget_small: 19,274 (value: $12,000-$225,433, median $134,133)
  economy_large: 13,496 (value: $225,450-$433,875, median $335,713)
  luxury_large: 11,219 (value: $741,940-$1,426,334, median $926,975)


In [25]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.cluster import MiniBatchKMeans
from sklearn.ensemble import IsolationForest
import warnings, time, os
from openpyxl import Workbook
from openpyxl.styles import Font, PatternFill
from openpyxl.utils.dataframe import dataframe_to_rows

warnings.filterwarnings('ignore')
os.environ['OMP_NUM_THREADS'] = '4'

# Config
INCLUDE_MLS, INCLUDE_CENSUS, INCLUDE_NEIGHBORHOOD, INCLUDE_IMAGE = True, True, True, False
MIN_PRICE, TEST_SIZE, RAND_STATE, N_JOBS, N_CLUSTERS, N_EST, MAX_SEGMENTS = 20000, 0.3, 42, -1, 8, 100, 7
QUANTILES = [0.1, 0.5, 0.9]

TRAINING_INPUT_PATH = "/Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/Untitled 5_2025-12-23-1212.csv"
PREDICTION_INPUT_PATH = "/Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/MLS_w_luxury_AVM_outliers.csv"
OUTPUT_DIR = "/Users/jenny.lin/BASIS_AVM_Onboarding/cate_scenario_analyses/model_outputs"

BASE_FEATS = ["living_sqft","lot_sqft","year_built","effective_year_built","bedrooms","full_baths","half_baths","garage_spaces","fireplace_code","latitude","longitude","geo_cluster"]
ENG_FEATS = ["sqft_per_bedroom","lot_to_living_ratio","property_age","is_new","has_garage","luxury_score","log_sqft","age_squared"]
PRIOR_FEATS = ["prior_sale_price","prior_price_per_sqft","sqft_per_prior_dollar","years_since_last_sale","expected_appreciation","has_prior_sale","recently_sold"]
CLUSTER_FEATS = ["cluster_avg_price","cluster_med_price"]
CENSUS_FEATS = ["total_population_25plus","male_bachelors_degree","female_bachelors_degree","pct_bachelors_degree","median_earnings_total","median_earnings_male","median_earnings_female","median_household_income","median_home_value","median_gross_rent","owner_occupied_units","renter_occupied_units","pct_owner_occupied","occupied_units","vacant_units","median_age","civilian_employed","civilian_unemployed","unemployment_rate","income_education_score"]
ELECT_FEATS = ["votes_gop","votes_dem","total_votes","per_gop","per_dem","per_point_diff","dem_margin","rep_margin"]
IMG_FEATS = ["topic_1","topic_2","topic_3","topic_4","topic_5","topic_6","topic_7","topic_8","topic_9","topic_10","gran_c_in","gran_c_ex","gran_c","high_c_in","high_c_ex","high_c"]

def optimize_dtypes(df):
    for c in df.select_dtypes(['float64']).columns: df[c] = df[c].astype('float32')
    for c in df.select_dtypes(['int64']).columns:
        df[c] = df[c].astype('int8' if set(df[c].dropna().unique()).issubset({0,1}) else 'int32')
    return df

def load_data(path):
    print(f"Loading {path}")
    df = optimize_dtypes(pd.read_csv(path, low_memory=False))
    df.columns = df.columns.str.lower()
    y_col = next((c for c in ['sale_price','currentsalesprice','price','saleprice'] if c in df.columns), 'currentsalesprice')
    id_col = next((c for c in ['cc_list_id','property_id','propertyid','id'] if c in df.columns), 'cc_list_id')
    state_col = next((c for c in ['sample_state','state','state_code'] if c in df.columns), None)
    print(f"{len(df):,} records | {df.memory_usage(deep=True).sum()/1024**2:.1f}MB | Price:{y_col} ID:{id_col}")
    return df, y_col, id_col, state_col

def filter_bad_assessed(df):
    if 'assessed_total_value' not in df.columns:
        return pd.Series([False]*len(df), index=df.index)

    valid = (df['assessed_total_value']>10000) & (df['assessed_total_value']<100000000)

    if 'living_sqft' in df.columns:
        has_sqft = (df['living_sqft'].notna()) & (df['living_sqft']>100)
        ppsf = df['assessed_total_value']/df['living_sqft']
        valid &= ~(has_sqft & ((ppsf<20)|(ppsf>2000)))

    if 'prior_sale_price' in df.columns and 'years_since_last_sale' in df.columns:
        has_prior = (df['prior_sale_price']>10000) & (df['years_since_last_sale']<=15)
        valid &= ~(has_prior & ((df['assessed_total_value']/df['prior_sale_price'])<0.10))

    if 'median_home_value' in df.columns:
        has_census = (df['median_home_value']>10000)
        valid &= ~(has_census & ((df['assessed_total_value']/df['median_home_value'])<0.05))

    invalid_cnt = ((df['assessed_total_value'].notna()) & ~valid).sum()
    if invalid_cnt>0:
        print(f"  ⚠️  Filtered {invalid_cnt:,} bad assessed values")

    return valid

def assign_segments(df, train_q=None):
    value_ind = pd.Series([np.nan]*len(df), index=df.index)
    source = pd.Series(['none']*len(df), index=df.index)

    # Priority 1: Prior sale
    if 'prior_sale_price' in df.columns and 'years_since_last_sale' in df.columns:
        recent = (df['years_since_last_sale']<=10) & (df['prior_sale_price']>10000)
        if recent.sum()>0:
            yrs = df.loc[recent,'years_since_last_sale'].fillna(5)
            value_ind[recent] = df.loc[recent,'prior_sale_price']*(1.04**yrs)
            source[recent] = 'prior_sale'

    # Priority 2: Assessed
    if 'assessed_total_value' in df.columns:
        valid = filter_bad_assessed(df) & value_ind.isna()
        if valid.sum()>0:
            vals = df.loc[valid,'assessed_total_value']
            mult = pd.Series([1.0]*len(vals), index=vals.index)
            mult[vals<50000], mult[(vals>=50000)&(vals<200000)] = 1.0, 1.1
            mult[(vals>=200000)&(vals<500000)], mult[vals>=500000] = 1.15, 1.2
            value_ind[valid], source[valid] = vals*mult, 'assessed'

    # Priority 3: Census (PROPERTY-ADJUSTED)
    if 'median_home_value' in df.columns:
        valid = (df['median_home_value']>0) & value_ind.isna()
        if valid.sum()>0:
            base_val = df.loc[valid,'median_home_value']
            # Adjust census median by property characteristics
            if 'living_sqft' in df.columns:
                typical_sqft = 2000
                sqft_ratio = (df.loc[valid,'living_sqft']/typical_sqft).clip(0.4, 3.0)
                base_val *= sqft_ratio
            if 'full_baths' in df.columns:
                typical_baths = 2
                bath_adj = 1 + (df.loc[valid,'full_baths']-typical_baths)*0.08
                base_val *= bath_adj.clip(0.85, 1.3)
            if 'garage_spaces' in df.columns:
                garage_adj = 1 + df.loc[valid,'garage_spaces']*0.05
                base_val *= garage_adj.clip(1.0, 1.2)
            value_ind[valid], source[valid] = base_val, 'census_adjusted'

    # Priority 4: Sqft estimate
    if 'living_sqft' in df.columns:
        valid = (df['living_sqft']>0) & value_ind.isna()
        if valid.sum()>0:
            ppsf = pd.Series([150.0]*valid.sum(), index=df[valid].index)
            if 'median_home_value' in df.columns:
                ppsf = (df.loc[valid,'median_home_value'].fillna(300000)/2000).clip(50,500)
            value_ind[valid], source[valid] = df.loc[valid,'living_sqft']*ppsf, 'sqft_estimate'

    value_ind = value_ind.fillna(value_ind.median())
    source[value_ind.isna()] = 'fallback'

    # Segment
    if train_q is None:
        q20,q40,q60,q80,q95 = value_ind.quantile([0.20,0.40,0.60,0.80,0.95])
        train_q = {'value_q20':q20,'value_q40':q40,'value_q60':q60,'value_q80':q80,'value_q95':q95}
        print(f"\nValue quantiles: Q20:${q20:,.0f} Q40:${q40:,.0f} Q60:${q60:,.0f} Q80:${q80:,.0f} Q95:${q95:,.0f}")
    else:
        q20,q40,q60,q80,q95 = train_q['value_q20'],train_q['value_q40'],train_q['value_q60'],train_q['value_q80'],train_q['value_q95']

    tier = pd.Series(['mid']*len(df), index=df.index)
    tier[value_ind<q20], tier[(value_ind>=q20)&(value_ind<q40)] = 'budget', 'economy'
    tier[(value_ind>=q60)&(value_ind<q80)], tier[(value_ind>=q80)&(value_ind<q95)], tier[value_ind>=q95] = 'premium', 'luxury', 'ultra'

    size = pd.Series(['small']*len(df), index=df.index)
    if 'living_sqft' in df.columns:
        sqft = df['living_sqft'].fillna(df['living_sqft'].median())
        size[sqft>sqft.median()] = 'large'

    df['_value_indicator'], df['_value_source'] = value_ind, source
    return tier+'_'+size, train_q

def engineer(df, y_col, with_price=False):
    if 'living_sqft' in df.columns:
        if 'bedrooms' in df.columns: df['sqft_per_bedroom'] = df['living_sqft']/(df['bedrooms']+1)
        df['log_sqft'] = np.log1p(df['living_sqft'])

    if 'lot_sqft' in df.columns:
        if 'living_sqft' in df.columns: df['lot_to_living_ratio'] = df['lot_sqft']/(df['living_sqft']+1)
        if 'lot_acres' not in df.columns: df['lot_acres'] = df['lot_sqft']/43560

    if 'year_built' in df.columns:
        df['property_age'], df['is_new'], df['age_squared'] = 2024-df['year_built'], ((2024-df['year_built'])<=5).astype('int8'), (2024-df['year_built'])**2

    if 'garage_spaces' in df.columns:
        df['has_garage'] = (df['garage_spaces']>0).astype('int8')

    lux = [df[c]/1000 if c=='living_sqft' else df[c] for c in ['living_sqft','full_baths','garage_spaces'] if c in df.columns]
    df['luxury_score'] = sum(lux)/len(lux) if lux else 0

    if 'prior_sale_price' in df.columns and 'years_since_last_sale' in df.columns:
        df['prior_appreciated'] = df['prior_sale_price']*(1.04**df['years_since_last_sale'].fillna(5))

    if 'assessed_total_value' in df.columns:
        valid = filter_bad_assessed(df)
        if 'living_sqft' in df.columns:
            df['assessed_per_sqft'] = 0
            df.loc[valid,'assessed_per_sqft'] = df.loc[valid,'assessed_total_value']/(df.loc[valid,'living_sqft']+1)
        if 'median_home_value' in df.columns:
            reasonable = valid & (df['median_home_value']>10000)
            df['assessed_to_census_ratio'] = 1.0
            df.loc[reasonable,'assessed_to_census_ratio'] = df.loc[reasonable,'assessed_total_value']/(df.loc[reasonable,'median_home_value']+1)
        if 'assessed_land_value' in df.columns:
            reasonable = valid & (df['assessed_land_value']>0)
            df['land_to_total_ratio'] = 0
            df.loc[reasonable,'land_to_total_ratio'] = df.loc[reasonable,'assessed_land_value']/(df.loc[reasonable,'assessed_total_value']+1)

    if INCLUDE_CENSUS and all(c in df.columns for c in ['median_household_income','pct_bachelors_degree']):
        df['income_education_score'] = df['median_household_income']*df['pct_bachelors_degree']

    if 'prior_sale_price' in df.columns and 'living_sqft' in df.columns:
        df['prior_price_per_sqft'], df['sqft_per_prior_dollar'] = df['prior_sale_price']/(df['living_sqft']+1), df['living_sqft']/(df['prior_sale_price']+1)

    if 'prior_sale_date' in df.columns:
        df['prior_sale_date'] = pd.to_datetime(df['prior_sale_date'], errors='coerce')
        df['years_since_last_sale'] = (pd.Timestamp('2024-01-01')-df['prior_sale_date']).dt.days/365.25
        if 'prior_sale_price' in df.columns:
            df['expected_appreciation'] = df['prior_sale_price']*(1.04)**df['years_since_last_sale']
        df['recently_sold'] = (df['years_since_last_sale']<2).astype('int8')

    if 'prior_sale_price' in df.columns:
        df['has_prior_sale'] = df['prior_sale_price'].notna().astype('int8')
        missing = df['prior_sale_price'].isna()
        df.loc[missing,'prior_sale_price'] = df.loc[missing,'median_home_value'] if 'median_home_value' in df.columns and INCLUDE_CENSUS else df['prior_sale_price'].median()

    if 'years_since_last_sale' in df.columns:
        df['years_since_last_sale'] = df['years_since_last_sale'].fillna(999)

    if with_price and y_col in df.columns and 'living_sqft' in df.columns:
        df['price_per_sqft'], df['sqft_per_dollar'] = df[y_col]/(df['living_sqft']+1), df['living_sqft']/(df[y_col]+1)

    return df

def geo_cluster(df, kmeans=None):
    if not all(c in df.columns for c in ['latitude','longitude']):
        df['geo_cluster'] = 0
        return df, kmeans

    valid = df[['latitude','longitude']].notna().all(axis=1)
    if valid.sum()<N_CLUSTERS:
        df['geo_cluster'] = 0
        return df, kmeans

    df['geo_cluster'] = 0
    if kmeans is None:
        kmeans = MiniBatchKMeans(n_clusters=N_CLUSTERS, random_state=RAND_STATE, batch_size=1000, n_init=3)
        df.loc[valid,'geo_cluster'] = kmeans.fit_predict(df.loc[valid,['latitude','longitude']])
    else:
        df.loc[valid,'geo_cluster'] = kmeans.predict(df.loc[valid,['latitude','longitude']])

    return df, kmeans

def add_cluster_feats(train, test, y_col):
    if 'geo_cluster' not in train.columns or y_col not in train.columns:
        for d in [train,test]:
            d['cluster_avg_price'] = d['cluster_med_price'] = train[y_col].median() if y_col in train.columns else 0
        return train, test

    stats = train.groupby('geo_cluster')[y_col].agg(['mean','median']).reset_index()
    stats.columns = ['geo_cluster','cluster_avg_price','cluster_med_price']
    med = train[y_col].median()
    train = train.merge(stats, on='geo_cluster', how='left').fillna({'cluster_avg_price':med,'cluster_med_price':med})
    test = test.merge(stats, on='geo_cluster', how='left').fillna({'cluster_avg_price':med,'cluster_med_price':med})
    return train, test

def filter_outliers(df, name, y_col):
    orig = len(df)
    df = engineer(df, y_col, with_price=True)

    if 'price_per_sqft' in df.columns:
        lb,ub = df['price_per_sqft'].quantile([.05,.95])
        df = df[(df['price_per_sqft']>=lb)&(df['price_per_sqft']<=ub)].drop(columns=['price_per_sqft','sqft_per_dollar'])

    if 'lot_sqft' in df.columns: df = df[df['lot_sqft']<=df['lot_sqft'].quantile(.98)]
    if 'year_built' in df.columns: df = df[(df['year_built']>=1900)&(df['year_built']<=2025)]

    if len(df)>=100:
        try:
            feats = [c for c in ['living_sqft','lot_sqft',y_col,'year_built'] if c in df.columns]
            if len(feats)>=3:
                X = df[feats].fillna(df[feats].median())
                df = df[IsolationForest(contamination=.05, random_state=RAND_STATE, n_jobs=N_JOBS).fit_predict(X)==1]
        except: pass

    if (pct:=(orig-len(df))/orig*100)>0: print(f"  {name}: {orig:,}→{len(df):,} ({pct:.1f}% filtered)")
    return df

def train_model(X, y, q):
    return XGBRegressor(objective='reg:quantileerror', quantile_alpha=q, n_estimators=N_EST, learning_rate=.1,
                       max_depth=5, min_child_weight=3, subsample=.8, colsample_bytree=.8,
                       random_state=RAND_STATE, n_jobs=N_JOBS, tree_method='hist').fit(X, y, verbose=False)

def get_feat_importance(model, feats, top_n=20):
    scores = model.get_booster().get_score(importance_type="gain")
    imp = [(feats[int(k[1:])],v) for k,v in scores.items() if int(k[1:])<len(feats)]
    imp.sort(key=lambda x: x[1], reverse=True)
    total = sum(v for _,v in imp)
    return pd.DataFrame([{'feature':f,'gain':g,'importance':g/total} for f,g in imp[:top_n]]) if total>0 else pd.DataFrame(columns=['feature','gain','importance'])

def feature_importance(models, feats, metrics):
    rows = []
    for seg,mdl in models.items():
        scores = mdl['q50'].get_booster().get_score(importance_type="gain")
        w = metrics[seg]["n_test"]
        rows.extend([(feats[int(k[1:])],v,w) for k,v in scores.items() if int(k[1:])<len(feats)])

    if not rows: return pd.DataFrame(columns=["feature","total_gain","importance"])

    df = pd.DataFrame(rows, columns=["feature","gain","weight"])
    out = df.assign(wg=df["gain"]*df["weight"]).groupby("feature",as_index=False).agg(
        total_gain=("gain","sum"), weighted_gain=("wg","sum")).sort_values("weighted_gain",ascending=False)
    out["importance"] = out["weighted_gain"]/out["weighted_gain"].sum()
    return out[["feature","total_gain","importance"]].head(100)

def prepare_data(df, y_col, id_col, state_col, for_training=True):
    print(f"\nPreparing data (property-adjusted census segmentation)...")
    if for_training: df = df[df[y_col]>=MIN_PRICE]

    df, kmeans = geo_cluster(engineer(df, y_col))

    feat_groups = []
    if INCLUDE_MLS: feat_groups.extend([BASE_FEATS,ENG_FEATS,PRIOR_FEATS,CLUSTER_FEATS])
    if INCLUDE_CENSUS: feat_groups.append(CENSUS_FEATS)
    if INCLUDE_NEIGHBORHOOD: feat_groups.append(ELECT_FEATS)
    if INCLUDE_IMAGE: feat_groups.append(IMG_FEATS)

    all_feats = [f for g in feat_groups for f in g]
    feats = [f for f in all_feats if f in df.columns]
    feats.extend([vf for vf in ['prior_appreciated','assessed_per_sqft','assessed_to_census_ratio','land_to_total_ratio'] if vf in df.columns and vf not in feats])

    print(f"{len(feats)}/{len(all_feats)} features available")

    cols = list(set(feats+[y_col,id_col]+([state_col] if state_col and state_col in df.columns else [])))
    df = df[[c for c in cols if c in df.columns]].copy()
    df[feats] = df[feats].fillna(df[feats].median())

    return (df.dropna(subset=[y_col]),feats,kmeans) if for_training else (df,feats,kmeans)

def train_segments(df, feats, y_col, id_col, state_col):
    print(f"\nTraining on {len(df):,} properties (with property-adjusted segmentation)")

    df['seg'], train_q = assign_segments(df)

    if '_value_source' in df.columns:
        print(f"\nValue sources: {dict(df['_value_source'].value_counts())}")

    seg_cnts = df['seg'].value_counts()
    print(f"\n{len(seg_cnts)} segments")

    for seg,cnt in seg_cnts.head(MAX_SEGMENTS).items():
        if '_value_indicator' in df.columns and seg!='other':
            seg_df = df[df['seg']==seg]
            if len(seg_df)>0:
                print(f"  {seg}: {cnt:,} (${seg_df['_value_indicator'].min():,.0f}-${seg_df['_value_indicator'].max():,.0f}, med ${seg_df['_value_indicator'].median():,.0f})")
        else:
            print(f"  {seg}: {cnt:,}")

    small = seg_cnts[seg_cnts<50].index
    if len(small)>0:
        df.loc[df['seg'].isin(small),'seg'] = 'other'
        print(f"\nMerged {len(small)} small segments")

    seg_cnts = df['seg'].value_counts()
    if len(seg_cnts)>MAX_SEGMENTS:
        keep = seg_cnts.head(MAX_SEGMENTS-1).index
        df.loc[~df['seg'].isin(keep),'seg'] = 'other'
        print(f"Consolidated to {MAX_SEGMENTS} segments")

    df = df.drop(columns=[c for c in ['_value_indicator','_value_source'] if c in df.columns])

    models, metrics, preds_list, seg_imps = {}, {}, [], {}

    for seg in df['seg'].unique():
        seg_df = filter_outliers(df[df['seg']==seg].copy(), seg, y_col)
        if len(seg_df)<50: continue

        train_idx = seg_df.sample(frac=1-TEST_SIZE, random_state=RAND_STATE).index
        train_df, test_df = seg_df.loc[train_idx].copy(), seg_df.loc[seg_df.index.difference(train_idx)].copy()
        train_df, test_df = add_cluster_feats(train_df, test_df, y_col)

        X_tr, y_tr = train_df[feats].values, train_df[y_col].values
        X_te, y_te = test_df[feats].values, test_df[y_col].values
        ids, states = test_df[id_col].values, test_df[state_col].values if state_col and state_col in test_df.columns else ['Unknown']*len(test_df)

        seg_models = {f"q{int(q*100)}":train_model(X_tr,y_tr,q) for q in QUANTILES}
        seg_preds = [seg_models[f"q{int(q*100)}"].predict(X_te) for q in QUANTILES]

        models[seg] = seg_models
        seg_imps[seg] = get_feat_importance(seg_models['q50'], feats)

        mae, mape = mean_absolute_error(y_te,seg_preds[1]), np.mean(np.abs((y_te-seg_preds[1])/y_te))*100
        r2, cov = r2_score(y_te,seg_preds[1]), np.mean((y_te>=seg_preds[0])&(y_te<=seg_preds[2]))*100
        metrics[seg] = {'n_train':len(X_tr),'n_test':len(X_te),'mae':mae,'mape':mape,'r2':r2,'cov':cov,
                       'p_min':seg_df[y_col].min(),'p_max':seg_df[y_col].max(),'p_med':seg_df[y_col].median()}
        print(f"  {seg}: {len(test_df):,} test | MAE:${mae:,.0f} | MAPE:{mape:.2f}% | R²:{r2:.3f}")

        preds_list.append(pd.DataFrame({
            'property_id':ids, 'state':states, 'actual':y_te, 'predicted':seg_preds[1],
            'pred_lower':seg_preds[0], 'pred_upper':seg_preds[2], 'segment':seg
        }))

    return {'models':models, 'metrics':metrics, 'predictions':pd.concat(preds_list),
            'feature_importance':feature_importance(models,feats,metrics), 'segment_importances':seg_imps,
            'feature_names':feats, 'training_quantiles':train_q}

def predict_new(pred_df, models, feats, y_col, id_col, state_col, kmeans, train_stats, train_q):
    print(f"\n{'='*60}\nPREDICTING {len(pred_df):,} NEW PROPERTIES\n{'='*60}")

    pred_df = engineer(pred_df, y_col)
    pred_df, _ = geo_cluster(pred_df, kmeans)

    if 'geo_cluster' in pred_df.columns and train_stats is not None:
        pred_df = pred_df.merge(train_stats, on='geo_cluster', how='left')
        med = train_stats['cluster_avg_price'].median()
        pred_df['cluster_avg_price'] = pred_df['cluster_avg_price'].fillna(med)
        pred_df['cluster_med_price'] = pred_df['cluster_med_price'].fillna(med)

    for feat in feats:
        if feat not in pred_df.columns: pred_df[feat] = 0
        else: pred_df[feat] = pred_df[feat].fillna(pred_df[feat].median() if pred_df[feat].notna().sum()>0 else 0)

    pred_df['seg'], _ = assign_segments(pred_df, train_q)

    if '_value_source' in pred_df.columns:
        print(f"Value sources: {dict(pred_df['_value_source'].value_counts())}")

    preds_list = []
    for seg in pred_df['seg'].unique():
        seg_df = pred_df[pred_df['seg']==seg].copy()

        if seg not in models:
            print(f"  Warning: '{seg}' not in models, using fallback")
            avail = list(models.keys())
            seg = next((s for s in avail if any(x in seg for x in ['ultra','luxury','premium','mid','economy','budget'] if x in s)), avail[0])

        X = seg_df[feats].values
        ids, states = seg_df[id_col].values, seg_df[state_col].values if state_col and state_col in seg_df.columns else ['Unknown']*len(seg_df)

        preds = [models[seg][f"q{int(q*100)}"].predict(X) for q in QUANTILES]
        actual = seg_df[y_col].values if y_col in seg_df.columns else [np.nan]*len(seg_df)

        value_ind = seg_df['_value_indicator'].values if '_value_indicator' in seg_df.columns else [np.nan]*len(seg_df)
        value_src = seg_df['_value_source'].values if '_value_source' in seg_df.columns else ['unknown']*len(seg_df)

        preds_list.append(pd.DataFrame({
            'property_id':ids, 'state':states, 'value_indicator':value_ind, 'value_source':value_src,
            'actual':actual, 'predicted':preds[1], 'pred_lower':preds[0], 'pred_upper':preds[2], 'segment':seg,
            'error':[actual[i]-preds[1][i] if not np.isnan(actual[i]) else np.nan for i in range(len(actual))],
            'pct_error':[100*(actual[i]-preds[1][i])/actual[i] if not np.isnan(actual[i]) and actual[i]!=0 else np.nan for i in range(len(actual))]
        }))

        print(f"  {seg}: {len(seg_df):,} predicted")

    result = pd.concat(preds_list, ignore_index=True)
    print(f"\n✓ Generated {len(result):,} predictions")

    pred_df = pred_df.drop(columns=[c for c in ['_value_indicator','_value_source'] if c in pred_df.columns])

    valid = result['actual'].notna().sum()
    if valid>0:
        valid_df = result[result['actual'].notna()]
        mae, mape = mean_absolute_error(valid_df['actual'],valid_df['predicted']), np.mean(np.abs((valid_df['actual']-valid_df['predicted'])/valid_df['actual']))*100
        r2 = r2_score(valid_df['actual'],valid_df['predicted'])
        print(f"  Validation ({valid}): MAE:${mae:,.0f} | MAPE:{mape:.2f}% | R²:{r2:.4f}")

    return result

def save_results(results, out_dir, new_preds=None):
    print(f"\nSaving results...")
    preds, metrics, fi, seg_imps = results['predictions'], results['metrics'], results['feature_importance'], results['segment_importances']

    wb = Workbook()
    wb.remove(wb.active)

    # Summary
    ws = wb.create_sheet("Summary", 0)
    ws['A1'].font, ws['A1'].value = Font(bold=True,size=14), 'IMPROVED SEGMENTED AVM'
    ws['A2'].font, ws['A2'].value = Font(italic=True,size=10), 'Validated assessments + property-adjusted census + 6 tiers'

    r2, mae, mape = r2_score(preds['actual'],preds['predicted']), mean_absolute_error(preds['actual'],preds['predicted']), np.mean(np.abs((preds['actual']-preds['predicted'])/preds['actual']))*100

    data = [['Metric','Value'], ['Properties',len(preds)], ['Segments',len(metrics)], ['R²',f'{r2:.4f}'], ['MAE',f'${mae:,.0f}'], ['MAPE%',f'{mape:.2f}%']]
    if new_preds is not None: data.append(['New Predictions',len(new_preds)])

    for i,(k,v) in enumerate(data,5):
        ws[f'A{i}'].font, ws[f'A{i}'].value, ws[f'B{i}'].value = Font(bold=True), k, v

    # Segments
    ws = wb.create_sheet("Segments")
    seg_df = pd.DataFrame([{**{'Segment':s},**m} for s,m in metrics.items()])
    for i,h in enumerate(seg_df.columns,1):
        c = ws.cell(1,i,h)
        c.font, c.fill = Font(bold=True,color='FFFFFF'), PatternFill(start_color='366092',end_color='366092',fill_type='solid')
    for i,row in enumerate(seg_df.itertuples(index=False),2):
        for j,v in enumerate(row,1): ws.cell(i,j,v)

    # Feature Importance
    for title,data,name in [('Global Feature Importance',fi,'Global_Feature_Importance')]+[(f'FI: {s}',si,f"FI_{s}"[:31]) for s,si in seg_imps.items()]:
        ws = wb.create_sheet(name)
        ws['A1'].value, ws['A1'].font = title, Font(bold=True,size=12)
        for r_idx,row in enumerate(dataframe_to_rows(data,index=False,header=True),2):
            for c_idx,value in enumerate(row,1):
                cell = ws.cell(row=r_idx,column=c_idx,value=value)
                if r_idx==2: cell.font, cell.fill = Font(bold=True,color='FFFFFF'), PatternFill(start_color='366092',end_color='366092',fill_type='solid')

    # Predictions
    for sheet_name,data,color in [('Test_Predictions',preds,'366092'), ('New_Predictions',new_preds,'4472C4')]:
        if data is None: continue
        ws = wb.create_sheet(sheet_name)
        for i,h in enumerate(data.columns,1):
            c = ws.cell(1,i,h)
            c.font, c.fill = Font(bold=True,color='FFFFFF'), PatternFill(start_color=color,end_color=color,fill_type='solid')
        for i,row in enumerate(data.itertuples(index=False),2):
            for j,v in enumerate(row,1): ws.cell(i,j,v)

    ts = time.strftime("%Y%m%d_%H%M%S")
    xl_path = f"{out_dir}/improved_segmented_{ts}.xlsx"
    wb.save(xl_path)

    for name,data in [('test_predictions',preds), ('segments',seg_df), ('importance',fi)]+[(f'importance_{s}',si) for s,si in seg_imps.items()]+([('new_predictions',new_preds)] if new_preds is not None else []):
        data.to_csv(f"{out_dir}/improved_{name}_{ts}.csv", index=False)

    print(f"✓ Excel: {xl_path}")
    print(f"✓ CSVs saved with timestamp: {ts}")

def main():
    t0 = time.time()
    print("="*60+"\nIMPROVED SEGMENTED AVM\nValidated Assessments + Property-Adjusted Census + 6 Tiers\n"+"="*60)

    df, y_col, id_col, state_col = load_data(TRAINING_INPUT_PATH)
    df, feats, kmeans = prepare_data(df, y_col, id_col, state_col, for_training=True)
    results = train_segments(df, feats, y_col, id_col, state_col)

    train_stats = df.groupby('geo_cluster')[y_col].agg(['mean','median']).reset_index() if 'geo_cluster' in df.columns else None
    if train_stats is not None: train_stats.columns = ['geo_cluster','cluster_avg_price','cluster_med_price']

    new_preds = None
    if PREDICTION_INPUT_PATH:
        pred_df, _, _, _ = load_data(PREDICTION_INPUT_PATH)
        new_preds = predict_new(pred_df, results['models'], feats, y_col, id_col, state_col, kmeans, train_stats, results['training_quantiles'])

    os.makedirs(OUTPUT_DIR, exist_ok=True)
    save_results(results, OUTPUT_DIR, new_preds)

    preds = results['predictions']
    r2, mae, mape = r2_score(preds['actual'],preds['predicted']), mean_absolute_error(preds['actual'],preds['predicted']), np.mean(np.abs((preds['actual']-preds['predicted'])/preds['actual']))*100
    print(f"\n{'='*60}\n✓ COMPLETE in {time.time()-t0:.1f}s\n  Test: {len(preds):,} | {preds['segment'].nunique()} segments\n  R²:{r2:.4f} | MAE:${mae:,.0f} | MAPE:{mape:.2f}%")
    if new_preds is not None: print(f"  New predictions: {len(new_preds):,}")
    print("="*60)

if __name__=="__main__": main()

IMPROVED SEGMENTED AVM
Validated Assessments + Property-Adjusted Census + 6 Tiers
Loading /Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/Untitled 5_2025-12-23-1212.csv
127,326 records | 90.4MB | Price:sale_price ID:property_id

Preparing data (property-adjusted census segmentation)...
  ⚠️  Filtered 29,431 bad assessed values
51/57 features available

Training on 127,258 properties (with property-adjusted segmentation)

Value quantiles: Q20:$234,441 Q40:$588,227 Q60:$962,426 Q80:$1,552,373 Q95:$3,183,290

Value sources: {'census_adjusted': np.int64(127258)}

12 segments
  budget_small: 24,655 ($4,637-$234,400, med $105,480)
  economy_small: 19,718 ($234,502-$588,222, med $409,100)
  premium_large: 19,457 ($962,428-$1,552,302, med $1,218,987)
  luxury_large: 16,594 ($1,552,423-$3,183,260, med $1,944,059)
  mid_large: 14,703 ($588,240-$962,426, med $801,360)
  mid_small: 10,749 ($588,229-$962,413, med $709,203)
  ultra_large: 6,341 ($3,183,461-$9,360,005, med $4,285,591)

M

In [27]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.cluster import MiniBatchKMeans
from sklearn.ensemble import IsolationForest
import warnings, time, os
from openpyxl import Workbook
from openpyxl.styles import Font, PatternFill
from openpyxl.utils.dataframe import dataframe_to_rows

warnings.filterwarnings('ignore')
os.environ['OMP_NUM_THREADS'] = '4'

# Config
INCLUDE_MLS, INCLUDE_CENSUS, INCLUDE_NEIGHBORHOOD, INCLUDE_IMAGE = True, True, True, False
MIN_PRICE, TEST_SIZE, RAND_STATE, N_JOBS, N_CLUSTERS, N_EST, MAX_SEGMENTS = 20000, 0.3, 42, -1, 8, 100, 7
QUANTILES = [0.1, 0.5, 0.9]

TRAINING_INPUT_PATH = "/Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/Untitled 5_2025-12-23-1212.csv"
PREDICTION_INPUT_PATH = "/Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/MLS_w_luxury_AVM_outliers.csv"
OUTPUT_DIR = "/Users/jenny.lin/BASIS_AVM_Onboarding/cate_scenario_analyses/model_outputs"

BASE_FEATS = ["living_sqft","lot_sqft","year_built","effective_year_built","bedrooms","full_baths","half_baths","garage_spaces","fireplace_code","latitude","longitude","geo_cluster"]
ENG_FEATS = ["sqft_per_bedroom","lot_to_living_ratio","property_age","is_new","has_garage","luxury_score","log_sqft","age_squared"]
PRIOR_FEATS = ["prior_sale_price","prior_price_per_sqft","sqft_per_prior_dollar","years_since_last_sale","expected_appreciation","has_prior_sale","recently_sold"]
CLUSTER_FEATS = ["cluster_avg_price","cluster_med_price"]
CENSUS_FEATS = ["total_population_25plus","male_bachelors_degree","female_bachelors_degree","pct_bachelors_degree","median_earnings_total","median_earnings_male","median_earnings_female","median_household_income","median_home_value","median_gross_rent","owner_occupied_units","renter_occupied_units","pct_owner_occupied","occupied_units","vacant_units","median_age","civilian_employed","civilian_unemployed","unemployment_rate","income_education_score"]
ELECT_FEATS = ["votes_gop","votes_dem","total_votes","per_gop","per_dem","per_point_diff","dem_margin","rep_margin"]
IMG_FEATS = ["topic_1","topic_2","topic_3","topic_4","topic_5","topic_6","topic_7","topic_8","topic_9","topic_10","gran_c_in","gran_c_ex","gran_c","high_c_in","high_c_ex","high_c"]

def optimize_dtypes(df):
    for c in df.select_dtypes(['float64']).columns: df[c] = df[c].astype('float32')
    for c in df.select_dtypes(['int64']).columns:
        df[c] = df[c].astype('int8' if set(df[c].dropna().unique()).issubset({0,1}) else 'int32')
    return df

def load_data(path):
    print(f"Loading {path}")
    df = optimize_dtypes(pd.read_csv(path, low_memory=False))
    df.columns = df.columns.str.lower()
    y_col = next((c for c in ['sale_price','currentsalesprice','price','saleprice'] if c in df.columns), 'currentsalesprice')
    id_col = next((c for c in ['cc_list_id','property_id','propertyid','id'] if c in df.columns), 'cc_list_id')
    state_col = next((c for c in ['sample_state','state','state_code'] if c in df.columns), None)
    print(f"{len(df):,} records | {df.memory_usage(deep=True).sum()/1024**2:.1f}MB | Price:{y_col} ID:{id_col}")
    return df, y_col, id_col, state_col

def filter_bad_assessed(df):
    if 'assessed_total_value' not in df.columns:
        return pd.Series([False]*len(df), index=df.index)

    valid = (df['assessed_total_value']>10000) & (df['assessed_total_value']<100000000)

    if 'living_sqft' in df.columns:
        has_sqft = (df['living_sqft'].notna()) & (df['living_sqft']>100)
        ppsf = df['assessed_total_value']/df['living_sqft']
        valid &= ~(has_sqft & ((ppsf<20)|(ppsf>2000)))

    if 'prior_sale_price' in df.columns and 'years_since_last_sale' in df.columns:
        has_prior = (df['prior_sale_price']>10000) & (df['years_since_last_sale']<=15)
        valid &= ~(has_prior & ((df['assessed_total_value']/df['prior_sale_price'])<0.10))

    if 'median_home_value' in df.columns:
        has_census = (df['median_home_value']>10000)
        valid &= ~(has_census & ((df['assessed_total_value']/df['median_home_value'])<0.05))

    invalid_cnt = ((df['assessed_total_value'].notna()) & ~valid).sum()
    if invalid_cnt>0:
        print(f"  ⚠️  Filtered {invalid_cnt:,} bad assessed values")

    return valid

def assign_segments(df, train_q=None):
    value_ind = pd.Series([np.nan]*len(df), index=df.index)
    source = pd.Series(['none']*len(df), index=df.index)

    # Priority 1: Prior sale
    if 'prior_sale_price' in df.columns and 'years_since_last_sale' in df.columns:
        recent = (df['years_since_last_sale']<=10) & (df['prior_sale_price']>10000)
        if recent.sum()>0:
            yrs = df.loc[recent,'years_since_last_sale'].fillna(5)
            value_ind[recent] = df.loc[recent,'prior_sale_price']*(1.04**yrs)
            source[recent] = 'prior_sale'

    # Priority 2: COMPOSITE approach (assessed + adjusted census)
    missing_val = value_ind.isna()
    if missing_val.sum()>0:
        composite_vals = []

        # Get assessed value (with relaxed filter for composite)
        assessed_component = pd.Series([np.nan]*len(df), index=df.index)
        if 'assessed_total_value' in df.columns:
            # Use assessed if it passes basic sanity (more lenient than filter_bad_assessed)
            reasonable = (df['assessed_total_value']>50000) & (df['assessed_total_value']<50000000)
            if 'living_sqft' in df.columns:
                has_sqft = (df['living_sqft']>100)
                ppsf = df['assessed_total_value']/df['living_sqft']
                reasonable &= ~(has_sqft & ((ppsf<50)|(ppsf>1500)))  # More lenient than 20-2000

            if reasonable.sum()>0:
                vals = df.loc[reasonable,'assessed_total_value']
                mult = pd.Series([1.0]*len(vals), index=vals.index)
                mult[vals<200000], mult[(vals>=200000)&(vals<500000)] = 1.1, 1.15
                mult[vals>=500000] = 1.2
                assessed_component[reasonable] = vals*mult

        # Get AGGRESSIVELY adjusted census
        census_component = pd.Series([np.nan]*len(df), index=df.index)
        if 'median_home_value' in df.columns:
            has_census = (df['median_home_value']>10000)
            if has_census.sum()>0:
                base = df.loc[has_census,'median_home_value'].copy()

                # MUCH MORE AGGRESSIVE adjustments
                if 'living_sqft' in df.columns:
                    # Sqft adjustment: 0.5x to 4.0x (linear, not clipped hard)
                    sqft_ratio = (df.loc[has_census,'living_sqft']/1800).clip(0.5, 4.0)
                    base *= sqft_ratio

                # Quality multiplier from multiple features
                quality = pd.Series([1.0]*has_census.sum(), index=df[has_census].index)

                if 'full_baths' in df.columns:
                    # Each bath above 2 = +25%, below 2 = -15%
                    bath_diff = df.loc[has_census,'full_baths'].fillna(2) - 2
                    quality *= (1 + bath_diff*0.25).clip(0.7, 2.0)

                if 'bedrooms' in df.columns:
                    # Each bed above 3 = +15%, below 3 = -10%
                    bed_diff = df.loc[has_census,'bedrooms'].fillna(3) - 3
                    quality *= (1 + bed_diff*0.15).clip(0.75, 1.8)

                if 'garage_spaces' in df.columns:
                    # Each garage space = +15%
                    garage = df.loc[has_census,'garage_spaces'].fillna(0)
                    quality *= (1 + garage*0.15).clip(1.0, 1.6)

                if 'year_built' in df.columns:
                    # Newer homes worth more
                    age = (2024 - df.loc[has_census,'year_built']).fillna(50)
                    age_adj = (1 - (age-20)/200).clip(0.8, 1.3)  # Sweet spot at 20 years
                    quality *= age_adj

                census_component[has_census] = base * quality

        # Take the MAXIMUM of assessed and census_adjusted (avoid under-estimation)
        for idx in missing_val[missing_val].index:
            vals_to_compare = []
            if not pd.isna(assessed_component[idx]):
                vals_to_compare.append((assessed_component[idx], 'assessed_composite'))
            if not pd.isna(census_component[idx]):
                vals_to_compare.append((census_component[idx], 'census_composite'))

            if vals_to_compare:
                # Take the higher value to avoid under-segmentation
                best_val, best_source = max(vals_to_compare, key=lambda x: x[0])
                value_ind[idx] = best_val
                source[idx] = best_source

    # Fallback: Sqft estimate
    if 'living_sqft' in df.columns:
        valid = (df['living_sqft']>0) & value_ind.isna()
        if valid.sum()>0:
            ppsf = pd.Series([200.0]*valid.sum(), index=df[valid].index)
            if 'median_home_value' in df.columns:
                ppsf = (df.loc[valid,'median_home_value'].fillna(350000)/1800).clip(80,600)
            value_ind[valid], source[valid] = df.loc[valid,'living_sqft']*ppsf, 'sqft_estimate'

    value_ind = value_ind.fillna(value_ind.median())
    source[value_ind.isna()] = 'fallback'

    # Segment
    if train_q is None:
        q20,q40,q60,q80,q95 = value_ind.quantile([0.20,0.40,0.60,0.80,0.95])
        train_q = {'value_q20':q20,'value_q40':q40,'value_q60':q60,'value_q80':q80,'value_q95':q95}
        print(f"\nValue quantiles: Q20:${q20:,.0f} Q40:${q40:,.0f} Q60:${q60:,.0f} Q80:${q80:,.0f} Q95:${q95:,.0f}")
    else:
        q20,q40,q60,q80,q95 = train_q['value_q20'],train_q['value_q40'],train_q['value_q60'],train_q['value_q80'],train_q['value_q95']

    tier = pd.Series(['mid']*len(df), index=df.index)
    tier[value_ind<q20], tier[(value_ind>=q20)&(value_ind<q40)] = 'budget', 'economy'
    tier[(value_ind>=q60)&(value_ind<q80)], tier[(value_ind>=q80)&(value_ind<q95)], tier[value_ind>=q95] = 'premium', 'luxury', 'ultra'

    size = pd.Series(['small']*len(df), index=df.index)
    if 'living_sqft' in df.columns:
        sqft = df['living_sqft'].fillna(df['living_sqft'].median())
        size[sqft>sqft.median()] = 'large'

    df['_value_indicator'], df['_value_source'] = value_ind, source
    return tier+'_'+size, train_q

def engineer(df, y_col, with_price=False):
    if 'living_sqft' in df.columns:
        if 'bedrooms' in df.columns: df['sqft_per_bedroom'] = df['living_sqft']/(df['bedrooms']+1)
        df['log_sqft'] = np.log1p(df['living_sqft'])

    if 'lot_sqft' in df.columns:
        if 'living_sqft' in df.columns: df['lot_to_living_ratio'] = df['lot_sqft']/(df['living_sqft']+1)
        if 'lot_acres' not in df.columns: df['lot_acres'] = df['lot_sqft']/43560

    if 'year_built' in df.columns:
        df['property_age'], df['is_new'], df['age_squared'] = 2024-df['year_built'], ((2024-df['year_built'])<=5).astype('int8'), (2024-df['year_built'])**2

    if 'garage_spaces' in df.columns:
        df['has_garage'] = (df['garage_spaces']>0).astype('int8')

    lux = [df[c]/1000 if c=='living_sqft' else df[c] for c in ['living_sqft','full_baths','garage_spaces'] if c in df.columns]
    df['luxury_score'] = sum(lux)/len(lux) if lux else 0

    if 'prior_sale_price' in df.columns and 'years_since_last_sale' in df.columns:
        df['prior_appreciated'] = df['prior_sale_price']*(1.04**df['years_since_last_sale'].fillna(5))

    if 'assessed_total_value' in df.columns:
        valid = filter_bad_assessed(df)
        if 'living_sqft' in df.columns:
            df['assessed_per_sqft'] = 0
            df.loc[valid,'assessed_per_sqft'] = df.loc[valid,'assessed_total_value']/(df.loc[valid,'living_sqft']+1)
        if 'median_home_value' in df.columns:
            reasonable = valid & (df['median_home_value']>10000)
            df['assessed_to_census_ratio'] = 1.0
            df.loc[reasonable,'assessed_to_census_ratio'] = df.loc[reasonable,'assessed_total_value']/(df.loc[reasonable,'median_home_value']+1)
        if 'assessed_land_value' in df.columns:
            reasonable = valid & (df['assessed_land_value']>0)
            df['land_to_total_ratio'] = 0
            df.loc[reasonable,'land_to_total_ratio'] = df.loc[reasonable,'assessed_land_value']/(df.loc[reasonable,'assessed_total_value']+1)

    if INCLUDE_CENSUS and all(c in df.columns for c in ['median_household_income','pct_bachelors_degree']):
        df['income_education_score'] = df['median_household_income']*df['pct_bachelors_degree']

    if 'prior_sale_price' in df.columns and 'living_sqft' in df.columns:
        df['prior_price_per_sqft'], df['sqft_per_prior_dollar'] = df['prior_sale_price']/(df['living_sqft']+1), df['living_sqft']/(df['prior_sale_price']+1)

    if 'prior_sale_date' in df.columns:
        df['prior_sale_date'] = pd.to_datetime(df['prior_sale_date'], errors='coerce')
        df['years_since_last_sale'] = (pd.Timestamp('2024-01-01')-df['prior_sale_date']).dt.days/365.25
        if 'prior_sale_price' in df.columns:
            df['expected_appreciation'] = df['prior_sale_price']*(1.04)**df['years_since_last_sale']
        df['recently_sold'] = (df['years_since_last_sale']<2).astype('int8')

    if 'prior_sale_price' in df.columns:
        df['has_prior_sale'] = df['prior_sale_price'].notna().astype('int8')
        missing = df['prior_sale_price'].isna()
        df.loc[missing,'prior_sale_price'] = df.loc[missing,'median_home_value'] if 'median_home_value' in df.columns and INCLUDE_CENSUS else df['prior_sale_price'].median()

    if 'years_since_last_sale' in df.columns:
        df['years_since_last_sale'] = df['years_since_last_sale'].fillna(999)

    if with_price and y_col in df.columns and 'living_sqft' in df.columns:
        df['price_per_sqft'], df['sqft_per_dollar'] = df[y_col]/(df['living_sqft']+1), df['living_sqft']/(df[y_col]+1)

    return df

def geo_cluster(df, kmeans=None):
    if not all(c in df.columns for c in ['latitude','longitude']):
        df['geo_cluster'] = 0
        return df, kmeans

    valid = df[['latitude','longitude']].notna().all(axis=1)
    if valid.sum()<N_CLUSTERS:
        df['geo_cluster'] = 0
        return df, kmeans

    df['geo_cluster'] = 0
    if kmeans is None:
        kmeans = MiniBatchKMeans(n_clusters=N_CLUSTERS, random_state=RAND_STATE, batch_size=1000, n_init=3)
        df.loc[valid,'geo_cluster'] = kmeans.fit_predict(df.loc[valid,['latitude','longitude']])
    else:
        df.loc[valid,'geo_cluster'] = kmeans.predict(df.loc[valid,['latitude','longitude']])

    return df, kmeans

def add_cluster_feats(train, test, y_col):
    if 'geo_cluster' not in train.columns or y_col not in train.columns:
        for d in [train,test]:
            d['cluster_avg_price'] = d['cluster_med_price'] = train[y_col].median() if y_col in train.columns else 0
        return train, test

    stats = train.groupby('geo_cluster')[y_col].agg(['mean','median']).reset_index()
    stats.columns = ['geo_cluster','cluster_avg_price','cluster_med_price']
    med = train[y_col].median()
    train = train.merge(stats, on='geo_cluster', how='left').fillna({'cluster_avg_price':med,'cluster_med_price':med})
    test = test.merge(stats, on='geo_cluster', how='left').fillna({'cluster_avg_price':med,'cluster_med_price':med})
    return train, test

def filter_outliers(df, name, y_col):
    orig = len(df)
    df = engineer(df, y_col, with_price=True)

    if 'price_per_sqft' in df.columns:
        lb,ub = df['price_per_sqft'].quantile([.05,.95])
        df = df[(df['price_per_sqft']>=lb)&(df['price_per_sqft']<=ub)].drop(columns=['price_per_sqft','sqft_per_dollar'])

    if 'lot_sqft' in df.columns: df = df[df['lot_sqft']<=df['lot_sqft'].quantile(.98)]
    if 'year_built' in df.columns: df = df[(df['year_built']>=1900)&(df['year_built']<=2025)]

    if len(df)>=100:
        try:
            feats = [c for c in ['living_sqft','lot_sqft',y_col,'year_built'] if c in df.columns]
            if len(feats)>=3:
                X = df[feats].fillna(df[feats].median())
                df = df[IsolationForest(contamination=.05, random_state=RAND_STATE, n_jobs=N_JOBS).fit_predict(X)==1]
        except: pass

    if (pct:=(orig-len(df))/orig*100)>0: print(f"  {name}: {orig:,}→{len(df):,} ({pct:.1f}% filtered)")
    return df

def train_model(X, y, q):
    return XGBRegressor(objective='reg:quantileerror', quantile_alpha=q, n_estimators=N_EST, learning_rate=.1,
                       max_depth=5, min_child_weight=3, subsample=.8, colsample_bytree=.8,
                       random_state=RAND_STATE, n_jobs=N_JOBS, tree_method='hist').fit(X, y, verbose=False)

def get_feat_importance(model, feats, top_n=20):
    scores = model.get_booster().get_score(importance_type="gain")
    imp = [(feats[int(k[1:])],v) for k,v in scores.items() if int(k[1:])<len(feats)]
    imp.sort(key=lambda x: x[1], reverse=True)
    total = sum(v for _,v in imp)
    return pd.DataFrame([{'feature':f,'gain':g,'importance':g/total} for f,g in imp[:top_n]]) if total>0 else pd.DataFrame(columns=['feature','gain','importance'])

def feature_importance(models, feats, metrics):
    rows = []
    for seg,mdl in models.items():
        scores = mdl['q50'].get_booster().get_score(importance_type="gain")
        w = metrics[seg]["n_test"]
        rows.extend([(feats[int(k[1:])],v,w) for k,v in scores.items() if int(k[1:])<len(feats)])

    if not rows: return pd.DataFrame(columns=["feature","total_gain","importance"])

    df = pd.DataFrame(rows, columns=["feature","gain","weight"])
    out = df.assign(wg=df["gain"]*df["weight"]).groupby("feature",as_index=False).agg(
        total_gain=("gain","sum"), weighted_gain=("wg","sum")).sort_values("weighted_gain",ascending=False)
    out["importance"] = out["weighted_gain"]/out["weighted_gain"].sum()
    return out[["feature","total_gain","importance"]].head(100)

def prepare_data(df, y_col, id_col, state_col, for_training=True):
    print(f"\nPreparing data (composite valuation segmentation)...")
    if for_training: df = df[df[y_col]>=MIN_PRICE]

    df, kmeans = geo_cluster(engineer(df, y_col))

    feat_groups = []
    if INCLUDE_MLS: feat_groups.extend([BASE_FEATS,ENG_FEATS,PRIOR_FEATS,CLUSTER_FEATS])
    if INCLUDE_CENSUS: feat_groups.append(CENSUS_FEATS)
    if INCLUDE_NEIGHBORHOOD: feat_groups.append(ELECT_FEATS)
    if INCLUDE_IMAGE: feat_groups.append(IMG_FEATS)

    all_feats = [f for g in feat_groups for f in g]
    feats = [f for f in all_feats if f in df.columns]
    feats.extend([vf for vf in ['prior_appreciated','assessed_per_sqft','assessed_to_census_ratio','land_to_total_ratio'] if vf in df.columns and vf not in feats])

    print(f"{len(feats)}/{len(all_feats)} features available")

    cols = list(set(feats+[y_col,id_col]+([state_col] if state_col and state_col in df.columns else [])))
    df = df[[c for c in cols if c in df.columns]].copy()
    df[feats] = df[feats].fillna(df[feats].median())

    return (df.dropna(subset=[y_col]),feats,kmeans) if for_training else (df,feats,kmeans)

def train_segments(df, feats, y_col, id_col, state_col):
    print(f"\nTraining on {len(df):,} properties (composite valuation)")

    df['seg'], train_q = assign_segments(df)

    if '_value_source' in df.columns:
        print(f"\nValue sources: {dict(df['_value_source'].value_counts())}")

    seg_cnts = df['seg'].value_counts()
    print(f"\n{len(seg_cnts)} segments")

    for seg,cnt in seg_cnts.head(MAX_SEGMENTS).items():
        if '_value_indicator' in df.columns and seg!='other':
            seg_df = df[df['seg']==seg]
            if len(seg_df)>0:
                print(f"  {seg}: {cnt:,} (${seg_df['_value_indicator'].min():,.0f}-${seg_df['_value_indicator'].max():,.0f}, med ${seg_df['_value_indicator'].median():,.0f})")
        else:
            print(f"  {seg}: {cnt:,}")

    small = seg_cnts[seg_cnts<50].index
    if len(small)>0:
        df.loc[df['seg'].isin(small),'seg'] = 'other'
        print(f"\nMerged {len(small)} small segments")

    seg_cnts = df['seg'].value_counts()
    if len(seg_cnts)>MAX_SEGMENTS:
        keep = seg_cnts.head(MAX_SEGMENTS-1).index
        df.loc[~df['seg'].isin(keep),'seg'] = 'other'
        print(f"Consolidated to {MAX_SEGMENTS} segments")

    df = df.drop(columns=[c for c in ['_value_indicator','_value_source'] if c in df.columns])

    models, metrics, preds_list, seg_imps = {}, {}, [], {}

    for seg in df['seg'].unique():
        seg_df = filter_outliers(df[df['seg']==seg].copy(), seg, y_col)
        if len(seg_df)<50: continue

        train_idx = seg_df.sample(frac=1-TEST_SIZE, random_state=RAND_STATE).index
        train_df, test_df = seg_df.loc[train_idx].copy(), seg_df.loc[seg_df.index.difference(train_idx)].copy()
        train_df, test_df = add_cluster_feats(train_df, test_df, y_col)

        X_tr, y_tr = train_df[feats].values, train_df[y_col].values
        X_te, y_te = test_df[feats].values, test_df[y_col].values
        ids, states = test_df[id_col].values, test_df[state_col].values if state_col and state_col in test_df.columns else ['Unknown']*len(test_df)

        seg_models = {f"q{int(q*100)}":train_model(X_tr,y_tr,q) for q in QUANTILES}
        seg_preds = [seg_models[f"q{int(q*100)}"].predict(X_te) for q in QUANTILES]

        models[seg] = seg_models
        seg_imps[seg] = get_feat_importance(seg_models['q50'], feats)

        mae, mape = mean_absolute_error(y_te,seg_preds[1]), np.mean(np.abs((y_te-seg_preds[1])/y_te))*100
        r2, cov = r2_score(y_te,seg_preds[1]), np.mean((y_te>=seg_preds[0])&(y_te<=seg_preds[2]))*100
        metrics[seg] = {'n_train':len(X_tr),'n_test':len(X_te),'mae':mae,'mape':mape,'r2':r2,'cov':cov,
                       'p_min':seg_df[y_col].min(),'p_max':seg_df[y_col].max(),'p_med':seg_df[y_col].median()}
        print(f"  {seg}: {len(test_df):,} test | MAE:${mae:,.0f} | MAPE:{mape:.2f}% | R²:{r2:.3f}")

        preds_list.append(pd.DataFrame({
            'property_id':ids, 'state':states, 'actual':y_te, 'predicted':seg_preds[1],
            'pred_lower':seg_preds[0], 'pred_upper':seg_preds[2], 'segment':seg
        }))

    return {'models':models, 'metrics':metrics, 'predictions':pd.concat(preds_list),
            'feature_importance':feature_importance(models,feats,metrics), 'segment_importances':seg_imps,
            'feature_names':feats, 'training_quantiles':train_q}

def predict_new(pred_df, models, feats, y_col, id_col, state_col, kmeans, train_stats, train_q):
    print(f"\n{'='*60}\nPREDICTING {len(pred_df):,} NEW PROPERTIES\n{'='*60}")

    pred_df = engineer(pred_df, y_col)
    pred_df, _ = geo_cluster(pred_df, kmeans)

    if 'geo_cluster' in pred_df.columns and train_stats is not None:
        pred_df = pred_df.merge(train_stats, on='geo_cluster', how='left')
        med = train_stats['cluster_avg_price'].median()
        pred_df['cluster_avg_price'] = pred_df['cluster_avg_price'].fillna(med)
        pred_df['cluster_med_price'] = pred_df['cluster_med_price'].fillna(med)

    for feat in feats:
        if feat not in pred_df.columns: pred_df[feat] = 0
        else: pred_df[feat] = pred_df[feat].fillna(pred_df[feat].median() if pred_df[feat].notna().sum()>0 else 0)

    pred_df['seg'], _ = assign_segments(pred_df, train_q)

    if '_value_source' in pred_df.columns:
        print(f"Value sources: {dict(pred_df['_value_source'].value_counts())}")

    preds_list = []
    for seg in pred_df['seg'].unique():
        seg_df = pred_df[pred_df['seg']==seg].copy()

        if seg not in models:
            print(f"  Warning: '{seg}' not in models, using fallback")
            avail = list(models.keys())
            seg = next((s for s in avail if any(x in seg for x in ['ultra','luxury','premium','mid','economy','budget'] if x in s)), avail[0])

        X = seg_df[feats].values
        ids, states = seg_df[id_col].values, seg_df[state_col].values if state_col and state_col in seg_df.columns else ['Unknown']*len(seg_df)

        preds = [models[seg][f"q{int(q*100)}"].predict(X) for q in QUANTILES]
        actual = seg_df[y_col].values if y_col in seg_df.columns else [np.nan]*len(seg_df)

        value_ind = seg_df['_value_indicator'].values if '_value_indicator' in seg_df.columns else [np.nan]*len(seg_df)
        value_src = seg_df['_value_source'].values if '_value_source' in seg_df.columns else ['unknown']*len(seg_df)

        preds_list.append(pd.DataFrame({
            'property_id':ids, 'state':states, 'value_indicator':value_ind, 'value_source':value_src,
            'actual':actual, 'predicted':preds[1], 'pred_lower':preds[0], 'pred_upper':preds[2], 'segment':seg,
            'error':[actual[i]-preds[1][i] if not np.isnan(actual[i]) else np.nan for i in range(len(actual))],
            'pct_error':[100*(actual[i]-preds[1][i])/actual[i] if not np.isnan(actual[i]) and actual[i]!=0 else np.nan for i in range(len(actual))]
        }))

        print(f"  {seg}: {len(seg_df):,} predicted")

    result = pd.concat(preds_list, ignore_index=True)
    print(f"\n✓ Generated {len(result):,} predictions")

    pred_df = pred_df.drop(columns=[c for c in ['_value_indicator','_value_source'] if c in pred_df.columns])

    valid = result['actual'].notna().sum()
    if valid>0:
        valid_df = result[result['actual'].notna()]
        mae, mape = mean_absolute_error(valid_df['actual'],valid_df['predicted']), np.mean(np.abs((valid_df['actual']-valid_df['predicted'])/valid_df['actual']))*100
        r2 = r2_score(valid_df['actual'],valid_df['predicted'])
        print(f"  Validation ({valid}): MAE:${mae:,.0f} | MAPE:{mape:.2f}% | R²:{r2:.4f}")

    return result

def save_results(results, out_dir, new_preds=None):
    print(f"\nSaving results...")
    preds, metrics, fi, seg_imps = results['predictions'], results['metrics'], results['feature_importance'], results['segment_importances']

    wb = Workbook()
    wb.remove(wb.active)

    # Summary
    ws = wb.create_sheet("Summary", 0)
    ws['A1'].font, ws['A1'].value = Font(bold=True,size=14), 'IMPROVED SEGMENTED AVM'
    ws['A2'].font, ws['A2'].value = Font(italic=True,size=10), 'Composite valuation (assessed + aggressively adjusted census) + 6 tiers'

    r2, mae, mape = r2_score(preds['actual'],preds['predicted']), mean_absolute_error(preds['actual'],preds['predicted']), np.mean(np.abs((preds['actual']-preds['predicted'])/preds['actual']))*100

    data = [['Metric','Value'], ['Properties',len(preds)], ['Segments',len(metrics)], ['R²',f'{r2:.4f}'], ['MAE',f'${mae:,.0f}'], ['MAPE%',f'{mape:.2f}%']]
    if new_preds is not None: data.append(['New Predictions',len(new_preds)])

    for i,(k,v) in enumerate(data,5):
        ws[f'A{i}'].font, ws[f'A{i}'].value, ws[f'B{i}'].value = Font(bold=True), k, v

    # Segments
    ws = wb.create_sheet("Segments")
    seg_df = pd.DataFrame([{**{'Segment':s},**m} for s,m in metrics.items()])
    for i,h in enumerate(seg_df.columns,1):
        c = ws.cell(1,i,h)
        c.font, c.fill = Font(bold=True,color='FFFFFF'), PatternFill(start_color='366092',end_color='366092',fill_type='solid')
    for i,row in enumerate(seg_df.itertuples(index=False),2):
        for j,v in enumerate(row,1): ws.cell(i,j,v)

    # Feature Importance
    for title,data,name in [('Global Feature Importance',fi,'Global_Feature_Importance')]+[(f'FI: {s}',si,f"FI_{s}"[:31]) for s,si in seg_imps.items()]:
        ws = wb.create_sheet(name)
        ws['A1'].value, ws['A1'].font = title, Font(bold=True,size=12)
        for r_idx,row in enumerate(dataframe_to_rows(data,index=False,header=True),2):
            for c_idx,value in enumerate(row,1):
                cell = ws.cell(row=r_idx,column=c_idx,value=value)
                if r_idx==2: cell.font, cell.fill = Font(bold=True,color='FFFFFF'), PatternFill(start_color='366092',end_color='366092',fill_type='solid')

    # Predictions
    for sheet_name,data,color in [('Test_Predictions',preds,'366092'), ('New_Predictions',new_preds,'4472C4')]:
        if data is None: continue
        ws = wb.create_sheet(sheet_name)
        for i,h in enumerate(data.columns,1):
            c = ws.cell(1,i,h)
            c.font, c.fill = Font(bold=True,color='FFFFFF'), PatternFill(start_color=color,end_color=color,fill_type='solid')
        for i,row in enumerate(data.itertuples(index=False),2):
            for j,v in enumerate(row,1): ws.cell(i,j,v)

    ts = time.strftime("%Y%m%d_%H%M%S")
    xl_path = f"{out_dir}/improved_segmented_{ts}.xlsx"
    wb.save(xl_path)

    for name,data in [('test_predictions',preds), ('segments',seg_df), ('importance',fi)]+[(f'importance_{s}',si) for s,si in seg_imps.items()]+([('new_predictions',new_preds)] if new_preds is not None else []):
        data.to_csv(f"{out_dir}/improved_{name}_{ts}.csv", index=False)

    print(f"✓ Excel: {xl_path}")
    print(f"✓ CSVs saved with timestamp: {ts}")

def main():
    t0 = time.time()
    print("="*60+"\nIMPROVED SEGMENTED AVM\nComposite Valuation (Assessed + Aggressively Adjusted Census)\n"+"="*60)

    df, y_col, id_col, state_col = load_data(TRAINING_INPUT_PATH)
    df, feats, kmeans = prepare_data(df, y_col, id_col, state_col, for_training=True)
    results = train_segments(df, feats, y_col, id_col, state_col)

    train_stats = df.groupby('geo_cluster')[y_col].agg(['mean','median']).reset_index() if 'geo_cluster' in df.columns else None
    if train_stats is not None: train_stats.columns = ['geo_cluster','cluster_avg_price','cluster_med_price']

    new_preds = None
    if PREDICTION_INPUT_PATH:
        pred_df, _, _, _ = load_data(PREDICTION_INPUT_PATH)
        new_preds = predict_new(pred_df, results['models'], feats, y_col, id_col, state_col, kmeans, train_stats, results['training_quantiles'])

    os.makedirs(OUTPUT_DIR, exist_ok=True)
    save_results(results, OUTPUT_DIR, new_preds)

    preds = results['predictions']
    r2, mae, mape = r2_score(preds['actual'],preds['predicted']), mean_absolute_error(preds['actual'],preds['predicted']), np.mean(np.abs((preds['actual']-preds['predicted'])/preds['actual']))*100
    print(f"\n{'='*60}\n✓ COMPLETE in {time.time()-t0:.1f}s\n  Test: {len(preds):,} | {preds['segment'].nunique()} segments\n  R²:{r2:.4f} | MAE:${mae:,.0f} | MAPE:{mape:.2f}%")
    if new_preds is not None: print(f"  New predictions: {len(new_preds):,}")
    print("="*60)

if __name__=="__main__": main()

IMPROVED SEGMENTED AVM
Composite Valuation (Assessed + Aggressively Adjusted Census)
Loading /Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/Untitled 5_2025-12-23-1212.csv
127,326 records | 90.4MB | Price:sale_price ID:property_id

Preparing data (composite valuation segmentation)...
  ⚠️  Filtered 29,431 bad assessed values
51/57 features available

Training on 127,258 properties (composite valuation)

Value quantiles: Q20:$248,054 Q40:$770,306 Q60:$1,474,192 Q80:$2,762,586 Q95:$6,447,392

Value sources: {'census_composite': np.int64(127258)}

12 segments
  budget_small: 24,857 ($3,519-$248,049, med $99,605)
  economy_small: 20,762 ($248,062-$770,303, med $479,515)
  premium_large: 20,415 ($1,474,224-$2,762,514, med $2,014,426)
  luxury_large: 18,122 ($2,762,634-$6,447,311, med $3,873,393)
  mid_large: 13,442 ($770,306-$1,474,184, med $1,151,100)
  mid_small: 12,010 ($770,327-$1,474,166, med $1,024,507)
  ultra_large: 6,362 ($6,447,852-$41,649,446, med $9,201,787)

Merged

In [28]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.cluster import MiniBatchKMeans
from sklearn.ensemble import IsolationForest
import warnings, time, os
from openpyxl import Workbook
from openpyxl.styles import Font, PatternFill
from openpyxl.utils.dataframe import dataframe_to_rows

warnings.filterwarnings('ignore')
os.environ['OMP_NUM_THREADS'] = '4'

# Config
INCLUDE_MLS, INCLUDE_CENSUS, INCLUDE_NEIGHBORHOOD, INCLUDE_IMAGE = True, True, True, False
MIN_PRICE, TEST_SIZE, RAND_STATE, N_JOBS, N_CLUSTERS, N_EST, MAX_SEGMENTS = 20000, 0.3, 42, -1, 8, 100, 7
QUANTILES = [0.1, 0.5, 0.9]

TRAINING_INPUT_PATH = "/Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/Untitled 5_2025-12-23-1212.csv"
PREDICTION_INPUT_PATH = "/Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/MLS_w_luxury_AVM_outliers.csv"
OUTPUT_DIR = "/Users/jenny.lin/BASIS_AVM_Onboarding/cate_scenario_analyses/model_outputs"

BASE_FEATS = ["living_sqft","lot_sqft","year_built","effective_year_built","bedrooms","full_baths","half_baths","garage_spaces","fireplace_code","latitude","longitude","geo_cluster"]
ENG_FEATS = ["sqft_per_bedroom","lot_to_living_ratio","property_age","is_new","has_garage","luxury_score","log_sqft","age_squared"]
PRIOR_FEATS = ["prior_sale_price","prior_price_per_sqft","sqft_per_prior_dollar","years_since_last_sale","expected_appreciation","has_prior_sale","recently_sold"]
CLUSTER_FEATS = ["cluster_avg_price","cluster_med_price"]
CENSUS_FEATS = ["total_population_25plus","male_bachelors_degree","female_bachelors_degree","pct_bachelors_degree","median_earnings_total","median_earnings_male","median_earnings_female","median_household_income","median_home_value","median_gross_rent","owner_occupied_units","renter_occupied_units","pct_owner_occupied","occupied_units","vacant_units","median_age","civilian_employed","civilian_unemployed","unemployment_rate","income_education_score"]
ELECT_FEATS = ["votes_gop","votes_dem","total_votes","per_gop","per_dem","per_point_diff","dem_margin","rep_margin"]
IMG_FEATS = ["topic_1","topic_2","topic_3","topic_4","topic_5","topic_6","topic_7","topic_8","topic_9","topic_10","gran_c_in","gran_c_ex","gran_c","high_c_in","high_c_ex","high_c"]

def optimize_dtypes(df):
    for c in df.select_dtypes(['float64']).columns: df[c] = df[c].astype('float32')
    for c in df.select_dtypes(['int64']).columns:
        df[c] = df[c].astype('int8' if set(df[c].dropna().unique()).issubset({0,1}) else 'int32')
    return df

def load_data(path):
    print(f"Loading {path}")
    df = optimize_dtypes(pd.read_csv(path, low_memory=False))
    df.columns = df.columns.str.lower()
    y_col = next((c for c in ['sale_price','currentsalesprice','price','saleprice'] if c in df.columns), 'currentsalesprice')
    id_col = next((c for c in ['cc_list_id','property_id','propertyid','id'] if c in df.columns), 'cc_list_id')
    state_col = next((c for c in ['sample_state','state','state_code'] if c in df.columns), None)
    print(f"{len(df):,} records | {df.memory_usage(deep=True).sum()/1024**2:.1f}MB | Price:{y_col} ID:{id_col}")
    return df, y_col, id_col, state_col

def filter_bad_assessed(df):
    if 'assessed_total_value' not in df.columns:
        return pd.Series([False]*len(df), index=df.index)

    valid = (df['assessed_total_value']>10000) & (df['assessed_total_value']<100000000)

    if 'living_sqft' in df.columns:
        has_sqft = (df['living_sqft'].notna()) & (df['living_sqft']>100)
        ppsf = df['assessed_total_value']/df['living_sqft']
        valid &= ~(has_sqft & ((ppsf<20)|(ppsf>2000)))

    if 'prior_sale_price' in df.columns and 'years_since_last_sale' in df.columns:
        has_prior = (df['prior_sale_price']>10000) & (df['years_since_last_sale']<=15)
        valid &= ~(has_prior & ((df['assessed_total_value']/df['prior_sale_price'])<0.10))

    if 'median_home_value' in df.columns:
        has_census = (df['median_home_value']>10000)
        valid &= ~(has_census & ((df['assessed_total_value']/df['median_home_value'])<0.05))

    invalid_cnt = ((df['assessed_total_value'].notna()) & ~valid).sum()
    if invalid_cnt>0:
        print(f"  ⚠️  Filtered {invalid_cnt:,} bad assessed values")

    return valid

def assign_segments(df, train_q=None):
    value_ind = pd.Series([np.nan]*len(df), index=df.index)
    source = pd.Series(['none']*len(df), index=df.index)

    # Priority 1: Prior sale PPSF × current sqft (property-specific, external - NO LEAKAGE)
    if all(c in df.columns for c in ['prior_sale_price','years_since_last_sale','living_sqft']):
        has_prior = (df['prior_sale_price']>10000) & (df['years_since_last_sale']<=15) & (df['living_sqft']>400)
        if has_prior.sum()>0:
            # Calculate historical PPSF from prior sale
            prior_ppsf = df.loc[has_prior,'prior_sale_price'] / (df.loc[has_prior,'living_sqft']+1)
            # Appreciate the PPSF over time
            yrs = df.loc[has_prior,'years_since_last_sale'].fillna(5)
            appreciated_ppsf = prior_ppsf * (1.04 ** yrs)
            # Apply to current sqft
            value_ind[has_prior] = appreciated_ppsf * df.loc[has_prior,'living_sqft']
            source[has_prior] = 'prior_ppsf'

    # Priority 2: Assessed value (external data - NO LEAKAGE)
    if 'assessed_total_value' in df.columns:
        valid = filter_bad_assessed(df) & value_ind.isna()
        if valid.sum()>0:
            vals = df.loc[valid,'assessed_total_value']
            mult = pd.Series([1.15]*len(vals), index=vals.index)
            mult[vals<200000], mult[vals>=500000] = 1.1, 1.2
            value_ind[valid], source[valid] = vals*mult, 'assessed'

    # Priority 3: Census median (external, neighborhood-level - NO LEAKAGE)
    if 'median_home_value' in df.columns:
        valid = (df['median_home_value']>10000) & value_ind.isna()
        if valid.sum()>0:
            value_ind[valid], source[valid] = df.loc[valid,'median_home_value'], 'census'

    # Priority 4: Geo cluster median price (if available from training)
    if 'cluster_med_price' in df.columns:
        valid = (df['cluster_med_price']>10000) & value_ind.isna()
        if valid.sum()>0:
            value_ind[valid], source[valid] = df.loc[valid,'cluster_med_price'], 'cluster'

    # Fallback: Global median
    value_ind = value_ind.fillna(value_ind.median())
    source[value_ind.isna()] = 'global_median'

    # Segment
    if train_q is None:
        q20,q40,q60,q80,q95 = value_ind.quantile([0.20,0.40,0.60,0.80,0.95])
        train_q = {'value_q20':q20,'value_q40':q40,'value_q60':q60,'value_q80':q80,'value_q95':q95}
        print(f"\nValue quantiles: Q20:${q20:,.0f} Q40:${q40:,.0f} Q60:${q60:,.0f} Q80:${q80:,.0f} Q95:${q95:,.0f}")
    else:
        q20,q40,q60,q80,q95 = train_q['value_q20'],train_q['value_q40'],train_q['value_q60'],train_q['value_q80'],train_q['value_q95']

    tier = pd.Series(['mid']*len(df), index=df.index)
    tier[value_ind<q20], tier[(value_ind>=q20)&(value_ind<q40)] = 'budget', 'economy'
    tier[(value_ind>=q60)&(value_ind<q80)], tier[(value_ind>=q80)&(value_ind<q95)], tier[value_ind>=q95] = 'premium', 'luxury', 'ultra'

    size = pd.Series(['small']*len(df), index=df.index)
    if 'living_sqft' in df.columns:
        sqft = df['living_sqft'].fillna(df['living_sqft'].median())
        size[sqft>sqft.median()] = 'large'

    df['_value_indicator'], df['_value_source'] = value_ind, source
    return tier+'_'+size, train_q

def engineer(df, y_col, with_price=False):
    if 'living_sqft' in df.columns:
        if 'bedrooms' in df.columns: df['sqft_per_bedroom'] = df['living_sqft']/(df['bedrooms']+1)
        df['log_sqft'] = np.log1p(df['living_sqft'])

    if 'lot_sqft' in df.columns:
        if 'living_sqft' in df.columns: df['lot_to_living_ratio'] = df['lot_sqft']/(df['living_sqft']+1)
        if 'lot_acres' not in df.columns: df['lot_acres'] = df['lot_sqft']/43560

    if 'year_built' in df.columns:
        df['property_age'], df['is_new'], df['age_squared'] = 2024-df['year_built'], ((2024-df['year_built'])<=5).astype('int8'), (2024-df['year_built'])**2

    if 'garage_spaces' in df.columns:
        df['has_garage'] = (df['garage_spaces']>0).astype('int8')

    lux = [df[c]/1000 if c=='living_sqft' else df[c] for c in ['living_sqft','full_baths','garage_spaces'] if c in df.columns]
    df['luxury_score'] = sum(lux)/len(lux) if lux else 0

    if 'prior_sale_price' in df.columns and 'years_since_last_sale' in df.columns:
        df['prior_appreciated'] = df['prior_sale_price']*(1.04**df['years_since_last_sale'].fillna(5))

    if 'assessed_total_value' in df.columns:
        valid = filter_bad_assessed(df)
        if 'living_sqft' in df.columns:
            df['assessed_per_sqft'] = 0
            df.loc[valid,'assessed_per_sqft'] = df.loc[valid,'assessed_total_value']/(df.loc[valid,'living_sqft']+1)
        if 'median_home_value' in df.columns:
            reasonable = valid & (df['median_home_value']>10000)
            df['assessed_to_census_ratio'] = 1.0
            df.loc[reasonable,'assessed_to_census_ratio'] = df.loc[reasonable,'assessed_total_value']/(df.loc[reasonable,'median_home_value']+1)
        if 'assessed_land_value' in df.columns:
            reasonable = valid & (df['assessed_land_value']>0)
            df['land_to_total_ratio'] = 0
            df.loc[reasonable,'land_to_total_ratio'] = df.loc[reasonable,'assessed_land_value']/(df.loc[reasonable,'assessed_total_value']+1)

    if INCLUDE_CENSUS and all(c in df.columns for c in ['median_household_income','pct_bachelors_degree']):
        df['income_education_score'] = df['median_household_income']*df['pct_bachelors_degree']

    if 'prior_sale_price' in df.columns and 'living_sqft' in df.columns:
        df['prior_price_per_sqft'], df['sqft_per_prior_dollar'] = df['prior_sale_price']/(df['living_sqft']+1), df['living_sqft']/(df['prior_sale_price']+1)

    if 'prior_sale_date' in df.columns:
        df['prior_sale_date'] = pd.to_datetime(df['prior_sale_date'], errors='coerce')
        df['years_since_last_sale'] = (pd.Timestamp('2024-01-01')-df['prior_sale_date']).dt.days/365.25
        if 'prior_sale_price' in df.columns:
            df['expected_appreciation'] = df['prior_sale_price']*(1.04)**df['years_since_last_sale']
        df['recently_sold'] = (df['years_since_last_sale']<2).astype('int8')

    if 'prior_sale_price' in df.columns:
        df['has_prior_sale'] = df['prior_sale_price'].notna().astype('int8')
        missing = df['prior_sale_price'].isna()
        df.loc[missing,'prior_sale_price'] = df.loc[missing,'median_home_value'] if 'median_home_value' in df.columns and INCLUDE_CENSUS else df['prior_sale_price'].median()

    if 'years_since_last_sale' in df.columns:
        df['years_since_last_sale'] = df['years_since_last_sale'].fillna(999)

    if with_price and y_col in df.columns and 'living_sqft' in df.columns:
        df['price_per_sqft'], df['sqft_per_dollar'] = df[y_col]/(df['living_sqft']+1), df['living_sqft']/(df[y_col]+1)

    return df

def geo_cluster(df, kmeans=None):
    if not all(c in df.columns for c in ['latitude','longitude']):
        df['geo_cluster'] = 0
        return df, kmeans

    valid = df[['latitude','longitude']].notna().all(axis=1)
    if valid.sum()<N_CLUSTERS:
        df['geo_cluster'] = 0
        return df, kmeans

    df['geo_cluster'] = 0
    if kmeans is None:
        kmeans = MiniBatchKMeans(n_clusters=N_CLUSTERS, random_state=RAND_STATE, batch_size=1000, n_init=3)
        df.loc[valid,'geo_cluster'] = kmeans.fit_predict(df.loc[valid,['latitude','longitude']])
    else:
        df.loc[valid,'geo_cluster'] = kmeans.predict(df.loc[valid,['latitude','longitude']])

    return df, kmeans

def add_cluster_feats(train, test, y_col):
    if 'geo_cluster' not in train.columns or y_col not in train.columns:
        for d in [train,test]:
            d['cluster_avg_price'] = d['cluster_med_price'] = train[y_col].median() if y_col in train.columns else 0
        return train, test

    stats = train.groupby('geo_cluster')[y_col].agg(['mean','median']).reset_index()
    stats.columns = ['geo_cluster','cluster_avg_price','cluster_med_price']
    med = train[y_col].median()
    train = train.merge(stats, on='geo_cluster', how='left').fillna({'cluster_avg_price':med,'cluster_med_price':med})
    test = test.merge(stats, on='geo_cluster', how='left').fillna({'cluster_avg_price':med,'cluster_med_price':med})
    return train, test

def filter_outliers(df, name, y_col):
    orig = len(df)
    df = engineer(df, y_col, with_price=True)

    if 'price_per_sqft' in df.columns:
        lb,ub = df['price_per_sqft'].quantile([.05,.95])
        df = df[(df['price_per_sqft']>=lb)&(df['price_per_sqft']<=ub)].drop(columns=['price_per_sqft','sqft_per_dollar'])

    if 'lot_sqft' in df.columns: df = df[df['lot_sqft']<=df['lot_sqft'].quantile(.98)]
    if 'year_built' in df.columns: df = df[(df['year_built']>=1900)&(df['year_built']<=2025)]

    if len(df)>=100:
        try:
            feats = [c for c in ['living_sqft','lot_sqft',y_col,'year_built'] if c in df.columns]
            if len(feats)>=3:
                X = df[feats].fillna(df[feats].median())
                df = df[IsolationForest(contamination=.05, random_state=RAND_STATE, n_jobs=N_JOBS).fit_predict(X)==1]
        except: pass

    if (pct:=(orig-len(df))/orig*100)>0: print(f"  {name}: {orig:,}→{len(df):,} ({pct:.1f}% filtered)")
    return df

def train_model(X, y, q):
    return XGBRegressor(objective='reg:quantileerror', quantile_alpha=q, n_estimators=N_EST, learning_rate=.1,
                       max_depth=5, min_child_weight=3, subsample=.8, colsample_bytree=.8,
                       random_state=RAND_STATE, n_jobs=N_JOBS, tree_method='hist').fit(X, y, verbose=False)

def get_feat_importance(model, feats, top_n=20):
    scores = model.get_booster().get_score(importance_type="gain")
    imp = [(feats[int(k[1:])],v) for k,v in scores.items() if int(k[1:])<len(feats)]
    imp.sort(key=lambda x: x[1], reverse=True)
    total = sum(v for _,v in imp)
    return pd.DataFrame([{'feature':f,'gain':g,'importance':g/total} for f,g in imp[:top_n]]) if total>0 else pd.DataFrame(columns=['feature','gain','importance'])

def feature_importance(models, feats, metrics):
    rows = []
    for seg,mdl in models.items():
        scores = mdl['q50'].get_booster().get_score(importance_type="gain")
        w = metrics[seg]["n_test"]
        rows.extend([(feats[int(k[1:])],v,w) for k,v in scores.items() if int(k[1:])<len(feats)])

    if not rows: return pd.DataFrame(columns=["feature","total_gain","importance"])

    df = pd.DataFrame(rows, columns=["feature","gain","weight"])
    out = df.assign(wg=df["gain"]*df["weight"]).groupby("feature",as_index=False).agg(
        total_gain=("gain","sum"), weighted_gain=("wg","sum")).sort_values("weighted_gain",ascending=False)
    out["importance"] = out["weighted_gain"]/out["weighted_gain"].sum()
    return out[["feature","total_gain","importance"]].head(100)

def prepare_data(df, y_col, id_col, state_col, for_training=True):
    print(f"\nPreparing data (prior PPSF × sqft segmentation)...")
    if for_training: df = df[df[y_col]>=MIN_PRICE]

    df, kmeans = geo_cluster(engineer(df, y_col))

    feat_groups = []
    if INCLUDE_MLS: feat_groups.extend([BASE_FEATS,ENG_FEATS,PRIOR_FEATS,CLUSTER_FEATS])
    if INCLUDE_CENSUS: feat_groups.append(CENSUS_FEATS)
    if INCLUDE_NEIGHBORHOOD: feat_groups.append(ELECT_FEATS)
    if INCLUDE_IMAGE: feat_groups.append(IMG_FEATS)

    all_feats = [f for g in feat_groups for f in g]
    feats = [f for f in all_feats if f in df.columns]
    feats.extend([vf for vf in ['prior_appreciated','assessed_per_sqft','assessed_to_census_ratio','land_to_total_ratio'] if vf in df.columns and vf not in feats])

    print(f"{len(feats)}/{len(all_feats)} features available")

    cols = list(set(feats+[y_col,id_col]+([state_col] if state_col and state_col in df.columns else [])))
    df = df[[c for c in cols if c in df.columns]].copy()
    df[feats] = df[feats].fillna(df[feats].median())

    return (df.dropna(subset=[y_col]),feats,kmeans) if for_training else (df,feats,kmeans)

def train_segments(df, feats, y_col, id_col, state_col):
    print(f"\nTraining on {len(df):,} properties (prior PPSF × sqft)")

    df['seg'], train_q = assign_segments(df)

    if '_value_source' in df.columns:
        print(f"\nValue sources: {dict(df['_value_source'].value_counts())}")

    seg_cnts = df['seg'].value_counts()
    print(f"\n{len(seg_cnts)} segments")

    for seg,cnt in seg_cnts.head(MAX_SEGMENTS).items():
        if '_value_indicator' in df.columns and seg!='other':
            seg_df = df[df['seg']==seg]
            if len(seg_df)>0:
                print(f"  {seg}: {cnt:,} (${seg_df['_value_indicator'].min():,.0f}-${seg_df['_value_indicator'].max():,.0f}, med ${seg_df['_value_indicator'].median():,.0f})")
        else:
            print(f"  {seg}: {cnt:,}")

    small = seg_cnts[seg_cnts<50].index
    if len(small)>0:
        df.loc[df['seg'].isin(small),'seg'] = 'other'
        print(f"\nMerged {len(small)} small segments")

    seg_cnts = df['seg'].value_counts()
    if len(seg_cnts)>MAX_SEGMENTS:
        keep = seg_cnts.head(MAX_SEGMENTS-1).index
        df.loc[~df['seg'].isin(keep),'seg'] = 'other'
        print(f"Consolidated to {MAX_SEGMENTS} segments")

    df = df.drop(columns=[c for c in ['_value_indicator','_value_source'] if c in df.columns])

    models, metrics, preds_list, seg_imps = {}, {}, [], {}

    for seg in df['seg'].unique():
        seg_df = filter_outliers(df[df['seg']==seg].copy(), seg, y_col)
        if len(seg_df)<50: continue

        train_idx = seg_df.sample(frac=1-TEST_SIZE, random_state=RAND_STATE).index
        train_df, test_df = seg_df.loc[train_idx].copy(), seg_df.loc[seg_df.index.difference(train_idx)].copy()
        train_df, test_df = add_cluster_feats(train_df, test_df, y_col)

        X_tr, y_tr = train_df[feats].values, train_df[y_col].values
        X_te, y_te = test_df[feats].values, test_df[y_col].values
        ids, states = test_df[id_col].values, test_df[state_col].values if state_col and state_col in test_df.columns else ['Unknown']*len(test_df)

        seg_models = {f"q{int(q*100)}":train_model(X_tr,y_tr,q) for q in QUANTILES}
        seg_preds = [seg_models[f"q{int(q*100)}"].predict(X_te) for q in QUANTILES]

        models[seg] = seg_models
        seg_imps[seg] = get_feat_importance(seg_models['q50'], feats)

        mae, mape = mean_absolute_error(y_te,seg_preds[1]), np.mean(np.abs((y_te-seg_preds[1])/y_te))*100
        r2, cov = r2_score(y_te,seg_preds[1]), np.mean((y_te>=seg_preds[0])&(y_te<=seg_preds[2]))*100
        metrics[seg] = {'n_train':len(X_tr),'n_test':len(X_te),'mae':mae,'mape':mape,'r2':r2,'cov':cov,
                       'p_min':seg_df[y_col].min(),'p_max':seg_df[y_col].max(),'p_med':seg_df[y_col].median()}
        print(f"  {seg}: {len(test_df):,} test | MAE:${mae:,.0f} | MAPE:{mape:.2f}% | R²:{r2:.3f}")

        preds_list.append(pd.DataFrame({
            'property_id':ids, 'state':states, 'actual':y_te, 'predicted':seg_preds[1],
            'pred_lower':seg_preds[0], 'pred_upper':seg_preds[2], 'segment':seg
        }))

    return {'models':models, 'metrics':metrics, 'predictions':pd.concat(preds_list),
            'feature_importance':feature_importance(models,feats,metrics), 'segment_importances':seg_imps,
            'feature_names':feats, 'training_quantiles':train_q}

def predict_new(pred_df, models, feats, y_col, id_col, state_col, kmeans, train_stats, train_q):
    print(f"\n{'='*60}\nPREDICTING {len(pred_df):,} NEW PROPERTIES\n{'='*60}")

    pred_df = engineer(pred_df, y_col)
    pred_df, _ = geo_cluster(pred_df, kmeans)

    if 'geo_cluster' in pred_df.columns and train_stats is not None:
        pred_df = pred_df.merge(train_stats, on='geo_cluster', how='left')
        med = train_stats['cluster_avg_price'].median()
        pred_df['cluster_avg_price'] = pred_df['cluster_avg_price'].fillna(med)
        pred_df['cluster_med_price'] = pred_df['cluster_med_price'].fillna(med)

    for feat in feats:
        if feat not in pred_df.columns: pred_df[feat] = 0
        else: pred_df[feat] = pred_df[feat].fillna(pred_df[feat].median() if pred_df[feat].notna().sum()>0 else 0)

    pred_df['seg'], _ = assign_segments(pred_df, train_q)

    if '_value_source' in pred_df.columns:
        print(f"Value sources: {dict(pred_df['_value_source'].value_counts())}")

    preds_list = []
    for seg in pred_df['seg'].unique():
        seg_df = pred_df[pred_df['seg']==seg].copy()

        if seg not in models:
            print(f"  Warning: '{seg}' not in models, using fallback")
            avail = list(models.keys())
            seg = next((s for s in avail if any(x in seg for x in ['ultra','luxury','premium','mid','economy','budget'] if x in s)), avail[0])

        X = seg_df[feats].values
        ids, states = seg_df[id_col].values, seg_df[state_col].values if state_col and state_col in seg_df.columns else ['Unknown']*len(seg_df)

        preds = [models[seg][f"q{int(q*100)}"].predict(X) for q in QUANTILES]
        actual = seg_df[y_col].values if y_col in seg_df.columns else [np.nan]*len(seg_df)

        value_ind = seg_df['_value_indicator'].values if '_value_indicator' in seg_df.columns else [np.nan]*len(seg_df)
        value_src = seg_df['_value_source'].values if '_value_source' in seg_df.columns else ['unknown']*len(seg_df)

        preds_list.append(pd.DataFrame({
            'property_id':ids, 'state':states, 'value_indicator':value_ind, 'value_source':value_src,
            'actual':actual, 'predicted':preds[1], 'pred_lower':preds[0], 'pred_upper':preds[2], 'segment':seg,
            'error':[actual[i]-preds[1][i] if not np.isnan(actual[i]) else np.nan for i in range(len(actual))],
            'pct_error':[100*(actual[i]-preds[1][i])/actual[i] if not np.isnan(actual[i]) and actual[i]!=0 else np.nan for i in range(len(actual))]
        }))

        print(f"  {seg}: {len(seg_df):,} predicted")

    result = pd.concat(preds_list, ignore_index=True)
    print(f"\n✓ Generated {len(result):,} predictions")

    pred_df = pred_df.drop(columns=[c for c in ['_value_indicator','_value_source'] if c in pred_df.columns])

    valid = result['actual'].notna().sum()
    if valid>0:
        valid_df = result[result['actual'].notna()]
        mae, mape = mean_absolute_error(valid_df['actual'],valid_df['predicted']), np.mean(np.abs((valid_df['actual']-valid_df['predicted'])/valid_df['actual']))*100
        r2 = r2_score(valid_df['actual'],valid_df['predicted'])
        print(f"  Validation ({valid}): MAE:${mae:,.0f} | MAPE:{mape:.2f}% | R²:{r2:.4f}")

    return result

def save_results(results, out_dir, new_preds=None):
    print(f"\nSaving results...")
    preds, metrics, fi, seg_imps = results['predictions'], results['metrics'], results['feature_importance'], results['segment_importances']

    wb = Workbook()
    wb.remove(wb.active)

    # Summary
    ws = wb.create_sheet("Summary", 0)
    ws['A1'].font, ws['A1'].value = Font(bold=True,size=14), 'IMPROVED SEGMENTED AVM'
    ws['A2'].font, ws['A2'].value = Font(italic=True,size=10), 'Prior PPSF × Current Sqft (property-specific, no leakage) + 6 tiers'

    r2, mae, mape = r2_score(preds['actual'],preds['predicted']), mean_absolute_error(preds['actual'],preds['predicted']), np.mean(np.abs((preds['actual']-preds['predicted'])/preds['actual']))*100

    data = [['Metric','Value'], ['Properties',len(preds)], ['Segments',len(metrics)], ['R²',f'{r2:.4f}'], ['MAE',f'${mae:,.0f}'], ['MAPE%',f'{mape:.2f}%']]
    if new_preds is not None: data.append(['New Predictions',len(new_preds)])

    for i,(k,v) in enumerate(data,5):
        ws[f'A{i}'].font, ws[f'A{i}'].value, ws[f'B{i}'].value = Font(bold=True), k, v

    # Segments
    ws = wb.create_sheet("Segments")
    seg_df = pd.DataFrame([{**{'Segment':s},**m} for s,m in metrics.items()])
    for i,h in enumerate(seg_df.columns,1):
        c = ws.cell(1,i,h)
        c.font, c.fill = Font(bold=True,color='FFFFFF'), PatternFill(start_color='366092',end_color='366092',fill_type='solid')
    for i,row in enumerate(seg_df.itertuples(index=False),2):
        for j,v in enumerate(row,1): ws.cell(i,j,v)

    # Feature Importance
    for title,data,name in [('Global Feature Importance',fi,'Global_Feature_Importance')]+[(f'FI: {s}',si,f"FI_{s}"[:31]) for s,si in seg_imps.items()]:
        ws = wb.create_sheet(name)
        ws['A1'].value, ws['A1'].font = title, Font(bold=True,size=12)
        for r_idx,row in enumerate(dataframe_to_rows(data,index=False,header=True),2):
            for c_idx,value in enumerate(row,1):
                cell = ws.cell(row=r_idx,column=c_idx,value=value)
                if r_idx==2: cell.font, cell.fill = Font(bold=True,color='FFFFFF'), PatternFill(start_color='366092',end_color='366092',fill_type='solid')

    # Predictions
    for sheet_name,data,color in [('Test_Predictions',preds,'366092'), ('New_Predictions',new_preds,'4472C4')]:
        if data is None: continue
        ws = wb.create_sheet(sheet_name)
        for i,h in enumerate(data.columns,1):
            c = ws.cell(1,i,h)
            c.font, c.fill = Font(bold=True,color='FFFFFF'), PatternFill(start_color=color,end_color=color,fill_type='solid')
        for i,row in enumerate(data.itertuples(index=False),2):
            for j,v in enumerate(row,1): ws.cell(i,j,v)

    ts = time.strftime("%Y%m%d_%H%M%S")
    xl_path = f"{out_dir}/improved_segmented_{ts}.xlsx"
    wb.save(xl_path)

    for name,data in [('test_predictions',preds), ('segments',seg_df), ('importance',fi)]+[(f'importance_{s}',si) for s,si in seg_imps.items()]+([('new_predictions',new_preds)] if new_preds is not None else []):
        data.to_csv(f"{out_dir}/improved_{name}_{ts}.csv", index=False)

    print(f"✓ Excel: {xl_path}")
    print(f"✓ CSVs saved with timestamp: {ts}")

def main():
    t0 = time.time()
    print("="*60+"\nIMPROVED SEGMENTED AVM\nPrior PPSF × Current Sqft (Property-Specific, No Leakage)\n"+"="*60)

    df, y_col, id_col, state_col = load_data(TRAINING_INPUT_PATH)
    df, feats, kmeans = prepare_data(df, y_col, id_col, state_col, for_training=True)
    results = train_segments(df, feats, y_col, id_col, state_col)

    train_stats = df.groupby('geo_cluster')[y_col].agg(['mean','median']).reset_index() if 'geo_cluster' in df.columns else None
    if train_stats is not None: train_stats.columns = ['geo_cluster','cluster_avg_price','cluster_med_price']

    new_preds = None
    if PREDICTION_INPUT_PATH:
        pred_df, _, _, _ = load_data(PREDICTION_INPUT_PATH)
        new_preds = predict_new(pred_df, results['models'], feats, y_col, id_col, state_col, kmeans, train_stats, results['training_quantiles'])

    os.makedirs(OUTPUT_DIR, exist_ok=True)
    save_results(results, OUTPUT_DIR, new_preds)

    preds = results['predictions']
    r2, mae, mape = r2_score(preds['actual'],preds['predicted']), mean_absolute_error(preds['actual'],preds['predicted']), np.mean(np.abs((preds['actual']-preds['predicted'])/preds['actual']))*100
    print(f"\n{'='*60}\n✓ COMPLETE in {time.time()-t0:.1f}s\n  Test: {len(preds):,} | {preds['segment'].nunique()} segments\n  R²:{r2:.4f} | MAE:${mae:,.0f} | MAPE:{mape:.2f}%")
    if new_preds is not None: print(f"  New predictions: {len(new_preds):,}")
    print("="*60)

if __name__=="__main__": main()

IMPROVED SEGMENTED AVM
Prior PPSF × Current Sqft (Property-Specific, No Leakage)
Loading /Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/Untitled 5_2025-12-23-1212.csv
127,326 records | 90.4MB | Price:sale_price ID:property_id

Preparing data (prior PPSF × sqft segmentation)...
  ⚠️  Filtered 29,431 bad assessed values
51/57 features available

Training on 127,258 properties (prior PPSF × sqft)

Value quantiles: Q20:$225,450 Q40:$434,750 Q60:$434,750 Q80:$741,940 Q95:$1,445,367

Value sources: {'census': np.int64(127258)}

10 segments
  premium_large: 27,925 ($434,750-$732,125, med $434,750)
  premium_small: 26,120 ($434,750-$732,125, med $434,750)
  budget_small: 19,274 ($12,000-$225,433, med $134,133)
  economy_large: 13,496 ($225,450-$433,875, med $335,713)
  luxury_large: 11,219 ($741,940-$1,426,334, med $926,975)
  economy_small: 8,817 ($225,450-$433,875, med $320,400)
  luxury_small: 7,745 ($741,940-$1,426,334, med $923,550)
Consolidated to 7 segments
  budget_small:

In [29]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.cluster import MiniBatchKMeans
from sklearn.ensemble import IsolationForest
import warnings, time, os
from openpyxl import Workbook
from openpyxl.styles import Font, PatternFill
from openpyxl.utils.dataframe import dataframe_to_rows

warnings.filterwarnings('ignore')
os.environ['OMP_NUM_THREADS'] = '4'

# Config
INCLUDE_MLS, INCLUDE_CENSUS, INCLUDE_NEIGHBORHOOD, INCLUDE_IMAGE = True, True, True, False
MIN_PRICE, TEST_SIZE, RAND_STATE, N_JOBS, N_CLUSTERS, N_EST, MAX_SEGMENTS = 20000, 0.3, 42, -1, 8, 100, 7
QUANTILES = [0.1, 0.5, 0.9]

TRAINING_INPUT_PATH = "/Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/Untitled 5_2025-12-23-1212.csv"
PREDICTION_INPUT_PATH = "/Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/MLS_w_luxury_AVM_outliers.csv"
OUTPUT_DIR = "/Users/jenny.lin/BASIS_AVM_Onboarding/cate_scenario_analyses/model_outputs"

BASE_FEATS = ["living_sqft","lot_sqft","year_built","effective_year_built","bedrooms","full_baths","half_baths","garage_spaces","fireplace_code","latitude","longitude","geo_cluster","value_indicator"]
ENG_FEATS = ["sqft_per_bedroom","lot_to_living_ratio","property_age","is_new","has_garage","luxury_score","log_sqft","age_squared","log_value_indicator"]
PRIOR_FEATS = ["prior_sale_price","prior_price_per_sqft","sqft_per_prior_dollar","years_since_last_sale","expected_appreciation","has_prior_sale","recently_sold"]
CLUSTER_FEATS = ["cluster_avg_price","cluster_med_price"]
CENSUS_FEATS = ["total_population_25plus","male_bachelors_degree","female_bachelors_degree","pct_bachelors_degree","median_earnings_total","median_earnings_male","median_earnings_female","median_household_income","median_home_value","median_gross_rent","owner_occupied_units","renter_occupied_units","pct_owner_occupied","occupied_units","vacant_units","median_age","civilian_employed","civilian_unemployed","unemployment_rate","income_education_score"]
ELECT_FEATS = ["votes_gop","votes_dem","total_votes","per_gop","per_dem","per_point_diff","dem_margin","rep_margin"]
IMG_FEATS = ["topic_1","topic_2","topic_3","topic_4","topic_5","topic_6","topic_7","topic_8","topic_9","topic_10","gran_c_in","gran_c_ex","gran_c","high_c_in","high_c_ex","high_c"]

def optimize_dtypes(df):
    for c in df.select_dtypes(['float64']).columns: df[c] = df[c].astype('float32')
    for c in df.select_dtypes(['int64']).columns:
        df[c] = df[c].astype('int8' if set(df[c].dropna().unique()).issubset({0,1}) else 'int32')
    return df

def load_data(path):
    print(f"Loading {path}")
    df = optimize_dtypes(pd.read_csv(path, low_memory=False))
    df.columns = df.columns.str.lower()
    y_col = next((c for c in ['sale_price','currentsalesprice','price','saleprice'] if c in df.columns), 'currentsalesprice')
    id_col = next((c for c in ['cc_list_id','property_id','propertyid','id'] if c in df.columns), 'cc_list_id')
    state_col = next((c for c in ['sample_state','state','state_code'] if c in df.columns), None)
    print(f"{len(df):,} records | {df.memory_usage(deep=True).sum()/1024**2:.1f}MB | Price:{y_col} ID:{id_col}")
    return df, y_col, id_col, state_col

def filter_bad_assessed(df):
    if 'assessed_total_value' not in df.columns:
        return pd.Series([False]*len(df), index=df.index)

    valid = (df['assessed_total_value']>10000) & (df['assessed_total_value']<100000000)

    if 'living_sqft' in df.columns:
        has_sqft = (df['living_sqft'].notna()) & (df['living_sqft']>100)
        ppsf = df['assessed_total_value']/df['living_sqft']
        valid &= ~(has_sqft & ((ppsf<20)|(ppsf>2000)))

    if 'prior_sale_price' in df.columns and 'years_since_last_sale' in df.columns:
        has_prior = (df['prior_sale_price']>10000) & (df['years_since_last_sale']<=15)
        valid &= ~(has_prior & ((df['assessed_total_value']/df['prior_sale_price'])<0.10))

    if 'median_home_value' in df.columns:
        has_census = (df['median_home_value']>10000)
        valid &= ~(has_census & ((df['assessed_total_value']/df['median_home_value'])<0.05))

    invalid_cnt = ((df['assessed_total_value'].notna()) & ~valid).sum()
    if invalid_cnt>0:
        print(f"  ⚠️  Filtered {invalid_cnt:,} bad assessed values")

    return valid

def assign_segments(df, train_q=None):
    value_ind = pd.Series([np.nan]*len(df), index=df.index)
    source = pd.Series(['none']*len(df), index=df.index)

    # Priority 1: Prior sale PPSF × current sqft (property-specific, external - NO LEAKAGE)
    if all(c in df.columns for c in ['prior_sale_price','years_since_last_sale','living_sqft']):
        has_prior = (df['prior_sale_price']>10000) & (df['years_since_last_sale']<=15) & (df['living_sqft']>400)
        if has_prior.sum()>0:
            # Calculate historical PPSF from prior sale
            prior_ppsf = df.loc[has_prior,'prior_sale_price'] / (df.loc[has_prior,'living_sqft']+1)
            # Appreciate the PPSF over time
            yrs = df.loc[has_prior,'years_since_last_sale'].fillna(5)
            appreciated_ppsf = prior_ppsf * (1.04 ** yrs)
            # Apply to current sqft
            value_ind[has_prior] = appreciated_ppsf * df.loc[has_prior,'living_sqft']
            source[has_prior] = 'prior_ppsf'

    # Priority 2: Assessed value (external data - NO LEAKAGE)
    if 'assessed_total_value' in df.columns:
        valid = filter_bad_assessed(df) & value_ind.isna()
        if valid.sum()>0:
            vals = df.loc[valid,'assessed_total_value']
            mult = pd.Series([1.15]*len(vals), index=vals.index)
            mult[vals<200000], mult[vals>=500000] = 1.1, 1.2
            value_ind[valid], source[valid] = vals*mult, 'assessed'

    # Priority 3: Census median (external, neighborhood-level - NO LEAKAGE)
    if 'median_home_value' in df.columns:
        valid = (df['median_home_value']>10000) & value_ind.isna()
        if valid.sum()>0:
            value_ind[valid], source[valid] = df.loc[valid,'median_home_value'], 'census'

    # Priority 4: Geo cluster median price (if available from training)
    if 'cluster_med_price' in df.columns:
        valid = (df['cluster_med_price']>10000) & value_ind.isna()
        if valid.sum()>0:
            value_ind[valid], source[valid] = df.loc[valid,'cluster_med_price'], 'cluster'

    # Fallback: Global median
    value_ind = value_ind.fillna(value_ind.median())
    source[value_ind.isna()] = 'global_median'

    # STORE value_indicator as a feature (for model to use as anchor)
    df['value_indicator'] = value_ind
    df['log_value_indicator'] = np.log1p(value_ind)  # Also create log transform
    df['_value_source'] = source

    # Segment based on value indicator
    if train_q is None:
        q20,q40,q60,q80,q95 = value_ind.quantile([0.20,0.40,0.60,0.80,0.95])
        train_q = {'value_q20':q20,'value_q40':q40,'value_q60':q60,'value_q80':q80,'value_q95':q95}
        print(f"\nValue quantiles: Q20:${q20:,.0f} Q40:${q40:,.0f} Q60:${q60:,.0f} Q80:${q80:,.0f} Q95:${q95:,.0f}")
    else:
        q20,q40,q60,q80,q95 = train_q['value_q20'],train_q['value_q40'],train_q['value_q60'],train_q['value_q80'],train_q['value_q95']

    tier = pd.Series(['mid']*len(df), index=df.index)
    tier[value_ind<q20], tier[(value_ind>=q20)&(value_ind<q40)] = 'budget', 'economy'
    tier[(value_ind>=q60)&(value_ind<q80)], tier[(value_ind>=q80)&(value_ind<q95)], tier[value_ind>=q95] = 'premium', 'luxury', 'ultra'

    size = pd.Series(['small']*len(df), index=df.index)
    if 'living_sqft' in df.columns:
        sqft = df['living_sqft'].fillna(df['living_sqft'].median())
        size[sqft>sqft.median()] = 'large'

    return tier+'_'+size, train_q

    # Segment
    if train_q is None:
        q20,q40,q60,q80,q95 = value_ind.quantile([0.20,0.40,0.60,0.80,0.95])
        train_q = {'value_q20':q20,'value_q40':q40,'value_q60':q60,'value_q80':q80,'value_q95':q95}
        print(f"\nValue quantiles: Q20:${q20:,.0f} Q40:${q40:,.0f} Q60:${q60:,.0f} Q80:${q80:,.0f} Q95:${q95:,.0f}")
    else:
        q20,q40,q60,q80,q95 = train_q['value_q20'],train_q['value_q40'],train_q['value_q60'],train_q['value_q80'],train_q['value_q95']

    tier = pd.Series(['mid']*len(df), index=df.index)
    tier[value_ind<q20], tier[(value_ind>=q20)&(value_ind<q40)] = 'budget', 'economy'
    tier[(value_ind>=q60)&(value_ind<q80)], tier[(value_ind>=q80)&(value_ind<q95)], tier[value_ind>=q95] = 'premium', 'luxury', 'ultra'

    size = pd.Series(['small']*len(df), index=df.index)
    if 'living_sqft' in df.columns:
        sqft = df['living_sqft'].fillna(df['living_sqft'].median())
        size[sqft>sqft.median()] = 'large'

    df['_value_indicator'], df['_value_source'] = value_ind, source
    return tier+'_'+size, train_q

def engineer(df, y_col, with_price=False):
    if 'living_sqft' in df.columns:
        if 'bedrooms' in df.columns: df['sqft_per_bedroom'] = df['living_sqft']/(df['bedrooms']+1)
        df['log_sqft'] = np.log1p(df['living_sqft'])

    if 'lot_sqft' in df.columns:
        if 'living_sqft' in df.columns: df['lot_to_living_ratio'] = df['lot_sqft']/(df['living_sqft']+1)
        if 'lot_acres' not in df.columns: df['lot_acres'] = df['lot_sqft']/43560

    if 'year_built' in df.columns:
        df['property_age'], df['is_new'], df['age_squared'] = 2024-df['year_built'], ((2024-df['year_built'])<=5).astype('int8'), (2024-df['year_built'])**2

    if 'garage_spaces' in df.columns:
        df['has_garage'] = (df['garage_spaces']>0).astype('int8')

    lux = [df[c]/1000 if c=='living_sqft' else df[c] for c in ['living_sqft','full_baths','garage_spaces'] if c in df.columns]
    df['luxury_score'] = sum(lux)/len(lux) if lux else 0

    if 'prior_sale_price' in df.columns and 'years_since_last_sale' in df.columns:
        df['prior_appreciated'] = df['prior_sale_price']*(1.04**df['years_since_last_sale'].fillna(5))

    if 'assessed_total_value' in df.columns:
        valid = filter_bad_assessed(df)
        if 'living_sqft' in df.columns:
            df['assessed_per_sqft'] = 0
            df.loc[valid,'assessed_per_sqft'] = df.loc[valid,'assessed_total_value']/(df.loc[valid,'living_sqft']+1)
        if 'median_home_value' in df.columns:
            reasonable = valid & (df['median_home_value']>10000)
            df['assessed_to_census_ratio'] = 1.0
            df.loc[reasonable,'assessed_to_census_ratio'] = df.loc[reasonable,'assessed_total_value']/(df.loc[reasonable,'median_home_value']+1)
        if 'assessed_land_value' in df.columns:
            reasonable = valid & (df['assessed_land_value']>0)
            df['land_to_total_ratio'] = 0
            df.loc[reasonable,'land_to_total_ratio'] = df.loc[reasonable,'assessed_land_value']/(df.loc[reasonable,'assessed_total_value']+1)

    if INCLUDE_CENSUS and all(c in df.columns for c in ['median_household_income','pct_bachelors_degree']):
        df['income_education_score'] = df['median_household_income']*df['pct_bachelors_degree']

    if 'prior_sale_price' in df.columns and 'living_sqft' in df.columns:
        df['prior_price_per_sqft'], df['sqft_per_prior_dollar'] = df['prior_sale_price']/(df['living_sqft']+1), df['living_sqft']/(df['prior_sale_price']+1)

    if 'prior_sale_date' in df.columns:
        df['prior_sale_date'] = pd.to_datetime(df['prior_sale_date'], errors='coerce')
        df['years_since_last_sale'] = (pd.Timestamp('2024-01-01')-df['prior_sale_date']).dt.days/365.25
        if 'prior_sale_price' in df.columns:
            df['expected_appreciation'] = df['prior_sale_price']*(1.04)**df['years_since_last_sale']
        df['recently_sold'] = (df['years_since_last_sale']<2).astype('int8')

    if 'prior_sale_price' in df.columns:
        df['has_prior_sale'] = df['prior_sale_price'].notna().astype('int8')
        missing = df['prior_sale_price'].isna()
        df.loc[missing,'prior_sale_price'] = df.loc[missing,'median_home_value'] if 'median_home_value' in df.columns and INCLUDE_CENSUS else df['prior_sale_price'].median()

    if 'years_since_last_sale' in df.columns:
        df['years_since_last_sale'] = df['years_since_last_sale'].fillna(999)

    if with_price and y_col in df.columns and 'living_sqft' in df.columns:
        df['price_per_sqft'], df['sqft_per_dollar'] = df[y_col]/(df['living_sqft']+1), df['living_sqft']/(df[y_col]+1)

    return df

def geo_cluster(df, kmeans=None):
    if not all(c in df.columns for c in ['latitude','longitude']):
        df['geo_cluster'] = 0
        return df, kmeans

    valid = df[['latitude','longitude']].notna().all(axis=1)
    if valid.sum()<N_CLUSTERS:
        df['geo_cluster'] = 0
        return df, kmeans

    df['geo_cluster'] = 0
    if kmeans is None:
        kmeans = MiniBatchKMeans(n_clusters=N_CLUSTERS, random_state=RAND_STATE, batch_size=1000, n_init=3)
        df.loc[valid,'geo_cluster'] = kmeans.fit_predict(df.loc[valid,['latitude','longitude']])
    else:
        df.loc[valid,'geo_cluster'] = kmeans.predict(df.loc[valid,['latitude','longitude']])

    return df, kmeans

def add_cluster_feats(train, test, y_col):
    if 'geo_cluster' not in train.columns or y_col not in train.columns:
        for d in [train,test]:
            d['cluster_avg_price'] = d['cluster_med_price'] = train[y_col].median() if y_col in train.columns else 0
        return train, test

    stats = train.groupby('geo_cluster')[y_col].agg(['mean','median']).reset_index()
    stats.columns = ['geo_cluster','cluster_avg_price','cluster_med_price']
    med = train[y_col].median()
    train = train.merge(stats, on='geo_cluster', how='left').fillna({'cluster_avg_price':med,'cluster_med_price':med})
    test = test.merge(stats, on='geo_cluster', how='left').fillna({'cluster_avg_price':med,'cluster_med_price':med})
    return train, test

def filter_outliers(df, name, y_col):
    orig = len(df)
    df = engineer(df, y_col, with_price=True)

    if 'price_per_sqft' in df.columns:
        lb,ub = df['price_per_sqft'].quantile([.05,.95])
        df = df[(df['price_per_sqft']>=lb)&(df['price_per_sqft']<=ub)].drop(columns=['price_per_sqft','sqft_per_dollar'])

    if 'lot_sqft' in df.columns: df = df[df['lot_sqft']<=df['lot_sqft'].quantile(.98)]
    if 'year_built' in df.columns: df = df[(df['year_built']>=1900)&(df['year_built']<=2025)]

    if len(df)>=100:
        try:
            feats = [c for c in ['living_sqft','lot_sqft',y_col,'year_built'] if c in df.columns]
            if len(feats)>=3:
                X = df[feats].fillna(df[feats].median())
                df = df[IsolationForest(contamination=.05, random_state=RAND_STATE, n_jobs=N_JOBS).fit_predict(X)==1]
        except: pass

    if (pct:=(orig-len(df))/orig*100)>0: print(f"  {name}: {orig:,}→{len(df):,} ({pct:.1f}% filtered)")
    return df

def train_model(X, y, q):
    return XGBRegressor(objective='reg:quantileerror', quantile_alpha=q, n_estimators=N_EST, learning_rate=.1,
                       max_depth=5, min_child_weight=3, subsample=.8, colsample_bytree=.8,
                       random_state=RAND_STATE, n_jobs=N_JOBS, tree_method='hist').fit(X, y, verbose=False)

def get_feat_importance(model, feats, top_n=20):
    scores = model.get_booster().get_score(importance_type="gain")
    imp = [(feats[int(k[1:])],v) for k,v in scores.items() if int(k[1:])<len(feats)]
    imp.sort(key=lambda x: x[1], reverse=True)
    total = sum(v for _,v in imp)
    return pd.DataFrame([{'feature':f,'gain':g,'importance':g/total} for f,g in imp[:top_n]]) if total>0 else pd.DataFrame(columns=['feature','gain','importance'])

def feature_importance(models, feats, metrics):
    rows = []
    for seg,mdl in models.items():
        scores = mdl['q50'].get_booster().get_score(importance_type="gain")
        w = metrics[seg]["n_test"]
        rows.extend([(feats[int(k[1:])],v,w) for k,v in scores.items() if int(k[1:])<len(feats)])

    if not rows: return pd.DataFrame(columns=["feature","total_gain","importance"])

    df = pd.DataFrame(rows, columns=["feature","gain","weight"])
    out = df.assign(wg=df["gain"]*df["weight"]).groupby("feature",as_index=False).agg(
        total_gain=("gain","sum"), weighted_gain=("wg","sum")).sort_values("weighted_gain",ascending=False)
    out["importance"] = out["weighted_gain"]/out["weighted_gain"].sum()
    return out[["feature","total_gain","importance"]].head(100)

def prepare_data(df, y_col, id_col, state_col, for_training=True):
    print(f"\nPreparing data (value_indicator as anchor feature)...")
    if for_training: df = df[df[y_col]>=MIN_PRICE]

    df, kmeans = geo_cluster(engineer(df, y_col))

    feat_groups = []
    if INCLUDE_MLS: feat_groups.extend([BASE_FEATS,ENG_FEATS,PRIOR_FEATS,CLUSTER_FEATS])
    if INCLUDE_CENSUS: feat_groups.append(CENSUS_FEATS)
    if INCLUDE_NEIGHBORHOOD: feat_groups.append(ELECT_FEATS)
    if INCLUDE_IMAGE: feat_groups.append(IMG_FEATS)

    all_feats = [f for g in feat_groups for f in g]
    feats = [f for f in all_feats if f in df.columns]
    feats.extend([vf for vf in ['prior_appreciated','assessed_per_sqft','assessed_to_census_ratio','land_to_total_ratio'] if vf in df.columns and vf not in feats])

    print(f"{len(feats)}/{len(all_feats)} features available")

    cols = list(set(feats+[y_col,id_col]+([state_col] if state_col and state_col in df.columns else [])))
    df = df[[c for c in cols if c in df.columns]].copy()
    df[feats] = df[feats].fillna(df[feats].median())

    return (df.dropna(subset=[y_col]),feats,kmeans) if for_training else (df,feats,kmeans)

def train_segments(df, feats, y_col, id_col, state_col):
    print(f"\nTraining on {len(df):,} properties (value_indicator as anchor feature)")

    df['seg'], train_q = assign_segments(df)

    # Add value_indicator features to feature list (created by assign_segments)
    if 'value_indicator' in df.columns and 'value_indicator' not in feats:
        feats = feats + ['value_indicator']
    if 'log_value_indicator' in df.columns and 'log_value_indicator' not in feats:
        feats = feats + ['log_value_indicator']

    if '_value_source' in df.columns:
        print(f"\nValue sources: {dict(df['_value_source'].value_counts())}")

    seg_cnts = df['seg'].value_counts()
    print(f"\n{len(seg_cnts)} segments")

    for seg,cnt in seg_cnts.head(MAX_SEGMENTS).items():
        if 'value_indicator' in df.columns and seg!='other':
            seg_df = df[df['seg']==seg]
            if len(seg_df)>0:
                print(f"  {seg}: {cnt:,} (${seg_df['value_indicator'].min():,.0f}-${seg_df['value_indicator'].max():,.0f}, med ${seg_df['value_indicator'].median():,.0f})")
        else:
            print(f"  {seg}: {cnt:,}")

    small = seg_cnts[seg_cnts<50].index
    if len(small)>0:
        df.loc[df['seg'].isin(small),'seg'] = 'other'
        print(f"\nMerged {len(small)} small segments")

    seg_cnts = df['seg'].value_counts()
    if len(seg_cnts)>MAX_SEGMENTS:
        keep = seg_cnts.head(MAX_SEGMENTS-1).index
        df.loc[~df['seg'].isin(keep),'seg'] = 'other'
        print(f"Consolidated to {MAX_SEGMENTS} segments")

    df = df.drop(columns=[c for c in ['_value_indicator','_value_source'] if c in df.columns])

    models, metrics, preds_list, seg_imps = {}, {}, [], {}

    for seg in df['seg'].unique():
        seg_df = filter_outliers(df[df['seg']==seg].copy(), seg, y_col)
        if len(seg_df)<50: continue

        train_idx = seg_df.sample(frac=1-TEST_SIZE, random_state=RAND_STATE).index
        train_df, test_df = seg_df.loc[train_idx].copy(), seg_df.loc[seg_df.index.difference(train_idx)].copy()
        train_df, test_df = add_cluster_feats(train_df, test_df, y_col)

        X_tr, y_tr = train_df[feats].values, train_df[y_col].values
        X_te, y_te = test_df[feats].values, test_df[y_col].values
        ids, states = test_df[id_col].values, test_df[state_col].values if state_col and state_col in test_df.columns else ['Unknown']*len(test_df)

        seg_models = {f"q{int(q*100)}":train_model(X_tr,y_tr,q) for q in QUANTILES}
        seg_preds = [seg_models[f"q{int(q*100)}"].predict(X_te) for q in QUANTILES]

        models[seg] = seg_models
        seg_imps[seg] = get_feat_importance(seg_models['q50'], feats)

        mae, mape = mean_absolute_error(y_te,seg_preds[1]), np.mean(np.abs((y_te-seg_preds[1])/y_te))*100
        r2, cov = r2_score(y_te,seg_preds[1]), np.mean((y_te>=seg_preds[0])&(y_te<=seg_preds[2]))*100
        metrics[seg] = {'n_train':len(X_tr),'n_test':len(X_te),'mae':mae,'mape':mape,'r2':r2,'cov':cov,
                       'p_min':seg_df[y_col].min(),'p_max':seg_df[y_col].max(),'p_med':seg_df[y_col].median()}
        print(f"  {seg}: {len(test_df):,} test | MAE:${mae:,.0f} | MAPE:{mape:.2f}% | R²:{r2:.3f}")

        preds_list.append(pd.DataFrame({
            'property_id':ids, 'state':states, 'actual':y_te, 'predicted':seg_preds[1],
            'pred_lower':seg_preds[0], 'pred_upper':seg_preds[2], 'segment':seg
        }))

    return {'models':models, 'metrics':metrics, 'predictions':pd.concat(preds_list),
            'feature_importance':feature_importance(models,feats,metrics), 'segment_importances':seg_imps,
            'feature_names':feats, 'training_quantiles':train_q}  # feats now includes value_indicator

def predict_new(pred_df, models, feats, y_col, id_col, state_col, kmeans, train_stats, train_q):
    print(f"\n{'='*60}\nPREDICTING {len(pred_df):,} NEW PROPERTIES\n{'='*60}")

    pred_df = engineer(pred_df, y_col)
    pred_df, _ = geo_cluster(pred_df, kmeans)

    if 'geo_cluster' in pred_df.columns and train_stats is not None:
        pred_df = pred_df.merge(train_stats, on='geo_cluster', how='left')
        med = train_stats['cluster_avg_price'].median()
        pred_df['cluster_avg_price'] = pred_df['cluster_avg_price'].fillna(med)
        pred_df['cluster_med_price'] = pred_df['cluster_med_price'].fillna(med)

    for feat in feats:
        if feat not in pred_df.columns: pred_df[feat] = 0
        else: pred_df[feat] = pred_df[feat].fillna(pred_df[feat].median() if pred_df[feat].notna().sum()>0 else 0)

    pred_df['seg'], _ = assign_segments(pred_df, train_q)

    if '_value_source' in pred_df.columns:
        print(f"Value sources: {dict(pred_df['_value_source'].value_counts())}")

    preds_list = []
    for seg in pred_df['seg'].unique():
        seg_df = pred_df[pred_df['seg']==seg].copy()

        if seg not in models:
            print(f"  Warning: '{seg}' not in models, using fallback")
            avail = list(models.keys())
            seg = next((s for s in avail if any(x in seg for x in ['ultra','luxury','premium','mid','economy','budget'] if x in s)), avail[0])

        final_feats = list(feats)
        for vf in ['value_indicator', 'log_value_indicator']:
            if vf in pred_df.columns and vf not in final_feats:
                final_feats.append(vf)

        # Fill missing
        for f in final_feats:
            if f not in pred_df.columns: pred_df[f] = 0
            else: pred_df[f] = pred_df[f].fillna(pred_df[f].median() if pred_df[f].notna().sum()>0 else 0)

        X = seg_df[final_feats].values
        ids, states = seg_df[id_col].values, seg_df[state_col].values if state_col and state_col in seg_df.columns else ['Unknown']*len(seg_df)

        preds = [models[seg][f"q{int(q*100)}"].predict(X) for q in QUANTILES]
        actual = seg_df[y_col].values if y_col in seg_df.columns else [np.nan]*len(seg_df)

        value_ind = seg_df['_value_indicator'].values if '_value_indicator' in seg_df.columns else [np.nan]*len(seg_df)
        value_src = seg_df['_value_source'].values if '_value_source' in seg_df.columns else ['unknown']*len(seg_df)

        preds_list.append(pd.DataFrame({
            'property_id':ids, 'state':states, 'value_indicator':value_ind, 'value_source':value_src,
            'actual':actual, 'predicted':preds[1], 'pred_lower':preds[0], 'pred_upper':preds[2], 'segment':seg,
            'error':[actual[i]-preds[1][i] if not np.isnan(actual[i]) else np.nan for i in range(len(actual))],
            'pct_error':[100*(actual[i]-preds[1][i])/actual[i] if not np.isnan(actual[i]) and actual[i]!=0 else np.nan for i in range(len(actual))]
        }))

        print(f"  {seg}: {len(seg_df):,} predicted")

    result = pd.concat(preds_list, ignore_index=True)
    print(f"\n✓ Generated {len(result):,} predictions")

    pred_df = pred_df.drop(columns=[c for c in ['_value_indicator','_value_source'] if c in pred_df.columns])

    valid = result['actual'].notna().sum()
    if valid>0:
        valid_df = result[result['actual'].notna()]
        mae, mape = mean_absolute_error(valid_df['actual'],valid_df['predicted']), np.mean(np.abs((valid_df['actual']-valid_df['predicted'])/valid_df['actual']))*100
        r2 = r2_score(valid_df['actual'],valid_df['predicted'])
        print(f"  Validation ({valid}): MAE:${mae:,.0f} | MAPE:{mape:.2f}% | R²:{r2:.4f}")

    return result

def save_results(results, out_dir, new_preds=None):
    print(f"\nSaving results...")
    preds, metrics, fi, seg_imps = results['predictions'], results['metrics'], results['feature_importance'], results['segment_importances']

    wb = Workbook()
    wb.remove(wb.active)

    # Summary
    ws = wb.create_sheet("Summary", 0)
    ws['A1'].font, ws['A1'].value = Font(bold=True,size=14), 'IMPROVED SEGMENTED AVM'
    ws['A2'].font, ws['A2'].value = Font(italic=True,size=10), 'Value indicator AS MODEL FEATURE (anchors predictions) + 6 tiers'

    r2, mae, mape = r2_score(preds['actual'],preds['predicted']), mean_absolute_error(preds['actual'],preds['predicted']), np.mean(np.abs((preds['actual']-preds['predicted'])/preds['actual']))*100

    data = [['Metric','Value'], ['Properties',len(preds)], ['Segments',len(metrics)], ['R²',f'{r2:.4f}'], ['MAE',f'${mae:,.0f}'], ['MAPE%',f'{mape:.2f}%']]
    if new_preds is not None: data.append(['New Predictions',len(new_preds)])

    for i,(k,v) in enumerate(data,5):
        ws[f'A{i}'].font, ws[f'A{i}'].value, ws[f'B{i}'].value = Font(bold=True), k, v

    # Segments
    ws = wb.create_sheet("Segments")
    seg_df = pd.DataFrame([{**{'Segment':s},**m} for s,m in metrics.items()])
    for i,h in enumerate(seg_df.columns,1):
        c = ws.cell(1,i,h)
        c.font, c.fill = Font(bold=True,color='FFFFFF'), PatternFill(start_color='366092',end_color='366092',fill_type='solid')
    for i,row in enumerate(seg_df.itertuples(index=False),2):
        for j,v in enumerate(row,1): ws.cell(i,j,v)

    # Feature Importance
    for title,data,name in [('Global Feature Importance',fi,'Global_Feature_Importance')]+[(f'FI: {s}',si,f"FI_{s}"[:31]) for s,si in seg_imps.items()]:
        ws = wb.create_sheet(name)
        ws['A1'].value, ws['A1'].font = title, Font(bold=True,size=12)
        for r_idx,row in enumerate(dataframe_to_rows(data,index=False,header=True),2):
            for c_idx,value in enumerate(row,1):
                cell = ws.cell(row=r_idx,column=c_idx,value=value)
                if r_idx==2: cell.font, cell.fill = Font(bold=True,color='FFFFFF'), PatternFill(start_color='366092',end_color='366092',fill_type='solid')

    # Predictions
    for sheet_name,data,color in [('Test_Predictions',preds,'366092'), ('New_Predictions',new_preds,'4472C4')]:
        if data is None: continue
        ws = wb.create_sheet(sheet_name)
        for i,h in enumerate(data.columns,1):
            c = ws.cell(1,i,h)
            c.font, c.fill = Font(bold=True,color='FFFFFF'), PatternFill(start_color=color,end_color=color,fill_type='solid')
        for i,row in enumerate(data.itertuples(index=False),2):
            for j,v in enumerate(row,1): ws.cell(i,j,v)

    ts = time.strftime("%Y%m%d_%H%M%S")
    xl_path = f"{out_dir}/improved_segmented_{ts}.xlsx"
    wb.save(xl_path)

    for name,data in [('test_predictions',preds), ('segments',seg_df), ('importance',fi)]+[(f'importance_{s}',si) for s,si in seg_imps.items()]+([('new_predictions',new_preds)] if new_preds is not None else []):
        data.to_csv(f"{out_dir}/improved_{name}_{ts}.csv", index=False)

    print(f"✓ Excel: {xl_path}")
    print(f"✓ CSVs saved with timestamp: {ts}")

def main():
    t0 = time.time()
    print("="*60+"\nIMPROVED SEGMENTED AVM\nValue Indicator AS MODEL FEATURE (Anchors Predictions)\n"+"="*60)

    df, y_col, id_col, state_col = load_data(TRAINING_INPUT_PATH)
    df, feats, kmeans = prepare_data(df, y_col, id_col, state_col, for_training=True)
    results = train_segments(df, feats, y_col, id_col, state_col)

    train_stats = df.groupby('geo_cluster')[y_col].agg(['mean','median']).reset_index() if 'geo_cluster' in df.columns else None
    if train_stats is not None: train_stats.columns = ['geo_cluster','cluster_avg_price','cluster_med_price']

    new_preds = None
    if PREDICTION_INPUT_PATH:
        pred_df, _, _, _ = load_data(PREDICTION_INPUT_PATH)
        new_preds = predict_new(pred_df, results['models'], feats, y_col, id_col, state_col, kmeans, train_stats, results['training_quantiles'])

    os.makedirs(OUTPUT_DIR, exist_ok=True)
    save_results(results, OUTPUT_DIR, new_preds)

    preds = results['predictions']
    r2, mae, mape = r2_score(preds['actual'],preds['predicted']), mean_absolute_error(preds['actual'],preds['predicted']), np.mean(np.abs((preds['actual']-preds['predicted'])/preds['actual']))*100
    print(f"\n{'='*60}\n✓ COMPLETE in {time.time()-t0:.1f}s\n  Test: {len(preds):,} | {preds['segment'].nunique()} segments\n  R²:{r2:.4f} | MAE:${mae:,.0f} | MAPE:{mape:.2f}%")
    if new_preds is not None: print(f"  New predictions: {len(new_preds):,}")
    print("="*60)

if __name__=="__main__": main()

IMPROVED SEGMENTED AVM
Value Indicator AS MODEL FEATURE (Anchors Predictions)
Loading /Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/Untitled 5_2025-12-23-1212.csv
127,326 records | 90.4MB | Price:sale_price ID:property_id

Preparing data (value_indicator as anchor feature)...
  ⚠️  Filtered 29,431 bad assessed values
51/59 features available

Training on 127,258 properties (value_indicator as anchor feature)

Value quantiles: Q20:$225,450 Q40:$434,750 Q60:$434,750 Q80:$741,940 Q95:$1,445,367

Value sources: {'census': np.int64(127258)}

10 segments
  premium_large: 27,925 ($434,750-$732,125, med $434,750)
  premium_small: 26,120 ($434,750-$732,125, med $434,750)
  budget_small: 19,274 ($12,000-$225,433, med $134,133)
  economy_large: 13,496 ($225,450-$433,875, med $335,713)
  luxury_large: 11,219 ($741,940-$1,426,334, med $926,975)
  economy_small: 8,817 ($225,450-$433,875, med $320,400)
  luxury_small: 7,745 ($741,940-$1,426,334, med $923,550)
Consolidated to 7 segments

NameError: name 'feats_with_value' is not defined

In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.cluster import MiniBatchKMeans
from sklearn.ensemble import IsolationForest
import warnings, time, os
from openpyxl import Workbook
from openpyxl.styles import Font, PatternFill
from openpyxl.utils.dataframe import dataframe_to_rows

warnings.filterwarnings('ignore')
os.environ['OMP_NUM_THREADS'] = '4'

# Config
INCLUDE_MLS, INCLUDE_CENSUS, INCLUDE_NEIGHBORHOOD, INCLUDE_IMAGE = True, True, True, False
MIN_PRICE, TEST_SIZE, RAND_STATE, N_JOBS, N_CLUSTERS, N_EST, MAX_SEGMENTS = 20000, 0.3, 42, -1, 8, 100, 7
QUANTILES = [0.1, 0.5, 0.9]

TRAINING_INPUT_PATH = "/Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/Untitled 5_2025-12-23-1212.csv"
PREDICTION_INPUT_PATH = "/Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/MLS_w_luxury_AVM_outliers.csv"
OUTPUT_DIR = "/Users/jenny.lin/BASIS_AVM_Onboarding/cate_scenario_analyses/model_outputs"

BASE_FEATS = ["living_sqft","lot_sqft","year_built","effective_year_built","bedrooms","full_baths","half_baths","garage_spaces","fireplace_code","latitude","longitude","geo_cluster","value_indicator"]
ENG_FEATS = ["sqft_per_bedroom","lot_to_living_ratio","property_age","is_new","has_garage","luxury_score","log_sqft","age_squared","log_value_indicator"]
PRIOR_FEATS = ["prior_sale_price","prior_price_per_sqft","sqft_per_prior_dollar","years_since_last_sale","expected_appreciation","has_prior_sale","recently_sold"]
CLUSTER_FEATS = ["cluster_avg_price","cluster_med_price"]
CENSUS_FEATS = ["total_population_25plus","male_bachelors_degree","female_bachelors_degree","pct_bachelors_degree","median_earnings_total","median_earnings_male","median_earnings_female","median_household_income","median_home_value","median_gross_rent","owner_occupied_units","renter_occupied_units","pct_owner_occupied","occupied_units","vacant_units","median_age","civilian_employed","civilian_unemployed","unemployment_rate","income_education_score"]
ELECT_FEATS = ["votes_gop","votes_dem","total_votes","per_gop","per_dem","per_point_diff","dem_margin","rep_margin"]
IMG_FEATS = ["topic_1","topic_2","topic_3","topic_4","topic_5","topic_6","topic_7","topic_8","topic_9","topic_10","gran_c_in","gran_c_ex","gran_c","high_c_in","high_c_ex","high_c"]

def optimize_dtypes(df):
    for c in df.select_dtypes(['float64']).columns: df[c] = df[c].astype('float32')
    for c in df.select_dtypes(['int64']).columns:
        df[c] = df[c].astype('int8' if set(df[c].dropna().unique()).issubset({0,1}) else 'int32')
    return df

def load_data(path):
    print(f"Loading {path}")
    df = optimize_dtypes(pd.read_csv(path, low_memory=False))
    df.columns = df.columns.str.lower()
    y_col = next((c for c in ['sale_price','currentsalesprice','price','saleprice'] if c in df.columns), 'currentsalesprice')
    id_col = next((c for c in ['cc_list_id','property_id','propertyid','id'] if c in df.columns), 'cc_list_id')
    state_col = next((c for c in ['sample_state','state','state_code'] if c in df.columns), None)
    print(f"{len(df):,} records | {df.memory_usage(deep=True).sum()/1024**2:.1f}MB | Price:{y_col} ID:{id_col}")
    return df, y_col, id_col, state_col

def filter_bad_assessed(df):
    if 'assessed_total_value' not in df.columns:
        return pd.Series([False]*len(df), index=df.index)
    
    valid = (df['assessed_total_value']>10000) & (df['assessed_total_value']<100000000)
    
    if 'living_sqft' in df.columns:
        has_sqft = (df['living_sqft'].notna()) & (df['living_sqft']>100)
        ppsf = df['assessed_total_value']/df['living_sqft']
        valid &= ~(has_sqft & ((ppsf<20)|(ppsf>2000)))
    
    if 'prior_sale_price' in df.columns and 'years_since_last_sale' in df.columns:
        has_prior = (df['prior_sale_price']>10000) & (df['years_since_last_sale']<=15)
        valid &= ~(has_prior & ((df['assessed_total_value']/df['prior_sale_price'])<0.10))
    
    if 'median_home_value' in df.columns:
        has_census = (df['median_home_value']>10000)
        valid &= ~(has_census & ((df['assessed_total_value']/df['median_home_value'])<0.05))
    
    invalid_cnt = ((df['assessed_total_value'].notna()) & ~valid).sum()
    if invalid_cnt>0:
        print(f"  ⚠️  Filtered {invalid_cnt:,} bad assessed values")
    
    return valid

def assign_segments(df, train_q=None):
    value_ind = pd.Series([np.nan]*len(df), index=df.index)
    source = pd.Series(['none']*len(df), index=df.index)
    
    # Priority 1: Prior sale PPSF × current sqft (property-specific, external - NO LEAKAGE)
    if all(c in df.columns for c in ['prior_sale_price','years_since_last_sale','living_sqft']):
        has_prior = (df['prior_sale_price']>10000) & (df['years_since_last_sale']<=15) & (df['living_sqft']>400)
        if has_prior.sum()>0:
            prior_ppsf = df.loc[has_prior,'prior_sale_price'] / (df.loc[has_prior,'living_sqft']+1)
            yrs = df.loc[has_prior,'years_since_last_sale'].fillna(5)
            appreciated_ppsf = prior_ppsf * (1.04 ** yrs)
            value_ind[has_prior] = appreciated_ppsf * df.loc[has_prior,'living_sqft']
            source[has_prior] = 'prior_ppsf'
    
    # Priority 2: Assessed value (external data - NO LEAKAGE)
    if 'assessed_total_value' in df.columns:
        valid = filter_bad_assessed(df) & value_ind.isna()
        if valid.sum()>0:
            vals = df.loc[valid,'assessed_total_value']
            mult = pd.Series([1.15]*len(vals), index=vals.index)
            mult[vals<200000], mult[vals>=500000] = 1.1, 1.2
            value_ind[valid], source[valid] = vals*mult, 'assessed'
    
    # Priority 3: Census median (external, neighborhood-level - NO LEAKAGE)
    if 'median_home_value' in df.columns:
        valid = (df['median_home_value']>10000) & value_ind.isna()
        if valid.sum()>0:
            value_ind[valid], source[valid] = df.loc[valid,'median_home_value'], 'census'
    
    # Priority 4: Geo cluster median price (if available from training)
    if 'cluster_med_price' in df.columns:
        valid = (df['cluster_med_price']>10000) & value_ind.isna()
        if valid.sum()>0:
            value_ind[valid], source[valid] = df.loc[valid,'cluster_med_price'], 'cluster'
    
    # Fallback: Global median
    value_ind = value_ind.fillna(value_ind.median())
    source[value_ind.isna()] = 'global_median'
    
    # STORE value_indicator as a feature (for model to use as anchor)
    df['value_indicator'] = value_ind
    df['log_value_indicator'] = np.log1p(value_ind)
    df['_value_source'] = source
    
    # Segment based on value indicator
    if train_q is None:
        q20,q40,q60,q80,q95 = value_ind.quantile([0.20,0.40,0.60,0.80,0.95])
        train_q = {'value_q20':q20,'value_q40':q40,'value_q60':q60,'value_q80':q80,'value_q95':q95}
        print(f"\nValue quantiles: Q20:${q20:,.0f} Q40:${q40:,.0f} Q60:${q60:,.0f} Q80:${q80:,.0f} Q95:${q95:,.0f}")
    else:
        q20,q40,q60,q80,q95 = train_q['value_q20'],train_q['value_q40'],train_q['value_q60'],train_q['value_q80'],train_q['value_q95']
    
    tier = pd.Series(['mid']*len(df), index=df.index)
    tier[value_ind<q20], tier[(value_ind>=q20)&(value_ind<q40)] = 'budget', 'economy'
    tier[(value_ind>=q60)&(value_ind<q80)], tier[(value_ind>=q80)&(value_ind<q95)], tier[value_ind>=q95] = 'premium', 'luxury', 'ultra'
    
    size = pd.Series(['small']*len(df), index=df.index)
    if 'living_sqft' in df.columns:
        sqft = df['living_sqft'].fillna(df['living_sqft'].median())
        size[sqft>sqft.median()] = 'large'
    
    return tier+'_'+size, train_q

def engineer(df, y_col, with_price=False):
    if 'living_sqft' in df.columns:
        if 'bedrooms' in df.columns: df['sqft_per_bedroom'] = df['living_sqft']/(df['bedrooms']+1)
        df['log_sqft'] = np.log1p(df['living_sqft'])
    
    if 'lot_sqft' in df.columns:
        if 'living_sqft' in df.columns: df['lot_to_living_ratio'] = df['lot_sqft']/(df['living_sqft']+1)
        if 'lot_acres' not in df.columns: df['lot_acres'] = df['lot_sqft']/43560

    if 'year_built' in df.columns:
        df['property_age'], df['is_new'], df['age_squared'] = 2024-df['year_built'], ((2024-df['year_built'])<=5).astype('int8'), (2024-df['year_built'])**2

    if 'garage_spaces' in df.columns:
        df['has_garage'] = (df['garage_spaces']>0).astype('int8')

    lux = [df[c]/1000 if c=='living_sqft' else df[c] for c in ['living_sqft','full_baths','garage_spaces'] if c in df.columns]
    df['luxury_score'] = sum(lux)/len(lux) if lux else 0

    if 'prior_sale_price' in df.columns and 'years_since_last_sale' in df.columns:
        df['prior_appreciated'] = df['prior_sale_price']*(1.04**df['years_since_last_sale'].fillna(5))

    if 'assessed_total_value' in df.columns:
        valid = filter_bad_assessed(df)
        if 'living_sqft' in df.columns:
            df['assessed_per_sqft'] = 0
            df.loc[valid,'assessed_per_sqft'] = df.loc[valid,'assessed_total_value']/(df.loc[valid,'living_sqft']+1)
        if 'median_home_value' in df.columns:
            reasonable = valid & (df['median_home_value']>10000)
            df['assessed_to_census_ratio'] = 1.0
            df.loc[reasonable,'assessed_to_census_ratio'] = df.loc[reasonable,'assessed_total_value']/(df.loc[reasonable,'median_home_value']+1)
        if 'assessed_land_value' in df.columns:
            reasonable = valid & (df['assessed_land_value']>0)
            df['land_to_total_ratio'] = 0
            df.loc[reasonable,'land_to_total_ratio'] = df.loc[reasonable,'assessed_land_value']/(df.loc[reasonable,'assessed_total_value']+1)

    if INCLUDE_CENSUS and all(c in df.columns for c in ['median_household_income','pct_bachelors_degree']):
        df['income_education_score'] = df['median_household_income']*df['pct_bachelors_degree']

    if 'prior_sale_price' in df.columns and 'living_sqft' in df.columns:
        df['prior_price_per_sqft'], df['sqft_per_prior_dollar'] = df['prior_sale_price']/(df['living_sqft']+1), df['living_sqft']/(df['prior_sale_price']+1)

    if 'prior_sale_date' in df.columns:
        df['prior_sale_date'] = pd.to_datetime(df['prior_sale_date'], errors='coerce')
        df['years_since_last_sale'] = (pd.Timestamp('2024-01-01')-df['prior_sale_date']).dt.days/365.25
        if 'prior_sale_price' in df.columns:
            df['expected_appreciation'] = df['prior_sale_price']*(1.04)**df['years_since_last_sale']
        df['recently_sold'] = (df['years_since_last_sale']<2).astype('int8')

    if 'prior_sale_price' in df.columns:
        df['has_prior_sale'] = df['prior_sale_price'].notna().astype('int8')
        missing = df['prior_sale_price'].isna()
        df.loc[missing,'prior_sale_price'] = df.loc[missing,'median_home_value'] if 'median_home_value' in df.columns and INCLUDE_CENSUS else df['prior_sale_price'].median()

    if 'years_since_last_sale' in df.columns:
        df['years_since_last_sale'] = df['years_since_last_sale'].fillna(999)

    if with_price and y_col in df.columns and 'living_sqft' in df.columns:
        df['price_per_sqft'], df['sqft_per_dollar'] = df[y_col]/(df['living_sqft']+1), df['living_sqft']/(df[y_col]+1)

    return df

def geo_cluster(df, kmeans=None):
    if not all(c in df.columns for c in ['latitude','longitude']):
        df['geo_cluster'] = 0
        return df, kmeans

    valid = df[['latitude','longitude']].notna().all(axis=1)
    if valid.sum()<N_CLUSTERS:
        df['geo_cluster'] = 0
        return df, kmeans

    df['geo_cluster'] = 0
    if kmeans is None:
        kmeans = MiniBatchKMeans(n_clusters=N_CLUSTERS, random_state=RAND_STATE, batch_size=1000, n_init=3)
        df.loc[valid,'geo_cluster'] = kmeans.fit_predict(df.loc[valid,['latitude','longitude']])
    else:
        df.loc[valid,'geo_cluster'] = kmeans.predict(df.loc[valid,['latitude','longitude']])

    return df, kmeans

def add_cluster_feats(train, test, y_col):
    if 'geo_cluster' not in train.columns or y_col not in train.columns:
        for d in [train,test]:
            d['cluster_avg_price'] = d['cluster_med_price'] = train[y_col].median() if y_col in train.columns else 0
        return train, test

    stats = train.groupby('geo_cluster')[y_col].agg(['mean','median']).reset_index()
    stats.columns = ['geo_cluster','cluster_avg_price','cluster_med_price']
    med = train[y_col].median()
    train = train.merge(stats, on='geo_cluster', how='left').fillna({'cluster_avg_price':med,'cluster_med_price':med})
    test = test.merge(stats, on='geo_cluster', how='left').fillna({'cluster_avg_price':med,'cluster_med_price':med})
    return train, test

def filter_outliers(df, name, y_col):
    orig = len(df)
    df = engineer(df, y_col, with_price=True)

    if 'price_per_sqft' in df.columns:
        lb,ub = df['price_per_sqft'].quantile([.05,.95])
        df = df[(df['price_per_sqft']>=lb)&(df['price_per_sqft']<=ub)].drop(columns=['price_per_sqft','sqft_per_dollar'])

    if 'lot_sqft' in df.columns: df = df[df['lot_sqft']<=df['lot_sqft'].quantile(.98)]
    if 'year_built' in df.columns: df = df[(df['year_built']>=1900)&(df['year_built']<=2025)]

    if len(df)>=100:
        try:
            feats = [c for c in ['living_sqft','lot_sqft',y_col,'year_built'] if c in df.columns]
            if len(feats)>=3:
                X = df[feats].fillna(df[feats].median())
                df = df[IsolationForest(contamination=.05, random_state=RAND_STATE, n_jobs=N_JOBS).fit_predict(X)==1]
        except: pass

    if (pct:=(orig-len(df))/orig*100)>0: print(f"  {name}: {orig:,}→{len(df):,} ({pct:.1f}% filtered)")
    return df

def train_model(X, y, q):
    return XGBRegressor(objective='reg:quantileerror', quantile_alpha=q, n_estimators=N_EST, learning_rate=.1,
                       max_depth=5, min_child_weight=3, subsample=.8, colsample_bytree=.8,
                       random_state=RAND_STATE, n_jobs=N_JOBS, tree_method='hist').fit(X, y, verbose=False)

def get_feat_importance(model, feats, top_n=20):
    scores = model.get_booster().get_score(importance_type="gain")
    imp = [(feats[int(k[1:])],v) for k,v in scores.items() if int(k[1:])<len(feats)]
    imp.sort(key=lambda x: x[1], reverse=True)
    total = sum(v for _,v in imp)
    return pd.DataFrame([{'feature':f,'gain':g,'importance':g/total} for f,g in imp[:top_n]]) if total>0 else pd.DataFrame(columns=['feature','gain','importance'])

def feature_importance(models, feats, metrics):
    rows = []
    for seg,mdl in models.items():
        scores = mdl['q50'].get_booster().get_score(importance_type="gain")
        w = metrics[seg]["n_test"]
        rows.extend([(feats[int(k[1:])],v,w) for k,v in scores.items() if int(k[1:])<len(feats)])

    if not rows: return pd.DataFrame(columns=["feature","total_gain","importance"])

    df = pd.DataFrame(rows, columns=["feature","gain","weight"])
    out = df.assign(wg=df["gain"]*df["weight"]).groupby("feature",as_index=False).agg(
        total_gain=("gain","sum"), weighted_gain=("wg","sum")).sort_values("weighted_gain",ascending=False)
    out["importance"] = out["weighted_gain"]/out["weighted_gain"].sum()
    return out[["feature","total_gain","importance"]].head(100)

def prepare_data(df, y_col, id_col, state_col, for_training=True):
    print(f"\nPreparing data (value_indicator as anchor feature)...")
    if for_training: df = df[df[y_col]>=MIN_PRICE]

    df, kmeans = geo_cluster(engineer(df, y_col))

    feat_groups = []
    if INCLUDE_MLS: feat_groups.extend([BASE_FEATS,ENG_FEATS,PRIOR_FEATS,CLUSTER_FEATS])
    if INCLUDE_CENSUS: feat_groups.append(CENSUS_FEATS)
    if INCLUDE_NEIGHBORHOOD: feat_groups.append(ELECT_FEATS)
    if INCLUDE_IMAGE: feat_groups.append(IMG_FEATS)

    all_feats = [f for g in feat_groups for f in g]
    feats = [f for f in all_feats if f in df.columns]
    feats.extend([vf for vf in ['prior_appreciated','assessed_per_sqft','assessed_to_census_ratio','land_to_total_ratio'] if vf in df.columns and vf not in feats])

    print(f"{len(feats)}/{len(all_feats)} features available")

    cols = list(set(feats+[y_col,id_col]+([state_col] if state_col and state_col in df.columns else [])))
    df = df[[c for c in cols if c in df.columns]].copy()
    df[feats] = df[feats].fillna(df[feats].median())

    return (df.dropna(subset=[y_col]),feats,kmeans) if for_training else (df,feats,kmeans)

def train_segments(df, feats, y_col, id_col, state_col):
    print(f"\nTraining on {len(df):,} properties (value_indicator as anchor feature)")

    df['seg'], train_q = assign_segments(df)

    # Add value_indicator features to feature list (created by assign_segments)
    if 'value_indicator' in df.columns and 'value_indicator' not in feats:
        feats = feats + ['value_indicator']
    if 'log_value_indicator' in df.columns and 'log_value_indicator' not in feats:
        feats = feats + ['log_value_indicator']

    if '_value_source' in df.columns:
        print(f"\nValue sources: {dict(df['_value_source'].value_counts())}")

    seg_cnts = df['seg'].value_counts()
    print(f"\n{len(seg_cnts)} segments")

    for seg,cnt in seg_cnts.head(MAX_SEGMENTS).items():
        if 'value_indicator' in df.columns and seg!='other':
            seg_df = df[df['seg']==seg]
            if len(seg_df)>0:
                print(f"  {seg}: {cnt:,} (${seg_df['value_indicator'].min():,.0f}-${seg_df['value_indicator'].max():,.0f}, med ${seg_df['value_indicator'].median():,.0f})")
        else:
            print(f"  {seg}: {cnt:,}")

    small = seg_cnts[seg_cnts<50].index
    if len(small)>0:
        df.loc[df['seg'].isin(small),'seg'] = 'other'
        print(f"Consolidated to {MAX_SEGMENTS} segments")

    seg_cnts = df['seg'].value_counts()
    if len(seg_cnts)>MAX_SEGMENTS:
        keep = seg_cnts.head(MAX_SEGMENTS-1).index
        df.loc[~df['seg'].isin(keep),'seg'] = 'other'
        print(f"Consolidated to {MAX_SEGMENTS} segments")

    if '_value_source' in df.columns:
        df = df.drop(columns=['_value_source'])

    models, metrics, preds_list, seg_imps = {}, {}, [], {}

    for seg in df['seg'].unique():
        seg_df = filter_outliers(df[df['seg']==seg].copy(), seg, y_col)
        if len(seg_df)<50: continue

        train_idx = seg_df.sample(frac=1-TEST_SIZE, random_state=RAND_STATE).index
        train_df, test_df = seg_df.loc[train_idx].copy(), seg_df.loc[seg_df.index.difference(train_idx)].copy()
        train_df, test_df = add_cluster_feats(train_df, test_df, y_col)

        X_tr, y_tr = train_df[feats].values, train_df[y_col].values
        X_te, y_te = test_df[feats].values, test_df[y_col].values
        ids = test_df[id_col].values
        states = test_df[state_col].values if state_col and state_col in test_df.columns else ['Unknown']*len(test_df)

        seg_models = {f"q{int(q*100)}":train_model(X_tr,y_tr,q) for q in QUANTILES}
        seg_preds = [seg_models[f"q{int(q*100)}"].predict(X_te) for q in QUANTILES]

        models[seg] = seg_models
        seg_imps[seg] = get_feat_importance(seg_models['q50'], feats)

        mae, mape = mean_absolute_error(y_te,seg_preds[1]), np.mean(np.abs((y_te-seg_preds[1])/y_te))*100
        r2, cov = r2_score(y_te,seg_preds[1]), np.mean((y_te>=seg_preds[0])&(y_te<=seg_preds[2]))*100
        metrics[seg] = {'n_train':len(X_tr),'n_test':len(X_te),'mae':mae,'mape':mape,'r2':r2,'cov':cov,
                       'p_min':seg_df[y_col].min(),'p_max':seg_df[y_col].max(),'p_med':seg_df[y_col].median()}
        print(f"  {seg}: {len(test_df):,} test | MAE:${mae:,.0f} | MAPE:{mape:.2f}% | R²:{r2:.3f}")

        preds_list.append(pd.DataFrame({
            'property_id':ids, 'state':states, 'actual':y_te, 'predicted':seg_preds[1],
            'pred_lower':seg_preds[0], 'pred_upper':seg_preds[2], 'segment':seg
        }))

    return {'models':models, 'metrics':metrics, 'predictions':pd.concat(preds_list),
            'feature_importance':feature_importance(models,feats,metrics), 'segment_importances':seg_imps,
            'feature_names':feats, 'training_quantiles':train_q}

def predict_new(pred_df, models, feats, y_col, id_col, state_col, kmeans, train_stats, train_q):
    print(f"\n{'='*60}\nPREDICTING {len(pred_df):,} NEW PROPERTIES\n{'='*60}")

    pred_df = engineer(pred_df, y_col)
    pred_df, _ = geo_cluster(pred_df, kmeans)

    if 'geo_cluster' in pred_df.columns and train_stats is not None:
        pred_df = pred_df.merge(train_stats, on='geo_cluster', how='left')
        med = train_stats['cluster_avg_price'].median()
        pred_df['cluster_avg_price'], pred_df['cluster_med_price'] = pred_df['cluster_avg_price'].fillna(med), pred_df['cluster_med_price'].fillna(med)

    pred_df['seg'], _ = assign_segments(pred_df, train_q)

    # Build complete feature list including value_indicator
    use_feats = list(feats)
    for vf in ['value_indicator', 'log_value_indicator']:
        if vf in pred_df.columns and vf not in use_feats:
            use_feats.append(vf)

    for f in use_feats:
        if f not in pred_df.columns: pred_df[f] = 0
        else: pred_df[f] = pred_df[f].fillna(pred_df[f].median() if pred_df[f].notna().sum()>0 else 0)

    if '_value_source' in pred_df.columns:
        print(f"Value sources: {dict(pred_df['_value_source'].value_counts())}")

    preds_list = []
    for seg in pred_df['seg'].unique():
        seg_df = pred_df[pred_df['seg']==seg].copy()

        if seg not in models:
            print(f"  Warning: '{seg}' not in models, using fallback")
            avail = list(models.keys())
            seg = next((s for s in avail if any(x in seg for x in ['ultra','luxury','premium','mid','economy','budget'] if x in s)), avail[0])

        X = seg_df[use_feats].values
        ids = seg_df[id_col].values
        states = seg_df[state_col].values if state_col and state_col in seg_df.columns else ['Unknown']*len(seg_df)
        actual = seg_df[y_col].values if y_col in seg_df.columns else [np.nan]*len(seg_df)
        value_ind = seg_df['value_indicator'].values if 'value_indicator' in seg_df.columns else [np.nan]*len(seg_df)
        value_src = seg_df['_value_source'].values if '_value_source' in seg_df.columns else ['unknown']*len(seg_df)

        preds = [models[seg][f"q{int(q*100)}"].predict(X) for q in QUANTILES]

        preds_list.append(pd.DataFrame({
            'property_id':ids, 'state':states, 'value_indicator':value_ind, 'value_source':value_src,
            'actual':actual, 'predicted':preds[1], 'pred_lower':preds[0], 'pred_upper':preds[2], 'segment':seg,
            'error':[actual[i]-preds[1][i] if not np.isnan(actual[i]) else np.nan for i in range(len(actual))],
            'pct_error':[100*(actual[i]-preds[1][i])/actual[i] if not np.isnan(actual[i]) and actual[i]!=0 else np.nan for i in range(len(actual))]
        }))

        print(f"  {seg}: {len(seg_df):,} predicted")

    result = pd.concat(preds_list, ignore_index=True)
    print(f"\n✓ Generated {len(result):,} predictions")

    if '_value_source' in pred_df.columns: pred_df = pred_df.drop(columns=['_value_source'])

    valid = result['actual'].notna().sum()
    if valid>0:
        valid_df = result[result['actual'].notna()]
        mae, mape, r2 = mean_absolute_error(valid_df['actual'],valid_df['predicted']), np.mean(np.abs((valid_df['actual']-valid_df['predicted'])/valid_df['actual']))*100, r2_score(valid_df['actual'],valid_df['predicted'])
        print(f"  Validation ({valid}): MAE:${mae:,.0f} | MAPE:{mape:.2f}% | R²:{r2:.4f}")

    return result

def save_results(results, out_dir, new_preds=None):
    print(f"\nSaving results...")
    preds, metrics, fi, seg_imps = results['predictions'], results['metrics'], results['feature_importance'], results['segment_importances']

    wb = Workbook()
    wb.remove(wb.active)

    ws = wb.create_sheet("Summary", 0)
    ws['A1'].font, ws['A1'].value = Font(bold=True,size=14), 'IMPROVED SEGMENTED AVM'
    ws['A2'].font, ws['A2'].value = Font(italic=True,size=10), 'Value indicator AS MODEL FEATURE (anchors predictions)'

    r2, mae, mape = r2_score(preds['actual'],preds['predicted']), mean_absolute_error(preds['actual'],preds['predicted']), np.mean(np.abs((preds['actual']-preds['predicted'])/preds['actual']))*100

    data = [['Metric','Value'], ['Properties',len(preds)], ['Segments',len(metrics)], ['R²',f'{r2:.4f}'], ['MAE',f'${mae:,.0f}'], ['MAPE%',f'{mape:.2f}%']]
    if new_preds is not None: data.append(['New Predictions',len(new_preds)])

    for i,(k,v) in enumerate(data,5):
        ws[f'A{i}'].font, ws[f'A{i}'].value, ws[f'B{i}'].value = Font(bold=True), k, v

    ws = wb.create_sheet("Segments")
    seg_df = pd.DataFrame([{**{'Segment':s},**m} for s,m in metrics.items()])
    for i,h in enumerate(seg_df.columns,1):
        c = ws.cell(1,i,h)
        c.font, c.fill = Font(bold=True,color='FFFFFF'), PatternFill(start_color='366092',end_color='366092',fill_type='solid')
    for i,row in enumerate(seg_df.itertuples(index=False),2):
        for j,v in enumerate(row,1): ws.cell(i,j,v)

    for title,data,name in [('Global Feature Importance',fi,'Global_Feature_Importance')]+[(f'FI: {s}',si,f"FI_{s}"[:31]) for s,si in seg_imps.items()]:
        ws = wb.create_sheet(name)
        ws['A1'].value, ws['A1'].font = title, Font(bold=True,size=12)
        for r_idx,row in enumerate(dataframe_to_rows(data,index=False,header=True),2):
            for c_idx,value in enumerate(row,1):
                cell = ws.cell(row=r_idx,column=c_idx,value=value)
                if r_idx==2: cell.font, cell.fill = Font(bold=True,color='FFFFFF'), PatternFill(start_color='366092',end_color='366092',fill_type='solid')

    for sheet_name,data,color in [('Test_Predictions',preds,'366092'), ('New_Predictions',new_preds,'4472C4')]:
        if data is None: continue
        ws = wb.create_sheet(sheet_name)
        for i,h in enumerate(data.columns,1):
            c = ws.cell(1,i,h)
            c.font, c.fill = Font(bold=True,color='FFFFFF'), PatternFill(start_color=color,end_color=color,fill_type='solid')
        for i,row in enumerate(data.itertuples(index=False),2):
            for j,v in enumerate(row,1): ws.cell(i,j,v)

    ts = time.strftime("%Y%m%d_%H%M%S")
    xl_path = f"{out_dir}/improved_segmented_{ts}.xlsx"
    wb.save(xl_path)

    for name,data in [('test_predictions',preds), ('segments',seg_df), ('importance',fi)]+[(f'importance_{s}',si) for s,si in seg_imps.items()]+([('new_predictions',new_preds)] if new_preds is not None else []):
        data.to_csv(f"{out_dir}/improved_{name}_{ts}.csv", index=False)

    print(f"✓ Excel: {xl_path}")
    print(f"✓ CSVs saved with timestamp: {ts}")

def main():
    t0 = time.time()
    print("="*60+"\nIMPROVED SEGMENTED AVM\nValue Indicator AS MODEL FEATURE (Anchors Predictions)\n"+"="*60)

    df, y_col, id_col, state_col = load_data(TRAINING_INPUT_PATH)
    df, feats, kmeans = prepare_data(df, y_col, id_col, state_col, for_training=True)
    results = train_segments(df, feats, y_col, id_col, state_col)

    train_stats = df.groupby('geo_cluster')[y_col].agg(['mean','median']).reset_index() if 'geo_cluster' in df.columns else None
    if train_stats is not None: train_stats.columns = ['geo_cluster','cluster_avg_price','cluster_med_price']

    new_preds = None
    if PREDICTION_INPUT_PATH:
        pred_df, _, _, _ = load_data(PREDICTION_INPUT_PATH)
        new_preds = predict_new(pred_df, results['models'], results['feature_names'], y_col, id_col, state_col, kmeans, train_stats, results['training_quantiles'])

    os.makedirs(OUTPUT_DIR, exist_ok=True)
    save_results(results, OUTPUT_DIR, new_preds)

    preds = results['predictions']
    r2, mae, mape = r2_score(preds['actual'],preds['predicted']), mean_absolute_error(preds['actual'],preds['predicted']), np.mean(np.abs((preds['actual']-preds['predicted'])/preds['actual']))*100
    print(f"\n{'='*60}\n✓ COMPLETE in {time.time()-t0:.1f}s\n  Test: {len(preds):,} | {preds['segment'].nunique()} segments\n  R²:{r2:.4f} | MAE:${mae:,.0f} | MAPE:{mape:.2f}%")
    if new_preds is not None: print(f"  New predictions: {len(new_preds):,}")
    print("="*60)

if __name__=="__main__": main()

IMPROVED SEGMENTED AVM
Value Indicator AS MODEL FEATURE (Anchors Predictions)
Loading /Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/Untitled 5_2025-12-23-1212.csv
127,326 records | 90.4MB | Price:sale_price ID:property_id

Preparing data (value_indicator as anchor feature)...
  ⚠️  Filtered 29,431 bad assessed values
51/59 features available

Training on 127,258 properties (value_indicator as anchor feature)

Value quantiles: Q20:$225,450 Q40:$434,750 Q60:$434,750 Q80:$741,940 Q95:$1,445,367

Value sources: {'census': np.int64(127258)}

10 segments
  premium_large: 27,925 ($434,750-$732,125, med $434,750)
  premium_small: 26,120 ($434,750-$732,125, med $434,750)
  budget_small: 19,274 ($12,000-$225,433, med $134,133)
  economy_large: 13,496 ($225,450-$433,875, med $335,713)
  luxury_large: 11,219 ($741,940-$1,426,334, med $926,975)
  economy_small: 8,817 ($225,450-$433,875, med $320,400)
  luxury_small: 7,745 ($741,940-$1,426,334, med $923,550)
Consolidated to 7 segments

In [2]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.cluster import MiniBatchKMeans
from sklearn.ensemble import IsolationForest
import warnings, time, os
from openpyxl import Workbook
from openpyxl.styles import Font, PatternFill
from openpyxl.utils.dataframe import dataframe_to_rows

warnings.filterwarnings('ignore')
os.environ['OMP_NUM_THREADS'] = '4'

# Config
INCLUDE_MLS, INCLUDE_CENSUS, INCLUDE_NEIGHBORHOOD, INCLUDE_IMAGE = True, True, True, False
MIN_PRICE, TEST_SIZE, RAND_STATE, N_JOBS, N_CLUSTERS, N_EST, MAX_SEGMENTS = 20000, 0.3, 42, -1, 8, 100, 7
QUANTILES = [0.1, 0.5, 0.9]

TRAINING_INPUT_PATH = "/Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/Untitled 5_2025-12-23-1212.csv"
PREDICTION_INPUT_PATH = "/Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/MLS_w_luxury_AVM_outliers.csv"
OUTPUT_DIR = "/Users/jenny.lin/BASIS_AVM_Onboarding/cate_scenario_analyses/model_outputs"

BASE_FEATS = ["living_sqft","lot_sqft","year_built","effective_year_built","bedrooms","full_baths","half_baths","garage_spaces","fireplace_code","latitude","longitude","geo_cluster","value_indicator"]
ENG_FEATS = ["sqft_per_bedroom","lot_to_living_ratio","property_age","is_new","has_garage","luxury_score","log_sqft","age_squared","log_value_indicator"]
PRIOR_FEATS = ["prior_sale_price","prior_price_per_sqft","sqft_per_prior_dollar","years_since_last_sale","expected_appreciation","has_prior_sale","recently_sold"]
CLUSTER_FEATS = ["cluster_avg_price","cluster_med_price"]
CENSUS_FEATS = ["total_population_25plus","male_bachelors_degree","female_bachelors_degree","pct_bachelors_degree","median_earnings_total","median_earnings_male","median_earnings_female","median_household_income","median_home_value","median_gross_rent","owner_occupied_units","renter_occupied_units","pct_owner_occupied","occupied_units","vacant_units","median_age","civilian_employed","civilian_unemployed","unemployment_rate","income_education_score"]
ELECT_FEATS = ["votes_gop","votes_dem","total_votes","per_gop","per_dem","per_point_diff","dem_margin","rep_margin"]
IMG_FEATS = ["topic_1","topic_2","topic_3","topic_4","topic_5","topic_6","topic_7","topic_8","topic_9","topic_10","gran_c_in","gran_c_ex","gran_c","high_c_in","high_c_ex","high_c"]

def optimize_dtypes(df):
    for c in df.select_dtypes(['float64']).columns: df[c] = df[c].astype('float32')
    for c in df.select_dtypes(['int64']).columns:
        df[c] = df[c].astype('int8' if set(df[c].dropna().unique()).issubset({0,1}) else 'int32')
    return df

def load_data(path):
    print(f"Loading {path}")
    df = optimize_dtypes(pd.read_csv(path, low_memory=False))
    df.columns = df.columns.str.lower()
    y_col = next((c for c in ['sale_price','currentsalesprice','price','saleprice'] if c in df.columns), 'currentsalesprice')
    id_col = next((c for c in ['cc_list_id','property_id','propertyid','id'] if c in df.columns), 'cc_list_id')
    state_col = next((c for c in ['sample_state','state','state_code'] if c in df.columns), None)
    print(f"{len(df):,} records | {df.memory_usage(deep=True).sum()/1024**2:.1f}MB | Price:{y_col} ID:{id_col}")
    return df, y_col, id_col, state_col

def filter_bad_assessed(df):
    if 'assessed_total_value' not in df.columns:
        return pd.Series([False]*len(df), index=df.index)

    valid = (df['assessed_total_value']>10000) & (df['assessed_total_value']<100000000)

    if 'living_sqft' in df.columns:
        has_sqft = (df['living_sqft'].notna()) & (df['living_sqft']>100)
        ppsf = df['assessed_total_value']/df['living_sqft']
        valid &= ~(has_sqft & ((ppsf<20)|(ppsf>2000)))

    if 'prior_sale_price' in df.columns and 'years_since_last_sale' in df.columns:
        has_prior = (df['prior_sale_price']>10000) & (df['years_since_last_sale']<=15)
        valid &= ~(has_prior & ((df['assessed_total_value']/df['prior_sale_price'])<0.10))

    if 'median_home_value' in df.columns:
        has_census = (df['median_home_value']>10000)
        valid &= ~(has_census & ((df['assessed_total_value']/df['median_home_value'])<0.05))

    invalid_cnt = ((df['assessed_total_value'].notna()) & ~valid).sum()
    if invalid_cnt>0:
        print(f"  ⚠️  Filtered {invalid_cnt:,} bad assessed values")

    return valid

def assign_segments(df, train_q=None):
    value_ind = pd.Series([np.nan]*len(df), index=df.index)
    source = pd.Series(['none']*len(df), index=df.index)

    # Priority 1: Prior sale PPSF × current sqft (property-specific, external - NO LEAKAGE)
    if all(c in df.columns for c in ['prior_sale_price','years_since_last_sale','living_sqft']):
        has_prior = (df['prior_sale_price']>10000) & (df['years_since_last_sale']<=15) & (df['living_sqft']>400)
        if has_prior.sum()>0:
            prior_ppsf = df.loc[has_prior,'prior_sale_price'] / (df.loc[has_prior,'living_sqft']+1)
            yrs = df.loc[has_prior,'years_since_last_sale'].fillna(5)
            appreciated_ppsf = prior_ppsf * (1.04 ** yrs)
            value_ind[has_prior] = appreciated_ppsf * df.loc[has_prior,'living_sqft']
            source[has_prior] = 'prior_ppsf'

    # Priority 2: Assessed value (external data - NO LEAKAGE)
    if 'assessed_total_value' in df.columns:
        valid = filter_bad_assessed(df) & value_ind.isna()
        if valid.sum()>0:
            vals = df.loc[valid,'assessed_total_value']
            mult = pd.Series([1.15]*len(vals), index=vals.index)
            mult[vals<200000], mult[vals>=500000] = 1.1, 1.2
            value_ind[valid], source[valid] = vals*mult, 'assessed'

    # Priority 3: Census median (external, neighborhood-level - NO LEAKAGE)
    if 'median_home_value' in df.columns:
        valid = (df['median_home_value']>10000) & value_ind.isna()
        if valid.sum()>0:
            value_ind[valid], source[valid] = df.loc[valid,'median_home_value'], 'census'

    # Priority 4: Geo cluster median price (if available from training)
    if 'cluster_med_price' in df.columns:
        valid = (df['cluster_med_price']>10000) & value_ind.isna()
        if valid.sum()>0:
            value_ind[valid], source[valid] = df.loc[valid,'cluster_med_price'], 'cluster'

    # Fallback: Global median
    value_ind = value_ind.fillna(value_ind.median())
    source[value_ind.isna()] = 'global_median'

    # STORE value_indicator as a feature (for model to use as anchor)
    df['value_indicator'] = value_ind
    df['log_value_indicator'] = np.log1p(value_ind)
    df['_value_source'] = source

    # Segment based on value indicator
    if train_q is None:
        q20,q40,q60,q80,q95 = value_ind.quantile([0.20,0.40,0.60,0.80,0.95])
        train_q = {'value_q20':q20,'value_q40':q40,'value_q60':q60,'value_q80':q80,'value_q95':q95}
        print(f"\nValue quantiles: Q20:${q20:,.0f} Q40:${q40:,.0f} Q60:${q60:,.0f} Q80:${q80:,.0f} Q95:${q95:,.0f}")
    else:
        q20,q40,q60,q80,q95 = train_q['value_q20'],train_q['value_q40'],train_q['value_q60'],train_q['value_q80'],train_q['value_q95']

    tier = pd.Series(['mid']*len(df), index=df.index)
    tier[value_ind<q20], tier[(value_ind>=q20)&(value_ind<q40)] = 'budget', 'economy'
    tier[(value_ind>=q60)&(value_ind<q80)], tier[(value_ind>=q80)&(value_ind<q95)], tier[value_ind>=q95] = 'premium', 'luxury', 'ultra'

    size = pd.Series(['small']*len(df), index=df.index)
    if 'living_sqft' in df.columns:
        sqft = df['living_sqft'].fillna(df['living_sqft'].median())
        size[sqft>sqft.median()] = 'large'

    return tier+'_'+size, train_q

def engineer(df, y_col, with_price=False):
    if 'living_sqft' in df.columns:
        if 'bedrooms' in df.columns: df['sqft_per_bedroom'] = df['living_sqft']/(df['bedrooms']+1)
        df['log_sqft'] = np.log1p(df['living_sqft'])

    if 'lot_sqft' in df.columns:
        if 'living_sqft' in df.columns: df['lot_to_living_ratio'] = df['lot_sqft']/(df['living_sqft']+1)
        if 'lot_acres' not in df.columns: df['lot_acres'] = df['lot_sqft']/43560

    if 'year_built' in df.columns:
        df['property_age'], df['is_new'], df['age_squared'] = 2024-df['year_built'], ((2024-df['year_built'])<=5).astype('int8'), (2024-df['year_built'])**2

    if 'garage_spaces' in df.columns:
        df['has_garage'] = (df['garage_spaces']>0).astype('int8')

    lux = [df[c]/1000 if c=='living_sqft' else df[c] for c in ['living_sqft','full_baths','garage_spaces'] if c in df.columns]
    df['luxury_score'] = sum(lux)/len(lux) if lux else 0

    if 'prior_sale_price' in df.columns and 'years_since_last_sale' in df.columns:
        df['prior_appreciated'] = df['prior_sale_price']*(1.04**df['years_since_last_sale'].fillna(5))

    if 'assessed_total_value' in df.columns:
        valid = filter_bad_assessed(df)
        if 'living_sqft' in df.columns:
            df['assessed_per_sqft'] = 0
            df.loc[valid,'assessed_per_sqft'] = df.loc[valid,'assessed_total_value']/(df.loc[valid,'living_sqft']+1)
        if 'median_home_value' in df.columns:
            reasonable = valid & (df['median_home_value']>10000)
            df['assessed_to_census_ratio'] = 1.0
            df.loc[reasonable,'assessed_to_census_ratio'] = df.loc[reasonable,'assessed_total_value']/(df.loc[reasonable,'median_home_value']+1)
        if 'assessed_land_value' in df.columns:
            reasonable = valid & (df['assessed_land_value']>0)
            df['land_to_total_ratio'] = 0
            df.loc[reasonable,'land_to_total_ratio'] = df.loc[reasonable,'assessed_land_value']/(df.loc[reasonable,'assessed_total_value']+1)

    if INCLUDE_CENSUS and all(c in df.columns for c in ['median_household_income','pct_bachelors_degree']):
        df['income_education_score'] = df['median_household_income']*df['pct_bachelors_degree']

    if 'prior_sale_price' in df.columns and 'living_sqft' in df.columns:
        df['prior_price_per_sqft'], df['sqft_per_prior_dollar'] = df['prior_sale_price']/(df['living_sqft']+1), df['living_sqft']/(df['prior_sale_price']+1)

    if 'prior_sale_date' in df.columns:
        df['prior_sale_date'] = pd.to_datetime(df['prior_sale_date'], errors='coerce')
        df['years_since_last_sale'] = (pd.Timestamp('2024-01-01')-df['prior_sale_date']).dt.days/365.25
        if 'prior_sale_price' in df.columns:
            df['expected_appreciation'] = df['prior_sale_price']*(1.04)**df['years_since_last_sale']
        df['recently_sold'] = (df['years_since_last_sale']<2).astype('int8')

    if 'prior_sale_price' in df.columns:
        df['has_prior_sale'] = df['prior_sale_price'].notna().astype('int8')
        missing = df['prior_sale_price'].isna()
        df.loc[missing,'prior_sale_price'] = df.loc[missing,'median_home_value'] if 'median_home_value' in df.columns and INCLUDE_CENSUS else df['prior_sale_price'].median()

    if 'years_since_last_sale' in df.columns:
        df['years_since_last_sale'] = df['years_since_last_sale'].fillna(999)

    if with_price and y_col in df.columns and 'living_sqft' in df.columns:
        df['price_per_sqft'], df['sqft_per_dollar'] = df[y_col]/(df['living_sqft']+1), df['living_sqft']/(df[y_col]+1)

    return df

def geo_cluster(df, kmeans=None):
    if not all(c in df.columns for c in ['latitude','longitude']):
        df['geo_cluster'] = 0
        return df, kmeans

    valid = df[['latitude','longitude']].notna().all(axis=1)
    if valid.sum()<N_CLUSTERS:
        df['geo_cluster'] = 0
        return df, kmeans

    df['geo_cluster'] = 0
    if kmeans is None:
        kmeans = MiniBatchKMeans(n_clusters=N_CLUSTERS, random_state=RAND_STATE, batch_size=1000, n_init=3)
        df.loc[valid,'geo_cluster'] = kmeans.fit_predict(df.loc[valid,['latitude','longitude']])
    else:
        df.loc[valid,'geo_cluster'] = kmeans.predict(df.loc[valid,['latitude','longitude']])

    return df, kmeans

def add_cluster_feats(train, test, y_col):
    if 'geo_cluster' not in train.columns or y_col not in train.columns:
        for d in [train,test]:
            d['cluster_avg_price'] = d['cluster_med_price'] = train[y_col].median() if y_col in train.columns else 0
        return train, test

    stats = train.groupby('geo_cluster')[y_col].agg(['mean','median']).reset_index()
    stats.columns = ['geo_cluster','cluster_avg_price','cluster_med_price']
    med = train[y_col].median()
    train = train.merge(stats, on='geo_cluster', how='left').fillna({'cluster_avg_price':med,'cluster_med_price':med})
    test = test.merge(stats, on='geo_cluster', how='left').fillna({'cluster_avg_price':med,'cluster_med_price':med})
    return train, test

def filter_outliers(df, name, y_col):
    orig = len(df)
    df = engineer(df, y_col, with_price=True)

    if 'price_per_sqft' in df.columns:
        lb,ub = df['price_per_sqft'].quantile([.05,.95])
        df = df[(df['price_per_sqft']>=lb)&(df['price_per_sqft']<=ub)].drop(columns=['price_per_sqft','sqft_per_dollar'])

    if 'lot_sqft' in df.columns: df = df[df['lot_sqft']<=df['lot_sqft'].quantile(.98)]
    if 'year_built' in df.columns: df = df[(df['year_built']>=1900)&(df['year_built']<=2025)]

    if len(df)>=100:
        try:
            feats = [c for c in ['living_sqft','lot_sqft',y_col,'year_built'] if c in df.columns]
            if len(feats)>=3:
                X = df[feats].fillna(df[feats].median())
                df = df[IsolationForest(contamination=.05, random_state=RAND_STATE, n_jobs=N_JOBS).fit_predict(X)==1]
        except: pass

    if (pct:=(orig-len(df))/orig*100)>0: print(f"  {name}: {orig:,}→{len(df):,} ({pct:.1f}% filtered)")
    return df

def train_model(X, y, q):
    return XGBRegressor(objective='reg:quantileerror', quantile_alpha=q, n_estimators=N_EST, learning_rate=.1,
                       max_depth=5, min_child_weight=3, subsample=.8, colsample_bytree=.8,
                       random_state=RAND_STATE, n_jobs=N_JOBS, tree_method='hist').fit(X, y, verbose=False)

def get_feat_importance(model, feats, top_n=20):
    scores = model.get_booster().get_score(importance_type="gain")
    imp = [(feats[int(k[1:])],v) for k,v in scores.items() if int(k[1:])<len(feats)]
    imp.sort(key=lambda x: x[1], reverse=True)
    total = sum(v for _,v in imp)
    return pd.DataFrame([{'feature':f,'gain':g,'importance':g/total} for f,g in imp[:top_n]]) if total>0 else pd.DataFrame(columns=['feature','gain','importance'])

def feature_importance(models, feats, metrics):
    rows = []
    for seg,mdl in models.items():
        scores = mdl['q50'].get_booster().get_score(importance_type="gain")
        w = metrics[seg]["n_test"]
        rows.extend([(feats[int(k[1:])],v,w) for k,v in scores.items() if int(k[1:])<len(feats)])

    if not rows: return pd.DataFrame(columns=["feature","total_gain","importance"])

    df = pd.DataFrame(rows, columns=["feature","gain","weight"])
    out = df.assign(wg=df["gain"]*df["weight"]).groupby("feature",as_index=False).agg(
        total_gain=("gain","sum"), weighted_gain=("wg","sum")).sort_values("weighted_gain",ascending=False)
    out["importance"] = out["weighted_gain"]/out["weighted_gain"].sum()
    return out[["feature","total_gain","importance"]].head(100)

def prepare_data(df, y_col, id_col, state_col, for_training=True):
    print(f"\nPreparing data (ratio model approach)...")
    if for_training: df = df[df[y_col]>=MIN_PRICE]

    df, kmeans = geo_cluster(engineer(df, y_col))

    feat_groups = []
    if INCLUDE_MLS: feat_groups.extend([BASE_FEATS,ENG_FEATS,PRIOR_FEATS,CLUSTER_FEATS])
    if INCLUDE_CENSUS: feat_groups.append(CENSUS_FEATS)
    if INCLUDE_NEIGHBORHOOD: feat_groups.append(ELECT_FEATS)
    if INCLUDE_IMAGE: feat_groups.append(IMG_FEATS)

    all_feats = [f for g in feat_groups for f in g]
    feats = [f for f in all_feats if f in df.columns]
    feats.extend([vf for vf in ['prior_appreciated','assessed_per_sqft','assessed_to_census_ratio','land_to_total_ratio'] if vf in df.columns and vf not in feats])

    print(f"{len(feats)}/{len(all_feats)} features available")

    cols = list(set(feats+[y_col,id_col]+([state_col] if state_col and state_col in df.columns else [])))
    df = df[[c for c in cols if c in df.columns]].copy()
    df[feats] = df[feats].fillna(df[feats].median())

    return (df.dropna(subset=[y_col]),feats,kmeans) if for_training else (df,feats,kmeans)

def train_segments(df, feats, y_col, id_col, state_col):
    print(f"\nTraining on {len(df):,} properties (RATIO MODEL - predicts actual/value_indicator)")

    df['seg'], train_q = assign_segments(df)

    # Add value_indicator features to feature list
    if 'value_indicator' in df.columns and 'value_indicator' not in feats:
        feats = feats + ['value_indicator']
    if 'log_value_indicator' in df.columns and 'log_value_indicator' not in feats:
        feats = feats + ['log_value_indicator']

    # CRITICAL: Create ratio target (actual price / value_indicator)
    df['price_ratio'] = df[y_col] / (df['value_indicator'] + 1)

    if '_value_source' in df.columns:
        print(f"\nValue sources: {dict(df['_value_source'].value_counts())}")

    seg_cnts = df['seg'].value_counts()
    print(f"\n{len(seg_cnts)} segments")

    for seg,cnt in seg_cnts.head(MAX_SEGMENTS).items():
        if 'value_indicator' in df.columns and seg!='other':
            seg_df = df[df['seg']==seg]
            if len(seg_df)>0:
                avg_ratio = seg_df['price_ratio'].mean()
                print(f"  {seg}: {cnt:,} (${seg_df['value_indicator'].min():,.0f}-${seg_df['value_indicator'].max():,.0f}, avg ratio: {avg_ratio:.2f}x)")
        else:
            print(f"  {seg}: {cnt:,}")

    small = seg_cnts[seg_cnts<50].index
    if len(small)>0:
        df.loc[df['seg'].isin(small),'seg'] = 'other'
        print(f"Consolidated to {MAX_SEGMENTS} segments")

    seg_cnts = df['seg'].value_counts()
    if len(seg_cnts)>MAX_SEGMENTS:
        keep = seg_cnts.head(MAX_SEGMENTS-1).index
        df.loc[~df['seg'].isin(keep),'seg'] = 'other'
        print(f"Consolidated to {MAX_SEGMENTS} segments")

    if '_value_source' in df.columns:
        df = df.drop(columns=['_value_source'])

    models, metrics, preds_list, seg_imps = {}, {}, [], {}

    for seg in df['seg'].unique():
        seg_df = filter_outliers(df[df['seg']==seg].copy(), seg, y_col)
        if len(seg_df)<50: continue

        train_idx = seg_df.sample(frac=1-TEST_SIZE, random_state=RAND_STATE).index
        train_df, test_df = seg_df.loc[train_idx].copy(), seg_df.loc[seg_df.index.difference(train_idx)].copy()
        train_df, test_df = add_cluster_feats(train_df, test_df, y_col)

        # Train on RATIO, not absolute price
        X_tr, y_tr = train_df[feats].values, train_df['price_ratio'].values
        X_te, y_te_ratio = test_df[feats].values, test_df['price_ratio'].values
        y_te_actual = test_df[y_col].values
        value_indicators = test_df['value_indicator'].values

        ids = test_df[id_col].values
        states = test_df[state_col].values if state_col and state_col in test_df.columns else ['Unknown']*len(test_df)

        seg_models = {f"q{int(q*100)}":train_model(X_tr,y_tr,q) for q in QUANTILES}
        seg_preds_ratio = [seg_models[f"q{int(q*100)}"].predict(X_te) for q in QUANTILES]

        # Convert ratio predictions back to prices
        seg_preds = [seg_preds_ratio[i] * value_indicators for i in range(len(QUANTILES))]

        models[seg] = seg_models
        seg_imps[seg] = get_feat_importance(seg_models['q50'], feats)

        mae, mape = mean_absolute_error(y_te_actual,seg_preds[1]), np.mean(np.abs((y_te_actual-seg_preds[1])/y_te_actual))*100
        r2, cov = r2_score(y_te_actual,seg_preds[1]), np.mean((y_te_actual>=seg_preds[0])&(y_te_actual<=seg_preds[2]))*100
        metrics[seg] = {'n_train':len(X_tr),'n_test':len(X_te),'mae':mae,'mape':mape,'r2':r2,'cov':cov,
                       'p_min':seg_df[y_col].min(),'p_max':seg_df[y_col].max(),'p_med':seg_df[y_col].median()}
        print(f"  {seg}: {len(test_df):,} test | MAE:${mae:,.0f} | MAPE:{mape:.2f}% | R²:{r2:.3f}")

        preds_list.append(pd.DataFrame({
            'property_id':ids, 'state':states, 'actual':y_te_actual, 'predicted':seg_preds[1],
            'pred_lower':seg_preds[0], 'pred_upper':seg_preds[2], 'segment':seg
        }))

    return {'models':models, 'metrics':metrics, 'predictions':pd.concat(preds_list),
            'feature_importance':feature_importance(models,feats,metrics), 'segment_importances':seg_imps,
            'feature_names':feats, 'training_quantiles':train_q}

def predict_new(pred_df, models, feats, y_col, id_col, state_col, kmeans, train_stats, train_q):
    print(f"\n{'='*60}\nPREDICTING {len(pred_df):,} NEW PROPERTIES\n{'='*60}")

    pred_df = engineer(pred_df, y_col)
    pred_df, _ = geo_cluster(pred_df, kmeans)

    if 'geo_cluster' in pred_df.columns and train_stats is not None:
        pred_df = pred_df.merge(train_stats, on='geo_cluster', how='left')
        med = train_stats['cluster_avg_price'].median()
        pred_df['cluster_avg_price'], pred_df['cluster_med_price'] = pred_df['cluster_avg_price'].fillna(med), pred_df['cluster_med_price'].fillna(med)

    pred_df['seg'], _ = assign_segments(pred_df, train_q)

    # Build complete feature list including value_indicator
    use_feats = list(feats)
    for vf in ['value_indicator', 'log_value_indicator']:
        if vf in pred_df.columns and vf not in use_feats:
            use_feats.append(vf)

    for f in use_feats:
        if f not in pred_df.columns: pred_df[f] = 0
        else: pred_df[f] = pred_df[f].fillna(pred_df[f].median() if pred_df[f].notna().sum()>0 else 0)

    if '_value_source' in pred_df.columns:
        print(f"Value sources: {dict(pred_df['_value_source'].value_counts())}")

    preds_list = []
    for seg in pred_df['seg'].unique():
        seg_df = pred_df[pred_df['seg']==seg].copy()

        if seg not in models:
            print(f"  Warning: '{seg}' not in models, using fallback")
            avail = list(models.keys())
            seg = next((s for s in avail if any(x in seg for x in ['ultra','luxury','premium','mid','economy','budget'] if x in s)), avail[0])

        X = seg_df[use_feats].values
        ids = seg_df[id_col].values
        states = seg_df[state_col].values if state_col and state_col in seg_df.columns else ['Unknown']*len(seg_df)
        actual = seg_df[y_col].values if y_col in seg_df.columns else [np.nan]*len(seg_df)
        value_ind = seg_df['value_indicator'].values if 'value_indicator' in seg_df.columns else [np.nan]*len(seg_df)
        value_src = seg_df['_value_source'].values if '_value_source' in seg_df.columns else ['unknown']*len(seg_df)

        # Model predicts RATIO, convert to price
        preds_ratio = [models[seg][f"q{int(q*100)}"].predict(X) for q in QUANTILES]
        preds = [preds_ratio[i] * value_ind for i in range(len(QUANTILES))]

        preds_list.append(pd.DataFrame({
            'property_id':ids, 'state':states, 'value_indicator':value_ind, 'value_source':value_src,
            'predicted_ratio':preds_ratio[1], 'actual':actual, 'predicted':preds[1],
            'pred_lower':preds[0], 'pred_upper':preds[2], 'segment':seg,
            'error':[actual[i]-preds[1][i] if not np.isnan(actual[i]) else np.nan for i in range(len(actual))],
            'pct_error':[100*(actual[i]-preds[1][i])/actual[i] if not np.isnan(actual[i]) and actual[i]!=0 else np.nan for i in range(len(actual))]
        }))

        print(f"  {seg}: {len(seg_df):,} predicted")

    result = pd.concat(preds_list, ignore_index=True)
    print(f"\n✓ Generated {len(result):,} predictions")

    if '_value_source' in pred_df.columns: pred_df = pred_df.drop(columns=['_value_source'])

    valid = result['actual'].notna().sum()
    if valid>0:
        valid_df = result[result['actual'].notna()]
        mae, mape, r2 = mean_absolute_error(valid_df['actual'],valid_df['predicted']), np.mean(np.abs((valid_df['actual']-valid_df['predicted'])/valid_df['actual']))*100, r2_score(valid_df['actual'],valid_df['predicted'])
        print(f"  Validation ({valid}): MAE:${mae:,.0f} | MAPE:{mape:.2f}% | R²:{r2:.4f}")

    return result

def save_results(results, out_dir, new_preds=None):
    print(f"\nSaving results...")
    preds, metrics, fi, seg_imps = results['predictions'], results['metrics'], results['feature_importance'], results['segment_importances']

    wb = Workbook()
    wb.remove(wb.active)

    ws = wb.create_sheet("Summary", 0)
    ws['A1'].font, ws['A1'].value = Font(bold=True,size=14), 'IMPROVED SEGMENTED AVM'
    ws['A2'].font, ws['A2'].value = Font(italic=True,size=10), 'RATIO MODEL: predicts (actual/value_indicator) then multiplies back'

    r2, mae, mape = r2_score(preds['actual'],preds['predicted']), mean_absolute_error(preds['actual'],preds['predicted']), np.mean(np.abs((preds['actual']-preds['predicted'])/preds['actual']))*100

    data = [['Metric','Value'], ['Properties',len(preds)], ['Segments',len(metrics)], ['R²',f'{r2:.4f}'], ['MAE',f'${mae:,.0f}'], ['MAPE%',f'{mape:.2f}%']]
    if new_preds is not None: data.append(['New Predictions',len(new_preds)])

    for i,(k,v) in enumerate(data,5):
        ws[f'A{i}'].font, ws[f'A{i}'].value, ws[f'B{i}'].value = Font(bold=True), k, v

    ws = wb.create_sheet("Segments")
    seg_df = pd.DataFrame([{**{'Segment':s},**m} for s,m in metrics.items()])
    for i,h in enumerate(seg_df.columns,1):
        c = ws.cell(1,i,h)
        c.font, c.fill = Font(bold=True,color='FFFFFF'), PatternFill(start_color='366092',end_color='366092',fill_type='solid')
    for i,row in enumerate(seg_df.itertuples(index=False),2):
        for j,v in enumerate(row,1): ws.cell(i,j,v)

    for title,data,name in [('Global Feature Importance',fi,'Global_Feature_Importance')]+[(f'FI: {s}',si,f"FI_{s}"[:31]) for s,si in seg_imps.items()]:
        ws = wb.create_sheet(name)
        ws['A1'].value, ws['A1'].font = title, Font(bold=True,size=12)
        for r_idx,row in enumerate(dataframe_to_rows(data,index=False,header=True),2):
            for c_idx,value in enumerate(row,1):
                cell = ws.cell(row=r_idx,column=c_idx,value=value)
                if r_idx==2: cell.font, cell.fill = Font(bold=True,color='FFFFFF'), PatternFill(start_color='366092',end_color='366092',fill_type='solid')

    for sheet_name,data,color in [('Test_Predictions',preds,'366092'), ('New_Predictions',new_preds,'4472C4')]:
        if data is None: continue
        ws = wb.create_sheet(sheet_name)
        for i,h in enumerate(data.columns,1):
            c = ws.cell(1,i,h)
            c.font, c.fill = Font(bold=True,color='FFFFFF'), PatternFill(start_color=color,end_color=color,fill_type='solid')
        for i,row in enumerate(data.itertuples(index=False),2):
            for j,v in enumerate(row,1): ws.cell(i,j,v)

    ts = time.strftime("%Y%m%d_%H%M%S")
    xl_path = f"{out_dir}/improved_segmented_{ts}.xlsx"
    wb.save(xl_path)

    for name,data in [('test_predictions',preds), ('segments',seg_df), ('importance',fi)]+[(f'importance_{s}',si) for s,si in seg_imps.items()]+([('new_predictions',new_preds)] if new_preds is not None else []):
        data.to_csv(f"{out_dir}/improved_{name}_{ts}.csv", index=False)

    print(f"✓ Excel: {xl_path}")
    print(f"✓ CSVs saved with timestamp: {ts}")

def main():
    t0 = time.time()
    print("="*60+"\nIMPROVED SEGMENTED AVM\nRATIO MODEL (predicts actual/value_indicator)\n"+"="*60)

    df, y_col, id_col, state_col = load_data(TRAINING_INPUT_PATH)
    df, feats, kmeans = prepare_data(df, y_col, id_col, state_col, for_training=True)
    results = train_segments(df, feats, y_col, id_col, state_col)

    train_stats = df.groupby('geo_cluster')[y_col].agg(['mean','median']).reset_index() if 'geo_cluster' in df.columns else None
    if train_stats is not None: train_stats.columns = ['geo_cluster','cluster_avg_price','cluster_med_price']

    new_preds = None
    if PREDICTION_INPUT_PATH:
        pred_df, _, _, _ = load_data(PREDICTION_INPUT_PATH)
        new_preds = predict_new(pred_df, results['models'], results['feature_names'], y_col, id_col, state_col, kmeans, train_stats, results['training_quantiles'])

    os.makedirs(OUTPUT_DIR, exist_ok=True)
    save_results(results, OUTPUT_DIR, new_preds)

    preds = results['predictions']
    r2, mae, mape = r2_score(preds['actual'],preds['predicted']), mean_absolute_error(preds['actual'],preds['predicted']), np.mean(np.abs((preds['actual']-preds['predicted'])/preds['actual']))*100
    print(f"\n{'='*60}\n✓ COMPLETE in {time.time()-t0:.1f}s\n  Test: {len(preds):,} | {preds['segment'].nunique()} segments\n  R²:{r2:.4f} | MAE:${mae:,.0f} | MAPE:{mape:.2f}%")
    if new_preds is not None: print(f"  New predictions: {len(new_preds):,}")
    print("="*60)

if __name__=="__main__": main()

IMPROVED SEGMENTED AVM
RATIO MODEL (predicts actual/value_indicator)
Loading /Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/Untitled 5_2025-12-23-1212.csv
127,326 records | 90.4MB | Price:sale_price ID:property_id

Preparing data (ratio model approach)...
  ⚠️  Filtered 29,431 bad assessed values
51/59 features available

Training on 127,258 properties (RATIO MODEL - predicts actual/value_indicator)

Value quantiles: Q20:$225,450 Q40:$434,750 Q60:$434,750 Q80:$741,940 Q95:$1,445,367

Value sources: {'census': np.int64(127258)}

10 segments
  premium_large: 27,925 ($434,750-$732,125, avg ratio: 4.25x)
  premium_small: 26,120 ($434,750-$732,125, avg ratio: 5.21x)
  budget_small: 19,274 ($12,000-$225,433, avg ratio: 35.52x)
  economy_large: 13,496 ($225,450-$433,875, avg ratio: 4.57x)
  luxury_large: 11,219 ($741,940-$1,426,334, avg ratio: 1.95x)
  economy_small: 8,817 ($225,450-$433,875, avg ratio: 6.83x)
  luxury_small: 7,745 ($741,940-$1,426,334, avg ratio: 1.32x)
Consoli

In [3]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.cluster import MiniBatchKMeans
from sklearn.ensemble import IsolationForest
import warnings, time, os
from openpyxl import Workbook
from openpyxl.styles import Font, PatternFill
from openpyxl.utils.dataframe import dataframe_to_rows

warnings.filterwarnings('ignore')
os.environ['OMP_NUM_THREADS'] = '4'

# Config
MIN_PRICE, TEST_SIZE, RAND_STATE, N_JOBS, N_CLUSTERS = 20000, 0.3, 42, -1, 8
N_EST = 200  # More trees for single model

TRAINING_INPUT_PATH = "/Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/Untitled 5_2025-12-23-1212.csv"
PREDICTION_INPUT_PATH = "/Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/MLS_w_luxury_AVM_outliers.csv"
OUTPUT_DIR = "/Users/jenny.lin/BASIS_AVM_Onboarding/cate_scenario_analyses/model_outputs"

def optimize_dtypes(df):
    for c in df.select_dtypes(['float64']).columns: df[c] = df[c].astype('float32')
    for c in df.select_dtypes(['int64']).columns:
        df[c] = df[c].astype('int8' if set(df[c].dropna().unique()).issubset({0,1}) else 'int32')
    return df

def load_data(path):
    print(f"Loading {path}")
    df = optimize_dtypes(pd.read_csv(path, low_memory=False))
    df.columns = df.columns.str.lower()
    y_col = next((c for c in ['sale_price','currentsalesprice','price','saleprice'] if c in df.columns), 'currentsalesprice')
    id_col = next((c for c in ['cc_list_id','property_id','propertyid','id'] if c in df.columns), 'cc_list_id')
    state_col = next((c for c in ['sample_state','state','state_code'] if c in df.columns), None)
    print(f"{len(df):,} records | {df.memory_usage(deep=True).sum()/1024**2:.1f}MB | Price:{y_col} ID:{id_col}")
    return df, y_col, id_col, state_col

def filter_bad_assessed(df):
    if 'assessed_total_value' not in df.columns:
        return pd.Series([False]*len(df), index=df.index)

    valid = (df['assessed_total_value']>10000) & (df['assessed_total_value']<100000000)

    if 'living_sqft' in df.columns:
        has_sqft = (df['living_sqft'].notna()) & (df['living_sqft']>100)
        ppsf = df['assessed_total_value']/df['living_sqft']
        valid &= ~(has_sqft & ((ppsf<20)|(ppsf>2000)))

    if 'prior_sale_price' in df.columns and 'years_since_last_sale' in df.columns:
        has_prior = (df['prior_sale_price']>10000) & (df['years_since_last_sale']<=15)
        valid &= ~(has_prior & ((df['assessed_total_value']/df['prior_sale_price'])<0.10))

    if 'median_home_value' in df.columns:
        has_census = (df['median_home_value']>10000)
        valid &= ~(has_census & ((df['assessed_total_value']/df['median_home_value'])<0.05))

    invalid_cnt = ((df['assessed_total_value'].notna()) & ~valid).sum()
    if invalid_cnt>0: print(f"  ⚠️  Filtered {invalid_cnt:,} bad assessed values")

    return valid

def create_value_indicator(df):
    """Create value_indicator from multiple sources - NO LEAKAGE"""
    value_ind = pd.Series([np.nan]*len(df), index=df.index)
    source = pd.Series(['none']*len(df), index=df.index)

    # Priority 1: Prior sale PPSF × sqft
    if all(c in df.columns for c in ['prior_sale_price','years_since_last_sale','living_sqft']):
        has_prior = (df['prior_sale_price']>10000) & (df['years_since_last_sale']<=15) & (df['living_sqft']>400)
        if has_prior.sum()>0:
            prior_ppsf = df.loc[has_prior,'prior_sale_price'] / (df.loc[has_prior,'living_sqft']+1)
            yrs = df.loc[has_prior,'years_since_last_sale'].fillna(5)
            appreciated_ppsf = prior_ppsf * (1.04 ** yrs)
            value_ind[has_prior] = appreciated_ppsf * df.loc[has_prior,'living_sqft']
            source[has_prior] = 'prior_ppsf'

    # Priority 2: Assessed value
    if 'assessed_total_value' in df.columns:
        valid = filter_bad_assessed(df) & value_ind.isna()
        if valid.sum()>0:
            vals = df.loc[valid,'assessed_total_value']
            mult = pd.Series([1.15]*len(vals), index=vals.index)
            mult[vals<200000], mult[vals>=500000] = 1.1, 1.2
            value_ind[valid], source[valid] = vals*mult, 'assessed'

    # Priority 3: Census median
    if 'median_home_value' in df.columns:
        valid = (df['median_home_value']>10000) & value_ind.isna()
        if valid.sum()>0:
            value_ind[valid], source[valid] = df.loc[valid,'median_home_value'], 'census'

    # Priority 4: Geo cluster median
    if 'cluster_med_price' in df.columns:
        valid = (df['cluster_med_price']>10000) & value_ind.isna()
        if valid.sum()>0:
            value_ind[valid], source[valid] = df.loc[valid,'cluster_med_price'], 'cluster'

    # Fallback
    value_ind = value_ind.fillna(value_ind.median())
    source[value_ind.isna()] = 'global_median'

    df['value_indicator'] = value_ind
    df['log_value_indicator'] = np.log1p(value_ind)
    df['_value_source'] = source

    return df

def engineer(df, y_col, with_price=False):
    if 'living_sqft' in df.columns:
        if 'bedrooms' in df.columns: df['sqft_per_bedroom'] = df['living_sqft']/(df['bedrooms']+1)
        df['log_sqft'] = np.log1p(df['living_sqft'])

    if 'lot_sqft' in df.columns:
        if 'living_sqft' in df.columns: df['lot_to_living_ratio'] = df['lot_sqft']/(df['living_sqft']+1)
        if 'lot_acres' not in df.columns: df['lot_acres'] = df['lot_sqft']/43560

    if 'year_built' in df.columns:
        df['property_age'] = 2024-df['year_built']
        df['is_new'] = ((2024-df['year_built'])<=5).astype('int8')
        df['age_squared'] = (2024-df['year_built'])**2

    if 'garage_spaces' in df.columns:
        df['has_garage'] = (df['garage_spaces']>0).astype('int8')

    lux = [df[c]/1000 if c=='living_sqft' else df[c] for c in ['living_sqft','full_baths','garage_spaces'] if c in df.columns]
    df['luxury_score'] = sum(lux)/len(lux) if lux else 0

    if 'prior_sale_price' in df.columns:
        if 'years_since_last_sale' in df.columns:
            df['prior_appreciated'] = df['prior_sale_price']*(1.04**df['years_since_last_sale'].fillna(5))
        if 'living_sqft' in df.columns:
            df['prior_price_per_sqft'] = df['prior_sale_price']/(df['living_sqft']+1)
        df['has_prior_sale'] = df['prior_sale_price'].notna().astype('int8')

    if 'assessed_total_value' in df.columns and 'living_sqft' in df.columns:
        valid = filter_bad_assessed(df)
        df['assessed_per_sqft'] = 0
        df.loc[valid,'assessed_per_sqft'] = df.loc[valid,'assessed_total_value']/(df.loc[valid,'living_sqft']+1)

    if 'median_household_income' in df.columns and 'pct_bachelors_degree' in df.columns:
        df['income_education_score'] = df['median_household_income']*df['pct_bachelors_degree']

    if 'prior_sale_date' in df.columns:
        df['prior_sale_date'] = pd.to_datetime(df['prior_sale_date'], errors='coerce')
        df['years_since_last_sale'] = (pd.Timestamp('2024-01-01')-df['prior_sale_date']).dt.days/365.25
        df['recently_sold'] = (df['years_since_last_sale']<2).astype('int8')

    if 'years_since_last_sale' in df.columns:
        df['years_since_last_sale'] = df['years_since_last_sale'].fillna(999)

    if 'prior_sale_price' in df.columns:
        missing = df['prior_sale_price'].isna()
        df.loc[missing,'prior_sale_price'] = df['prior_sale_price'].median()

    if with_price and y_col in df.columns and 'living_sqft' in df.columns:
        df['price_per_sqft'] = df[y_col]/(df['living_sqft']+1)

    return df

def geo_cluster(df, kmeans=None):
    if not all(c in df.columns for c in ['latitude','longitude']):
        df['geo_cluster'] = 0
        return df, kmeans

    valid = df[['latitude','longitude']].notna().all(axis=1)
    if valid.sum()<N_CLUSTERS:
        df['geo_cluster'] = 0
        return df, kmeans

    df['geo_cluster'] = 0
    if kmeans is None:
        kmeans = MiniBatchKMeans(n_clusters=N_CLUSTERS, random_state=RAND_STATE, batch_size=1000, n_init=3)
        df.loc[valid,'geo_cluster'] = kmeans.fit_predict(df.loc[valid,['latitude','longitude']])
    else:
        df.loc[valid,'geo_cluster'] = kmeans.predict(df.loc[valid,['latitude','longitude']])

    return df, kmeans

def add_cluster_feats(df, cluster_stats):
    if cluster_stats is None or 'geo_cluster' not in df.columns:
        df['cluster_avg_price'] = df['cluster_med_price'] = 0
        return df

    df = df.merge(cluster_stats, on='geo_cluster', how='left')
    med = cluster_stats['cluster_avg_price'].median()
    df['cluster_avg_price'] = df['cluster_avg_price'].fillna(med)
    df['cluster_med_price'] = df['cluster_med_price'].fillna(med)
    return df

def filter_outliers(df, y_col):
    orig = len(df)
    df = engineer(df, y_col, with_price=True)

    if 'price_per_sqft' in df.columns:
        lb,ub = df['price_per_sqft'].quantile([.05,.95])
        df = df[(df['price_per_sqft']>=lb)&(df['price_per_sqft']<=ub)].drop(columns=['price_per_sqft'])

    if 'lot_sqft' in df.columns: df = df[df['lot_sqft']<=df['lot_sqft'].quantile(.98)]
    if 'year_built' in df.columns: df = df[(df['year_built']>=1900)&(df['year_built']<=2025)]

    if len(df)>=100:
        try:
            feats = [c for c in ['living_sqft','lot_sqft',y_col,'year_built'] if c in df.columns]
            if len(feats)>=3:
                X = df[feats].fillna(df[feats].median())
                df = df[IsolationForest(contamination=.05, random_state=RAND_STATE, n_jobs=N_JOBS).fit_predict(X)==1]
        except: pass

    pct = (orig-len(df))/orig*100 if orig>0 else 0
    if pct>0: print(f"  Filtered: {orig:,}→{len(df):,} ({pct:.1f}%)")
    return df

def train_single_model(df, feats, y_col, id_col, state_col):
    print(f"\nTraining SINGLE GLOBAL MODEL on {len(df):,} properties")
    print("Approach: XGBoost learns when to trust value_indicator vs other features")

    # Filter outliers
    df = filter_outliers(df, y_col)

    # Split train/test
    train_idx = df.sample(frac=1-TEST_SIZE, random_state=RAND_STATE).index
    train_df = df.loc[train_idx].copy()
    test_df = df.loc[df.index.difference(train_idx)].copy()

    # Add cluster stats from training data
    cluster_stats = None
    if 'geo_cluster' in train_df.columns:
        cluster_stats = train_df.groupby('geo_cluster')[y_col].agg(['mean','median']).reset_index()
        cluster_stats.columns = ['geo_cluster','cluster_avg_price','cluster_med_price']
        train_df = add_cluster_feats(train_df, cluster_stats)
        test_df = add_cluster_feats(test_df, cluster_stats)

    X_tr, y_tr = train_df[feats].values, train_df[y_col].values
    X_te, y_te = test_df[feats].values, test_df[y_col].values

    print(f"  Training: {len(X_tr):,} | Test: {len(X_te):,}")

    # Train single model
    model = XGBRegressor(
        n_estimators=N_EST,
        learning_rate=0.05,
        max_depth=6,
        min_child_weight=3,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=RAND_STATE,
        n_jobs=N_JOBS,
        tree_method='hist'
    ).fit(X_tr, y_tr, verbose=False)

    y_pred = model.predict(X_te)

    mae = mean_absolute_error(y_te, y_pred)
    mape = np.mean(np.abs((y_te-y_pred)/y_te))*100
    r2 = r2_score(y_te, y_pred)

    print(f"  Results: MAE=${mae:,.0f} | MAPE={mape:.2f}% | R²={r2:.4f}")

    # Feature importance
    scores = model.get_booster().get_score(importance_type="gain")
    imp = [(feats[int(k[1:])],v) for k,v in scores.items() if int(k[1:])<len(feats)]
    imp.sort(key=lambda x: x[1], reverse=True)
    total = sum(v for _,v in imp)
    importance_df = pd.DataFrame([
        {'feature':f,'gain':g,'importance':g/total} for f,g in imp[:20]
    ]) if total>0 else pd.DataFrame(columns=['feature','gain','importance'])

    # Predictions
    ids = test_df[id_col].values
    states = test_df[state_col].values if state_col and state_col in test_df.columns else ['Unknown']*len(test_df)
    value_ind = test_df['value_indicator'].values if 'value_indicator' in test_df.columns else [np.nan]*len(test_df)
    value_src = test_df['_value_source'].values if '_value_source' in test_df.columns else ['unknown']*len(test_df)

    preds_df = pd.DataFrame({
        'property_id':ids, 'state':states, 'value_indicator':value_ind, 'value_source':value_src,
        'actual':y_te, 'predicted':y_pred,
        'error':y_te-y_pred,
        'pct_error':100*(y_te-y_pred)/y_te
    })

    return {
        'model': model,
        'metrics': {'mae':mae, 'mape':mape, 'r2':r2, 'n_train':len(X_tr), 'n_test':len(X_te)},
        'predictions': preds_df,
        'feature_importance': importance_df,
        'cluster_stats': cluster_stats
    }

def predict_new(pred_df, model, feats, y_col, id_col, state_col, kmeans, cluster_stats):
    print(f"\n{'='*60}\nPREDICTING {len(pred_df):,} NEW PROPERTIES\n{'='*60}")

    pred_df = engineer(pred_df, y_col)
    pred_df, _ = geo_cluster(pred_df, kmeans)
    pred_df = create_value_indicator(pred_df)
    pred_df = add_cluster_feats(pred_df, cluster_stats)

    # Fill missing features
    for f in feats:
        if f not in pred_df.columns: pred_df[f] = 0
        else: pred_df[f] = pred_df[f].fillna(pred_df[f].median() if pred_df[f].notna().sum()>0 else 0)

    if '_value_source' in pred_df.columns:
        print(f"Value sources: {dict(pred_df['_value_source'].value_counts())}")

    X = pred_df[feats].values
    y_pred = model.predict(X)

    ids = pred_df[id_col].values
    states = pred_df[state_col].values if state_col and state_col in pred_df.columns else ['Unknown']*len(pred_df)
    actual = pred_df[y_col].values if y_col in pred_df.columns else [np.nan]*len(pred_df)
    value_ind = pred_df['value_indicator'].values
    value_src = pred_df['_value_source'].values

    result = pd.DataFrame({
        'property_id':ids, 'state':states, 'value_indicator':value_ind, 'value_source':value_src,
        'actual':actual, 'predicted':y_pred,
        'error':[actual[i]-y_pred[i] if not np.isnan(actual[i]) else np.nan for i in range(len(actual))],
        'pct_error':[100*(actual[i]-y_pred[i])/actual[i] if not np.isnan(actual[i]) and actual[i]!=0 else np.nan for i in range(len(actual))]
    })

    print(f"✓ Generated {len(result):,} predictions")

    valid = result['actual'].notna().sum()
    if valid>0:
        valid_df = result[result['actual'].notna()]
        mae = mean_absolute_error(valid_df['actual'], valid_df['predicted'])
        mape = np.mean(np.abs((valid_df['actual']-valid_df['predicted'])/valid_df['actual']))*100
        r2 = r2_score(valid_df['actual'], valid_df['predicted'])
        print(f"Validation ({valid}): MAE=${mae:,.0f} | MAPE={mape:.2f}% | R²={r2:.4f}")

    return result

def save_results(results, out_dir, new_preds=None):
    print(f"\nSaving results...")
    preds, metrics, fi = results['predictions'], results['metrics'], results['feature_importance']

    wb = Workbook()
    wb.remove(wb.active)

    ws = wb.create_sheet("Summary", 0)
    ws['A1'].font, ws['A1'].value = Font(bold=True,size=14), 'SINGLE GLOBAL MODEL AVM'
    ws['A2'].font, ws['A2'].value = Font(italic=True,size=10), 'XGBoost learns when to trust value_indicator'

    data = [
        ['Metric','Value'],
        ['Train Properties',metrics['n_train']],
        ['Test Properties',metrics['n_test']],
        ['R²',f"{metrics['r2']:.4f}"],
        ['MAE',f"${metrics['mae']:,.0f}"],
        ['MAPE%',f"{metrics['mape']:.2f}%"]
    ]
    if new_preds is not None: data.append(['New Predictions',len(new_preds)])

    for i,(k,v) in enumerate(data,5):
        ws[f'A{i}'].font, ws[f'A{i}'].value, ws[f'B{i}'].value = Font(bold=True), k, v

    ws = wb.create_sheet("Feature_Importance")
    for r_idx,row in enumerate(dataframe_to_rows(fi,index=False,header=True),1):
        for c_idx,value in enumerate(row,1):
            cell = ws.cell(row=r_idx,column=c_idx,value=value)
            if r_idx==1:
                cell.font = Font(bold=True,color='FFFFFF')
                cell.fill = PatternFill(start_color='366092',end_color='366092',fill_type='solid')

    for sheet_name,data,color in [('Test_Predictions',preds,'366092'),('New_Predictions',new_preds,'4472C4')]:
        if data is None: continue
        ws = wb.create_sheet(sheet_name)
        for i,h in enumerate(data.columns,1):
            c = ws.cell(1,i,h)
            c.font, c.fill = Font(bold=True,color='FFFFFF'), PatternFill(start_color=color,end_color=color,fill_type='solid')
        for i,row in enumerate(data.itertuples(index=False),2):
            for j,v in enumerate(row,1): ws.cell(i,j,v)

    ts = time.strftime("%Y%m%d_%H%M%S")
    xl_path = f"{out_dir}/single_model_{ts}.xlsx"
    wb.save(xl_path)

    preds.to_csv(f"{out_dir}/single_test_predictions_{ts}.csv", index=False)
    fi.to_csv(f"{out_dir}/single_importance_{ts}.csv", index=False)
    if new_preds is not None:
        new_preds.to_csv(f"{out_dir}/single_new_predictions_{ts}.csv", index=False)

    print(f"✓ Excel: {xl_path}")
    print(f"✓ CSVs saved")

def main():
    t0 = time.time()
    print("="*60+"\nSINGLE GLOBAL MODEL AVM\nNo Segmentation - Let XGBoost Learn\n"+"="*60)

    # Load and prepare training data
    df, y_col, id_col, state_col = load_data(TRAINING_INPUT_PATH)
    df = df[df[y_col]>=MIN_PRICE]
    df = engineer(df, y_col)
    df, kmeans = geo_cluster(df)
    df = create_value_indicator(df)

    # Define features
    base = ["living_sqft","lot_sqft","year_built","bedrooms","full_baths","half_baths","garage_spaces",
            "latitude","longitude","geo_cluster","value_indicator","log_value_indicator"]
    eng = ["sqft_per_bedroom","lot_to_living_ratio","property_age","is_new","has_garage","luxury_score","log_sqft","age_squared"]
    prior = ["prior_sale_price","prior_price_per_sqft","prior_appreciated","years_since_last_sale","has_prior_sale","recently_sold"]
    cluster = ["cluster_avg_price","cluster_med_price"]
    census = ["median_household_income","median_home_value","pct_bachelors_degree","income_education_score"]

    all_feats = base + eng + prior + cluster + census
    feats = [f for f in all_feats if f in df.columns]

    print(f"{len(feats)}/{len(all_feats)} features available")

    cols = list(set(feats+[y_col,id_col]+([state_col] if state_col and state_col in df.columns else [])))
    df = df[[c for c in cols if c in df.columns]].copy()
    df[feats] = df[feats].fillna(df[feats].median())
    df = df.dropna(subset=[y_col])

    # Train model
    results = train_single_model(df, feats, y_col, id_col, state_col)

    # Predict new properties
    new_preds = None
    if PREDICTION_INPUT_PATH:
        pred_df, _, _, _ = load_data(PREDICTION_INPUT_PATH)
        new_preds = predict_new(pred_df, results['model'], feats, y_col, id_col, state_col,
                               kmeans, results['cluster_stats'])

    # Save results
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    save_results(results, OUTPUT_DIR, new_preds)

    print(f"\n{'='*60}\n✓ COMPLETE in {time.time()-t0:.1f}s")
    if new_preds is not None: print(f"  New predictions: {len(new_preds):,}")
    print("="*60)

if __name__=="__main__": main()

SINGLE GLOBAL MODEL AVM
No Segmentation - Let XGBoost Learn
Loading /Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/Untitled 5_2025-12-23-1212.csv
127,326 records | 90.4MB | Price:sale_price ID:property_id
  ⚠️  Filtered 29,431 bad assessed values
  ⚠️  Filtered 29,431 bad assessed values
24/32 features available

Training SINGLE GLOBAL MODEL on 127,258 properties
Approach: XGBoost learns when to trust value_indicator vs other features
  Filtered: 127,258→103,005 (19.1%)
  Training: 72,104 | Test: 30,901
  Results: MAE=$397,134 | MAPE=28.56% | R²=0.4193
Loading /Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/MLS_w_luxury_AVM_outliers.csv
1 records | 0.0MB | Price:sale_price ID:property_id

PREDICTING 1 NEW PROPERTIES
  ⚠️  Filtered 1 bad assessed values
  ⚠️  Filtered 1 bad assessed values
Value sources: {'census': np.int64(1)}
✓ Generated 1 predictions
Validation (1): MAE=$921,190 | MAPE=175.13% | R²=nan

Saving results...
✓ Excel: /Users/jenny.lin/BASIS_AVM_

In [8]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.cluster import MiniBatchKMeans
from sklearn.ensemble import IsolationForest
import warnings, time, os
from openpyxl import Workbook
from openpyxl.styles import Font, PatternFill
from openpyxl.utils.dataframe import dataframe_to_rows

warnings.filterwarnings('ignore')
os.environ['OMP_NUM_THREADS'] = '4'

# Config
MIN_PRICE, TEST_SIZE, RAND_STATE, N_JOBS, N_CLUSTERS = 20000, 0.3, 42, -1, 8
N_EST = 200  # More trees for single model

TRAINING_INPUT_PATH = "/Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/Untitled 5_2025-12-23-1212.csv"
PREDICTION_INPUT_PATH = "/Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/MLS_w_luxury_AVM_outliers.csv"
OUTPUT_DIR = "/Users/jenny.lin/BASIS_AVM_Onboarding/cate_scenario_analyses/model_outputs"

def optimize_dtypes(df):
    for c in df.select_dtypes(['float64']).columns: df[c] = df[c].astype('float32')
    for c in df.select_dtypes(['int64']).columns:
        df[c] = df[c].astype('int8' if set(df[c].dropna().unique()).issubset({0,1}) else 'int32')
    return df

def load_data(path):
    print(f"Loading {path}")
    df = optimize_dtypes(pd.read_csv(path, low_memory=False))
    df.columns = df.columns.str.lower()
    y_col = next((c for c in ['sale_price','currentsalesprice','price','saleprice'] if c in df.columns), 'currentsalesprice')
    id_col = next((c for c in ['cc_list_id','property_id','propertyid','id'] if c in df.columns), 'cc_list_id')
    state_col = next((c for c in ['sample_state','state','state_code'] if c in df.columns), None)
    print(f"{len(df):,} records | {df.memory_usage(deep=True).sum()/1024**2:.1f}MB | Price:{y_col} ID:{id_col}")
    return df, y_col, id_col, state_col

def filter_bad_assessed(df):
    if 'assessed_total_value' not in df.columns:
        return pd.Series([False]*len(df), index=df.index)

    valid = (df['assessed_total_value']>10000) & (df['assessed_total_value']<100000000)

    if 'living_sqft' in df.columns:
        has_sqft = (df['living_sqft'].notna()) & (df['living_sqft']>100)
        ppsf = df['assessed_total_value']/df['living_sqft']
        valid &= ~(has_sqft & ((ppsf<20)|(ppsf>2000)))

    if 'prior_sale_price' in df.columns and 'years_since_last_sale' in df.columns:
        has_prior = (df['prior_sale_price']>10000) & (df['years_since_last_sale']<=15)
        valid &= ~(has_prior & ((df['assessed_total_value']/df['prior_sale_price'])<0.10))

    if 'median_home_value' in df.columns:
        has_census = (df['median_home_value']>10000)
        valid &= ~(has_census & ((df['assessed_total_value']/df['median_home_value'])<0.05))

    invalid_cnt = ((df['assessed_total_value'].notna()) & ~valid).sum()
    if invalid_cnt>0: print(f"  ⚠️  Filtered {invalid_cnt:,} bad assessed values")

    return valid

def create_value_indicator(df):
    """Create value_indicator from multiple sources - NO LEAKAGE"""
    value_ind = pd.Series([np.nan]*len(df), index=df.index)
    source = pd.Series(['none']*len(df), index=df.index)

    # Priority 1: Prior sale PPSF × sqft
    if all(c in df.columns for c in ['prior_sale_price','years_since_last_sale','living_sqft']):
        has_prior = (df['prior_sale_price']>10000) & (df['years_since_last_sale']<=15) & (df['living_sqft']>400)
        if has_prior.sum()>0:
            prior_ppsf = df.loc[has_prior,'prior_sale_price'] / (df.loc[has_prior,'living_sqft']+1)
            yrs = df.loc[has_prior,'years_since_last_sale'].fillna(5)
            appreciated_ppsf = prior_ppsf * (1.04 ** yrs)
            value_ind[has_prior] = appreciated_ppsf * df.loc[has_prior,'living_sqft']
            source[has_prior] = 'prior_ppsf'

    # Priority 2: Assessed value
    if 'assessed_total_value' in df.columns:
        valid = filter_bad_assessed(df) & value_ind.isna()
        if valid.sum()>0:
            vals = df.loc[valid,'assessed_total_value']
            mult = pd.Series([1.15]*len(vals), index=vals.index)
            mult[vals<200000], mult[vals>=500000] = 1.1, 1.2
            value_ind[valid], source[valid] = vals*mult, 'assessed'

    # Priority 3: Census median
    if 'median_home_value' in df.columns:
        valid = (df['median_home_value']>10000) & value_ind.isna()
        if valid.sum()>0:
            value_ind[valid], source[valid] = df.loc[valid,'median_home_value'], 'census'

    # Priority 4: Geo cluster median
    if 'cluster_med_price' in df.columns:
        valid = (df['cluster_med_price']>10000) & value_ind.isna()
        if valid.sum()>0:
            value_ind[valid], source[valid] = df.loc[valid,'cluster_med_price'], 'cluster'

    # Fallback
    value_ind = value_ind.fillna(value_ind.median())
    source[value_ind.isna()] = 'global_median'

    df['value_indicator'] = value_ind
    df['log_value_indicator'] = np.log1p(value_ind)
    df['_value_source'] = source

    return df

def engineer(df, y_col, with_price=False):
    if 'living_sqft' in df.columns:
        if 'bedrooms' in df.columns: df['sqft_per_bedroom'] = df['living_sqft']/(df['bedrooms']+1)
        df['log_sqft'] = np.log1p(df['living_sqft'])

    if 'lot_sqft' in df.columns:
        if 'living_sqft' in df.columns: df['lot_to_living_ratio'] = df['lot_sqft']/(df['living_sqft']+1)
        if 'lot_acres' not in df.columns: df['lot_acres'] = df['lot_sqft']/43560

    if 'year_built' in df.columns:
        df['property_age'] = 2024-df['year_built']
        df['is_new'] = ((2024-df['year_built'])<=5).astype('int8')
        df['age_squared'] = (2024-df['year_built'])**2

    if 'garage_spaces' in df.columns:
        df['has_garage'] = (df['garage_spaces']>0).astype('int8')

    lux = [df[c]/1000 if c=='living_sqft' else df[c] for c in ['living_sqft','full_baths','garage_spaces'] if c in df.columns]
    df['luxury_score'] = sum(lux)/len(lux) if lux else 0

    if 'prior_sale_price' in df.columns:
        if 'years_since_last_sale' in df.columns:
            df['prior_appreciated'] = df['prior_sale_price']*(1.04**df['years_since_last_sale'].fillna(5))
        if 'living_sqft' in df.columns:
            df['prior_price_per_sqft'] = df['prior_sale_price']/(df['living_sqft']+1)
        df['has_prior_sale'] = df['prior_sale_price'].notna().astype('int8')

    if 'assessed_total_value' in df.columns and 'living_sqft' in df.columns:
        valid = filter_bad_assessed(df)
        df['assessed_per_sqft'] = 0
        df.loc[valid,'assessed_per_sqft'] = df.loc[valid,'assessed_total_value']/(df.loc[valid,'living_sqft']+1)

    if 'median_household_income' in df.columns and 'pct_bachelors_degree' in df.columns:
        df['income_education_score'] = df['median_household_income']*df['pct_bachelors_degree']

    if 'prior_sale_date' in df.columns:
        df['prior_sale_date'] = pd.to_datetime(df['prior_sale_date'], errors='coerce')
        df['years_since_last_sale'] = (pd.Timestamp('2024-01-01')-df['prior_sale_date']).dt.days/365.25
        df['recently_sold'] = (df['years_since_last_sale']<2).astype('int8')

    if 'years_since_last_sale' in df.columns:
        df['years_since_last_sale'] = df['years_since_last_sale'].fillna(999)

    if 'prior_sale_price' in df.columns:
        missing = df['prior_sale_price'].isna()
        df.loc[missing,'prior_sale_price'] = df['prior_sale_price'].median()

    if with_price and y_col in df.columns:
        if 'living_sqft' in df.columns:
            df['price_per_sqft'] = df[y_col]/(df['living_sqft']+1)

    return df

def geo_cluster(df, kmeans=None):
    if not all(c in df.columns for c in ['latitude','longitude']):
        df['geo_cluster'] = 0
        return df, kmeans

    valid = df[['latitude','longitude']].notna().all(axis=1)
    if valid.sum()<N_CLUSTERS:
        df['geo_cluster'] = 0
        return df, kmeans

    df['geo_cluster'] = 0
    if kmeans is None:
        kmeans = MiniBatchKMeans(n_clusters=N_CLUSTERS, random_state=RAND_STATE, batch_size=1000, n_init=3)
        df.loc[valid,'geo_cluster'] = kmeans.fit_predict(df.loc[valid,['latitude','longitude']])
    else:
        df.loc[valid,'geo_cluster'] = kmeans.predict(df.loc[valid,['latitude','longitude']])

    return df, kmeans

def add_cluster_feats(df, cluster_stats):
    if cluster_stats is None or 'geo_cluster' not in df.columns:
        df['cluster_avg_price'] = df['cluster_med_price'] = 0
        return df

    df = df.merge(cluster_stats, on='geo_cluster', how='left')
    med = cluster_stats['cluster_avg_price'].median()
    df['cluster_avg_price'] = df['cluster_avg_price'].fillna(med)
    df['cluster_med_price'] = df['cluster_med_price'].fillna(med)
    return df

def detect_and_normalize_anomalies(df, y_col, for_training=True):
    """
    AGGRESSIVE neighborhood-level anomaly detection and normalization.
    Detects properties with extreme feature combinations relative to their cohort.
    """
    print(f"\n{'='*60}")
    print("PREPROCESSING: Aggressive Anomaly Detection")
    print(f"{'='*60}")

    df['anomaly_flag'] = 0
    df['anomaly_reason'] = 'none'

    # Create cohorts: geo_cluster + value_indicator quartile
    if 'geo_cluster' not in df.columns or 'value_indicator' not in df.columns:
        print("⚠️  Skipping anomaly detection - missing clustering features")
        return df

    # Skip for very small datasets
    if len(df) < 10:
        print(f"⚠️  Skipping cohort-based anomaly detection - only {len(df)} properties (need 10+)")
        return df

    # Handle quartile binning errors gracefully
    try:
        df['value_quartile'] = pd.qcut(
            df['value_indicator'],
            q=4,
            labels=False,
            duplicates='drop'
        )
        df['value_quartile'] = df['value_quartile'].map({
            0: 'Q1', 1: 'Q2', 2: 'Q3', 3: 'Q4'
        }).fillna('Q1')

    except (ValueError, TypeError) as e:
        print(f"⚠️  Could not create quartiles ({str(e)}), using single cohort")
        df['value_quartile'] = 'Q1'

    df['cohort'] = df['geo_cluster'].astype(str) + '_' + df['value_quartile'].astype(str)

    # AGGRESSIVE feature checks - MUCH TIGHTER thresholds
    feature_checks = {
        'living_sqft': ('sqft', 0.6, 1.8),      # Was 0.5-2.5, now 0.6-1.8
        'full_baths': ('baths', 0, 8),           # Was 0-10, now 0-8
        'luxury_score': ('luxury', 0.4, 2.5),    # Was 0.3-3.0, now 0.4-2.5
        'property_age': ('age', -10, 120),       # Was -20-150, now -10-120
        'bedrooms': ('beds', 0, 10),             # NEW: cap bedrooms
        'lot_sqft': ('lot', 0.3, 3.0),           # NEW: lot size relative to cohort
    }

    anomalies_found = 0

    for cohort in df['cohort'].unique():
        cohort_df = df[df['cohort']==cohort]
        if len(cohort_df) < 10:
            continue

        cohort_idx = cohort_df.index

        # Check each feature for anomalies
        for feat, (name, min_mult, max_mult) in feature_checks.items():
            if feat not in df.columns:
                continue

            cohort_vals = cohort_df[feat].dropna()
            if len(cohort_vals) < 5:
                continue

            cohort_median = cohort_vals.median()
            cohort_std = cohort_vals.std()

            if cohort_median == 0 or cohort_std == 0:
                continue

            # Method 1: Ratio-based detection (TIGHTER thresholds)
            if feat in ['living_sqft', 'luxury_score', 'lot_sqft']:
                ratios = df.loc[cohort_idx, feat] / cohort_median
                anomaly_mask = (ratios < min_mult) | (ratios > max_mult)

                if anomaly_mask.sum() > 0:
                    df.loc[cohort_idx[anomaly_mask], feat] = np.clip(
                        df.loc[cohort_idx[anomaly_mask], feat],
                        cohort_median * min_mult,
                        cohort_median * max_mult
                    )
                    df.loc[cohort_idx[anomaly_mask], 'anomaly_flag'] = 1
                    df.loc[cohort_idx[anomaly_mask], 'anomaly_reason'] = f'{name}_extreme'
                    anomalies_found += anomaly_mask.sum()

            # Method 2: Z-score detection (TIGHTER - 2.5 std instead of 3)
            elif feat in ['property_age', 'full_baths', 'bedrooms']:
                z_scores = np.abs((df.loc[cohort_idx, feat] - cohort_median) / (cohort_std + 1))
                anomaly_mask = z_scores > 2.5  # Was 3, now 2.5

                if anomaly_mask.sum() > 0:
                    df.loc[cohort_idx[anomaly_mask], feat] = np.clip(
                        df.loc[cohort_idx[anomaly_mask], feat],
                        cohort_median - 2.5*cohort_std,
                        cohort_median + 2.5*cohort_std
                    )
                    df.loc[cohort_idx[anomaly_mask], 'anomaly_flag'] = 1
                    df.loc[cohort_idx[anomaly_mask], 'anomaly_reason'] = f'{name}_outlier'
                    anomalies_found += anomaly_mask.sum()

    # NEW: Check living_sqft vs bedrooms ratio
    if 'living_sqft' in df.columns and 'bedrooms' in df.columns:
        sqft_per_bed = df['living_sqft'] / (df['bedrooms'] + 1)
        # Typical range: 400-1500 sqft/bedroom
        extreme_ratio = (sqft_per_bed < 250) | (sqft_per_bed > 2000)
        if extreme_ratio.sum() > 0:
            df.loc[extreme_ratio, 'anomaly_flag'] = 1
            df.loc[extreme_ratio, 'anomaly_reason'] = 'sqft_bed_mismatch'
            anomalies_found += extreme_ratio.sum()

    # EXISTING: Cross-feature consistency (TIGHTER)
    if 'living_sqft' in df.columns and 'luxury_score' in df.columns:
        sqft_low = df['living_sqft'] < df['living_sqft'].quantile(0.30)  # Was 0.25
        luxury_high = df['luxury_score'] > df['luxury_score'].quantile(0.70)  # Was 0.75
        inconsistent = sqft_low & luxury_high

        if inconsistent.sum() > 0:
            df.loc[inconsistent, 'luxury_score'] = df.loc[inconsistent, 'luxury_score'] * 0.5  # Was 0.6
            df.loc[inconsistent, 'anomaly_flag'] = 1
            df.loc[inconsistent, 'anomaly_reason'] = 'sqft_luxury_mismatch'
            anomalies_found += inconsistent.sum()

    # NEW: Penalize census-based value indicators (they're often unreliable)
    if '_value_source' in df.columns:
        census_mask = df['_value_source'] == 'census'
        if census_mask.sum() > 0:
            # Reduce impact of census values by dampening extreme features
            if 'luxury_score' in df.columns:
                df.loc[census_mask, 'luxury_score'] = df.loc[census_mask, 'luxury_score'] * 0.85
            if 'living_sqft' in df.columns:
                median_sqft = df['living_sqft'].median()
                extreme_sqft = df.loc[census_mask, 'living_sqft'] > median_sqft * 1.5
                if extreme_sqft.sum() > 0:
                    df.loc[census_mask & extreme_sqft, 'living_sqft'] = df.loc[census_mask & extreme_sqft, 'living_sqft'] * 0.9

            df.loc[census_mask, 'anomaly_flag'] = 1
            df.loc[census_mask, 'anomaly_reason'] = 'census_unreliable'
            anomalies_found += census_mask.sum()

    # TRAINING ONLY: Check price vs value_indicator ratio (TIGHTER)
    if for_training and y_col in df.columns:
        df['price_to_value_ratio'] = df[y_col] / (df['value_indicator'] + 1)

        # MUCH TIGHTER: ratio >2.5x or <0.5x (was 3.0x and 0.4x)
        extreme_ratio = (df['price_to_value_ratio'] > 2.5) | (df['price_to_value_ratio'] < 0.5)

        if extreme_ratio.sum() > 0:
            print(f"  ⚠️  Found {extreme_ratio.sum()} properties with extreme price/value ratios")
            print(f"      These will be EXCLUDED from training (likely data errors)")
            df.loc[extreme_ratio, 'anomaly_flag'] = 2  # Flag 2 = exclude
            df.loc[extreme_ratio, 'anomaly_reason'] = 'extreme_price_ratio'
            anomalies_found += extreme_ratio.sum()

    if anomalies_found > 0:
        cohort_count = df['cohort'].nunique() if 'cohort' in df.columns else 0
        print(f"  ✓ Detected {anomalies_found:,} anomalies across {cohort_count} cohorts")
    else:
        print(f"  ✓ No anomalies detected")

    # For training: exclude extreme anomalies (flag=2)
    if for_training:
        before = len(df)
        df = df[df['anomaly_flag'] != 2].copy()
        if len(df) < before:
            print(f"  ✓ Excluded {before-len(df):,} extreme anomalies from training")

    # Clean up temporary columns
    cols_to_drop = ['value_quartile', 'cohort']
    if 'price_to_value_ratio' in df.columns:
        cols_to_drop.append('price_to_value_ratio')
    df = df.drop(columns=[c for c in cols_to_drop if c in df.columns])

    return df

def add_prediction_confidence(pred_df):
    """
    Add a confidence score and warning flags for predictions.
    Helps identify predictions that are likely to be unreliable.
    """
    pred_df['confidence_score'] = 100  # Start at 100%
    pred_df['warning_flags'] = ''

    warnings = []

    # Check 1: Predicted vs value_indicator ratio
    if 'predicted' in pred_df.columns and 'value_indicator' in pred_df.columns:
        ratio = pred_df['predicted'] / (pred_df['value_indicator'] + 1)

        # Extreme ratios indicate unreliable predictions
        extreme_high = ratio > 2.0
        extreme_low = ratio < 0.6

        if extreme_high.sum() > 0:
            pred_df.loc[extreme_high, 'confidence_score'] -= 40
            pred_df.loc[extreme_high, 'warning_flags'] = pred_df.loc[extreme_high, 'warning_flags'] + 'PRED_MUCH_HIGHER_THAN_VALUE|'

        if extreme_low.sum() > 0:
            pred_df.loc[extreme_low, 'confidence_score'] -= 30
            pred_df.loc[extreme_low, 'warning_flags'] = pred_df.loc[extreme_low, 'warning_flags'] + 'PRED_MUCH_LOWER_THAN_VALUE|'

    # Check 2: Census-based value indicators are unreliable
    if 'value_source' in pred_df.columns:
        census_mask = pred_df['value_source'] == 'census'
        if census_mask.sum() > 0:
            pred_df.loc[census_mask, 'confidence_score'] -= 25
            pred_df.loc[census_mask, 'warning_flags'] = pred_df.loc[census_mask, 'warning_flags'] + 'CENSUS_VALUE_UNRELIABLE|'

    # Check 3: Properties with anomaly flags
    if 'anomaly_flag' in pred_df.columns:
        anomaly_mask = pred_df['anomaly_flag'] > 0
        if anomaly_mask.sum() > 0:
            pred_df.loc[anomaly_mask, 'confidence_score'] -= 20
            pred_df.loc[anomaly_mask, 'warning_flags'] = pred_df.loc[anomaly_mask, 'warning_flags'] + 'FEATURES_NORMALIZED|'

    # Check 4: Missing key features (if we had them in training)
    if 'prior_sale_price' in pred_df.columns:
        missing_prior = pred_df['prior_sale_price'].isna() | (pred_df['prior_sale_price'] == pred_df['prior_sale_price'].median())
        if missing_prior.sum() > 0:
            pred_df.loc[missing_prior, 'confidence_score'] -= 15
            pred_df.loc[missing_prior, 'warning_flags'] = pred_df.loc[missing_prior, 'warning_flags'] + 'NO_PRIOR_SALE|'

    # Clean up warning flags
    pred_df['warning_flags'] = pred_df['warning_flags'].str.rstrip('|')
    pred_df['warning_flags'] = pred_df['warning_flags'].replace('', 'NONE')

    # Ensure confidence doesn't go below 0
    pred_df['confidence_score'] = pred_df['confidence_score'].clip(lower=0)

    # Add recommendation
    pred_df['recommendation'] = 'USE'
    pred_df.loc[pred_df['confidence_score'] < 40, 'recommendation'] = 'CAUTION'
    pred_df.loc[pred_df['confidence_score'] < 20, 'recommendation'] = 'DO_NOT_USE'

    return pred_df

def train_single_model(df, feats, y_col, id_col, state_col):
    print(f"\nTraining SINGLE GLOBAL MODEL on {len(df):,} properties")

    # Engineer price features for anomaly detection
    df = engineer(df, y_col, with_price=True)

    # PREPROCESSING: Detect and normalize anomalies
    df = detect_and_normalize_anomalies(df, y_col, for_training=True)

    # Basic filtering
    if 'price_per_sqft' in df.columns:
        lb,ub = df['price_per_sqft'].quantile([.05,.95])
        df = df[(df['price_per_sqft']>=lb)&(df['price_per_sqft']<=ub)].drop(columns=['price_per_sqft'])
    if 'lot_sqft' in df.columns: df = df[df['lot_sqft']<=df['lot_sqft'].quantile(.98)]
    if 'year_built' in df.columns: df = df[(df['year_built']>=1900)&(df['year_built']<=2025)]

    print(f"  After filtering: {len(df):,} properties")

    # Split train/test
    train_idx = df.sample(frac=1-TEST_SIZE, random_state=RAND_STATE).index
    train_df = df.loc[train_idx].copy()
    test_df = df.loc[df.index.difference(train_idx)].copy()

    # Add cluster stats from training data
    cluster_stats = None
    if 'geo_cluster' in train_df.columns:
        cluster_stats = train_df.groupby('geo_cluster')[y_col].agg(['mean','median']).reset_index()
        cluster_stats.columns = ['geo_cluster','cluster_avg_price','cluster_med_price']
        train_df = add_cluster_feats(train_df, cluster_stats)
        test_df = add_cluster_feats(test_df, cluster_stats)

    X_tr, y_tr = train_df[feats].values, train_df[y_col].values
    X_te, y_te = test_df[feats].values, test_df[y_col].values

    print(f"\n{'='*60}")
    print(f"MODEL TRAINING")
    print(f"{'='*60}")
    print(f"  Training: {len(X_tr):,} | Test: {len(X_te):,}")

    # Train single model
    model = XGBRegressor(
        n_estimators=N_EST,
        learning_rate=0.05,
        max_depth=6,
        min_child_weight=3,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=RAND_STATE,
        n_jobs=N_JOBS,
        tree_method='hist'
    ).fit(X_tr, y_tr, verbose=False)

    y_pred = model.predict(X_te)

    mae = mean_absolute_error(y_te, y_pred)
    mape = np.mean(np.abs((y_te-y_pred)/y_te))*100
    r2 = r2_score(y_te, y_pred)

    print(f"  Results: MAE=${mae:,.0f} | MAPE={mape:.2f}% | R²={r2:.4f}")

    # Feature importance
    scores = model.get_booster().get_score(importance_type="gain")
    imp = [(feats[int(k[1:])],v) for k,v in scores.items() if int(k[1:])<len(feats)]
    imp.sort(key=lambda x: x[1], reverse=True)
    total = sum(v for _,v in imp)
    importance_df = pd.DataFrame([
        {'feature':f,'gain':g,'importance':g/total} for f,g in imp[:20]
    ]) if total>0 else pd.DataFrame(columns=['feature','gain','importance'])

    # Predictions
    ids = test_df[id_col].values
    states = test_df[state_col].values if state_col and state_col in test_df.columns else ['Unknown']*len(test_df)
    value_ind = test_df['value_indicator'].values if 'value_indicator' in test_df.columns else [np.nan]*len(test_df)
    value_src = test_df['_value_source'].values if '_value_source' in test_df.columns else ['unknown']*len(test_df)
    anomaly_flag = test_df['anomaly_flag'].values if 'anomaly_flag' in test_df.columns else [0]*len(test_df)

    preds_df = pd.DataFrame({
        'property_id':ids, 'state':states, 'value_indicator':value_ind, 'value_source':value_src,
        'anomaly_flag':anomaly_flag,
        'actual':y_te, 'predicted':y_pred,
        'error':y_te-y_pred,
        'pct_error':100*(y_te-y_pred)/y_te
    })

    return {
        'model': model,
        'metrics': {'mae':mae, 'mape':mape, 'r2':r2, 'n_train':len(X_tr), 'n_test':len(X_te)},
        'predictions': preds_df,
        'feature_importance': importance_df,
        'cluster_stats': cluster_stats
    }

def predict_new(pred_df, model, feats, y_col, id_col, state_col, kmeans, cluster_stats):
    print(f"\n{'='*60}\nPREDICTING {len(pred_df):,} NEW PROPERTIES\n{'='*60}")

    pred_df = engineer(pred_df, y_col)
    pred_df, _ = geo_cluster(pred_df, kmeans)
    pred_df = create_value_indicator(pred_df)
    pred_df = add_cluster_feats(pred_df, cluster_stats)

    # AGGRESSIVE anomaly detection
    pred_df = detect_and_normalize_anomalies(pred_df, y_col, for_training=False)

    # Fill missing features
    for f in feats:
        if f not in pred_df.columns: pred_df[f] = 0
        else: pred_df[f] = pred_df[f].fillna(pred_df[f].median() if pred_df[f].notna().sum()>0 else 0)

    if '_value_source' in pred_df.columns:
        print(f"\nValue sources: {dict(pred_df['_value_source'].value_counts())}")

    if 'anomaly_flag' in pred_df.columns:
        anomaly_cnt = (pred_df['anomaly_flag'] > 0).sum()
        if anomaly_cnt > 0:
            print(f"⚠️  {anomaly_cnt} properties had features normalized due to anomalies")

    X = pred_df[feats].values
    y_pred = model.predict(X)

    ids = pred_df[id_col].values
    states = pred_df[state_col].values if state_col and state_col in pred_df.columns else ['Unknown']*len(pred_df)
    actual = pred_df[y_col].values if y_col in pred_df.columns else [np.nan]*len(pred_df)
    value_ind = pred_df['value_indicator'].values
    value_src = pred_df['_value_source'].values
    anomaly_flag = pred_df['anomaly_flag'].values if 'anomaly_flag' in pred_df.columns else [0]*len(pred_df)
    anomaly_reason = pred_df['anomaly_reason'].values if 'anomaly_reason' in pred_df.columns else ['none']*len(pred_df)

    result = pd.DataFrame({
        'property_id':ids, 'state':states, 'value_indicator':value_ind, 'value_source':value_src,
        'anomaly_flag':anomaly_flag, 'anomaly_reason':anomaly_reason,
        'actual':actual, 'predicted':y_pred,
        'error':[actual[i]-y_pred[i] if not np.isnan(actual[i]) else np.nan for i in range(len(actual))],
        'pct_error':[100*(actual[i]-y_pred[i])/actual[i] if not np.isnan(actual[i]) and actual[i]!=0 else np.nan for i in range(len(actual))]
    })

    # ADD CONFIDENCE SCORING
    result = add_prediction_confidence(result)

    print(f"\n✓ Generated {len(result):,} predictions")

    # Show confidence summary
    if 'confidence_score' in result.columns:
        print(f"\nConfidence Summary:")
        print(f"  High confidence (60-100): {(result['confidence_score'] >= 60).sum()}")
        print(f"  Medium confidence (40-59): {((result['confidence_score'] >= 40) & (result['confidence_score'] < 60)).sum()}")
        print(f"  Low confidence (0-39): {(result['confidence_score'] < 40).sum()}")

    valid = result['actual'].notna().sum()
    if valid>0:
        valid_df = result[result['actual'].notna()]
        mae = mean_absolute_error(valid_df['actual'], valid_df['predicted'])
        mape = np.mean(np.abs((valid_df['actual']-valid_df['predicted'])/valid_df['actual']))*100
        r2 = r2_score(valid_df['actual'], valid_df['predicted'])
        print(f"\nValidation ({valid}): MAE=${mae:,.0f} | MAPE={mape:.2f}% | R²={r2:.4f}")

    return result

def save_results(results, out_dir, new_preds=None):
    print(f"\nSaving results...")
    preds, metrics, fi = results['predictions'], results['metrics'], results['feature_importance']

    wb = Workbook()
    wb.remove(wb.active)

    ws = wb.create_sheet("Summary", 0)
    ws['A1'].font, ws['A1'].value = Font(bold=True,size=14), 'AGGRESSIVE ANOMALY DETECTION + SINGLE MODEL AVM'
    ws['A2'].font, ws['A2'].value = Font(italic=True,size=10), 'Tighter thresholds → census penalty → confidence scoring → XGBoost'

    data = [
        ['Metric','Value'],
        ['Train Properties',metrics['n_train']],
        ['Test Properties',metrics['n_test']],
        ['R²',f"{metrics['r2']:.4f}"],
        ['MAE',f"${metrics['mae']:,.0f}"],
        ['MAPE%',f"{metrics['mape']:.2f}%"]
    ]
    if new_preds is not None: data.append(['New Predictions',len(new_preds)])

    for i,(k,v) in enumerate(data,5):
        ws[f'A{i}'].font, ws[f'A{i}'].value, ws[f'B{i}'].value = Font(bold=True), k, v

    ws = wb.create_sheet("Feature_Importance")
    for r_idx,row in enumerate(dataframe_to_rows(fi,index=False,header=True),1):
        for c_idx,value in enumerate(row,1):
            cell = ws.cell(row=r_idx,column=c_idx,value=value)
            if r_idx==1:
                cell.font = Font(bold=True,color='FFFFFF')
                cell.fill = PatternFill(start_color='366092',end_color='366092',fill_type='solid')

    for sheet_name,data,color in [('Test_Predictions',preds,'366092'),('New_Predictions',new_preds,'4472C4')]:
        if data is None: continue
        ws = wb.create_sheet(sheet_name)
        for i,h in enumerate(data.columns,1):
            c = ws.cell(1,i,h)
            c.font, c.fill = Font(bold=True,color='FFFFFF'), PatternFill(start_color=color,end_color=color,fill_type='solid')
        for i,row in enumerate(data.itertuples(index=False),2):
            for j,v in enumerate(row,1): ws.cell(i,j,v)

    ts = time.strftime("%Y%m%d_%H%M%S")
    xl_path = f"{out_dir}/aggressive_avm_{ts}.xlsx"
    wb.save(xl_path)

    preds.to_csv(f"{out_dir}/aggressive_test_predictions_{ts}.csv", index=False)
    fi.to_csv(f"{out_dir}/aggressive_importance_{ts}.csv", index=False)
    if new_preds is not None:
        new_preds.to_csv(f"{out_dir}/aggressive_new_predictions_{ts}.csv", index=False)

    print(f"✓ Excel: {xl_path}")
    print(f"✓ CSVs saved")

def main():
    t0 = time.time()
    print("="*60+"\nAGGRESSIVE ANOMALY DETECTION + SINGLE MODEL AVM\nTighter Thresholds → Census Penalty → Confidence Scoring\n"+"="*60)

    # Load and prepare training data
    df, y_col, id_col, state_col = load_data(TRAINING_INPUT_PATH)
    df = df[df[y_col]>=MIN_PRICE]
    df = engineer(df, y_col)
    df, kmeans = geo_cluster(df)
    df = create_value_indicator(df)

    # Define features
    base = ["living_sqft","lot_sqft","year_built","bedrooms","full_baths","half_baths","garage_spaces",
            "latitude","longitude","geo_cluster","value_indicator","log_value_indicator"]
    eng = ["sqft_per_bedroom","lot_to_living_ratio","property_age","is_new","has_garage","luxury_score","log_sqft","age_squared"]
    prior = ["prior_sale_price","prior_price_per_sqft","prior_appreciated","years_since_last_sale","has_prior_sale","recently_sold"]
    cluster = ["cluster_avg_price","cluster_med_price"]
    census = ["median_household_income","median_home_value","pct_bachelors_degree","income_education_score"]

    all_feats = base + eng + prior + cluster + census
    feats = [f for f in all_feats if f in df.columns]

    print(f"{len(feats)}/{len(all_feats)} features available")

    cols = list(set(feats+[y_col,id_col]+([state_col] if state_col and state_col in df.columns else [])))
    df = df[[c for c in cols if c in df.columns]].copy()
    df[feats] = df[feats].fillna(df[feats].median())
    df = df.dropna(subset=[y_col])

    # Train model
    results = train_single_model(df, feats, y_col, id_col, state_col)

    # Predict new properties
    new_preds = None
    if PREDICTION_INPUT_PATH:
        pred_df, _, _, _ = load_data(PREDICTION_INPUT_PATH)
        new_preds = predict_new(pred_df, results['model'], feats, y_col, id_col, state_col,
                               kmeans, results['cluster_stats'])

    # Save results
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    save_results(results, OUTPUT_DIR, new_preds)

    print(f"\n{'='*60}\n✓ COMPLETE in {time.time()-t0:.1f}s")
    if new_preds is not None: print(f"  New predictions: {len(new_preds):,}")
    print("="*60)

if __name__=="__main__": main()

AGGRESSIVE ANOMALY DETECTION + SINGLE MODEL AVM
Tighter Thresholds → Census Penalty → Confidence Scoring
Loading /Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/Untitled 5_2025-12-23-1212.csv
127,326 records | 90.4MB | Price:sale_price ID:property_id
  ⚠️  Filtered 29,431 bad assessed values
  ⚠️  Filtered 29,431 bad assessed values
24/32 features available

Training SINGLE GLOBAL MODEL on 127,258 properties

PREPROCESSING: Aggressive Anomaly Detection
  ⚠️  Found 60831 properties with extreme price/value ratios
      These will be EXCLUDED from training (likely data errors)
  ✓ Detected 148,985 anomalies across 32 cohorts
  ✓ Excluded 60,831 extreme anomalies from training
  After filtering: 56,793 properties

MODEL TRAINING
  Training: 39,755 | Test: 17,038
  Results: MAE=$230,548 | MAPE=16.38% | R²=0.7812
Loading /Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/MLS_w_luxury_AVM_outliers.csv
1 records | 0.0MB | Price:sale_price ID:property_id

PREDICTING 1 NE

In [9]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.cluster import MiniBatchKMeans
from sklearn.ensemble import IsolationForest
import warnings, time, os
from openpyxl import Workbook
from openpyxl.styles import Font, PatternFill
from openpyxl.utils.dataframe import dataframe_to_rows

warnings.filterwarnings('ignore')
os.environ['OMP_NUM_THREADS'] = '4'

# Config
MIN_PRICE, TEST_SIZE, RAND_STATE, N_JOBS, N_CLUSTERS = 20000, 0.3, 42, -1, 8
N_EST = 200  # More trees for single model

TRAINING_INPUT_PATH = "/Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/Untitled 5_2025-12-23-1212.csv"
PREDICTION_INPUT_PATH = "/Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/MLS_w_luxury_AVM_outliers.csv"
OUTPUT_DIR = "/Users/jenny.lin/BASIS_AVM_Onboarding/cate_scenario_analyses/model_outputs"

def optimize_dtypes(df):
    for c in df.select_dtypes(['float64']).columns: df[c] = df[c].astype('float32')
    for c in df.select_dtypes(['int64']).columns:
        df[c] = df[c].astype('int8' if set(df[c].dropna().unique()).issubset({0,1}) else 'int32')
    return df

def load_data(path):
    print(f"Loading {path}")
    df = optimize_dtypes(pd.read_csv(path, low_memory=False))
    df.columns = df.columns.str.lower()
    y_col = next((c for c in ['sale_price','currentsalesprice','price','saleprice'] if c in df.columns), 'currentsalesprice')
    id_col = next((c for c in ['cc_list_id','property_id','propertyid','id'] if c in df.columns), 'cc_list_id')
    state_col = next((c for c in ['sample_state','state','state_code'] if c in df.columns), None)
    print(f"{len(df):,} records | {df.memory_usage(deep=True).sum()/1024**2:.1f}MB | Price:{y_col} ID:{id_col}")
    return df, y_col, id_col, state_col

def filter_bad_assessed(df):
    if 'assessed_total_value' not in df.columns:
        return pd.Series([False]*len(df), index=df.index)

    valid = (df['assessed_total_value']>10000) & (df['assessed_total_value']<100000000)

    if 'living_sqft' in df.columns:
        has_sqft = (df['living_sqft'].notna()) & (df['living_sqft']>100)
        ppsf = df['assessed_total_value']/df['living_sqft']
        valid &= ~(has_sqft & ((ppsf<20)|(ppsf>2000)))

    if 'prior_sale_price' in df.columns and 'years_since_last_sale' in df.columns:
        has_prior = (df['prior_sale_price']>10000) & (df['years_since_last_sale']<=15)
        valid &= ~(has_prior & ((df['assessed_total_value']/df['prior_sale_price'])<0.10))

    if 'median_home_value' in df.columns:
        has_census = (df['median_home_value']>10000)
        valid &= ~(has_census & ((df['assessed_total_value']/df['median_home_value'])<0.05))

    invalid_cnt = ((df['assessed_total_value'].notna()) & ~valid).sum()
    if invalid_cnt>0: print(f"  ⚠️  Filtered {invalid_cnt:,} bad assessed values")

    return valid

def create_value_indicator(df):
    """Create value_indicator from multiple sources - NO LEAKAGE"""
    value_ind = pd.Series([np.nan]*len(df), index=df.index)
    source = pd.Series(['none']*len(df), index=df.index)

    # Priority 1: Prior sale PPSF × sqft
    if all(c in df.columns for c in ['prior_sale_price','years_since_last_sale','living_sqft']):
        has_prior = (df['prior_sale_price']>10000) & (df['years_since_last_sale']<=15) & (df['living_sqft']>400)
        if has_prior.sum()>0:
            prior_ppsf = df.loc[has_prior,'prior_sale_price'] / (df.loc[has_prior,'living_sqft']+1)
            yrs = df.loc[has_prior,'years_since_last_sale'].fillna(5)
            appreciated_ppsf = prior_ppsf * (1.04 ** yrs)
            value_ind[has_prior] = appreciated_ppsf * df.loc[has_prior,'living_sqft']
            source[has_prior] = 'prior_ppsf'

    # Priority 2: Assessed value
    if 'assessed_total_value' in df.columns:
        valid = filter_bad_assessed(df) & value_ind.isna()
        if valid.sum()>0:
            vals = df.loc[valid,'assessed_total_value']
            mult = pd.Series([1.15]*len(vals), index=vals.index)
            mult[vals<200000], mult[vals>=500000] = 1.1, 1.2
            value_ind[valid], source[valid] = vals*mult, 'assessed'

    # Priority 3: Census median
    if 'median_home_value' in df.columns:
        valid = (df['median_home_value']>10000) & value_ind.isna()
        if valid.sum()>0:
            value_ind[valid], source[valid] = df.loc[valid,'median_home_value'], 'census'

    # Priority 4: Geo cluster median
    if 'cluster_med_price' in df.columns:
        valid = (df['cluster_med_price']>10000) & value_ind.isna()
        if valid.sum()>0:
            value_ind[valid], source[valid] = df.loc[valid,'cluster_med_price'], 'cluster'

    # Fallback
    value_ind = value_ind.fillna(value_ind.median())
    source[value_ind.isna()] = 'global_median'

    df['value_indicator'] = value_ind
    df['log_value_indicator'] = np.log1p(value_ind)
    df['_value_source'] = source

    return df

def engineer(df, y_col, with_price=False):
    if 'living_sqft' in df.columns:
        if 'bedrooms' in df.columns: df['sqft_per_bedroom'] = df['living_sqft']/(df['bedrooms']+1)
        df['log_sqft'] = np.log1p(df['living_sqft'])

    if 'lot_sqft' in df.columns:
        if 'living_sqft' in df.columns: df['lot_to_living_ratio'] = df['lot_sqft']/(df['living_sqft']+1)
        if 'lot_acres' not in df.columns: df['lot_acres'] = df['lot_sqft']/43560

    if 'year_built' in df.columns:
        df['property_age'] = 2024-df['year_built']
        df['is_new'] = ((2024-df['year_built'])<=5).astype('int8')
        df['age_squared'] = (2024-df['year_built'])**2

    if 'garage_spaces' in df.columns:
        df['has_garage'] = (df['garage_spaces']>0).astype('int8')

    lux = [df[c]/1000 if c=='living_sqft' else df[c] for c in ['living_sqft','full_baths','garage_spaces'] if c in df.columns]
    df['luxury_score'] = sum(lux)/len(lux) if lux else 0

    if 'prior_sale_price' in df.columns:
        if 'years_since_last_sale' in df.columns:
            df['prior_appreciated'] = df['prior_sale_price']*(1.04**df['years_since_last_sale'].fillna(5))
        if 'living_sqft' in df.columns:
            df['prior_price_per_sqft'] = df['prior_sale_price']/(df['living_sqft']+1)
        df['has_prior_sale'] = df['prior_sale_price'].notna().astype('int8')

    if 'assessed_total_value' in df.columns and 'living_sqft' in df.columns:
        valid = filter_bad_assessed(df)
        df['assessed_per_sqft'] = 0
        df.loc[valid,'assessed_per_sqft'] = df.loc[valid,'assessed_total_value']/(df.loc[valid,'living_sqft']+1)

    if 'median_household_income' in df.columns and 'pct_bachelors_degree' in df.columns:
        df['income_education_score'] = df['median_household_income']*df['pct_bachelors_degree']

    if 'prior_sale_date' in df.columns:
        df['prior_sale_date'] = pd.to_datetime(df['prior_sale_date'], errors='coerce')
        df['years_since_last_sale'] = (pd.Timestamp('2024-01-01')-df['prior_sale_date']).dt.days/365.25
        df['recently_sold'] = (df['years_since_last_sale']<2).astype('int8')

    if 'years_since_last_sale' in df.columns:
        df['years_since_last_sale'] = df['years_since_last_sale'].fillna(999)

    if 'prior_sale_price' in df.columns:
        missing = df['prior_sale_price'].isna()
        df.loc[missing,'prior_sale_price'] = df['prior_sale_price'].median()

    if with_price and y_col in df.columns:
        if 'living_sqft' in df.columns:
            df['price_per_sqft'] = df[y_col]/(df['living_sqft']+1)

    return df

def geo_cluster(df, kmeans=None):
    if not all(c in df.columns for c in ['latitude','longitude']):
        df['geo_cluster'] = 0
        return df, kmeans

    valid = df[['latitude','longitude']].notna().all(axis=1)
    if valid.sum()<N_CLUSTERS:
        df['geo_cluster'] = 0
        return df, kmeans

    df['geo_cluster'] = 0
    if kmeans is None:
        kmeans = MiniBatchKMeans(n_clusters=N_CLUSTERS, random_state=RAND_STATE, batch_size=1000, n_init=3)
        df.loc[valid,'geo_cluster'] = kmeans.fit_predict(df.loc[valid,['latitude','longitude']])
    else:
        df.loc[valid,'geo_cluster'] = kmeans.predict(df.loc[valid,['latitude','longitude']])

    return df, kmeans

def add_cluster_feats(df, cluster_stats):
    if cluster_stats is None or 'geo_cluster' not in df.columns:
        df['cluster_avg_price'] = df['cluster_med_price'] = 0
        return df

    df = df.merge(cluster_stats, on='geo_cluster', how='left')
    med = cluster_stats['cluster_avg_price'].median()
    df['cluster_avg_price'] = df['cluster_avg_price'].fillna(med)
    df['cluster_med_price'] = df['cluster_med_price'].fillna(med)
    return df

def detect_and_normalize_anomalies(df, y_col, for_training=True):
    """
    AGGRESSIVE neighborhood-level anomaly detection and normalization.
    Detects properties with extreme feature combinations relative to their cohort.
    """
    print(f"\n{'='*60}")
    print("PREPROCESSING: Aggressive Anomaly Detection")
    print(f"{'='*60}")

    df['anomaly_flag'] = 0
    df['anomaly_reason'] = 'none'

    # Create cohorts: geo_cluster + value_indicator quartile
    if 'geo_cluster' not in df.columns or 'value_indicator' not in df.columns:
        print("⚠️  Skipping anomaly detection - missing clustering features")
        return df

    # Skip for very small datasets
    if len(df) < 10:
        print(f"⚠️  Skipping cohort-based anomaly detection - only {len(df)} properties (need 10+)")
        return df

    # Handle quartile binning errors gracefully
    try:
        df['value_quartile'] = pd.qcut(
            df['value_indicator'],
            q=4,
            labels=False,
            duplicates='drop'
        )
        df['value_quartile'] = df['value_quartile'].map({
            0: 'Q1', 1: 'Q2', 2: 'Q3', 3: 'Q4'
        }).fillna('Q1')

    except (ValueError, TypeError) as e:
        print(f"⚠️  Could not create quartiles ({str(e)}), using single cohort")
        df['value_quartile'] = 'Q1'

    df['cohort'] = df['geo_cluster'].astype(str) + '_' + df['value_quartile'].astype(str)

    # ULTRA AGGRESSIVE feature checks - VERY TIGHT thresholds
    feature_checks = {
        'living_sqft': ('sqft', 0.7, 1.5),      # Was 0.6-1.8, now 0.7-1.5
        'full_baths': ('baths', 0, 7),           # Was 0-8, now 0-7
        'luxury_score': ('luxury', 0.5, 2.0),    # Was 0.4-2.5, now 0.5-2.0
        'property_age': ('age', -5, 100),        # Was -10-120, now -5-100
        'bedrooms': ('beds', 0, 9),              # Was 0-10, now 0-9
        'lot_sqft': ('lot', 0.4, 2.5),           # Was 0.3-3.0, now 0.4-2.5
    }

    anomalies_found = 0

    for cohort in df['cohort'].unique():
        cohort_df = df[df['cohort']==cohort]
        if len(cohort_df) < 10:
            continue

        cohort_idx = cohort_df.index

        # Check each feature for anomalies
        for feat, (name, min_mult, max_mult) in feature_checks.items():
            if feat not in df.columns:
                continue

            cohort_vals = cohort_df[feat].dropna()
            if len(cohort_vals) < 5:
                continue

            cohort_median = cohort_vals.median()
            cohort_std = cohort_vals.std()

            if cohort_median == 0 or cohort_std == 0:
                continue

            # Method 1: Ratio-based detection (TIGHTER thresholds)
            if feat in ['living_sqft', 'luxury_score', 'lot_sqft']:
                ratios = df.loc[cohort_idx, feat] / cohort_median
                anomaly_mask = (ratios < min_mult) | (ratios > max_mult)

                if anomaly_mask.sum() > 0:
                    df.loc[cohort_idx[anomaly_mask], feat] = np.clip(
                        df.loc[cohort_idx[anomaly_mask], feat],
                        cohort_median * min_mult,
                        cohort_median * max_mult
                    )
                    df.loc[cohort_idx[anomaly_mask], 'anomaly_flag'] = 1
                    df.loc[cohort_idx[anomaly_mask], 'anomaly_reason'] = f'{name}_extreme'
                    anomalies_found += anomaly_mask.sum()

            # Method 2: Z-score detection (ULTRA TIGHT - 2.0 std instead of 3)
            elif feat in ['property_age', 'full_baths', 'bedrooms']:
                z_scores = np.abs((df.loc[cohort_idx, feat] - cohort_median) / (cohort_std + 1))
                anomaly_mask = z_scores > 2.0  # Was 2.5, now 2.0

                if anomaly_mask.sum() > 0:
                    df.loc[cohort_idx[anomaly_mask], feat] = np.clip(
                        df.loc[cohort_idx[anomaly_mask], feat],
                        cohort_median - 2.0*cohort_std,
                        cohort_median + 2.0*cohort_std
                    )
                    df.loc[cohort_idx[anomaly_mask], 'anomaly_flag'] = 1
                    df.loc[cohort_idx[anomaly_mask], 'anomaly_reason'] = f'{name}_outlier'
                    anomalies_found += anomaly_mask.sum()

    # NEW: Check living_sqft vs bedrooms ratio (TIGHTER)
    if 'living_sqft' in df.columns and 'bedrooms' in df.columns:
        sqft_per_bed = df['living_sqft'] / (df['bedrooms'] + 1)
        # Typical range: 400-1200 sqft/bedroom (was 250-2000)
        extreme_ratio = (sqft_per_bed < 300) | (sqft_per_bed > 1500)
        if extreme_ratio.sum() > 0:
            df.loc[extreme_ratio, 'anomaly_flag'] = 1
            df.loc[extreme_ratio, 'anomaly_reason'] = 'sqft_bed_mismatch'
            anomalies_found += extreme_ratio.sum()

    # EXISTING: Cross-feature consistency (TIGHTER)
    if 'living_sqft' in df.columns and 'luxury_score' in df.columns:
        sqft_low = df['living_sqft'] < df['living_sqft'].quantile(0.30)  # Was 0.25
        luxury_high = df['luxury_score'] > df['luxury_score'].quantile(0.70)  # Was 0.75
        inconsistent = sqft_low & luxury_high

        if inconsistent.sum() > 0:
            df.loc[inconsistent, 'luxury_score'] = df.loc[inconsistent, 'luxury_score'] * 0.5  # Was 0.6
            df.loc[inconsistent, 'anomaly_flag'] = 1
            df.loc[inconsistent, 'anomaly_reason'] = 'sqft_luxury_mismatch'
            anomalies_found += inconsistent.sum()

    # NEW: HEAVILY penalize census-based value indicators (they're very unreliable)
    if '_value_source' in df.columns:
        census_mask = df['_value_source'] == 'census'
        if census_mask.sum() > 0:
            # AGGRESSIVE: Reduce impact of census values significantly
            if 'luxury_score' in df.columns:
                df.loc[census_mask, 'luxury_score'] = df.loc[census_mask, 'luxury_score'] * 0.75  # Was 0.85, now 0.75
            if 'living_sqft' in df.columns:
                median_sqft = df['living_sqft'].median()
                # More aggressive sqft reduction for census properties
                extreme_sqft = df.loc[census_mask, 'living_sqft'] > median_sqft * 1.3  # Was 1.5, now 1.3
                if extreme_sqft.sum() > 0:
                    df.loc[census_mask & extreme_sqft, 'living_sqft'] = df.loc[census_mask & extreme_sqft, 'living_sqft'] * 0.85  # Was 0.9, now 0.85

            # Also dampen any engineered features that might inflate predictions
            if 'sqft_per_bedroom' in df.columns:
                df.loc[census_mask, 'sqft_per_bedroom'] = df.loc[census_mask, 'sqft_per_bedroom'] * 0.9

            df.loc[census_mask, 'anomaly_flag'] = 1
            df.loc[census_mask, 'anomaly_reason'] = 'census_unreliable'
            anomalies_found += census_mask.sum()

    # TRAINING ONLY: Check price vs value_indicator ratio (ULTRA TIGHT)
    if for_training and y_col in df.columns:
        df['price_to_value_ratio'] = df[y_col] / (df['value_indicator'] + 1)

        # ULTRA TIGHT: ratio >2.0x or <0.6x (was 2.5x and 0.5x)
        extreme_ratio = (df['price_to_value_ratio'] > 2.0) | (df['price_to_value_ratio'] < 0.6)

        if extreme_ratio.sum() > 0:
            print(f"  ⚠️  Found {extreme_ratio.sum()} properties with extreme price/value ratios")
            print(f"      These will be EXCLUDED from training (likely data errors)")
            df.loc[extreme_ratio, 'anomaly_flag'] = 2  # Flag 2 = exclude
            df.loc[extreme_ratio, 'anomaly_reason'] = 'extreme_price_ratio'
            anomalies_found += extreme_ratio.sum()

    if anomalies_found > 0:
        cohort_count = df['cohort'].nunique() if 'cohort' in df.columns else 0
        print(f"  ✓ Detected {anomalies_found:,} anomalies across {cohort_count} cohorts")
    else:
        print(f"  ✓ No anomalies detected")

    # For training: exclude extreme anomalies (flag=2)
    if for_training:
        before = len(df)
        df = df[df['anomaly_flag'] != 2].copy()
        if len(df) < before:
            print(f"  ✓ Excluded {before-len(df):,} extreme anomalies from training")

    # Clean up temporary columns
    cols_to_drop = ['value_quartile', 'cohort']
    if 'price_to_value_ratio' in df.columns:
        cols_to_drop.append('price_to_value_ratio')
    df = df.drop(columns=[c for c in cols_to_drop if c in df.columns])

    return df

def add_prediction_confidence(pred_df):
    """
    Add a confidence score and warning flags for predictions.
    Helps identify predictions that are likely to be unreliable.
    """
    pred_df['confidence_score'] = 100  # Start at 100%
    pred_df['warning_flags'] = ''

    warnings = []

    # Check 1: Predicted vs value_indicator ratio (TIGHTER)
    if 'predicted' in pred_df.columns and 'value_indicator' in pred_df.columns:
        ratio = pred_df['predicted'] / (pred_df['value_indicator'] + 1)

        # More aggressive penalties for deviation
        extreme_high = ratio > 1.6  # Was 2.0, now 1.6
        moderate_high = (ratio > 1.3) & (ratio <= 1.6)  # New moderate tier
        extreme_low = ratio < 0.7  # Was 0.6, now 0.7

        if extreme_high.sum() > 0:
            pred_df.loc[extreme_high, 'confidence_score'] -= 50  # Was 40, now 50
            pred_df.loc[extreme_high, 'warning_flags'] = pred_df.loc[extreme_high, 'warning_flags'] + 'PRED_MUCH_HIGHER_THAN_VALUE|'

        if moderate_high.sum() > 0:
            pred_df.loc[moderate_high, 'confidence_score'] -= 30
            pred_df.loc[moderate_high, 'warning_flags'] = pred_df.loc[moderate_high, 'warning_flags'] + 'PRED_HIGHER_THAN_VALUE|'

        if extreme_low.sum() > 0:
            pred_df.loc[extreme_low, 'confidence_score'] -= 35  # Was 30, now 35
            pred_df.loc[extreme_low, 'warning_flags'] = pred_df.loc[extreme_low, 'warning_flags'] + 'PRED_MUCH_LOWER_THAN_VALUE|'

    # Check 2: Census-based value indicators are VERY unreliable (more aggressive)
    if 'value_source' in pred_df.columns:
        census_mask = pred_df['value_source'] == 'census'
        if census_mask.sum() > 0:
            pred_df.loc[census_mask, 'confidence_score'] -= 35  # Was 25, now 35
            pred_df.loc[census_mask, 'warning_flags'] = pred_df.loc[census_mask, 'warning_flags'] + 'CENSUS_VALUE_UNRELIABLE|'

    # Check 3: Properties with anomaly flags
    if 'anomaly_flag' in pred_df.columns:
        anomaly_mask = pred_df['anomaly_flag'] > 0
        if anomaly_mask.sum() > 0:
            pred_df.loc[anomaly_mask, 'confidence_score'] -= 20
            pred_df.loc[anomaly_mask, 'warning_flags'] = pred_df.loc[anomaly_mask, 'warning_flags'] + 'FEATURES_NORMALIZED|'

    # Check 4: Missing key features (if we had them in training)
    if 'prior_sale_price' in pred_df.columns:
        missing_prior = pred_df['prior_sale_price'].isna() | (pred_df['prior_sale_price'] == pred_df['prior_sale_price'].median())
        if missing_prior.sum() > 0:
            pred_df.loc[missing_prior, 'confidence_score'] -= 15
            pred_df.loc[missing_prior, 'warning_flags'] = pred_df.loc[missing_prior, 'warning_flags'] + 'NO_PRIOR_SALE|'

    # Clean up warning flags
    pred_df['warning_flags'] = pred_df['warning_flags'].str.rstrip('|')
    pred_df['warning_flags'] = pred_df['warning_flags'].replace('', 'NONE')

    # Ensure confidence doesn't go below 0
    pred_df['confidence_score'] = pred_df['confidence_score'].clip(lower=0)

    # Add recommendation (stricter thresholds)
    pred_df['recommendation'] = 'USE'
    pred_df.loc[pred_df['confidence_score'] < 50, 'recommendation'] = 'CAUTION'  # Was 40, now 50
    pred_df.loc[pred_df['confidence_score'] < 30, 'recommendation'] = 'DO_NOT_USE'  # Was 20, now 30

    return pred_df

def train_single_model(df, feats, y_col, id_col, state_col):
    print(f"\nTraining SINGLE GLOBAL MODEL on {len(df):,} properties")

    # Engineer price features for anomaly detection
    df = engineer(df, y_col, with_price=True)

    # PREPROCESSING: Detect and normalize anomalies
    df = detect_and_normalize_anomalies(df, y_col, for_training=True)

    # Basic filtering
    if 'price_per_sqft' in df.columns:
        lb,ub = df['price_per_sqft'].quantile([.05,.95])
        df = df[(df['price_per_sqft']>=lb)&(df['price_per_sqft']<=ub)].drop(columns=['price_per_sqft'])
    if 'lot_sqft' in df.columns: df = df[df['lot_sqft']<=df['lot_sqft'].quantile(.98)]
    if 'year_built' in df.columns: df = df[(df['year_built']>=1900)&(df['year_built']<=2025)]

    print(f"  After filtering: {len(df):,} properties")

    # Split train/test
    train_idx = df.sample(frac=1-TEST_SIZE, random_state=RAND_STATE).index
    train_df = df.loc[train_idx].copy()
    test_df = df.loc[df.index.difference(train_idx)].copy()

    # Add cluster stats from training data
    cluster_stats = None
    if 'geo_cluster' in train_df.columns:
        cluster_stats = train_df.groupby('geo_cluster')[y_col].agg(['mean','median']).reset_index()
        cluster_stats.columns = ['geo_cluster','cluster_avg_price','cluster_med_price']
        train_df = add_cluster_feats(train_df, cluster_stats)
        test_df = add_cluster_feats(test_df, cluster_stats)

    X_tr, y_tr = train_df[feats].values, train_df[y_col].values
    X_te, y_te = test_df[feats].values, test_df[y_col].values

    print(f"\n{'='*60}")
    print(f"MODEL TRAINING")
    print(f"{'='*60}")
    print(f"  Training: {len(X_tr):,} | Test: {len(X_te):,}")

    # Train single model
    model = XGBRegressor(
        n_estimators=N_EST,
        learning_rate=0.05,
        max_depth=6,
        min_child_weight=3,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=RAND_STATE,
        n_jobs=N_JOBS,
        tree_method='hist'
    ).fit(X_tr, y_tr, verbose=False)

    y_pred = model.predict(X_te)

    mae = mean_absolute_error(y_te, y_pred)
    mape = np.mean(np.abs((y_te-y_pred)/y_te))*100
    r2 = r2_score(y_te, y_pred)

    print(f"  Results: MAE=${mae:,.0f} | MAPE={mape:.2f}% | R²={r2:.4f}")

    # Feature importance
    scores = model.get_booster().get_score(importance_type="gain")
    imp = [(feats[int(k[1:])],v) for k,v in scores.items() if int(k[1:])<len(feats)]
    imp.sort(key=lambda x: x[1], reverse=True)
    total = sum(v for _,v in imp)
    importance_df = pd.DataFrame([
        {'feature':f,'gain':g,'importance':g/total} for f,g in imp[:20]
    ]) if total>0 else pd.DataFrame(columns=['feature','gain','importance'])

    # Predictions
    ids = test_df[id_col].values
    states = test_df[state_col].values if state_col and state_col in test_df.columns else ['Unknown']*len(test_df)
    value_ind = test_df['value_indicator'].values if 'value_indicator' in test_df.columns else [np.nan]*len(test_df)
    value_src = test_df['_value_source'].values if '_value_source' in test_df.columns else ['unknown']*len(test_df)
    anomaly_flag = test_df['anomaly_flag'].values if 'anomaly_flag' in test_df.columns else [0]*len(test_df)

    preds_df = pd.DataFrame({
        'property_id':ids, 'state':states, 'value_indicator':value_ind, 'value_source':value_src,
        'anomaly_flag':anomaly_flag,
        'actual':y_te, 'predicted':y_pred,
        'error':y_te-y_pred,
        'pct_error':100*(y_te-y_pred)/y_te
    })

    return {
        'model': model,
        'metrics': {'mae':mae, 'mape':mape, 'r2':r2, 'n_train':len(X_tr), 'n_test':len(X_te)},
        'predictions': preds_df,
        'feature_importance': importance_df,
        'cluster_stats': cluster_stats
    }

def predict_new(pred_df, model, feats, y_col, id_col, state_col, kmeans, cluster_stats):
    print(f"\n{'='*60}\nPREDICTING {len(pred_df):,} NEW PROPERTIES\n{'='*60}")

    pred_df = engineer(pred_df, y_col)
    pred_df, _ = geo_cluster(pred_df, kmeans)
    pred_df = create_value_indicator(pred_df)
    pred_df = add_cluster_feats(pred_df, cluster_stats)

    # AGGRESSIVE anomaly detection
    pred_df = detect_and_normalize_anomalies(pred_df, y_col, for_training=False)

    # Fill missing features
    for f in feats:
        if f not in pred_df.columns: pred_df[f] = 0
        else: pred_df[f] = pred_df[f].fillna(pred_df[f].median() if pred_df[f].notna().sum()>0 else 0)

    if '_value_source' in pred_df.columns:
        print(f"\nValue sources: {dict(pred_df['_value_source'].value_counts())}")

    if 'anomaly_flag' in pred_df.columns:
        anomaly_cnt = (pred_df['anomaly_flag'] > 0).sum()
        if anomaly_cnt > 0:
            print(f"⚠️  {anomaly_cnt} properties had features normalized due to anomalies")

    X = pred_df[feats].values
    y_pred = model.predict(X)

    # CRITICAL: Hard cap predictions to prevent extreme overpredictions
    # Don't let prediction exceed value_indicator by more than 1.8x (or 1.5x for census)
    pred_to_value_ratio = y_pred / (pred_df['value_indicator'].values + 1)

    # Even stricter cap for census-based properties
    census_mask = pred_df['_value_source'] == 'census'
    max_ratio = np.where(census_mask, 1.5, 1.8)  # 1.5x for census, 1.8x for others

    extreme_high = pred_to_value_ratio > max_ratio

    if extreme_high.sum() > 0:
        print(f"⚠️  CAPPING {extreme_high.sum()} extreme predictions")
        y_pred[extreme_high] = pred_df.loc[extreme_high, 'value_indicator'].values * max_ratio[extreme_high]

    # Also cap very low predictions
    extreme_low = pred_to_value_ratio < 0.6
    if extreme_low.sum() > 0:
        print(f"⚠️  CAPPING {extreme_low.sum()} extreme low predictions (<0.6x value_indicator)")
        y_pred[extreme_low] = pred_df.loc[extreme_low, 'value_indicator'].values * 0.6

    ids = pred_df[id_col].values
    states = pred_df[state_col].values if state_col and state_col in pred_df.columns else ['Unknown']*len(pred_df)
    actual = pred_df[y_col].values if y_col in pred_df.columns else [np.nan]*len(pred_df)
    value_ind = pred_df['value_indicator'].values
    value_src = pred_df['_value_source'].values
    anomaly_flag = pred_df['anomaly_flag'].values if 'anomaly_flag' in pred_df.columns else [0]*len(pred_df)
    anomaly_reason = pred_df['anomaly_reason'].values if 'anomaly_reason' in pred_df.columns else ['none']*len(pred_df)

    result = pd.DataFrame({
        'property_id':ids, 'state':states, 'value_indicator':value_ind, 'value_source':value_src,
        'anomaly_flag':anomaly_flag, 'anomaly_reason':anomaly_reason,
        'actual':actual, 'predicted':y_pred,
        'error':[actual[i]-y_pred[i] if not np.isnan(actual[i]) else np.nan for i in range(len(actual))],
        'pct_error':[100*(actual[i]-y_pred[i])/actual[i] if not np.isnan(actual[i]) and actual[i]!=0 else np.nan for i in range(len(actual))]
    })

    # ADD CONFIDENCE SCORING
    result = add_prediction_confidence(result)

    print(f"\n✓ Generated {len(result):,} predictions")

    # Show confidence summary
    if 'confidence_score' in result.columns:
        print(f"\nConfidence Summary:")
        print(f"  High confidence (70-100): {(result['confidence_score'] >= 70).sum()}")
        print(f"  Medium confidence (50-69): {((result['confidence_score'] >= 50) & (result['confidence_score'] < 70)).sum()}")
        print(f"  Low confidence (30-49): {((result['confidence_score'] >= 30) & (result['confidence_score'] < 50)).sum()}")
        print(f"  Very low confidence (0-29): {(result['confidence_score'] < 30).sum()}")

    valid = result['actual'].notna().sum()
    if valid>0:
        valid_df = result[result['actual'].notna()]
        mae = mean_absolute_error(valid_df['actual'], valid_df['predicted'])
        mape = np.mean(np.abs((valid_df['actual']-valid_df['predicted'])/valid_df['actual']))*100
        r2 = r2_score(valid_df['actual'], valid_df['predicted'])
        print(f"\nValidation ({valid}): MAE=${mae:,.0f} | MAPE={mape:.2f}% | R²={r2:.4f}")

    return result

def save_results(results, out_dir, new_preds=None):
    print(f"\nSaving results...")
    preds, metrics, fi = results['predictions'], results['metrics'], results['feature_importance']

    wb = Workbook()
    wb.remove(wb.active)

    ws = wb.create_sheet("Summary", 0)
    ws['A1'].font, ws['A1'].value = Font(bold=True,size=14), 'ULTRA AGGRESSIVE ANOMALY DETECTION + AVM'
    ws['A2'].font, ws['A2'].value = Font(italic=True,size=10), 'Strictest thresholds → heavy census penalty → hard prediction caps → confidence scoring'

    data = [
        ['Metric','Value'],
        ['Train Properties',metrics['n_train']],
        ['Test Properties',metrics['n_test']],
        ['R²',f"{metrics['r2']:.4f}"],
        ['MAE',f"${metrics['mae']:,.0f}"],
        ['MAPE%',f"{metrics['mape']:.2f}%"]
    ]
    if new_preds is not None: data.append(['New Predictions',len(new_preds)])

    for i,(k,v) in enumerate(data,5):
        ws[f'A{i}'].font, ws[f'A{i}'].value, ws[f'B{i}'].value = Font(bold=True), k, v

    ws = wb.create_sheet("Feature_Importance")
    for r_idx,row in enumerate(dataframe_to_rows(fi,index=False,header=True),1):
        for c_idx,value in enumerate(row,1):
            cell = ws.cell(row=r_idx,column=c_idx,value=value)
            if r_idx==1:
                cell.font = Font(bold=True,color='FFFFFF')
                cell.fill = PatternFill(start_color='366092',end_color='366092',fill_type='solid')

    for sheet_name,data,color in [('Test_Predictions',preds,'366092'),('New_Predictions',new_preds,'4472C4')]:
        if data is None: continue
        ws = wb.create_sheet(sheet_name)
        for i,h in enumerate(data.columns,1):
            c = ws.cell(1,i,h)
            c.font, c.fill = Font(bold=True,color='FFFFFF'), PatternFill(start_color=color,end_color=color,fill_type='solid')
        for i,row in enumerate(data.itertuples(index=False),2):
            for j,v in enumerate(row,1): ws.cell(i,j,v)

    ts = time.strftime("%Y%m%d_%H%M%S")
    xl_path = f"{out_dir}/ultra_aggressive_avm_{ts}.xlsx"
    wb.save(xl_path)

    preds.to_csv(f"{out_dir}/ultra_aggressive_test_predictions_{ts}.csv", index=False)
    fi.to_csv(f"{out_dir}/ultra_aggressive_importance_{ts}.csv", index=False)
    if new_preds is not None:
        new_preds.to_csv(f"{out_dir}/ultra_aggressive_new_predictions_{ts}.csv", index=False)

    print(f"✓ Excel: {xl_path}")
    print(f"✓ CSVs saved")

def main():
    t0 = time.time()
    print("="*60+"\nULTRA AGGRESSIVE ANOMALY DETECTION + AVM\nStrictest Thresholds → Heavy Census Penalty → Hard Prediction Caps\n"+"="*60)

    # Load and prepare training data
    df, y_col, id_col, state_col = load_data(TRAINING_INPUT_PATH)
    df = df[df[y_col]>=MIN_PRICE]
    df = engineer(df, y_col)
    df, kmeans = geo_cluster(df)
    df = create_value_indicator(df)

    # Define features
    base = ["living_sqft","lot_sqft","year_built","bedrooms","full_baths","half_baths","garage_spaces",
            "latitude","longitude","geo_cluster","value_indicator","log_value_indicator"]
    eng = ["sqft_per_bedroom","lot_to_living_ratio","property_age","is_new","has_garage","luxury_score","log_sqft","age_squared"]
    prior = ["prior_sale_price","prior_price_per_sqft","prior_appreciated","years_since_last_sale","has_prior_sale","recently_sold"]
    cluster = ["cluster_avg_price","cluster_med_price"]
    census = ["median_household_income","median_home_value","pct_bachelors_degree","income_education_score"]

    all_feats = base + eng + prior + cluster + census
    feats = [f for f in all_feats if f in df.columns]

    print(f"{len(feats)}/{len(all_feats)} features available")

    cols = list(set(feats+[y_col,id_col]+([state_col] if state_col and state_col in df.columns else [])))
    df = df[[c for c in cols if c in df.columns]].copy()
    df[feats] = df[feats].fillna(df[feats].median())
    df = df.dropna(subset=[y_col])

    # Train model
    results = train_single_model(df, feats, y_col, id_col, state_col)

    # Predict new properties
    new_preds = None
    if PREDICTION_INPUT_PATH:
        pred_df, _, _, _ = load_data(PREDICTION_INPUT_PATH)
        new_preds = predict_new(pred_df, results['model'], feats, y_col, id_col, state_col,
                               kmeans, results['cluster_stats'])

    # Save results
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    save_results(results, OUTPUT_DIR, new_preds)

    print(f"\n{'='*60}\n✓ COMPLETE in {time.time()-t0:.1f}s")
    if new_preds is not None: print(f"  New predictions: {len(new_preds):,}")
    print("="*60)

if __name__=="__main__": main()

ULTRA AGGRESSIVE ANOMALY DETECTION + AVM
Strictest Thresholds → Heavy Census Penalty → Hard Prediction Caps
Loading /Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/Untitled 5_2025-12-23-1212.csv
127,326 records | 90.4MB | Price:sale_price ID:property_id
  ⚠️  Filtered 29,431 bad assessed values
  ⚠️  Filtered 29,431 bad assessed values
24/32 features available

Training SINGLE GLOBAL MODEL on 127,258 properties

PREPROCESSING: Aggressive Anomaly Detection
  ⚠️  Found 75048 properties with extreme price/value ratios
      These will be EXCLUDED from training (likely data errors)
  ✓ Detected 210,788 anomalies across 32 cohorts
  ✓ Excluded 75,048 extreme anomalies from training
  After filtering: 44,703 properties

MODEL TRAINING
  Training: 31,292 | Test: 13,411
  Results: MAE=$214,446 | MAPE=14.78% | R²=0.8059
Loading /Users/jenny.lin/ImageDataParser/XGBoost_with_ImageData/data/MLS_w_luxury_AVM_outliers.csv
1 records | 0.0MB | Price:sale_price ID:property_id

PREDICTING 1