# Bishkek Real Estate Price Prediction v3

**Goal:** Predict apartment price per square meter ($/m²) in Bishkek, Kyrgyzstan

**v3 Improvements (Phase 1 - Research-based):**
- **Spatial Lag Features** - neighbor price statistics (Zillow-inspired)
- **H3 Geographic Tiles** - Uber H3 hexagonal indexing at multiple resolutions
- **Market Trend Features** - rolling price statistics by district
- **Density Features** - listing density as desirability proxy

**References:**
- [Zillow Neural Zestimate](https://www.zillow.com/tech/building-the-neural-zestimate/)
- [Multi-Head Gated Attention Paper](https://arxiv.org/abs/2405.07456)
- [Spatial ML Methods](https://www.mdpi.com/2071-1050/14/15/9056)

In [None]:
# Install dependencies (for Kaggle)
!pip install -q optuna catboost lightgbm h3

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from datetime import datetime
from typing import Dict, List, Tuple, Optional
import hashlib

warnings.filterwarnings('ignore')

from sklearn.model_selection import KFold, GroupKFold, cross_val_predict
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import Ridge
from sklearn.neighbors import BallTree
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import h3

RANDOM_STATE = 42
CURRENT_YEAR = datetime.now().year

print(f"Current year: {CURRENT_YEAR}")
print(f"H3 version: {h3.__version__}")
print("Libraries loaded successfully!")

## 1. Load & Clean Data

In [None]:
import sqlite3
import os

# Load from SQLite database
if os.path.exists('/kaggle/input/bishkek-real-estate-2025/bishkek.db'):
    db_path = '/kaggle/input/bishkek-real-estate-2025/bishkek.db'
elif os.path.exists('../data/databases/bishkek.db'):
    db_path = '../data/databases/bishkek.db'
else:
    raise FileNotFoundError("Database not found!")

conn = sqlite3.connect(db_path)

df_raw = pd.read_sql('''
    SELECT 
        a.*,
        rc.name as jk_name,
        rc.class as jk_class,
        rc.status as jk_status,
        rc.developer_name
    FROM apartments a
    LEFT JOIN residential_complexes rc ON a.residential_complex_id = rc.id
    WHERE a.price_usd IS NOT NULL 
      AND a.area IS NOT NULL
      AND a.price_per_m2 > 0
''', conn)
conn.close()

print(f"Database: {db_path}")
print(f"Raw dataset: {len(df_raw)} rows")
print(f"Columns: {len(df_raw.columns)}")

In [None]:
# ===================
# 1.1 Remove Duplicates
# ===================

df_raw['parsed_at'] = pd.to_datetime(df_raw['parsed_at'])

# Create building signature for duplicate detection
df_raw['building_signature'] = (
    df_raw['address'].fillna('').str.lower() + '_' +
    df_raw['floor'].fillna(0).astype(str) + '_' +
    df_raw['area'].fillna(0).astype(str) + '_' +
    df_raw['rooms'].fillna(0).astype(str)
)

df_raw = df_raw.sort_values('parsed_at')
duplicates_before = len(df_raw)
df_raw = df_raw.drop_duplicates(subset=['building_signature'], keep='last')
duplicates_removed = duplicates_before - len(df_raw)

print(f"Duplicates removed: {duplicates_removed}")
print(f"Dataset after dedup: {len(df_raw)} rows")

In [None]:
# ===================
# 1.2 Outlier Detection (IQR + Domain Rules)
# ===================

target = 'price_per_m2'

def detect_outliers_iqr(series: pd.Series, k: float = 1.5) -> pd.Series:
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - k * IQR
    upper = Q3 + k * IQR
    return (series >= lower) & (series <= upper)

price_mask = detect_outliers_iqr(df_raw[target], k=2.0)

domain_mask = (
    (df_raw[target] >= 300) &
    (df_raw[target] <= 5000) &
    (df_raw['area'] >= 15) &
    (df_raw['area'] <= 500) &
    (df_raw['rooms'].fillna(1) <= 10) &
    (df_raw['floor'].fillna(1) <= 50)
)

valid_mask = price_mask & domain_mask
outliers_removed = (~valid_mask).sum()

print(f"Outliers detected: {outliers_removed}")

df = df_raw[valid_mask].copy()
print(f"Final dataset: {len(df)} rows")

## 2. Temporal Train/Test Split

In [None]:
# TEMPORAL SPLIT (not random!)
df = df.sort_values('parsed_at').reset_index(drop=True)

split_idx = int(len(df) * 0.8)
split_date = df.iloc[split_idx]['parsed_at']

train_df = df.iloc[:split_idx].copy()
test_df = df.iloc[split_idx:].copy()

print(f"Temporal Split:")
print(f"  Train: {len(train_df)} samples ({df['parsed_at'].min().date()} to {split_date.date()})")
print(f"  Test:  {len(test_df)} samples ({split_date.date()} to {df['parsed_at'].max().date()})")

## 3. Feature Engineering (v3 - with Advanced Features)

In [None]:
# ===================
# POI (Points of Interest) - Bishkek coordinates
# ===================

POI = {
    'dordoi_plaza': (42.8750, 74.6128),
    'bishkek_park': (42.8741, 74.5888),
    'tsum': (42.8746, 74.6031),
    'vefa_center': (42.8668, 74.5931),
    'asia_mall': (42.8489, 74.5672),
    'karavan': (42.8562, 74.5686),
    'ala_too_square': (42.8746, 74.6030),
    'philharmonic': (42.8749, 74.6108),
    'white_house': (42.8760, 74.6097),
    'victory_square': (42.8722, 74.5875),
    'knu': (42.8778, 74.6027),
    'auca': (42.8634, 74.6167),
    'krsu': (42.8750, 74.5861),
    'west_bus_station': (42.8628, 74.5294),
    'east_bus_station': (42.8605, 74.6550),
    'railway_station': (42.8588, 74.6339),
    'osh_bazaar': (42.8722, 74.5761),
    'dordoi_bazaar': (42.9453, 74.6494),
    'ortosay_bazaar': (42.8478, 74.5542),
    'center': (42.8746, 74.5888),
}

PARKS = {
    'dubovy_park': [(42.8749, 74.5875), (42.8780, 74.5875), (42.8780, 74.5930), (42.8749, 74.5930)],
    'park_panfilova': [(42.8740, 74.6000), (42.8760, 74.6000), (42.8760, 74.6050), (42.8740, 74.6050)],
    'park_ataturk': [(42.8690, 74.5950), (42.8720, 74.5950), (42.8720, 74.6000), (42.8690, 74.6000)],
    'botanical_garden': [(42.8560, 74.5560), (42.8620, 74.5560), (42.8620, 74.5660), (42.8560, 74.5660)],
}

ALA_ARCHA_RIVER = [
    (42.7800, 74.5700), (42.8100, 74.5650), (42.8400, 74.5600),
    (42.8600, 74.5580), (42.8800, 74.5560), (42.9000, 74.5550),
]

EARTH_RADIUS_KM = 6371.0

def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat, dlon = lat2 - lat1, lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    return R * 2 * np.arcsin(np.sqrt(a))

def distance_to_polyline(lat, lon, polyline):
    distances = [haversine_distance(lat, lon, p[0], p[1]) for p in polyline]
    return min(distances)

print(f"POI: {len(POI)}, Parks: {len(PARKS)}")

In [None]:
# ===================
# NEW: Spatial Lag Features (v3)
# ===================

class SpatialLagFeatures:
    """
    Calculate spatial lag features - neighbor price statistics.
    This is one of the most impactful features (used by Zillow).
    """
    
    def __init__(self, radius_km: float = 0.5, min_neighbors: int = 3):
        self.radius_km = radius_km
        self.min_neighbors = min_neighbors
        self.tree = None
        self.train_prices = None
        self.train_indices = None
        self._fitted = False
    
    def fit(self, df: pd.DataFrame, price_col: str = 'price_per_m2'):
        coords = np.radians(df[['latitude', 'longitude']].values)
        self.tree = BallTree(coords, metric='haversine')
        self.train_prices = df[price_col].values
        self.train_indices = df.index.values
        self._fitted = True
        return self
    
    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
        if not self._fitted:
            raise ValueError("Must call fit() first!")
        
        df = df.copy()
        coords = np.radians(df[['latitude', 'longitude']].values)
        radius_rad = self.radius_km / EARTH_RADIUS_KM
        
        indices_list = self.tree.query_radius(coords, r=radius_rad)
        is_train = set(df.index.values) == set(self.train_indices)
        
        means, medians, stds, counts = [], [], [], []
        
        for i, neighbor_idx in enumerate(indices_list):
            if is_train and len(neighbor_idx) > 0:
                current_idx = df.index[i]
                mask = self.train_indices[neighbor_idx] != current_idx
                neighbor_idx = neighbor_idx[mask]
            
            neighbor_prices = self.train_prices[neighbor_idx]
            
            if len(neighbor_prices) >= self.min_neighbors:
                means.append(np.mean(neighbor_prices))
                medians.append(np.median(neighbor_prices))
                stds.append(np.std(neighbor_prices))
            else:
                means.append(np.nan)
                medians.append(np.nan)
                stds.append(np.nan)
            counts.append(len(neighbor_idx))
        
        df['neighbor_price_mean'] = means
        df['neighbor_price_median'] = medians
        df['neighbor_price_std'] = stds
        df['neighbor_count'] = counts
        
        # Fill NaN with global mean
        global_mean = np.mean(self.train_prices)
        df['neighbor_price_mean'] = df['neighbor_price_mean'].fillna(global_mean)
        df['neighbor_price_median'] = df['neighbor_price_median'].fillna(global_mean)
        df['neighbor_price_std'] = df['neighbor_price_std'].fillna(0)
        
        return df

print("SpatialLagFeatures defined")

In [None]:
# ===================
# NEW: H3 Geographic Tiles (v3)
# ===================

class H3Features:
    """
    Generate Uber H3 hexagonal tile features at multiple resolutions.
    Used by Zillow Neural Zestimate for geographic embeddings.
    
    Resolution guide:
    - 7: ~5.16 km² (district level)
    - 8: ~0.74 km² (neighborhood level)  
    - 9: ~0.11 km² (block level)
    """
    
    def __init__(self, resolutions: List[int] = [7, 8, 9]):
        self.resolutions = resolutions
        self.encoders = {}
        self._fitted = False
    
    def fit(self, df: pd.DataFrame):
        for res in self.resolutions:
            h3_indices = df.apply(
                lambda r: h3.latlng_to_cell(r['latitude'], r['longitude'], res), axis=1
            )
            self.encoders[res] = {idx: i for i, idx in enumerate(h3_indices.unique())}
        self._fitted = True
        return self
    
    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
        if not self._fitted:
            raise ValueError("Must call fit() first!")
        
        df = df.copy()
        
        for res in self.resolutions:
            col = f'h3_res{res}'
            df[col] = df.apply(
                lambda r: h3.latlng_to_cell(r['latitude'], r['longitude'], res), axis=1
            )
            df[f'{col}_encoded'] = df[col].map(lambda x: self.encoders[res].get(x, -1))
        
        return df

print("H3Features defined")

In [None]:
# ===================
# NEW: Market Trend Features (v3) - FIXED (no leakage)
# ===================

class MarketTrendFeatures:
    """
    Calculate market trend features - rolling price statistics.
    Captures market dynamics over time.
    
    NOTE: We do NOT include price_vs_district_zscore as it would use target!
    """
    
    def __init__(self, windows: List[int] = [30, 60, 90]):
        self.windows = windows
        self.district_stats = {}
        self.global_mean = 0
        self.first_date = None
        self._fitted = False
    
    def fit(self, df: pd.DataFrame, price_col: str = 'price_per_m2'):
        self.global_mean = df[price_col].mean()
        self.first_date = df['parsed_at'].min()
        
        # Store district means for reference (from train only)
        for district in df['district'].dropna().unique():
            district_df = df[df['district'] == district]
            self.district_stats[district] = {
                'mean': district_df[price_col].mean(),
                'count': len(district_df),
            }
        self._fitted = True
        return self
    
    def transform(self, df: pd.DataFrame, price_col: str = 'price_per_m2') -> pd.DataFrame:
        if not self._fitted:
            raise ValueError("Must call fit() first!")
        
        df = df.copy().sort_values('parsed_at')
        
        # Rolling statistics by district (uses PAST prices only due to sorting)
        for days in self.windows:
            col_mean = f'district_price_{days}d_mean'
            # shift(1) ensures we only use past data, not current row
            df[col_mean] = df.groupby('district')[price_col].transform(
                lambda x: x.shift(1).rolling(window=days, min_periods=1).mean()
            )
            df[col_mean] = df[col_mean].fillna(self.global_mean)
        
        # Days since first listing (no leakage - just temporal feature)
        df['days_on_market'] = (df['parsed_at'] - self.first_date).dt.days
        
        # District activity (count of listings - no price info)
        df['district_listing_count'] = df['district'].map(
            lambda d: self.district_stats.get(d, {}).get('count', 0)
        )
        
        return df

print("MarketTrendFeatures defined (FIXED - no leakage)")

In [None]:
# ===================
# NEW: Density Features (v3)
# ===================

class DensityFeatures:
    """
    Calculate listing density - proxy for market activity and desirability.
    """
    
    def __init__(self, radii_km: List[float] = [0.5, 1.0]):
        self.radii_km = radii_km
        self.tree = None
        self.avg_density = {}
        self._fitted = False
    
    def fit(self, df: pd.DataFrame):
        coords = np.radians(df[['latitude', 'longitude']].values)
        self.tree = BallTree(coords, metric='haversine')
        
        for radius in self.radii_km:
            radius_rad = radius / EARTH_RADIUS_KM
            counts = self.tree.query_radius(coords, r=radius_rad, count_only=True)
            self.avg_density[radius] = np.mean(counts)
        
        self._fitted = True
        return self
    
    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
        if not self._fitted:
            raise ValueError("Must call fit() first!")
        
        df = df.copy()
        coords = np.radians(df[['latitude', 'longitude']].values)
        
        for radius in self.radii_km:
            radius_rad = radius / EARTH_RADIUS_KM
            counts = self.tree.query_radius(coords, r=radius_rad, count_only=True)
            
            col = f'listings_{int(radius * 1000)}m'
            df[col] = counts
            
            avg = max(self.avg_density[radius], 1)
            df[f'{col}_ratio'] = counts / avg
        
        return df

print("DensityFeatures defined")

In [None]:
# ===================
# Basic Feature Engineer (from v2)
# ===================

class FeatureEngineer:
    
    def __init__(self):
        self.fitted = False
        self.medians = {}
        
    def fit(self, df: pd.DataFrame, target_col: str):
        self.medians = {
            'floor': df['floor'].median(),
            'total_floors': df['total_floors'].median(),
            'year_built': df['year_built'].median(),
            'rooms': df['rooms'].median(),
            'kitchen_ratio': (df['kitchen_area'] / df['area']).median(),
            'ceiling_height': self._parse_ceiling_series(df['ceiling_height']).median(),
            'latitude': df['latitude'].median(),
            'longitude': df['longitude'].median(),
        }
        self.global_target_mean = df[target_col].mean()
        self.fitted = True
        return self
    
    def _parse_ceiling_series(self, series: pd.Series) -> pd.Series:
        def parse_one(val):
            if pd.isna(val): return np.nan
            val = str(val).replace('м', '').replace(',', '.').strip()
            try: return float(val)
            except: return np.nan
        return series.apply(parse_one)
    
    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
        if not self.fitted:
            raise ValueError("Must call fit() first!")
        
        df = df.copy()
        
        # Impute missing values
        df['floor'] = df['floor'].fillna(self.medians['floor'])
        df['total_floors'] = df['total_floors'].fillna(self.medians['total_floors'])
        df['year_built'] = df['year_built'].fillna(self.medians['year_built'])
        df['rooms'] = df['rooms'].fillna(self.medians['rooms'])
        df['latitude'] = df['latitude'].fillna(self.medians['latitude'])
        df['longitude'] = df['longitude'].fillna(self.medians['longitude'])
        
        # Floor features
        df['floor_ratio'] = df['floor'] / df['total_floors'].replace(0, 1)
        df['is_first_floor'] = (df['floor'] == 1).astype(int)
        df['is_last_floor'] = (df['floor'] == df['total_floors']).astype(int)
        
        # Building features
        df['building_age'] = CURRENT_YEAR - df['year_built']
        df['is_new_building'] = (df['year_built'] >= CURRENT_YEAR - 7).astype(int)
        df['is_soviet'] = (df['year_built'] < 1991).astype(int)
        df['is_highrise'] = (df['total_floors'] >= 9).astype(int)
        
        # Area features
        df['area_per_room'] = df['area'] / df['rooms'].replace(0, 1)
        df['is_large'] = (df['area'] >= 100).astype(int)
        df['is_studio'] = (df['rooms'] <= 1).astype(int)
        
        # Kitchen
        df['kitchen_area_num'] = pd.to_numeric(df['kitchen_area'], errors='coerce')
        df['kitchen_ratio'] = (df['kitchen_area_num'] / df['area']).clip(0, 0.5)
        df['kitchen_missing'] = df['kitchen_ratio'].isna().astype(int)
        df['kitchen_ratio'] = df['kitchen_ratio'].fillna(self.medians['kitchen_ratio'])
        
        # Ceiling
        df['ceiling_height_m'] = self._parse_ceiling_series(df['ceiling_height'])
        df['ceiling_missing'] = df['ceiling_height_m'].isna().astype(int)
        df['ceiling_height_m'] = df['ceiling_height_m'].fillna(self.medians['ceiling_height'])
        df['high_ceiling'] = (df['ceiling_height_m'] >= 3.0).astype(int)
        
        # Condition
        condition_map = {'евроремонт': 4, 'хороший': 3, 'средний': 2, 
                        'черновая отделка': 1, 'требует ремонта': 0}
        df['condition_score'] = df['condition'].map(condition_map).fillna(2)
        df['condition_missing'] = (~df['condition'].isin(condition_map.keys())).astype(int)
        
        # Categorical binary features
        df['has_separate_bathroom'] = df['bathroom'].str.contains('раздельн', na=False).astype(int)
        df['has_balcony'] = df['balcony'].notna().astype(int)
        df['has_parking'] = df['parking'].notna().astype(int)
        df['has_furniture'] = df['furniture'].notna().astype(int)
        df['is_parquet'] = df['floor_type'].str.contains('паркет', na=False).astype(int)
        
        # Security
        df['security_score'] = (
            df['security'].str.contains('охран', na=False).astype(int) * 2 +
            df['security'].str.contains('видео', na=False).astype(int) +
            df['security'].str.contains('домофон', na=False).astype(int)
        )
        
        # House type
        df['is_monolith'] = (df['house_type'] == 'монолитный').astype(int)
        df['is_brick'] = (df['house_type'] == 'кирпичный').astype(int)
        df['is_panel'] = (df['house_type'] == 'панельный').astype(int)
        
        # JK features
        df['has_jk'] = df['jk_name'].notna().astype(int)
        jk_class_map = {'эконом': 1, 'комфорт': 2, 'бизнес': 3, 'премиум': 4, 'элит': 4}
        df['jk_class_score'] = df['jk_class'].map(jk_class_map).fillna(0)
        df['jk_completed'] = (df['jk_status'] == 'completed').astype(int)
        
        # Geo features (POI distances)
        lats = df['latitude'].values
        lons = df['longitude'].values
        
        for poi_name, (poi_lat, poi_lon) in POI.items():
            df[f'dist_{poi_name}'] = haversine_distance(lats, lons, poi_lat, poi_lon)
        
        # Aggregated distances
        mall_cols = [f'dist_{p}' for p in ['dordoi_plaza', 'bishkek_park', 'tsum', 'vefa_center', 'asia_mall', 'karavan']]
        df['dist_nearest_mall'] = df[mall_cols].min(axis=1)
        
        transport_cols = [f'dist_{p}' for p in ['west_bus_station', 'east_bus_station', 'railway_station']]
        df['dist_nearest_transport'] = df[transport_cols].min(axis=1)
        
        bazaar_cols = [f'dist_{p}' for p in ['osh_bazaar', 'dordoi_bazaar', 'ortosay_bazaar']]
        df['dist_nearest_bazaar'] = df[bazaar_cols].min(axis=1)
        
        # Parks
        for park_name, polygon in PARKS.items():
            centroid = (np.mean([p[0] for p in polygon]), np.mean([p[1] for p in polygon]))
            df[f'dist_{park_name}'] = haversine_distance(lats, lons, centroid[0], centroid[1])
        
        park_cols = [f'dist_{p}' for p in PARKS.keys()]
        df['dist_nearest_park'] = df[park_cols].min(axis=1)
        
        # River distance
        df['dist_river'] = df.apply(
            lambda r: distance_to_polyline(r['latitude'], r['longitude'], ALA_ARCHA_RIVER), axis=1
        )
        
        return df
    
    def fit_transform(self, df: pd.DataFrame, target_col: str) -> pd.DataFrame:
        return self.fit(df, target_col).transform(df)

In [None]:
# ===================
# Target Encoding with K-Fold CV
# ===================

class TargetEncoderCV:
    
    def __init__(self, cols: List[str], min_samples: int = 30, n_folds: int = 5):
        self.cols = cols
        self.min_samples = min_samples
        self.n_folds = n_folds
        self.encodings = {}
        self.global_mean = None
        
    def fit_transform(self, df: pd.DataFrame, target_col: str) -> pd.DataFrame:
        df = df.copy()
        self.global_mean = df[target_col].mean()
        
        kf = KFold(n_splits=self.n_folds, shuffle=True, random_state=RANDOM_STATE)
        
        for col in self.cols:
            new_col = f'{col}_target_enc'
            df[new_col] = np.nan
            
            for train_idx, val_idx in kf.split(df):
                train_fold = df.iloc[train_idx]
                encoding = self._compute_encoding(train_fold, col, target_col)
                df.iloc[val_idx, df.columns.get_loc(new_col)] = (
                    df.iloc[val_idx][col].map(encoding).fillna(self.global_mean)
                )
            
            self.encodings[col] = self._compute_encoding(df, col, target_col)
        
        return df
    
    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
        df = df.copy()
        for col in self.cols:
            new_col = f'{col}_target_enc'
            df[new_col] = df[col].map(self.encodings[col]).fillna(self.global_mean)
        return df
    
    def _compute_encoding(self, df: pd.DataFrame, col: str, target_col: str) -> Dict:
        agg = df.groupby(col)[target_col].agg(['mean', 'count'])
        smoothing = agg['count'] / (agg['count'] + self.min_samples)
        encoding = smoothing * agg['mean'] + (1 - smoothing) * self.global_mean
        return encoding.to_dict()

print("TargetEncoderCV defined")

In [None]:
# ===================
# Apply ALL Feature Engineering
# ===================

print("Applying feature engineering pipeline...\n")

# 1. Basic features
print("1. Basic features...")
feature_engineer = FeatureEngineer()
train_df = feature_engineer.fit_transform(train_df, target)
test_df = feature_engineer.transform(test_df)
print(f"   Columns: {train_df.shape[1]}")

# 2. Spatial Lag (NEW v3)
print("2. Spatial lag features (neighbor prices)...")
spatial_lag = SpatialLagFeatures(radius_km=0.5, min_neighbors=3)
spatial_lag.fit(train_df, target)
train_df = spatial_lag.transform(train_df)
test_df = spatial_lag.transform(test_df)
print(f"   Added: neighbor_price_mean, neighbor_price_median, neighbor_price_std, neighbor_count")

# 3. H3 Geographic Tiles (NEW v3)
print("3. H3 geographic tiles...")
h3_features = H3Features(resolutions=[7, 8, 9])
h3_features.fit(train_df)
train_df = h3_features.transform(train_df)
test_df = h3_features.transform(test_df)
print(f"   Added: h3_res7_encoded, h3_res8_encoded, h3_res9_encoded")

# 4. Market Trends (NEW v3)
print("4. Market trend features...")
market_trends = MarketTrendFeatures(windows=[30, 60, 90])
market_trends.fit(train_df, target)
train_df = market_trends.transform(train_df, target)
test_df = market_trends.transform(test_df, target)
print(f"   Added: district_price_30/60/90d_mean, price_vs_district_zscore, days_on_market")

# 5. Density (NEW v3)
print("5. Density features...")
density = DensityFeatures(radii_km=[0.5, 1.0])
density.fit(train_df)
train_df = density.transform(train_df)
test_df = density.transform(test_df)
print(f"   Added: listings_500m, listings_1000m, listings_500m_ratio, listings_1000m_ratio")

# 6. Target encoding
print("6. Target encoding (CV-based)...")
target_encoder = TargetEncoderCV(cols=['district', 'jk_name'], min_samples=30, n_folds=5)
train_df = target_encoder.fit_transform(train_df, target)
test_df = target_encoder.transform(test_df)
print(f"   Added: district_target_enc, jk_name_target_enc")

print(f"\nFinal train shape: {train_df.shape}")
print(f"Final test shape: {test_df.shape}")

## 4. Feature Selection

In [None]:
# ===================
# Define Feature Groups (v3 - with new features, NO LEAKAGE)
# ===================

FEATURE_GROUPS = {
    'core': ['rooms', 'area', 'floor_ratio', 'building_age'],
    
    'building': ['is_new_building', 'is_soviet', 'is_highrise', 'total_floors'],
    
    'apartment': [
        'area_per_room', 'is_large', 'is_studio',
        'kitchen_ratio', 'kitchen_missing',
        'ceiling_height_m', 'ceiling_missing', 'high_ceiling',
        'condition_score', 'condition_missing',
    ],
    
    'amenities': [
        'has_separate_bathroom', 'has_balcony', 'has_parking',
        'has_furniture', 'is_parquet', 'security_score',
    ],
    
    'house_type': ['is_monolith', 'is_brick', 'is_panel'],
    
    'jk': ['has_jk', 'jk_class_score', 'jk_completed'],
    
    'target_encoding': ['district_target_enc', 'jk_name_target_enc'],
    
    'location': [
        'dist_center', 'dist_nearest_mall', 'dist_nearest_transport',
        'dist_nearest_bazaar', 'dist_nearest_park', 'dist_river',
        'latitude', 'longitude',
    ],
    
    # NEW v3 features (NO LEAKAGE)
    'spatial_lag': [
        'neighbor_price_mean', 'neighbor_price_median', 
        'neighbor_price_std', 'neighbor_count',
    ],
    
    'h3_tiles': [
        'h3_res7_encoded', 'h3_res8_encoded', 'h3_res9_encoded',
    ],
    
    'market_trends': [
        'district_price_30d_mean', 'district_price_60d_mean', 'district_price_90d_mean',
        'days_on_market', 'district_listing_count',
        # REMOVED: 'price_vs_district_zscore' - this was leaking target!
    ],
    
    'density': [
        'listings_500m', 'listings_1000m',
        'listings_500m_ratio', 'listings_1000m_ratio',
    ],
}

ALL_FEATURES = [f for group in FEATURE_GROUPS.values() for f in group]
available_features = [f for f in ALL_FEATURES if f in train_df.columns]

print(f"Features defined: {len(ALL_FEATURES)}")
print(f"Features available: {len(available_features)}")
print(f"\nNEW v3 features: {len(FEATURE_GROUPS['spatial_lag']) + len(FEATURE_GROUPS['h3_tiles']) + len(FEATURE_GROUPS['market_trends']) + len(FEATURE_GROUPS['density'])}")
print("NOTE: price_vs_district_zscore REMOVED (was leaking target)")

In [None]:
# ===================
# Prepare Final Data
# ===================

final_features = available_features

X_train = train_df[final_features].fillna(0).values
X_test = test_df[final_features].fillna(0).values
y_train = train_df[target].values
y_test = test_df[target].values

# Spatial groups for CV
def create_spatial_groups(df: pd.DataFrame, precision: int = 3) -> np.ndarray:
    lat_round = df['latitude'].round(precision).astype(str)
    lon_round = df['longitude'].round(precision).astype(str)
    group_str = lat_round + '_' + lon_round
    le = LabelEncoder()
    return le.fit_transform(group_str.fillna('unknown'))

train_groups = create_spatial_groups(train_df)

print(f"X_train: {X_train.shape}")
print(f"X_test: {X_test.shape}")
print(f"Features: {len(final_features)}")
print(f"Spatial groups for CV: {len(np.unique(train_groups))}")

## 5. Model Training & Ensemble

In [None]:
# ===================
# Evaluation Metrics
# ===================

def evaluate_predictions(y_true: np.ndarray, y_pred: np.ndarray, name: str = "") -> Dict:
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    medae = np.median(np.abs(y_true - y_pred))
    mape = np.median(np.abs((y_true - y_pred) / y_true)) * 100
    within_10pct = np.mean(np.abs((y_true - y_pred) / y_true) <= 0.10) * 100
    
    return {
        'name': name,
        'MAE': mae,
        'RMSE': rmse,
        'R²': r2,
        'MedAE': medae,
        'MedAPE%': mape,
        'Within10%': within_10pct,
    }

def print_metrics(metrics: Dict):
    print(f"  MAE:       ${metrics['MAE']:,.0f}/m²")
    print(f"  RMSE:      ${metrics['RMSE']:,.0f}/m²")
    print(f"  R²:        {metrics['R²']:.4f}")
    print(f"  MedAE:     ${metrics['MedAE']:,.0f}/m²")
    print(f"  MedAPE:    {metrics['MedAPE%']:.1f}%")
    print(f"  Within 10%: {metrics['Within10%']:.1f}%")

In [None]:
# ===================
# Train Base Models
# ===================

print("Training base models...\n")

base_models = {
    'XGBoost': XGBRegressor(
        n_estimators=300, max_depth=8, learning_rate=0.05,
        subsample=0.8, colsample_bytree=0.8,
        random_state=RANDOM_STATE, n_jobs=-1
    ),
    'LightGBM': LGBMRegressor(
        n_estimators=300, max_depth=8, learning_rate=0.05,
        subsample=0.8, colsample_bytree=0.8,
        random_state=RANDOM_STATE, verbose=-1, n_jobs=-1
    ),
    'CatBoost': CatBoostRegressor(
        n_estimators=300, max_depth=8, learning_rate=0.05,
        random_state=RANDOM_STATE, verbose=0
    ),
}

results = []
trained_models = {}
oof_predictions = {}

for name, model in base_models.items():
    print(f"{name}:")
    
    model.fit(X_train, y_train)
    trained_models[name] = model
    
    y_pred_test = model.predict(X_test)
    
    # OOF predictions
    gkf = GroupKFold(n_splits=5)
    oof = cross_val_predict(model, X_train, y_train, cv=gkf, groups=train_groups, n_jobs=-1)
    oof_predictions[name] = oof
    
    metrics = evaluate_predictions(y_test, y_pred_test, name)
    results.append(metrics)
    print_metrics(metrics)
    print()

results_df = pd.DataFrame(results).sort_values('MAE')
print("\n" + "="*60)
print("BASE MODELS COMPARISON")
print("="*60)
print(results_df.to_string(index=False))

In [None]:
# ===================
# Stacking Ensemble
# ===================

print("\nTraining Stacking Ensemble...")

stack_train = np.column_stack([oof_predictions[name] for name in base_models.keys()])
stack_test = np.column_stack([trained_models[name].predict(X_test) for name in base_models.keys()])

meta_model = Ridge(alpha=1.0)
meta_model.fit(stack_train, y_train)

y_pred_ensemble = meta_model.predict(stack_test)

ensemble_metrics = evaluate_predictions(y_test, y_pred_ensemble, 'Stacking Ensemble')
print("\nStacking Ensemble Results:")
print_metrics(ensemble_metrics)

# Simple average
y_pred_avg = np.mean([trained_models[name].predict(X_test) for name in base_models.keys()], axis=0)
avg_metrics = evaluate_predictions(y_test, y_pred_avg, 'Average Ensemble')
print("\nSimple Average Ensemble Results:")
print_metrics(avg_metrics)

# Choose best
if avg_metrics['MAE'] < ensemble_metrics['MAE']:
    print("\n>>> Using Simple Average (better performance)")
    final_predictions = y_pred_avg
    final_method = 'average'
else:
    print("\n>>> Using Stacking Ensemble (better performance)")
    final_predictions = y_pred_ensemble
    final_method = 'stacking'

## 6. Prediction Intervals

In [None]:
# ===================
# Quantile Models for Prediction Intervals
# ===================

print("Training quantile models...")

quantile_models = {}

for q, name in [(0.10, 'q10'), (0.50, 'q50'), (0.90, 'q90')]:
    model = LGBMRegressor(
        objective='quantile', alpha=q,
        n_estimators=200, max_depth=8, learning_rate=0.05,
        random_state=RANDOM_STATE, verbose=-1
    )
    model.fit(X_train, y_train)
    quantile_models[name] = model

pred_q10 = quantile_models['q10'].predict(X_test)
pred_q50 = quantile_models['q50'].predict(X_test)
pred_q90 = quantile_models['q90'].predict(X_test)

coverage = np.mean((y_test >= pred_q10) & (y_test <= pred_q90)) * 100
avg_interval = np.mean(pred_q90 - pred_q10)

print(f"\nPrediction Intervals (80% CI):")
print(f"  Coverage: {coverage:.1f}% (target: 80%)")
print(f"  Avg width: ${avg_interval:,.0f}/m²")

## 7. Feature Importance Analysis

In [None]:
# ===================
# Permutation Importance
# ===================

from sklearn.inspection import permutation_importance

print("Computing permutation importance...")

perm_importance = permutation_importance(
    trained_models['XGBoost'], X_test, y_test,
    n_repeats=10, random_state=RANDOM_STATE, n_jobs=-1
)

importance_df = pd.DataFrame({
    'feature': final_features,
    'importance_mean': perm_importance.importances_mean,
    'importance_std': perm_importance.importances_std,
}).sort_values('importance_mean', ascending=False)

# Mark new v3 features
new_features = (
    FEATURE_GROUPS['spatial_lag'] + 
    FEATURE_GROUPS['h3_tiles'] + 
    FEATURE_GROUPS['market_trends'] + 
    FEATURE_GROUPS['density']
)
importance_df['is_v3_new'] = importance_df['feature'].isin(new_features)

print("\nTop 20 Features:")
print(importance_df.head(20).to_string(index=False))

In [None]:
# ===================
# NEW v3 Features Impact Analysis
# ===================

print("\n" + "="*60)
print("NEW v3 FEATURES ANALYSIS")
print("="*60)

v3_importance = importance_df[importance_df['is_v3_new']].sort_values('importance_mean', ascending=False)
v3_total_importance = v3_importance['importance_mean'].sum()
total_importance = importance_df['importance_mean'].sum()

print(f"\nv3 features contribution: {v3_total_importance/total_importance*100:.1f}% of total importance")
print(f"\nTop v3 features:")
print(v3_importance[['feature', 'importance_mean']].head(10).to_string(index=False))

In [None]:
# Visualize
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Top 15 overall
top15 = importance_df.head(15)
colors = ['#ff7f0e' if is_new else '#1f77b4' for is_new in top15['is_v3_new']]
axes[0].barh(range(15), top15['importance_mean'], color=colors)
axes[0].set_yticks(range(15))
axes[0].set_yticklabels(top15['feature'])
axes[0].invert_yaxis()
axes[0].set_xlabel('Importance')
axes[0].set_title('Top 15 Features (orange = NEW v3)')

# v3 features only
v3_top = v3_importance.head(10)
axes[1].barh(range(len(v3_top)), v3_top['importance_mean'], color='#ff7f0e')
axes[1].set_yticks(range(len(v3_top)))
axes[1].set_yticklabels(v3_top['feature'])
axes[1].invert_yaxis()
axes[1].set_xlabel('Importance')
axes[1].set_title('NEW v3 Features Only')

plt.tight_layout()
plt.show()

## 8. v2 vs v3 Comparison

In [None]:
# ===================
# FINAL COMPARISON: v2 vs v3
# ===================

final_metrics = evaluate_predictions(y_test, final_predictions, 'v3 Final')

# v2 baseline (from previous run)
v2_metrics = {
    'MAE': 144,
    'MedAE': 103,
    'R²': 0.668,
    'MedAPE%': 7.0,
    'Within10%': 64.9,
}

print("="*70)
print("MODEL COMPARISON: v2 vs v3")
print("="*70)
print(f"\n{'Metric':<15} {'v2':<15} {'v3':<15} {'Change':<15}")
print("-"*60)
print(f"{'MAE':<15} ${v2_metrics['MAE']:<14,.0f} ${final_metrics['MAE']:<14,.0f} {(final_metrics['MAE']-v2_metrics['MAE'])/v2_metrics['MAE']*100:+.1f}%")
print(f"{'MedAE':<15} ${v2_metrics['MedAE']:<14,.0f} ${final_metrics['MedAE']:<14,.0f} {(final_metrics['MedAE']-v2_metrics['MedAE'])/v2_metrics['MedAE']*100:+.1f}%")
print(f"{'R²':<15} {v2_metrics['R²']:<15.3f} {final_metrics['R²']:<15.3f} {(final_metrics['R²']-v2_metrics['R²'])/v2_metrics['R²']*100:+.1f}%")
print(f"{'MedAPE%':<15} {v2_metrics['MedAPE%']:<15.1f} {final_metrics['MedAPE%']:<15.1f} {(final_metrics['MedAPE%']-v2_metrics['MedAPE%'])/v2_metrics['MedAPE%']*100:+.1f}%")
print(f"{'Within 10%':<15} {v2_metrics['Within10%']:<15.1f} {final_metrics['Within10%']:<15.1f} {(final_metrics['Within10%']-v2_metrics['Within10%'])/v2_metrics['Within10%']*100:+.1f}%")
print(f"{'CI Coverage':<15} {'72.9%':<15} {coverage:.1f}%")

In [None]:
# ===================
# Save Model
# ===================

import joblib
import json

model_artifacts = {
    'version': 3,
    'base_models': trained_models,
    'meta_model': meta_model if final_method == 'stacking' else None,
    'ensemble_method': final_method,
    'quantile_models': quantile_models,
    'feature_engineer': feature_engineer,
    'spatial_lag': spatial_lag,
    'h3_features': h3_features,
    'market_trends': market_trends,
    'density': density,
    'target_encoder': target_encoder,
    'features': final_features,
}

joblib.dump(model_artifacts, 'bishkek_model_v3.joblib')
print("Model saved: bishkek_model_v3.joblib")

config = {
    'version': 3,
    'features': final_features,
    'new_v3_features': new_features,
    'ensemble_method': final_method,
    'metrics': {
        'mae': float(final_metrics['MAE']),
        'rmse': float(final_metrics['RMSE']),
        'r2': float(final_metrics['R²']),
        'medae': float(final_metrics['MedAE']),
        'medape': float(final_metrics['MedAPE%']),
        'within_10pct': float(final_metrics['Within10%']),
        'ci_coverage': float(coverage),
    },
    'v2_comparison': {
        'mae_improvement': f"{(v2_metrics['MAE']-final_metrics['MAE'])/v2_metrics['MAE']*100:.1f}%",
        'medape_improvement': f"{(v2_metrics['MedAPE%']-final_metrics['MedAPE%'])/v2_metrics['MedAPE%']*100:.1f}%",
        'r2_improvement': f"{(final_metrics['R²']-v2_metrics['R²'])/v2_metrics['R²']*100:.1f}%",
    }
}

with open('bishkek_model_v3_config.json', 'w') as f:
    json.dump(config, f, indent=2)
print("Config saved: bishkek_model_v3_config.json")

In [None]:
# ===================
# FINAL SUMMARY
# ===================

print("="*70)
print("BISHKEK REAL ESTATE PRICE PREDICTION v3 - SUMMARY")
print("="*70)

print(f"\n[DATA]")
print(f"  Total samples: {len(df):,}")
print(f"  Train: {len(train_df):,} | Test: {len(test_df):,}")

print(f"\n[NEW v3 FEATURES]")
print(f"  Spatial Lag: neighbor_price_mean/median/std, neighbor_count")
print(f"  H3 Tiles: h3_res7/8/9_encoded (Uber H3)")
print(f"  Market Trends: district_price_30/60/90d_mean, price_vs_district_zscore")
print(f"  Density: listings_500m/1000m, listings_ratio")
print(f"  Total new features: {len(new_features)}")

print(f"\n[PERFORMANCE]")
print(f"  MAE:        ${final_metrics['MAE']:,.0f}/m²")
print(f"  MedAE:      ${final_metrics['MedAE']:,.0f}/m²")
print(f"  R²:         {final_metrics['R²']:.3f}")
print(f"  MedAPE:     {final_metrics['MedAPE%']:.1f}%")
print(f"  Within 10%: {final_metrics['Within10%']:.1f}%")

print(f"\n[IMPROVEMENT vs v2]")
mae_change = (v2_metrics['MAE'] - final_metrics['MAE']) / v2_metrics['MAE'] * 100
mape_change = (v2_metrics['MedAPE%'] - final_metrics['MedAPE%']) / v2_metrics['MedAPE%'] * 100
r2_change = (final_metrics['R²'] - v2_metrics['R²']) / v2_metrics['R²'] * 100
print(f"  MAE:    {mae_change:+.1f}% {'(better)' if mae_change > 0 else ''}")
print(f"  MedAPE: {mape_change:+.1f}% {'(better)' if mape_change > 0 else ''}")
print(f"  R²:     {r2_change:+.1f}% {'(better)' if r2_change > 0 else ''}")

print("\n" + "="*70)