# Bishkek Real Estate Price Prediction

**Goal:** Predict apartment price per square meter ($/m²) in Bishkek, Kyrgyzstan

**Dataset:** 8,821 apartment listings from house.kg (January 2025)

**Approach:**
1. Feature Engineering
2. Baseline Models Comparison
3. Hyperparameter Tuning (Optuna)
4. Final Evaluation & SHAP Analysis

In [None]:
# Install dependencies (for Kaggle)
!pip install -q optuna shap catboost lightgbm

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, cross_val_score, KFold, GroupKFold
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import optuna
import shap

print("Libraries loaded successfully!")

## 1. Load Data

In [None]:
import sqlite3

# Load from SQLite database
# For Kaggle: change to CSV path
try:
    # Local path
    conn = sqlite3.connect('../data/databases/bishkek.db')
except:
    # Kaggle path
    conn = sqlite3.connect('/kaggle/input/bishkek-real-estate-2025/bishkek.db')

df = pd.read_sql('''
    SELECT 
        a.*,
        rc.name as jk_name,
        rc.class as jk_class,
        rc.status as jk_status,
        rc.developer_name
    FROM apartments a
    LEFT JOIN residential_complexes rc ON a.residential_complex_id = rc.id
    WHERE a.price_usd IS NOT NULL 
      AND a.area IS NOT NULL
      AND a.price_per_m2 > 0
      AND a.price_per_m2 < 10000  -- Filter outliers (> $10k/m2 is unrealistic)
''', conn)
conn.close()

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")

In [None]:
# Quick overview
df.head()

In [None]:
# Target variable statistics
target = 'price_per_m2'

print("Target variable statistics:")
print(f"  Mean:   ${df[target].mean():,.0f}/m²")
print(f"  Median: ${df[target].median():,.0f}/m²")
print(f"  Std:    ${df[target].std():,.0f}/m²")
print(f"  Min:    ${df[target].min():,.0f}/m²")
print(f"  Max:    ${df[target].max():,.0f}/m²")

print(f"\nResidential complex coverage:")
print(f"  With JK: {df['jk_name'].notna().sum()} ({df['jk_name'].notna().mean()*100:.1f}%)")
print(f"  Unique JK: {df['jk_name'].nunique()}")

In [None]:
# Target distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

axes[0].hist(df[target], bins=50, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Price per m² (USD)')
axes[0].set_ylabel('Count')
axes[0].set_title('Price Distribution')
axes[0].axvline(df[target].median(), color='red', linestyle='--', label=f'Median: ${df[target].median():,.0f}')
axes[0].legend()

axes[1].hist(np.log1p(df[target]), bins=50, edgecolor='black', alpha=0.7, color='green')
axes[1].set_xlabel('Log(Price per m²)')
axes[1].set_ylabel('Count')
axes[1].set_title('Log-transformed Price Distribution')

plt.tight_layout()
plt.show()

## 2. Feature Engineering

In [None]:
# ===================
# POI (Points of Interest) - Bishkek coordinates
# ===================

POI = {
    # Shopping malls
    'dordoi_plaza': (42.8750, 74.6128),
    'bishkek_park': (42.8741, 74.5888),
    'tsum': (42.8746, 74.6031),
    'vefa_center': (42.8668, 74.5931),
    'asia_mall': (42.8489, 74.5672),
    'karavan': (42.8562, 74.5686),
    
    # Key landmarks
    'ala_too_square': (42.8746, 74.6030),
    'philharmonic': (42.8749, 74.6108),
    'white_house': (42.8760, 74.6097),
    'victory_square': (42.8722, 74.5875),
    
    # Universities
    'knu': (42.8778, 74.6027),  # Kyrgyz National University
    'auca': (42.8634, 74.6167),  # American University of Central Asia
    'krsu': (42.8750, 74.5861),  # Kyrgyz-Russian Slavic University
    
    # Transport
    'west_bus_station': (42.8628, 74.5294),
    'east_bus_station': (42.8605, 74.6550),
    'railway_station': (42.8588, 74.6339),
    
    # Markets
    'osh_bazaar': (42.8722, 74.5761),
    'dordoi_bazaar': (42.9453, 74.6494),
    'ortosay_bazaar': (42.8478, 74.5542),
    
    # City center
    'center': (42.8746, 74.5888),
}

# ===================
# Parks in Bishkek
# ===================

PARKS = {
    'dubovy_park': [
        (42.8749, 74.5875), (42.8780, 74.5875), (42.8780, 74.5930), (42.8749, 74.5930)
    ],
    'park_panfilova': [
        (42.8740, 74.6000), (42.8760, 74.6000), (42.8760, 74.6050), (42.8740, 74.6050)
    ],
    'park_atatürk': [
        (42.8690, 74.5950), (42.8720, 74.5950), (42.8720, 74.6000), (42.8690, 74.6000)
    ],
    'botanical_garden': [
        (42.8560, 74.5560), (42.8620, 74.5560), (42.8620, 74.5660), (42.8560, 74.5660)
    ],
    'youth_park': [
        (42.8650, 74.5700), (42.8700, 74.5700), (42.8700, 74.5800), (42.8650, 74.5800)
    ],
}

# ===================
# Ala-Archa River (through Bishkek)
# ===================
ALA_ARCHA_RIVER = [
    (42.7800, 74.5700), (42.8100, 74.5650), (42.8400, 74.5600),
    (42.8600, 74.5580), (42.8800, 74.5560), (42.9000, 74.5550),
    (42.9200, 74.5540), (42.9400, 74.5520)
]

def haversine_distance(lat1, lon1, lat2, lon2):
    """Calculate distance between two points in km using Haversine formula"""
    R = 6371  # Earth's radius in km
    
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    
    return R * c

def distance_to_polyline(lat, lon, polyline):
    """Calculate minimum distance from point to polyline"""
    min_dist = float('inf')
    
    for i in range(len(polyline) - 1):
        p1_lat, p1_lon = polyline[i]
        p2_lat, p2_lon = polyline[i + 1]
        
        d1 = haversine_distance(lat, lon, p1_lat, p1_lon)
        d2 = haversine_distance(lat, lon, p2_lat, p2_lon)
        mid_lat = (p1_lat + p2_lat) / 2
        mid_lon = (p1_lon + p2_lon) / 2
        d_mid = haversine_distance(lat, lon, mid_lat, mid_lon)
        
        min_dist = min(min_dist, d1, d2, d_mid)
    
    return min_dist

def point_in_polygon(lat, lon, polygon):
    """Check if point is inside polygon using ray casting"""
    n = len(polygon)
    inside = False
    
    p1_lat, p1_lon = polygon[0]
    for i in range(1, n + 1):
        p2_lat, p2_lon = polygon[i % n]
        if lon > min(p1_lon, p2_lon):
            if lon <= max(p1_lon, p2_lon):
                if lat <= max(p1_lat, p2_lat):
                    if p1_lon != p2_lon:
                        lat_inters = (lon - p1_lon) * (p2_lat - p1_lat) / (p2_lon - p1_lon) + p1_lat
                    if p1_lat == p2_lat or lat <= lat_inters:
                        inside = not inside
        p1_lat, p1_lon = p2_lat, p2_lon
    
    return inside

def point_in_any_park(lat, lon):
    """Check if point is inside any park"""
    for park_name, polygon in PARKS.items():
        if point_in_polygon(lat, lon, polygon):
            return True
    return False

print(f"Loaded {len(POI)} POI locations")
print(f"Loaded {len(PARKS)} park polygons")
print(f"Loaded Ala-Archa river with {len(ALA_ARCHA_RIVER)} points")

In [None]:
def create_features(df, fit_encoders=None, target_col=None):
    """
    Create all features for the model.
    """
    df = df.copy()
    encoders = fit_encoders or {}
    
    # ===================
    # Floor features
    # ===================
    df['total_floors'] = df['total_floors'].fillna(df['total_floors'].median())
    df['floor'] = df['floor'].fillna(1)
    df['floor_ratio'] = df['floor'] / df['total_floors'].replace(0, 1)
    df['is_first_floor'] = (df['floor'] == 1).astype(int)
    df['is_last_floor'] = (df['floor'] == df['total_floors']).astype(int)
    
    # ===================
    # Building features
    # ===================
    df['year_built'] = df['year_built'].fillna(df['year_built'].median())
    df['building_age'] = 2025 - df['year_built']
    df['is_new_building'] = (df['year_built'] >= 2018).astype(int)
    df['is_highrise'] = (df['total_floors'] >= 9).astype(int)
    
    # ===================
    # Area features
    # ===================
    df['rooms'] = df['rooms'].fillna(df['rooms'].median())
    df['area_per_room'] = df['area'] / df['rooms'].replace(0, 1)
    df['is_large_apartment'] = (df['area'] >= 100).astype(int)
    
    # Kitchen
    df['kitchen_area'] = pd.to_numeric(df['kitchen_area'], errors='coerce')
    df['kitchen_ratio'] = df['kitchen_area'] / df['area']
    df['kitchen_ratio'] = df['kitchen_ratio'].clip(0, 0.5)
    df['kitchen_missing'] = df['kitchen_ratio'].isna().astype(int)
    kitchen_median = df['kitchen_ratio'].median() if fit_encoders is None else encoders.get('kitchen_median', 0.12)
    df['kitchen_ratio'] = df['kitchen_ratio'].fillna(kitchen_median)
    if fit_encoders is None:
        encoders['kitchen_median'] = kitchen_median
    
    # ===================
    # Ceiling height
    # ===================
    def parse_ceiling(val):
        if pd.isna(val):
            return np.nan
        val = str(val).replace('м', '').replace(',', '.').strip()
        try:
            return float(val)
        except:
            return np.nan
    
    df['ceiling_height_m'] = df['ceiling_height'].apply(parse_ceiling)
    df['ceiling_missing'] = df['ceiling_height_m'].isna().astype(int)
    ceiling_median = df['ceiling_height_m'].median() if fit_encoders is None else encoders.get('ceiling_median', 2.7)
    df['ceiling_height_m'] = df['ceiling_height_m'].fillna(ceiling_median)
    if fit_encoders is None:
        encoders['ceiling_median'] = ceiling_median
    
    # ===================
    # Condition
    # ===================
    condition_map = {
        'евроремонт': 4,
        'хороший': 3,
        'средний': 2,
        'черновая отделка': 1,
        'требует ремонта': 0
    }
    df['condition_score'] = df['condition'].map(condition_map)
    df['condition_missing'] = df['condition_score'].isna().astype(int)
    df['condition_score'] = df['condition_score'].fillna(2)
    
    # ===================
    # Bathroom
    # ===================
    df['has_separate_bathroom'] = df['bathroom'].str.contains('раздельн', na=False).astype(int)
    df['has_2plus_bathrooms'] = df['bathroom'].str.contains('2|два', na=False, case=False).astype(int)
    
    # ===================
    # Balcony
    # ===================
    df['has_balcony'] = df['balcony'].notna().astype(int)
    df['has_loggia'] = df['balcony'].str.contains('лоджия', na=False).astype(int)
    
    # ===================
    # Parking
    # ===================
    df['has_parking'] = df['parking'].notna().astype(int)
    
    # ===================
    # Floor type
    # ===================
    df['is_parquet'] = df['floor_type'].str.contains('паркет', na=False).astype(int)
    df['is_laminate'] = df['floor_type'].str.contains('ламинат', na=False).astype(int)
    
    # ===================
    # Security
    # ===================
    df['security_score'] = (
        df['security'].str.contains('охран', na=False).astype(int) * 2 +
        df['security'].str.contains('видео', na=False).astype(int) +
        df['security'].str.contains('домофон', na=False).astype(int) +
        df['security'].str.contains('консьерж', na=False).astype(int)
    )
    
    # ===================
    # Furniture
    # ===================
    df['has_furniture'] = df['furniture'].notna().astype(int)
    
    # ===================
    # House type
    # ===================
    df['is_monolith'] = (df['house_type'] == 'монолитный').astype(int)
    df['is_brick'] = (df['house_type'] == 'кирпичный').astype(int)
    df['is_panel'] = (df['house_type'] == 'панельный').astype(int)
    
    # ===================
    # Residential complex features
    # ===================
    df['has_jk'] = df['jk_name'].notna().astype(int)
    
    # JK class (эконом/комфорт/бизнес/премиум)
    jk_class_map = {
        'эконом': 1, 'комфорт': 2, 'бизнес': 3, 'премиум': 4, 'элит': 4
    }
    df['jk_class_score'] = df['jk_class'].map(jk_class_map).fillna(0)
    
    # JK status (строится/сдан)
    df['jk_is_completed'] = (df['jk_status'] == 'completed').astype(int)
    
    # ===================
    # Location features
    # ===================
    # District encoding will be done via target encoding
    
    # Fill missing coordinates with city center
    df['latitude'] = df['latitude'].fillna(42.8746)
    df['longitude'] = df['longitude'].fillna(74.5888)
    
    # Distance to river
    df['dist_river'] = df.apply(
        lambda row: distance_to_polyline(row['latitude'], row['longitude'], ALA_ARCHA_RIVER),
        axis=1
    )
    df['near_river'] = (df['dist_river'] <= 0.5).astype(int)
    
    # ===================
    # POI distances
    # ===================
    lats = df['latitude'].values
    lons = df['longitude'].values
    
    for poi_name, (poi_lat, poi_lon) in POI.items():
        df[f'dist_{poi_name}'] = haversine_distance(lats, lons, poi_lat, poi_lon)
    
    # Aggregated distances
    mall_pois = ['dordoi_plaza', 'bishkek_park', 'tsum', 'vefa_center', 'asia_mall', 'karavan']
    df['dist_nearest_mall'] = df[[f'dist_{p}' for p in mall_pois]].min(axis=1)
    
    transport_pois = ['west_bus_station', 'east_bus_station', 'railway_station']
    df['dist_nearest_station'] = df[[f'dist_{p}' for p in transport_pois]].min(axis=1)
    
    market_pois = ['osh_bazaar', 'dordoi_bazaar', 'ortosay_bazaar']
    df['dist_nearest_bazaar'] = df[[f'dist_{p}' for p in market_pois]].min(axis=1)
    
    df['dist_center'] = df['dist_center']
    
    # ===================
    # Park features
    # ===================
    df['near_park'] = df.apply(
        lambda row: point_in_any_park(row['latitude'], row['longitude']), 
        axis=1
    ).astype(int)
    
    for park_name, polygon in PARKS.items():
        centroid_lat = np.mean([p[0] for p in polygon])
        centroid_lon = np.mean([p[1] for p in polygon])
        df[f'dist_{park_name}'] = haversine_distance(lats, lons, centroid_lat, centroid_lon)
    
    park_dist_cols = [f'dist_{p}' for p in PARKS.keys()]
    df['dist_nearest_park'] = df[park_dist_cols].min(axis=1)
    
    return df, encoders

In [None]:
# Apply feature engineering
df_features, feature_encoders = create_features(df)
print(f"Features created. New shape: {df_features.shape}")

# Show distance feature statistics
poi_dist_cols = [c for c in df_features.columns if c.startswith('dist_')][:5]
print(f"\nDistance features sample:")
print(df_features[poi_dist_cols].describe().round(2))

In [None]:
def create_target_encoders(train_df, cols, target_col, min_samples=5):
    """
    Create target encoders from training data only (no leakage).
    Returns dict of {col: {category: encoded_value}}
    """
    encoders = {}
    global_mean = train_df[target_col].mean()
    
    for col in cols:
        # Handle NaN values by converting to string
        temp_col = train_df[col].fillna('__MISSING__')
        temp_target = train_df[target_col]
        
        agg = pd.DataFrame({'val': temp_col, 'target': temp_target}).groupby('val')['target'].agg(['mean', 'count'])
        
        # Smoothing: blend with global mean based on sample size
        smoothing_factor = agg['count'] / (agg['count'] + min_samples)
        smoothed_mean = smoothing_factor * agg['mean'] + (1 - smoothing_factor) * global_mean
        
        encoders[col] = {
            'mapping': smoothed_mean.to_dict(),
            'global_mean': global_mean
        }
    
    return encoders


def apply_target_encoding(df, encoders):
    """Apply target encoding using pre-fitted encoders (safe for train/test)"""
    df = df.copy()
    
    for col, encoder in encoders.items():
        col_mean = f'{col}_price_mean'
        # Handle NaN by mapping to '__MISSING__'
        temp_col = df[col].fillna('__MISSING__')
        df[col_mean] = temp_col.map(encoder['mapping'])
        df[col_mean] = df[col_mean].fillna(encoder['global_mean'])
    
    return df


print("Target encoding functions defined (will apply after train/test split)")

In [None]:
# ===================
# FEATURE LIST
# ===================

# Core numeric
numeric_features = [
    'rooms', 'area', 'floor', 'total_floors', 'year_built',
    'latitude', 'longitude'
]

# Floor & Building
floor_building_features = [
    'floor_ratio', 'is_first_floor', 'is_last_floor',
    'building_age', 'is_new_building', 'is_highrise',
]

# Area & Kitchen
area_features = [
    'area_per_room', 'is_large_apartment',
    'kitchen_ratio', 'kitchen_missing',
]

# Ceiling
ceiling_features = [
    'ceiling_height_m', 'ceiling_missing',
]

# Condition
condition_features = [
    'condition_score', 'condition_missing',
]

# Bathroom & Balcony
bathroom_balcony_features = [
    'has_separate_bathroom', 'has_2plus_bathrooms',
    'has_balcony', 'has_loggia',
]

# Other amenities
amenity_features = [
    'has_parking', 'is_parquet', 'is_laminate',
    'security_score', 'has_furniture',
]

# House type
house_type_features = ['is_monolith', 'is_brick', 'is_panel']

# Residential complex
jk_features = [
    'has_jk', 'jk_class_score', 'jk_is_completed',
]

# Target encoding
target_encoding_features = [
    'district_price_mean',
    'jk_name_price_mean',  # Target encoding by residential complex
]

# Location & River
location_features = [
    'dist_river', 'near_river',
]

# POI distances
poi_features = [f'dist_{poi}' for poi in POI.keys()] + [
    'dist_nearest_mall', 'dist_nearest_station', 'dist_nearest_bazaar', 'dist_center',
]

# Park features
park_features = [f'dist_{park}' for park in PARKS.keys()] + [
    'dist_nearest_park', 'near_park',
]

# Combine all features
all_features = (
    numeric_features + 
    floor_building_features + 
    area_features +
    ceiling_features +
    condition_features +
    bathroom_balcony_features +
    amenity_features +
    house_type_features +
    jk_features +
    target_encoding_features +
    location_features +
    poi_features + 
    park_features
)

print(f"TOTAL FEATURES: {len(all_features)}")

In [None]:
# ===================
# TRAIN/TEST SPLIT FIRST (before target encoding!)
# ===================
target = 'price_per_m2'

# Split indices
train_idx, test_idx = train_test_split(
    df_features.index, test_size=0.2, random_state=42
)

train_df = df_features.loc[train_idx].copy()
test_df = df_features.loc[test_idx].copy()

print(f"Train set: {len(train_df)} samples")
print(f"Test set:  {len(test_df)} samples")

# ===================
# TARGET ENCODING (fit on train only!)
# ===================
target_cols = ['district', 'jk_name']
target_encoders = create_target_encoders(
    train_df, target_cols, target, min_samples=10
)

# Apply to both sets
train_df = apply_target_encoding(train_df, target_encoders)
test_df = apply_target_encoding(test_df, target_encoders)

print("\nTarget encoding applied (fitted on train only - no leakage)")

# Show JK price stats
print("\nTop 10 JK by average price (from training data):")
jk_stats = train_df[train_df['jk_name'].notna()].groupby('jk_name')[target].agg(['mean', 'count'])
jk_stats = jk_stats[jk_stats['count'] >= 5].sort_values('mean', ascending=False)
print(jk_stats.head(10).round(0))

In [None]:
# ===================
# PREPARE FINAL FEATURES
# ===================

# Check which features are available
available_features = [f for f in all_features if f in train_df.columns]
missing_features = [f for f in all_features if f not in train_df.columns]

if missing_features:
    print(f"Warning: {len(missing_features)} features not found:")
    print(f"  {missing_features[:10]}...")

print(f"\nUsing {len(available_features)} features")

# Create X and y
X_train = train_df[available_features].values
X_test = test_df[available_features].values
y_train = train_df[target].values
y_test = test_df[target].values

# Replace any remaining NaN with 0
X_train = np.nan_to_num(X_train, nan=0.0)
X_test = np.nan_to_num(X_test, nan=0.0)

print(f"\nX_train shape: {X_train.shape}")
print(f"X_test shape:  {X_test.shape}")

In [None]:
# ===================
# CREATE GROUP LABELS for GroupKFold
# ===================
# Apartments in the same residential complex should not be split across train/validation

complex_col = 'jk_name'
train_groups = train_df[complex_col].fillna('unknown_' + train_df.index.astype(str)).copy()

# Convert to numeric group IDs
from sklearn.preprocessing import LabelEncoder
group_encoder = LabelEncoder()
train_group_ids = group_encoder.fit_transform(train_groups.values)

n_complexes = len(np.unique(train_group_ids))
print(f"Created {n_complexes} unique groups for GroupKFold")
print(f"  - Named complexes: {train_df[complex_col].notna().sum()}")
print(f"  - Unnamed (individual): {train_df[complex_col].isna().sum()}")

## 3. Baseline Models Comparison

In [None]:
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    """Train and evaluate a model, return metrics"""
    model.fit(X_train, y_train)
    
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    metrics = {
        'Model': model_name,
        'Train MAE': mean_absolute_error(y_train, y_pred_train),
        'Test MAE': mean_absolute_error(y_test, y_pred_test),
        'Train R²': r2_score(y_train, y_pred_train),
        'Test R²': r2_score(y_test, y_pred_test),
        'MAPE (%)': np.mean(np.abs((y_test - y_pred_test) / y_test)) * 100
    }
    
    return metrics, model

In [None]:
# Define baseline models
models = {
    'Random Forest': RandomForestRegressor(
        n_estimators=200, max_depth=15, random_state=42, n_jobs=-1
    ),
    'XGBoost': XGBRegressor(
        n_estimators=300, max_depth=10, learning_rate=0.05, random_state=42
    ),
    'LightGBM': LGBMRegressor(
        n_estimators=300, max_depth=10, learning_rate=0.05, random_state=42, verbose=-1
    ),
    'CatBoost': CatBoostRegressor(
        n_estimators=300, max_depth=10, learning_rate=0.05, random_state=42, verbose=0
    )
}

In [None]:
# Train and evaluate all models
results = []
trained_models = {}

for name, model in models.items():
    print(f"Training {name}...")
    metrics, trained_model = evaluate_model(model, X_train, X_test, y_train, y_test, name)
    results.append(metrics)
    trained_models[name] = trained_model
    print(f"  Test MAE: ${metrics['Test MAE']:,.0f}/m² | Test R²: {metrics['Test R²']:.3f} | MAPE: {metrics['MAPE (%)']:.1f}%")

print("\nDone!")

In [None]:
# Results comparison
results_df = pd.DataFrame(results)
results_df = results_df.sort_values('Test MAE')

print("=" * 80)
print("BASELINE MODELS COMPARISON")
print("=" * 80)
print(results_df.to_string(index=False))

In [None]:
# Visualize results
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

colors = ['#2ecc71' if m == results_df['Test MAE'].min() else '#3498db' for m in results_df['Test MAE']]
axes[0].barh(results_df['Model'], results_df['Test MAE'], color=colors)
axes[0].set_xlabel('MAE ($/m²)')
axes[0].set_title('Test MAE by Model (lower is better)')

colors = ['#2ecc71' if r == results_df['Test R²'].max() else '#3498db' for r in results_df['Test R²']]
axes[1].barh(results_df['Model'], results_df['Test R²'], color=colors)
axes[1].set_xlabel('R²')
axes[1].set_title('Test R² by Model (higher is better)')

plt.tight_layout()
plt.show()

## 4. Hyperparameter Tuning (Optuna)

In [None]:
def objective(trial):
    """Optuna objective function for XGBoost with GroupKFold"""
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 5, 15),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'subsample': trial.suggest_float('subsample', 0.5, 0.9),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 0.9),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-4, 1.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-4, 10.0, log=True),
        'random_state': 42
    }
    
    model = XGBRegressor(**params)
    
    # GroupKFold - keeps same residential complex in same fold (no leakage)
    gkf = GroupKFold(n_splits=5)
    
    scores = []
    for train_idx, val_idx in gkf.split(X_train, y_train, groups=train_group_ids):
        X_tr, X_val = X_train[train_idx], X_train[val_idx]
        y_tr, y_val = y_train[train_idx], y_train[val_idx]
        
        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_val)
        scores.append(mean_absolute_error(y_val, y_pred))
    
    return np.mean(scores)

print("Objective function defined with GroupKFold")

In [None]:
# Run Optuna optimization
optuna.logging.set_verbosity(optuna.logging.WARNING)

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50, show_progress_bar=True)

print(f"\nBest trial:")
print(f"  MAE (CV): ${study.best_trial.value:,.0f}/m²")
print(f"\nBest parameters:")
for key, value in study.best_trial.params.items():
    print(f"  {key}: {value}")

In [None]:
# Optuna visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

trials = [t.value for t in study.trials]
best_so_far = [min(trials[:i+1]) for i in range(len(trials))]
axes[0].plot(trials, 'o-', alpha=0.5, label='Trial MAE')
axes[0].plot(best_so_far, 'r-', linewidth=2, label='Best so far')
axes[0].set_xlabel('Trial')
axes[0].set_ylabel('MAE ($/m²)')
axes[0].set_title('Optimization History')
axes[0].legend()

try:
    importance = optuna.importance.get_param_importances(study)
    params = list(importance.keys())
    values = list(importance.values())
    axes[1].barh(params, values, color='steelblue')
    axes[1].set_xlabel('Importance')
    axes[1].set_title('Hyperparameter Importance')
except Exception as e:
    axes[1].text(0.5, 0.5, f'Could not compute importance', ha='center', va='center')

plt.tight_layout()
plt.show()

## 5. Final Model Training

In [None]:
# Train final model with best parameters
best_params = study.best_trial.params
best_params['random_state'] = 42

final_model = XGBRegressor(**best_params)
final_model.fit(X_train, y_train)

y_pred_train = final_model.predict(X_train)
y_pred_test = final_model.predict(X_test)

print("=" * 60)
print("FINAL MODEL RESULTS")
print("=" * 60)
print(f"\nTrain Set:")
print(f"  MAE:  ${mean_absolute_error(y_train, y_pred_train):,.0f}/m²")
print(f"  R²:   {r2_score(y_train, y_pred_train):.4f}")
print(f"\nTest Set:")
print(f"  MAE:  ${mean_absolute_error(y_test, y_pred_test):,.0f}/m²")
print(f"  R²:   {r2_score(y_test, y_pred_test):.4f}")
print(f"  MAPE: {np.mean(np.abs((y_test - y_pred_test) / y_test)) * 100:.2f}%")
print(f"  RMSE: ${np.sqrt(mean_squared_error(y_test, y_pred_test)):,.0f}/m²")

In [None]:
# Prediction vs Actual plot
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].scatter(y_test, y_pred_test, alpha=0.3, s=10)
axes[0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', linewidth=2)
axes[0].set_xlabel('Actual Price ($/m²)')
axes[0].set_ylabel('Predicted Price ($/m²)')
axes[0].set_title('Predicted vs Actual')

residuals = y_test - y_pred_test
axes[1].hist(residuals, bins=50, edgecolor='black', alpha=0.7)
axes[1].axvline(0, color='red', linestyle='--')
axes[1].set_xlabel('Residual ($/m²)')
axes[1].set_ylabel('Count')
axes[1].set_title(f'Residuals Distribution (Mean: ${residuals.mean():,.0f})')

plt.tight_layout()
plt.show()

## 6. Feature Importance & SHAP Analysis

In [None]:
# XGBoost feature importance
importance_df = pd.DataFrame({
    'feature': available_features,
    'importance': final_model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 8))
top_n = 15
plt.barh(importance_df['feature'][:top_n][::-1], 
         importance_df['importance'][:top_n][::-1], 
         color='steelblue')
plt.xlabel('Feature Importance')
plt.title(f'Top {top_n} Feature Importances (XGBoost)')
plt.tight_layout()
plt.show()

print("\nTop 15 features:")
for idx, row in importance_df.head(15).iterrows():
    print(f"  {row['feature']:30s}: {row['importance']:.4f} ({row['importance']*100:.1f}%)")

In [None]:
# SHAP values
print("Computing SHAP values...")
try:
    explainer = shap.TreeExplainer(final_model)
    shap_values = explainer.shap_values(X_test)
    print("Done!")
    shap_computed = True
except Exception as e:
    print(f"SHAP computation failed: {e}")
    shap_computed = False

In [None]:
# SHAP summary plot
if shap_computed:
    plt.figure(figsize=(10, 8))
    shap.summary_plot(shap_values, X_test, feature_names=available_features, show=False)
    plt.title('SHAP Feature Impact on Price Prediction')
    plt.tight_layout()
    plt.show()

## 7. Error Analysis

In [None]:
# Create test dataframe with predictions
test_analysis = test_df.copy()
test_analysis['predicted'] = y_pred_test
test_analysis['error'] = test_analysis[target] - test_analysis['predicted']
test_analysis['abs_error'] = np.abs(test_analysis['error'])
test_analysis['pct_error'] = test_analysis['abs_error'] / test_analysis[target] * 100

In [None]:
# Error by district
district_error = test_analysis.groupby('district').agg({
    'abs_error': 'mean',
    'pct_error': 'mean',
    target: 'count'
}).rename(columns={target: 'count'})
district_error = district_error.sort_values('abs_error')

print("Error by District:")
print(district_error.round(0))

In [None]:
# Error by JK presence
jk_error = test_analysis.groupby('has_jk').agg({
    'abs_error': 'mean',
    'pct_error': 'mean',
    target: 'count'
}).rename(columns={target: 'count'})

print("\nError by Residential Complex Presence:")
print(jk_error.round(1))
print("\n0 = No JK, 1 = Has JK")

In [None]:
# Error by room count
room_error = test_analysis.groupby('rooms').agg({
    'abs_error': 'mean',
    'pct_error': 'mean',
    target: 'count'
}).rename(columns={target: 'count'})

print("\nError by Room Count:")
print(room_error.round(0))

## 8. Save Model

In [None]:
import joblib
import json

# Save model
joblib.dump(final_model, 'bishkek_price_model.joblib')
print("Model saved to: bishkek_price_model.joblib")

# Save feature list and encoders
with open('bishkek_model_config.json', 'w', encoding='utf-8') as f:
    json.dump({
        'features': available_features,
        'best_params': best_params,
        'target_encoders': {
            col: {
                'mapping': {str(k): float(v) for k, v in enc['mapping'].items()},
                'global_mean': float(enc['global_mean'])
            } for col, enc in target_encoders.items()
        },
        'feature_encoders': {
            'kitchen_median': float(feature_encoders.get('kitchen_median', 0.12)),
            'ceiling_median': float(feature_encoders.get('ceiling_median', 2.7))
        },
        'metrics': {
            'test_mae': float(mean_absolute_error(y_test, y_pred_test)),
            'test_r2': float(r2_score(y_test, y_pred_test)),
            'test_mape': float(np.mean(np.abs((y_test - y_pred_test) / y_test)) * 100)
        }
    }, f, indent=2, ensure_ascii=False)
print("Model config saved to: bishkek_model_config.json")

## 9. Summary

In [None]:
print("="*70)
print("BISHKEK REAL ESTATE PRICE PREDICTION - SUMMARY")
print("="*70)
print(f"\nDataset: {len(df):,} apartments")
print(f"Features: {len(available_features)}")
print(f"Train/Test split: 80/20")
print(f"\nResidential Complex Coverage:")
print(f"  With JK: {df['jk_name'].notna().sum()} ({df['jk_name'].notna().mean()*100:.1f}%)")
print(f"  Unique JK: {df['jk_name'].nunique()}")
print(f"\nBest Model: XGBoost (tuned with Optuna, 50 trials)")
print(f"\nFinal Results:")
print(f"  MAE:  ${mean_absolute_error(y_test, y_pred_test):,.0f}/m²")
print(f"  MAPE: {np.mean(np.abs((y_test - y_pred_test) / y_test)) * 100:.1f}%")
print(f"  R²:   {r2_score(y_test, y_pred_test):.3f}")
print(f"\nInterpretation:")
print(f"  For an average apartment (60m², ~$95,000):")
print(f"  Expected prediction error: ~${mean_absolute_error(y_test, y_pred_test) * 60:,.0f}")
print(f"\nTop 5 Most Important Features:")
for idx, row in importance_df.head(5).iterrows():
    print(f"  - {row['feature']} ({row['importance']*100:.1f}%)")
print("\n" + "="*70)