# Astana Real Estate Price Prediction

**Goal:** Predict apartment price per square meter (₸/m²) in Astana, Kazakhstan

**Dataset:** 18,293 apartment listings from krisha.kz (January 2025)

**Approach:**
1. Feature Engineering
2. Baseline Models Comparison
3. Hyperparameter Tuning (Optuna)
4. Final Evaluation & SHAP Analysis

In [None]:
# Install dependencies (for Kaggle)
!pip install -q optuna shap catboost lightgbm

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, cross_val_score, KFold, GroupKFold
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import optuna
import shap

print("Libraries loaded successfully!")

## 1. Load Data

In [None]:
# Load from Kaggle dataset
df = pd.read_csv('/kaggle/input/astana-real-estate-2025/astana_clean.csv')

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")

In [None]:
# Quick overview
df.head()

In [None]:
# Target variable statistics
target = 'price_per_m2_kzt'

print("Target variable statistics:")
print(f"  Mean:   {df[target].mean():,.0f} ₸/m²")
print(f"  Median: {df[target].median():,.0f} ₸/m²")
print(f"  Std:    {df[target].std():,.0f} ₸/m²")
print(f"  Min:    {df[target].min():,.0f} ₸/m²")
print(f"  Max:    {df[target].max():,.0f} ₸/m²")

In [None]:
# Target distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

axes[0].hist(df[target], bins=50, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Price per m² (KZT)')
axes[0].set_ylabel('Count')
axes[0].set_title('Price Distribution')
axes[0].axvline(df[target].median(), color='red', linestyle='--', label=f'Median: {df[target].median():,.0f}')
axes[0].legend()

axes[1].hist(np.log1p(df[target]), bins=50, edgecolor='black', alpha=0.7, color='green')
axes[1].set_xlabel('Log(Price per m²)')
axes[1].set_ylabel('Count')
axes[1].set_title('Log-transformed Price Distribution')

plt.tight_layout()
plt.show()

## 2. Feature Engineering

In [None]:
# ===================
# POI (Points of Interest) - verified coordinates
# ===================

POI = {
    # Shopping malls
    'khan_shatyr': (51.1260, 71.4023),
    'mega_silk_way': (51.0881, 71.4088),
    'asia_park': (51.1280, 71.4116),
    'saryarka_mall': (51.1609, 71.4113),
    'keruen_city': (51.14591, 71.414001),
    'keruen': (51.128223, 71.424591),
    'abu_dhabi_plaza': (51.12218, 71.426543),
    
    # Key landmarks
    'baiterek': (51.1283, 71.4305),
    'akorda': (51.1258, 71.4464),
    'expo_nur_alem': (51.089487, 71.415327),
    'nazarbayev_university': (51.0906, 71.3972),
    'hazrat_sultan_mosque': (51.1250, 71.4722),
    
    # Transport
    'nurly_zhol_station': (51.1124, 71.5318),
    'astana_1_station': (51.1956, 71.4089),
    
    # Markets
    'astanalyk_bazaar': (51.17283, 71.43662),
}

# ===================
# Park polygons
# ===================

PARKS = {
    'presidential_park': [
        (51.138959, 71.435097), (51.133512, 71.434477), (51.132215, 71.440333),
        (51.131091, 71.445017), (51.12119, 71.441642), (51.112633, 71.438886),
        (51.103723, 71.453767), (51.100046, 71.468578), (51.100565, 71.486835),
        (51.104372, 71.486284), (51.109779, 71.476639), (51.110125, 71.462034),
        (51.114017, 71.455007), (51.120379, 71.455321), (51.119168, 71.473302),
        (51.123882, 71.475437), (51.12773, 71.459041), (51.135064, 71.460771),
        (51.138782, 71.452435), (51.140122, 71.444582)
    ],
    'central_park': [
        (51.146409, 71.412506), (51.156925, 71.411967), (51.159836, 71.420565),
        (51.15151, 71.427833), (51.14756, 71.422292)
    ],
    'botanical_garden': [
        (51.100993, 71.42198), (51.109475, 71.425268), (51.111839, 71.410623),
        (51.10302, 71.407754)
    ],
    'zhetisu_park': [
        (51.1335, 71.434528), (51.138415, 71.434804), (51.139564, 71.440678),
        (51.131704, 71.446176)
    ],
    'nurzhol_boulevard': [
        (51.12767, 71.438654), (51.126423, 71.438118), (51.127407, 71.432151),
        (51.123086, 71.4302), (51.123734, 71.426605), (51.128079, 71.428326),
        (51.129911, 71.417377), (51.128182, 71.416497), (51.130991, 71.397792),
        (51.135839, 71.399284), (51.132599, 71.418486), (51.130823, 71.417874),
        (51.129287, 71.428775), (51.130919, 71.430038), (51.130919, 71.433442)
    ],
    'triathlon_park': [
        (51.134808, 71.454883), (51.1387, 71.452378), (51.139723, 71.445062),
        (51.138526, 71.444108), (51.132064, 71.448203), (51.132762, 71.454048)
    ]
}

# ===================
# Yesil (Ishim) River - line through Astana
# ===================
YESIL_RIVER = [
    (51.097223, 71.586765), (51.103356, 71.525837), (51.101073, 71.515104),
    (51.103691, 71.501088), (51.099736, 71.488939), (51.102744, 71.475066),
    (51.105752, 71.471695), (51.106755, 71.454575), (51.109985, 71.450051),
    (51.116166, 71.449519), (51.120747, 71.445462), (51.124199, 71.451228),
    (51.128096, 71.450164), (51.133551, 71.445906), (51.14101, 71.441027),
    (51.147465, 71.439785), (51.150858, 71.429495), (51.16015, 71.42231),
    (51.157257, 71.408203), (51.160094, 71.397558), (51.162764, 71.395075),
    (51.165156, 71.359681)
]

def haversine_distance(lat1, lon1, lat2, lon2):
    """Calculate distance between two points in km using Haversine formula"""
    R = 6371  # Earth's radius in km
    
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    
    return R * c

def distance_to_polyline(lat, lon, polyline):
    """Calculate minimum distance from point to polyline (river)"""
    min_dist = float('inf')
    
    for i in range(len(polyline) - 1):
        p1_lat, p1_lon = polyline[i]
        p2_lat, p2_lon = polyline[i + 1]
        
        # Project point onto line segment
        # Simplified: check distance to both endpoints and midpoint
        d1 = haversine_distance(lat, lon, p1_lat, p1_lon)
        d2 = haversine_distance(lat, lon, p2_lat, p2_lon)
        
        # Midpoint
        mid_lat = (p1_lat + p2_lat) / 2
        mid_lon = (p1_lon + p2_lon) / 2
        d_mid = haversine_distance(lat, lon, mid_lat, mid_lon)
        
        min_dist = min(min_dist, d1, d2, d_mid)
    
    return min_dist

def point_in_polygon(lat, lon, polygon):
    """Check if point is inside polygon using ray casting algorithm"""
    n = len(polygon)
    inside = False
    
    p1_lat, p1_lon = polygon[0]
    for i in range(1, n + 1):
        p2_lat, p2_lon = polygon[i % n]
        if lon > min(p1_lon, p2_lon):
            if lon <= max(p1_lon, p2_lon):
                if lat <= max(p1_lat, p2_lat):
                    if p1_lon != p2_lon:
                        lat_inters = (lon - p1_lon) * (p2_lat - p1_lat) / (p2_lon - p1_lon) + p1_lat
                    if p1_lat == p2_lat or lat <= lat_inters:
                        inside = not inside
        p1_lat, p1_lon = p2_lat, p2_lon
    
    return inside

def point_in_any_park(lat, lon):
    """Check if point is inside any park"""
    for park_name, polygon in PARKS.items():
        if point_in_polygon(lat, lon, polygon):
            return True
    return False

print(f"Loaded {len(POI)} POI locations")
print(f"Loaded {len(PARKS)} park polygons")
print(f"Loaded Yesil river with {len(YESIL_RIVER)} points")

In [None]:
def create_features(df, fit_encoders=None, target_col=None):
    """
    Create all features for the model.
    
    Args:
        df: DataFrame with raw data
        fit_encoders: Dict with fitted encoders (for test set). If None, creates new encoders.
        target_col: Target column name (required if fit_encoders is None)
    
    Returns:
        df: DataFrame with features
        encoders: Dict with fitted encoders (for applying to test set)
    """
    df = df.copy()
    encoders = fit_encoders or {}
    
    # ===================
    # Floor features (simplified - remove multicollinear)
    # ===================
    df['floor_ratio'] = df['floor'] / df['total_floors']
    # Removed: is_first_floor, is_last_floor, is_middle_floor (multicollinear with floor_ratio)
    
    # ===================
    # Building features
    # ===================
    df['building_age'] = 2025 - df['year_built']
    df['is_new_building'] = (df['year_built'] >= 2020).astype(int)
    df['is_highrise'] = (df['total_floors'] >= 10).astype(int)
    # Removed: is_lowrise (inverse of is_highrise)
    
    # ===================
    # Area features
    # ===================
    df['area_per_room'] = df['area'] / df['rooms'].replace(0, 1)
    df['is_large_apartment'] = (df['area'] >= 100).astype(int)
    
    # Kitchen with missing indicator
    df['kitchen_area_clean'] = pd.to_numeric(df['kitchen_area'], errors='coerce')
    df['kitchen_ratio'] = df['kitchen_area_clean'] / df['area']
    df['kitchen_ratio'] = df['kitchen_ratio'].clip(0, 0.5)
    df['kitchen_missing'] = df['kitchen_ratio'].isna().astype(int)
    df['kitchen_ratio'] = df['kitchen_ratio'].fillna(df['kitchen_ratio'].median() if fit_encoders is None else encoders.get('kitchen_median', 0.15))
    if fit_encoders is None:
        encoders['kitchen_median'] = df['kitchen_ratio'].median()
    
    # ===================
    # Ceiling height with missing indicator
    # ===================
    def parse_ceiling(val):
        if pd.isna(val):
            return np.nan
        val = str(val).replace('м', '').replace(',', '.').strip()
        try:
            return float(val)
        except:
            return np.nan
    
    df['ceiling_height_m'] = df['ceiling_height'].apply(parse_ceiling)
    df['ceiling_missing'] = df['ceiling_height_m'].isna().astype(int)
    ceiling_median = df['ceiling_height_m'].median() if fit_encoders is None else encoders.get('ceiling_median', 2.7)
    df['ceiling_height_m'] = df['ceiling_height_m'].fillna(ceiling_median)
    if fit_encoders is None:
        encoders['ceiling_median'] = ceiling_median
    
    # ===================
    # Condition - use score only (remove redundant binary flags)
    # ===================
    df['condition_score'] = df['condition'].map({
        'свежий ремонт': 4,
        'не новый, но аккуратный ремонт': 3,
        'свободная планировка': 2,
        'черновая отделка': 1,
        'требует ремонта': 0
    })
    df['condition_missing'] = df['condition_score'].isna().astype(int)
    df['condition_score'] = df['condition_score'].fillna(2)  # Unknown = средний
    
    # ===================
    # Bathroom - simplified
    # ===================
    df['has_2plus_bathrooms'] = (df['bathroom'] == '2 с/у и более').astype(int)
    df['has_separate_bathroom'] = (df['bathroom'] == 'раздельный').astype(int)
    
    # ===================
    # Balcony - simplified
    # ===================
    df['has_balcony'] = df['balcony'].notna().astype(int)
    df['has_multiple_balconies'] = df['balcony'].str.contains('несколько|и лоджия', na=False).astype(int)
    
    # ===================
    # Parking - simplified
    # ===================
    df['has_parking'] = df['parking'].notna().astype(int)
    
    # ===================
    # Floor type - keep only parquet (premium indicator)
    # ===================
    df['is_parquet'] = df['floor_type'].str.contains('паркет', na=False).astype(int)
    
    # ===================
    # Security - aggregate score
    # ===================
    df['security_score'] = (
        df['security'].str.contains('охрана', na=False).astype(int) * 2 +
        df['security'].str.contains('видео', na=False).astype(int) +
        df['security'].str.contains('домофон', na=False).astype(int)
    )
    
    # ===================
    # Other features - keep important only
    # ===================
    df['is_dormitory'] = (df['former_dormitory'] == 'да').astype(int)
    df['has_furniture'] = df['furniture'].isin(['полностью', 'частично']).astype(int) if 'furniture' in df.columns else 0
    
    # ===================
    # House type - simplified
    # ===================
    df['is_monolith'] = (df['house_type'] == 'монолитный').astype(int)
    df['is_panel'] = (df['house_type'] == 'панельный').astype(int)
    
    # ===================
    # Residential complex
    # ===================
    elite_complexes = [
        'Хайвил Астана', 'Гранд Астана', 'Abu Dhabi Plaza', 'Абу-Даби Плаза',
        'Northern Lights', 'Северное сияние', 'Изумрудный квартал',
        'Millennium Park', 'Premium Tower', 'D Tower', 'Talan Towers'
    ]
    df['is_elite_complex'] = df['raw_жилой_комплекс'].isin(elite_complexes).astype(int)
    df['has_complex_name'] = df['raw_жилой_комплекс'].notna().astype(int)
    
    # ===================
    # Location
    # ===================
    df['is_left_bank'] = (df['district'] == 'Есильский р-н').astype(int)
    
    # ===================
    # Distance to Yesil River
    # ===================
    df['dist_river'] = df.apply(
        lambda row: distance_to_polyline(row['latitude'], row['longitude'], YESIL_RIVER),
        axis=1
    )
    df['near_river'] = (df['dist_river'] <= 0.5).astype(int)
    
    # ===================
    # POI distances - keep only aggregates to reduce multicollinearity
    # ===================
    lats = df['latitude'].values
    lons = df['longitude'].values
    
    # Key POIs only
    for poi_name, (poi_lat, poi_lon) in POI.items():
        df[f'dist_{poi_name}'] = haversine_distance(lats, lons, poi_lat, poi_lon)
    
    # Aggregated distances (main features)
    mall_pois = ['khan_shatyr', 'mega_silk_way', 'asia_park', 'saryarka_mall', 'keruen_city', 'keruen', 'abu_dhabi_plaza']
    df['dist_nearest_mall'] = df[[f'dist_{p}' for p in mall_pois]].min(axis=1)
    
    transport_pois = ['nurly_zhol_station', 'astana_1_station']
    df['dist_nearest_station'] = df[[f'dist_{p}' for p in transport_pois]].min(axis=1)
    
    df['dist_center'] = df['dist_baiterek']
    
    # ===================
    # Park features - aggregated only
    # ===================
    df['near_park'] = df.apply(
        lambda row: point_in_any_park(row['latitude'], row['longitude']), 
        axis=1
    ).astype(int)
    
    for park_name, polygon in PARKS.items():
        centroid_lat = np.mean([p[0] for p in polygon])
        centroid_lon = np.mean([p[1] for p in polygon])
        df[f'dist_{park_name}'] = haversine_distance(lats, lons, centroid_lat, centroid_lon)
    
    park_dist_cols = [f'dist_{p}' for p in PARKS.keys()]
    df['dist_nearest_park'] = df[park_dist_cols].min(axis=1)
    
    return df, encoders

In [None]:
# Apply feature engineering (without target encoding - that comes after split)
df_features, feature_encoders = create_features(df)
print(f"Features created. New shape: {df_features.shape}")

# Show distance feature statistics
poi_dist_cols = [c for c in df_features.columns if c.startswith('dist_')]
print(f"\nDistance features: {len(poi_dist_cols)}")
print(df_features[poi_dist_cols[:5]].describe().round(2))

In [None]:
def create_target_encoders(train_df, cols, target_col, min_samples=5):
    """
    Create target encoders from training data only (no leakage).
    Returns dict of {col: {category: encoded_value}}
    """
    encoders = {}
    global_mean = train_df[target_col].mean()
    
    for col in cols:
        agg = train_df.groupby(col)[target_col].agg(['mean', 'count'])
        
        # Smoothing: blend with global mean based on sample size
        smoothing_factor = agg['count'] / (agg['count'] + min_samples)
        smoothed_mean = smoothing_factor * agg['mean'] + (1 - smoothing_factor) * global_mean
        
        encoders[col] = {
            'mapping': smoothed_mean.to_dict(),
            'global_mean': global_mean
        }
    
    return encoders


def apply_target_encoding(df, encoders):
    """Apply target encoding using pre-fitted encoders (safe for train/test)"""
    df = df.copy()
    
    for col, encoder in encoders.items():
        col_mean = f'{col}_price_mean'
        df[col_mean] = df[col].map(encoder['mapping'])
        df[col_mean] = df[col_mean].fillna(encoder['global_mean'])
    
    return df


print("Target encoding functions defined (will apply after train/test split)")

In [None]:
# ===================
# FEATURE LIST - matches create_features() output
# ===================

# Core numeric
numeric_features = [
    'rooms', 'area', 'floor', 'total_floors', 'year_built',
    'latitude', 'longitude'
]

# Floor & Building (simplified - no multicollinearity)
floor_building_features = [
    'floor_ratio',  # Single feature instead of is_first/last/middle
    'building_age', 'is_new_building', 
    'is_highrise',  # Removed is_lowrise (inverse)
]

# Area & Kitchen
area_features = [
    'area_per_room', 'is_large_apartment',
    'kitchen_ratio', 'kitchen_missing',  # Added missing indicator
]

# Ceiling
ceiling_features = [
    'ceiling_height_m', 'ceiling_missing',  # Added missing indicator
]

# Condition (single score + missing indicator)
condition_features = [
    'condition_score', 'condition_missing',  # Simplified from 5 binary flags
]

# Bathroom (simplified)
bathroom_features = [
    'has_separate_bathroom', 'has_2plus_bathrooms',
]

# Balcony (simplified)
balcony_features = [
    'has_balcony', 'has_multiple_balconies',
]

# Parking (simplified)
parking_features = ['has_parking']

# Floor type (only premium indicator)
floor_type_features = ['is_parquet']

# Security (aggregate score instead of 4 binary flags)
security_features = ['security_score']

# Other
other_features = [
    'is_dormitory', 'has_furniture',
]

# House type
house_type_features = ['is_monolith', 'is_panel']

# Residential complex
complex_features = ['is_elite_complex', 'has_complex_name']

# Target encoding (applied after split)
target_encoding_features = [
    'district_price_mean',
    'raw_жилой_комплекс_price_mean',
]

# Location & River
location_features = [
    'is_left_bank',
    'dist_river', 'near_river',
]

# POI distances
poi_features = [f'dist_{poi}' for poi in POI.keys()] + [
    'dist_nearest_mall', 'dist_nearest_station', 'dist_center',
]

# Park features
park_features = [f'dist_{park}' for park in PARKS.keys()] + [
    'dist_nearest_park', 'near_park',
]

# ===================
# COMBINE ALL FEATURES
# ===================
all_features = (
    numeric_features + 
    floor_building_features + 
    area_features +
    ceiling_features +
    condition_features +
    bathroom_features +
    balcony_features +
    parking_features +
    floor_type_features +
    security_features +
    other_features +
    house_type_features +
    complex_features +
    target_encoding_features +
    location_features +
    poi_features + 
    park_features
)

print(f"=" * 50)
print(f"TOTAL FEATURES: {len(all_features)}")
print(f"=" * 50)
print(f"\nBy category:")
print(f"  Numeric:          {len(numeric_features)}")
print(f"  Floor/Building:   {len(floor_building_features)}")
print(f"  Area/Kitchen:     {len(area_features)}")
print(f"  Ceiling:          {len(ceiling_features)}")
print(f"  Condition:        {len(condition_features)}")
print(f"  Bathroom:         {len(bathroom_features)}")
print(f"  Balcony:          {len(balcony_features)}")
print(f"  Parking:          {len(parking_features)}")
print(f"  Floor type:       {len(floor_type_features)}")
print(f"  Security:         {len(security_features)}")
print(f"  Other:            {len(other_features)}")
print(f"  House type:       {len(house_type_features)}")
print(f"  Complex:          {len(complex_features)}")
print(f"  Target encoding:  {len(target_encoding_features)}")
print(f"  Location/River:   {len(location_features)}")
print(f"  POI:              {len(poi_features)}")
print(f"  Parks:            {len(park_features)}")

In [None]:
# ===================
# TRAIN/TEST SPLIT FIRST (before target encoding!)
# ===================
target = 'price_per_m2_kzt'

# Split indices
train_idx, test_idx = train_test_split(
    df_features.index, test_size=0.2, random_state=42
)

train_df = df_features.loc[train_idx].copy()
test_df = df_features.loc[test_idx].copy()

print(f"Train set: {len(train_df)} samples")
print(f"Test set:  {len(test_df)} samples")

# ===================
# TARGET ENCODING (fit on train only!)
# ===================
target_cols = ['district', 'raw_жилой_комплекс']
target_encoders = create_target_encoders(
    train_df, target_cols, target, min_samples=10
)

# Apply to both sets
train_df = apply_target_encoding(train_df, target_encoders)
test_df = apply_target_encoding(test_df, target_encoders)

print("\nTarget encoding applied (fitted on train only - no leakage)")

# Show district price stats from training data
print("\nDistrict price statistics (from training data):")
district_stats = train_df.groupby('district')[target].agg(['mean', 'median', 'count'])
district_stats = district_stats.sort_values('median', ascending=False)
print(district_stats.round(0))

In [None]:
# ===================
# PREPARE FINAL FEATURES
# ===================

# Check which features are available
available_features = [f for f in all_features if f in train_df.columns]
missing_features = [f for f in all_features if f not in train_df.columns]

if missing_features:
    print(f"Warning: {len(missing_features)} features not found:")
    print(f"  {missing_features}")

print(f"\nUsing {len(available_features)} features")

# Create X and y
X_train = train_df[available_features].values
X_test = test_df[available_features].values
y_train = train_df[target].values
y_test = test_df[target].values

# Replace any remaining NaN with 0
X_train = np.nan_to_num(X_train, nan=0.0)
X_test = np.nan_to_num(X_test, nan=0.0)

print(f"\nX_train shape: {X_train.shape}")
print(f"X_test shape:  {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape:  {y_test.shape}")

In [None]:
# ===================
# CREATE GROUP LABELS for GroupKFold
# ===================
# Apartments in the same residential complex should not be split across train/validation
# This prevents information leakage during cross-validation

# Create group ID from residential complex (NaN = unique group per sample)
complex_col = 'raw_жилой_комплекс'
train_groups = train_df[complex_col].fillna(
    'unknown_' + train_df.index.astype(str)
).values

# Convert to numeric group IDs
from sklearn.preprocessing import LabelEncoder
group_encoder = LabelEncoder()
train_group_ids = group_encoder.fit_transform(train_groups)

n_complexes = len(np.unique(train_group_ids))
print(f"Created {n_complexes} unique groups for GroupKFold")
print(f"  - Named complexes: {train_df[complex_col].notna().sum()}")
print(f"  - Unnamed (individual): {train_df[complex_col].isna().sum()}")

## 3. Baseline Models Comparison

In [None]:
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    """Train and evaluate a model, return metrics"""
    # Train
    model.fit(X_train, y_train)
    
    # Predict
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    # Metrics
    metrics = {
        'Model': model_name,
        'Train MAE': mean_absolute_error(y_train, y_pred_train),
        'Test MAE': mean_absolute_error(y_test, y_pred_test),
        'Train R²': r2_score(y_train, y_pred_train),
        'Test R²': r2_score(y_test, y_pred_test),
        'MAPE (%)': np.mean(np.abs((y_test - y_pred_test) / y_test)) * 100
    }
    
    return metrics, model

In [None]:
# Define baseline models
models = {
    'Random Forest': RandomForestRegressor(
        n_estimators=200, max_depth=15, random_state=42, n_jobs=-1
    ),
    'XGBoost': XGBRegressor(
        n_estimators=300, max_depth=10, learning_rate=0.05, random_state=42
    ),
    'LightGBM': LGBMRegressor(
        n_estimators=300, max_depth=10, learning_rate=0.05, random_state=42, verbose=-1
    ),
    'CatBoost': CatBoostRegressor(
        n_estimators=300, max_depth=10, learning_rate=0.05, random_state=42, verbose=0
    )
}

In [None]:
# Train and evaluate all models
results = []
trained_models = {}

for name, model in models.items():
    print(f"Training {name}...")
    metrics, trained_model = evaluate_model(model, X_train, X_test, y_train, y_test, name)
    results.append(metrics)
    trained_models[name] = trained_model
    print(f"  Test MAE: {metrics['Test MAE']:,.0f} ₸/m² | Test R²: {metrics['Test R²']:.3f} | MAPE: {metrics['MAPE (%)']:.1f}%")

print("\nDone!")

In [None]:
# Results comparison
results_df = pd.DataFrame(results)
results_df = results_df.sort_values('Test MAE')

print("=" * 80)
print("BASELINE MODELS COMPARISON")
print("=" * 80)
print(results_df.to_string(index=False))

In [None]:
# Visualize results
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# MAE comparison
colors = ['#2ecc71' if m == results_df['Test MAE'].min() else '#3498db' for m in results_df['Test MAE']]
axes[0].barh(results_df['Model'], results_df['Test MAE'], color=colors)
axes[0].set_xlabel('MAE (₸/m²)')
axes[0].set_title('Test MAE by Model (lower is better)')
for i, v in enumerate(results_df['Test MAE']):
    axes[0].text(v + 500, i, f'{v:,.0f}', va='center')

# R² comparison
colors = ['#2ecc71' if r == results_df['Test R²'].max() else '#3498db' for r in results_df['Test R²']]
axes[1].barh(results_df['Model'], results_df['Test R²'], color=colors)
axes[1].set_xlabel('R²')
axes[1].set_title('Test R² by Model (higher is better)')
for i, v in enumerate(results_df['Test R²']):
    axes[1].text(v + 0.01, i, f'{v:.3f}', va='center')

plt.tight_layout()
plt.show()

### Feature Selection

Remove low-importance features to reduce overfitting and improve interpretability.

In [None]:
# Train a quick LightGBM to get feature importances
lgbm_selector = LGBMRegressor(n_estimators=100, max_depth=8, random_state=42, verbose=-1)
lgbm_selector.fit(X_train, y_train)

# Get feature importances
importance_df_selection = pd.DataFrame({
    'feature': available_features,
    'importance': lgbm_selector.feature_importances_
}).sort_values('importance', ascending=False)

# Set threshold - keep features with importance > 0.5% of total
importance_threshold = 0.005
importance_df_selection['importance_pct'] = importance_df_selection['importance'] / importance_df_selection['importance'].sum()
selected_features = importance_df_selection[importance_df_selection['importance_pct'] >= importance_threshold]['feature'].tolist()

print(f"Feature Selection Results:")
print(f"  Original features: {len(available_features)}")
print(f"  Selected features: {len(selected_features)} (importance >= {importance_threshold*100}%)")
print(f"  Removed features:  {len(available_features) - len(selected_features)}")

# Show removed features
removed_features = [f for f in available_features if f not in selected_features]
if removed_features:
    print(f"\nRemoved low-importance features:")
    for f in removed_features:
        imp = importance_df_selection[importance_df_selection['feature'] == f]['importance_pct'].values[0]
        print(f"  - {f}: {imp*100:.2f}%")

In [None]:
# Update features with selected subset
selected_indices = [available_features.index(f) for f in selected_features]
X_train_selected = X_train[:, selected_indices]
X_test_selected = X_test[:, selected_indices]

print(f"X_train shape: {X_train.shape} -> {X_train_selected.shape}")
print(f"X_test shape:  {X_test.shape} -> {X_test_selected.shape}")

# Use selected features for training
X_train = X_train_selected
X_test = X_test_selected
available_features = selected_features

print(f"\nProceeding with {len(available_features)} selected features")

## 4. Hyperparameter Tuning (Optuna)

In [None]:
# Select best baseline model for tuning
best_baseline = results_df.iloc[0]['Model']
print(f"Best baseline model: {best_baseline}")
print(f"Proceeding with XGBoost tuning (most stable in practice)")

In [None]:
def objective(trial):
    """Optuna objective function for XGBoost with GroupKFold"""
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 5, 15),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'subsample': trial.suggest_float('subsample', 0.5, 0.9),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 0.9),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-4, 1.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-4, 10.0, log=True),
        'random_state': 42
    }
    
    model = XGBRegressor(**params)
    
    # GroupKFold - keeps same residential complex in same fold (no leakage)
    gkf = GroupKFold(n_splits=5)
    
    scores = []
    for train_idx, val_idx in gkf.split(X_train, y_train, groups=train_group_ids):
        X_tr, X_val = X_train[train_idx], X_train[val_idx]
        y_tr, y_val = y_train[train_idx], y_train[val_idx]
        
        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_val)
        scores.append(mean_absolute_error(y_val, y_pred))
    
    return np.mean(scores)

print("Objective function defined with GroupKFold (no same-complex leakage)")

In [None]:
# Run Optuna optimization
optuna.logging.set_verbosity(optuna.logging.WARNING)

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50, show_progress_bar=True)

print(f"\nBest trial:")
print(f"  MAE (CV): {study.best_trial.value:,.0f} ₸/m²")
print(f"\nBest parameters:")
for key, value in study.best_trial.params.items():
    print(f"  {key}: {value}")

In [None]:
# Optuna visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Optimization history
trials = [t.value for t in study.trials]
best_so_far = [min(trials[:i+1]) for i in range(len(trials))]
axes[0].plot(trials, 'o-', alpha=0.5, label='Trial MAE')
axes[0].plot(best_so_far, 'r-', linewidth=2, label='Best so far')
axes[0].set_xlabel('Trial')
axes[0].set_ylabel('MAE (₸/m²)')
axes[0].set_title('Optimization History')
axes[0].legend()

# Parameter importance
importance = optuna.importance.get_param_importances(study)
params = list(importance.keys())
values = list(importance.values())
axes[1].barh(params, values, color='steelblue')
axes[1].set_xlabel('Importance')
axes[1].set_title('Hyperparameter Importance')

plt.tight_layout()
plt.show()

## 5. Final Model Training

In [None]:
# Train final model with best parameters
best_params = study.best_trial.params
best_params['random_state'] = 42

final_model = XGBRegressor(**best_params)
final_model.fit(X_train, y_train)

# Predictions
y_pred_train = final_model.predict(X_train)
y_pred_test = final_model.predict(X_test)

# Final metrics
print("=" * 60)
print("FINAL MODEL RESULTS")
print("=" * 60)
print(f"\nTrain Set:")
print(f"  MAE:  {mean_absolute_error(y_train, y_pred_train):,.0f} ₸/m²")
print(f"  R²:   {r2_score(y_train, y_pred_train):.4f}")
print(f"\nTest Set:")
print(f"  MAE:  {mean_absolute_error(y_test, y_pred_test):,.0f} ₸/m²")
print(f"  R²:   {r2_score(y_test, y_pred_test):.4f}")
print(f"  MAPE: {np.mean(np.abs((y_test - y_pred_test) / y_test)) * 100:.2f}%")
print(f"  RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_test)):,.0f} ₸/m²")

In [None]:
# Prediction vs Actual plot
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Scatter plot
axes[0].scatter(y_test, y_pred_test, alpha=0.3, s=10)
axes[0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', linewidth=2)
axes[0].set_xlabel('Actual Price (₸/m²)')
axes[0].set_ylabel('Predicted Price (₸/m²)')
axes[0].set_title('Predicted vs Actual')

# Residuals
residuals = y_test - y_pred_test
axes[1].hist(residuals, bins=50, edgecolor='black', alpha=0.7)
axes[1].axvline(0, color='red', linestyle='--')
axes[1].set_xlabel('Residual (₸/m²)')
axes[1].set_ylabel('Count')
axes[1].set_title(f'Residuals Distribution (Mean: {residuals.mean():,.0f})')

plt.tight_layout()
plt.show()

## 6. Feature Importance & SHAP Analysis

In [None]:
# XGBoost feature importance
importance_df = pd.DataFrame({
    'feature': available_features,
    'importance': final_model.feature_importances_
}).sort_values('importance', ascending=False)

# Plot top 15 features
plt.figure(figsize=(10, 8))
top_n = 15
plt.barh(importance_df['feature'][:top_n][::-1], 
         importance_df['importance'][:top_n][::-1], 
         color='steelblue')
plt.xlabel('Feature Importance')
plt.title(f'Top {top_n} Feature Importances (XGBoost)')
plt.tight_layout()
plt.show()

print("\nTop 15 features:")
for idx, row in importance_df.head(15).iterrows():
    print(f"  {row['feature']:30s}: {row['importance']:.4f} ({row['importance']*100:.1f}%)")

In [None]:
# SHAP values
print("Computing SHAP values (this may take a minute)...")
explainer = shap.TreeExplainer(final_model)
shap_values = explainer.shap_values(X_test)
print("Done!")

In [None]:
# SHAP summary plot
plt.figure(figsize=(10, 8))
shap.summary_plot(shap_values, X_test, feature_names=available_features, show=False)
plt.title('SHAP Feature Impact on Price Prediction')
plt.tight_layout()
plt.show()

In [None]:
# SHAP bar plot (mean absolute impact)
plt.figure(figsize=(10, 8))
shap.summary_plot(shap_values, X_test, feature_names=available_features, plot_type='bar', show=False)
plt.title('Mean Absolute SHAP Values')
plt.tight_layout()
plt.show()

## 7. Error Analysis by Segment

In [None]:
# Create test dataframe with predictions
test_analysis = test_df.copy()
test_analysis['predicted'] = y_pred_test
test_analysis['error'] = test_analysis[target] - test_analysis['predicted']
test_analysis['abs_error'] = np.abs(test_analysis['error'])
test_analysis['pct_error'] = test_analysis['abs_error'] / test_analysis[target] * 100

In [None]:
# Error by district
district_error = test_analysis.groupby('district').agg({
    'abs_error': 'mean',
    'pct_error': 'mean',
    target: 'count'
}).rename(columns={target: 'count'})
district_error = district_error.sort_values('abs_error')

print("Error by District:")
print(district_error.round(0))

In [None]:
# Error by room count
room_error = test_analysis.groupby('rooms').agg({
    'abs_error': 'mean',
    'pct_error': 'mean',
    target: 'count'
}).rename(columns={target: 'count'})

print("\nError by Room Count:")
print(room_error.round(0))

In [None]:
# Error by price segment
test_analysis['price_segment'] = pd.cut(
    test_analysis[target], 
    bins=[0, 400000, 600000, 800000, 1000000, 2000000],
    labels=['<400k', '400-600k', '600-800k', '800k-1M', '>1M']
)

segment_error = test_analysis.groupby('price_segment').agg({
    'abs_error': 'mean',
    'pct_error': 'mean',
    target: 'count'
}).rename(columns={target: 'count'})

print("\nError by Price Segment:")
print(segment_error.round(0))

## 8. Save Model

In [None]:
import joblib
import json

# Save model
joblib.dump(final_model, 'astana_price_model.joblib')
print("Model saved to: astana_price_model.joblib")

# Save feature list and encoders
with open('model_config.json', 'w', encoding='utf-8') as f:
    json.dump({
        'features': available_features,
        'best_params': best_params,
        'target_encoders': {
            col: {
                'mapping': {str(k): float(v) for k, v in enc['mapping'].items()},
                'global_mean': float(enc['global_mean'])
            } for col, enc in target_encoders.items()
        },
        'feature_encoders': {
            'kitchen_median': float(feature_encoders.get('kitchen_median', 0.15)),
            'ceiling_median': float(feature_encoders.get('ceiling_median', 2.7))
        },
        'metrics': {
            'test_mae': float(mean_absolute_error(y_test, y_pred_test)),
            'test_r2': float(r2_score(y_test, y_pred_test)),
            'test_mape': float(np.mean(np.abs((y_test - y_pred_test) / y_test)) * 100)
        }
    }, f, indent=2, ensure_ascii=False)
print("Model config saved to: model_config.json")

## 9. Summary

In [None]:
print("="*70)
print("ASTANA REAL ESTATE PRICE PREDICTION - SUMMARY")
print("="*70)
print(f"\nDataset: {len(df):,} apartments")
print(f"Features: {len(available_features)}")
print(f"Train/Test split: 80/20")
print(f"\nBest Model: XGBoost (tuned with Optuna, 50 trials)")
print(f"\nFinal Results:")
print(f"  MAE:  {mean_absolute_error(y_test, y_pred_test):,.0f} KZT/m2")
print(f"  MAPE: {np.mean(np.abs((y_test - y_pred_test) / y_test)) * 100:.1f}%")
print(f"  R2:   {r2_score(y_test, y_pred_test):.3f}")
print(f"\nInterpretation:")
print(f"  For an average apartment (60m2, ~35M KZT):")
print(f"  Expected prediction error: ~{mean_absolute_error(y_test, y_pred_test) * 60 / 1e6:.1f} million KZT")
print(f"\nTop 5 Most Important Features:")
for idx, row in importance_df.head(5).iterrows():
    print(f"  - {row['feature']} ({row['importance']*100:.1f}%)")
print(f"\nKey Improvements:")
print(f"  - No data leakage (target encoding fit on train only)")
print(f"  - Reduced multicollinearity (simplified features)")
print(f"  - Missing value indicators for kitchen/ceiling/condition")
print("\n" + "="*70)