# Bishkek Real Estate: v3 + Computer Vision
#
This notebook combines:
- **v3 Baseline**: 39 features, POI distances, Optuna tuning (MAE $122, R² 0.76)
- **CV Embeddings**: ResNet-50 image features (64 PCA components)
#
Based on research:
- [MHPP (arXiv 2024)](https://arxiv.org/abs/2409.05335): +21-26% MAE improvement with images
- [PLOS One 2025](https://pmc.ncbi.nlm.nih.gov/articles/PMC12088074/): ResNet-101 + t-SNE
#
Expected improvement: MAE $122 → $100-110 (-10% to -18%)



In [1]:
# Install dependencies
import subprocess
import sys
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "optuna"])

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
import gc
import shutil
warnings.filterwarnings('ignore')

# ML
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import Ridge

# Boosting models
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

# Optuna for hyperparameter tuning
try:
    import optuna
    from optuna.integration import XGBoostPruningCallback, LightGBMPruningCallback
    OPTUNA_AVAILABLE = True
    print("Optuna available for hyperparameter tuning")
except ImportError:
    OPTUNA_AVAILABLE = False
    print("Optuna not available")

# For image processing
try:
    import torch
    import torchvision.models as models
    import torchvision.transforms as transforms
    from PIL import Image
    TORCH_AVAILABLE = True
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"PyTorch available, device: {DEVICE}")
    if torch.cuda.is_available():
        print(f"  GPU: {torch.cuda.get_device_name(0)}")
except ImportError:
    TORCH_AVAILABLE = False
    DEVICE = None
    print("PyTorch not available")

print(f"\nSetup complete!")



Optuna available for hyperparameter tuning
PyTorch available, device: cuda
  GPU: Tesla T4

Setup complete!


## POI (Points of Interest)



In [2]:
from math import radians, sin, cos, sqrt, atan2

# POI Bishkek - key locations by category
BISHKEK_POI = {
    'bazaars': [
        ('osh_bazaar', 42.874823, 74.569599),
        ('dordoi_bazaar', 42.939732, 74.620613),
        ('ortosay_bazaar', 42.836209, 74.615931),
        ('alamedin_bazaar', 42.88683, 74.637305),
    ],
    'parks': [
        ('dubovy_park', 42.877681, 74.606759),
        ('ataturk_park', 42.839587, 74.595725),
        ('karagach_grove', 42.900362, 74.619652),
        ('victory_park', 42.872456, 74.615523),
        ('botanical_garden', 42.829413, 74.616985),
    ],
    'malls': [
        ('bishkek_park', 42.871234, 74.593345),
        ('dordoi_plaza', 42.878456, 74.618234),
        ('vefa_center', 42.854123, 74.612345),
        ('tsum', 42.874234, 74.600123),
    ],
    'universities': [
        ('auca', 42.824891, 74.618307),
        ('krsu', 42.873917, 74.595389),
        ('bgu', 42.875423, 74.613456),
        ('knu', 42.872345, 74.603456),
    ],
    'hospitals': [
        ('national_hospital', 42.873456, 74.621234),
        ('city_hospital_1', 42.856789, 74.598765),
        ('republican_hospital', 42.869012, 74.615678),
    ],
    'transport': [
        ('west_bus_station', 42.874567, 74.554321),
        ('east_bus_station', 42.873456, 74.650123),
        ('railway_station', 42.871234, 74.573456),
    ],
    'admin': [
        ('jogorku_kenesh', 42.874567, 74.604321),
        ('erkindik', 42.872345, 74.599876),
    ],
}

BISHKEK_PREMIUM_ZONES = {
    'erkindik_south': (42.867, 74.600),
    'center': (42.874, 74.600),
    'filarmonia': (42.872, 74.616),
    'tsum_area': (42.876, 74.602),
    'vefa_area': (42.862, 74.618),
}

BISHKEK_CENTER = (42.874621, 74.569199)

def haversine_distance(lat1, lon1, lat2, lon2):
    """Calculate distance between two points in km"""
    R = 6371
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * atan2(sqrt(a), sqrt(1-a))
    return R * c



## Load Data



In [3]:
# Load data from Kaggle
df = pd.read_csv('/kaggle/input/bishkek-real-estate-2025/bishkek_apartments.csv')

print(f"Dataset: {len(df)} apartments, {len(df.columns)} columns")
print(f"year_built filled: {df['year_built'].notna().mean()*100:.1f}%")
print(f"JK linked: {df['jk_name'].notna().mean()*100:.1f}%")



Dataset: 8821 apartments, 60 columns
year_built filled: 73.1%
JK linked: 48.9%


## Feature Engineering (Same as v3)



In [4]:
class FeatureEngineer:
    """
    Feature engineering pipeline for real estate data.
    Handles imputation, encoding, POI distances, and derived features.

    IMPORTANT: All transformations are fit on train data only to prevent data leakage.
    """

    def __init__(self, n_district_clusters=30, include_poi=True):
        self.n_district_clusters = n_district_clusters
        self.include_poi = include_poi
        self.year_medians_by_series = None
        self.year_medians_by_type_floor = None
        self.global_year_median = None
        self.district_kmeans = None
        self.target_encodings = {}
        self.label_encoders = {}
        self.is_fitted = False

    def fit(self, df, y=None):
        """Fit all transformers on training data"""
        df = df.copy()

        # 1. Year built imputation medians
        df['floor_group'] = pd.cut(
            df['total_floors'].fillna(9),
            bins=[0, 5, 9, 12, 16, 100],
            labels=['1-5', '6-9', '10-12', '13-16', '17+']
        )

        filled = df[df['year_built'].notna()]
        self.year_medians_by_series = filled.groupby('building_series')['year_built'].median().to_dict()
        self.year_medians_by_type_floor = filled.groupby(['house_type', 'floor_group'])['year_built'].median().to_dict()
        self.global_year_median = filled['year_built'].median()

        # 2. District clustering (K-means on coordinates)
        coords = df[['latitude', 'longitude']].dropna()
        if len(coords) > self.n_district_clusters:
            self.district_kmeans = KMeans(
                n_clusters=self.n_district_clusters,
                random_state=42,
                n_init=10
            )
            self.district_kmeans.fit(coords)

        # 3. Target encoding for categorical variables (if y provided)
        if y is not None:
            df_with_target = df.copy()
            df_with_target['target'] = y

            for col in ['jk_name', 'district', 'building_series', 'house_type']:
                if col in df.columns:
                    means = df_with_target.groupby(col)['target'].mean()
                    counts = df_with_target.groupby(col)['target'].count()

                    # Smoothing with global mean
                    global_mean = y.mean()
                    smoothing = 10

                    smoothed = (means * counts + global_mean * smoothing) / (counts + smoothing)
                    self.target_encodings[col] = smoothed.to_dict()

        # 4. Label encoders for remaining categoricals
        for col in ['condition', 'heating', 'bathroom']:
            if col in df.columns:
                le = LabelEncoder()
                valid_values = df[col].dropna().unique()
                le.fit(list(valid_values) + ['unknown'])
                self.label_encoders[col] = le

        self.is_fitted = True
        return self

    def transform(self, df):
        """Transform data using fitted transformers"""
        if not self.is_fitted:
            raise ValueError("FeatureEngineer must be fitted before transform")

        df = df.copy()

        # 1. Impute year_built
        df['floor_group'] = pd.cut(
            df['total_floors'].fillna(9),
            bins=[0, 5, 9, 12, 16, 100],
            labels=['1-5', '6-9', '10-12', '13-16', '17+']
        )

        mask = df['year_built'].isna()
        for idx in df[mask].index:
            series = df.loc[idx, 'building_series']
            htype = df.loc[idx, 'house_type']
            fgroup = df.loc[idx, 'floor_group']

            if series in self.year_medians_by_series:
                df.loc[idx, 'year_built'] = self.year_medians_by_series[series]
            elif (htype, fgroup) in self.year_medians_by_type_floor:
                df.loc[idx, 'year_built'] = self.year_medians_by_type_floor[(htype, fgroup)]
            else:
                df.loc[idx, 'year_built'] = self.global_year_median

        # 2. District clusters
        if self.district_kmeans is not None:
            coords = df[['latitude', 'longitude']].fillna(df[['latitude', 'longitude']].median())
            df['district_cluster'] = self.district_kmeans.predict(coords)

        # 3. Derived features
        df['floor_ratio'] = df['floor'] / df['total_floors'].replace(0, 1)
        df['is_first_floor'] = (df['floor'] == 1).astype(int)
        df['is_last_floor'] = (df['floor'] == df['total_floors']).astype(int)
        df['building_age'] = 2025 - df['year_built']
        df['is_new_building'] = (df['year_built'] >= 2020).astype(int)
        df['area_per_room'] = df['area'] / df['rooms'].replace(0, 1)
        df['is_highrise'] = (df['total_floors'] >= 12).astype(int)

        # 4. Binary amenity features
        df['has_balcony'] = df['balcony'].notna().astype(int)
        df['has_parking'] = df['parking'].notna().astype(int)
        df['has_furniture'] = df['furniture'].notna().astype(int)

        # 5. Target encoding
        for col, encoding in self.target_encodings.items():
            if col in df.columns:
                global_mean = np.mean(list(encoding.values()))
                df[f'{col}_encoded'] = df[col].map(encoding).fillna(global_mean)

        # 6. Label encoding
        for col, le in self.label_encoders.items():
            if col in df.columns:
                df[f'{col}_encoded'] = df[col].fillna('unknown').apply(
                    lambda x: le.transform([x])[0] if x in le.classes_ else le.transform(['unknown'])[0]
                )

        # 7. House type one-hot
        if 'house_type' in df.columns:
            df['is_monolith'] = (df['house_type'] == 'монолит').astype(int)
            df['is_brick'] = (df['house_type'] == 'кирпич').astype(int)
            df['is_panel'] = (df['house_type'] == 'панель').astype(int)

        # 8. POI distances
        if self.include_poi:
            df = self._add_poi_features(df)

        return df

    def _add_poi_features(self, df):
        """Add POI distance features"""
        df = df.copy()

        # Distance to city center
        df['dist_to_center'] = df.apply(
            lambda row: haversine_distance(
                row['latitude'], row['longitude'],
                BISHKEK_CENTER[0], BISHKEK_CENTER[1]
            ) if pd.notna(row['latitude']) and pd.notna(row['longitude']) else np.nan,
            axis=1
        )

        # Distance to each POI category (minimum distance to nearest POI in category)
        for category, pois in BISHKEK_POI.items():
            col_name = f'dist_to_{category}'
            df[col_name] = df.apply(
                lambda row: self._min_distance_to_pois(row, pois),
                axis=1
            )

        # Distance to premium zones (minimum)
        df['dist_to_premium'] = df.apply(
            lambda row: self._min_distance_to_premium(row),
            axis=1
        )

        # Binary: is in premium zone (within 1km)
        df['is_premium_zone'] = (df['dist_to_premium'] <= 1.0).astype(int)

        return df

    def _min_distance_to_pois(self, row, pois):
        """Calculate minimum distance to a list of POIs"""
        if pd.isna(row['latitude']) or pd.isna(row['longitude']):
            return np.nan
        distances = [
            haversine_distance(row['latitude'], row['longitude'], lat, lon)
            for name, lat, lon in pois
        ]
        return min(distances) if distances else np.nan

    def _min_distance_to_premium(self, row):
        """Calculate minimum distance to premium zones"""
        if pd.isna(row['latitude']) or pd.isna(row['longitude']):
            return np.nan
        distances = [
            haversine_distance(row['latitude'], row['longitude'], lat, lon)
            for name, (lat, lon) in BISHKEK_PREMIUM_ZONES.items()
        ]
        return min(distances) if distances else np.nan

    def fit_transform(self, df, y=None):
        """Fit and transform in one step"""
        self.fit(df, y)
        return self.transform(df)

    def get_feature_columns(self):
        """Return list of feature columns for model"""
        base_features = [
            # Core numeric
            'latitude', 'longitude', 'area', 'rooms', 'floor', 'total_floors',
            'year_built', 'ceiling_height',

            # Derived
            'floor_ratio', 'is_first_floor', 'is_last_floor',
            'building_age', 'is_new_building', 'area_per_room', 'is_highrise',

            # Binary
            'has_balcony', 'has_parking', 'has_furniture',

            # House type
            'is_monolith', 'is_brick', 'is_panel',

            # Cluster
            'district_cluster',
        ]

        # POI distances
        if self.include_poi:
            base_features.extend([
                'dist_to_center',
                'dist_to_bazaars', 'dist_to_parks', 'dist_to_malls',
                'dist_to_universities', 'dist_to_hospitals',
                'dist_to_transport', 'dist_to_admin',
                'dist_to_premium', 'is_premium_zone',
            ])

        # Target encoded
        for col in self.target_encodings.keys():
            base_features.append(f'{col}_encoded')

        # Label encoded
        for col in self.label_encoders.keys():
            base_features.append(f'{col}_encoded')

        return base_features



## Image Feature Extractor (CV)



In [5]:
class ImageFeatureExtractor:
    """
    Extract image embeddings using pretrained ResNet-50.

    Based on research:
    - MHPP (arXiv 2024): ResNet-50 + mean pooling
    - PLOS One 2025: ResNet-101 + PCA
    """

    def __init__(self, embedding_dim=64, batch_size=32):
        self.embedding_dim = embedding_dim
        self.batch_size = batch_size
        self.pca = None
        self.model = None
        self.transform = None

        if TORCH_AVAILABLE:
            # Load pretrained ResNet-50, remove classification head
            resnet = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V2)
            self.model = torch.nn.Sequential(*list(resnet.children())[:-1])
            self.model.eval()
            self.model.to(DEVICE)

            # Image preprocessing (ImageNet standard)
            self.transform = transforms.Compose([
                transforms.Resize(256),
                transforms.CenterCrop(224),
                transforms.ToTensor(),
                transforms.Normalize(
                    mean=[0.485, 0.456, 0.406],
                    std=[0.229, 0.224, 0.225]
                )
            ])

    def extract_single_image(self, image_path):
        """Extract 2048-dim embedding from single image"""
        if not TORCH_AVAILABLE or self.model is None:
            return None

        try:
            img = Image.open(image_path).convert('RGB')
            img_tensor = self.transform(img).unsqueeze(0).to(DEVICE)
            img.close()  # Prevent memory leak

            with torch.no_grad():
                embedding = self.model(img_tensor)

            return embedding.squeeze().cpu().numpy()
        except Exception as e:
            return None

    def extract_listing_embedding(self, image_paths, max_images=10):
        """Extract mean embedding from multiple images of a listing"""
        embeddings = []

        for path in image_paths[:max_images]:
            emb = self.extract_single_image(path)
            if emb is not None:
                embeddings.append(emb)

        if not embeddings:
            return None

        # Mean pooling (robust to varying image counts)
        return np.mean(embeddings, axis=0)

    def fit_pca(self, embeddings):
        """Fit PCA on training embeddings"""
        valid = [e for e in embeddings if e is not None]
        if len(valid) > self.embedding_dim:
            self.pca = PCA(n_components=self.embedding_dim, random_state=42)
            self.pca.fit(valid)
            print(f"PCA fitted: {len(valid)} samples, explained variance: {self.pca.explained_variance_ratio_.sum():.2%}")
        return self

    def transform_embeddings(self, embeddings):
        """Apply PCA to embeddings"""
        result = []
        for emb in embeddings:
            if emb is not None and self.pca is not None:
                result.append(self.pca.transform([emb])[0])
            else:
                result.append(np.zeros(self.embedding_dim))
        return np.array(result)



In [6]:
def get_image_paths(listing_id, image_dir):
    """Get all image paths for a listing"""
    listing_dir = Path(image_dir) / str(listing_id)
    if listing_dir.exists():
        return list(listing_dir.glob('*.jpg')) + list(listing_dir.glob('*.jpeg')) + list(listing_dir.glob('*.png'))
    return []

def get_listing_id_from_url(url):
    """Extract listing ID from URL like /details/33889236900a242685078-39865021"""
    if pd.isna(url):
        return None
    import re
    match = re.search(r'/details/([^?]+)', str(url))
    if match:
        return match.group(1).split('?')[0]
    return None



## Metrics



In [7]:
def calculate_metrics(y_true, y_pred):
    """Calculate regression metrics"""
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    medape = np.median(np.abs((y_true - y_pred) / y_true)) * 100
    r2 = r2_score(y_true, y_pred)
    within_10 = np.mean(np.abs((y_true - y_pred) / y_true) <= 0.1) * 100
    return {
        'MAE': mae, 'RMSE': rmse, 'MAPE': mape,
        'MedAPE': medape, 'R2': r2, 'Within10%': within_10
    }

def print_metrics(metrics, prefix=""):
    """Print metrics nicely"""
    print(f"{prefix}MAE: ${metrics['MAE']:.2f}/m²")
    print(f"{prefix}RMSE: ${metrics['RMSE']:.2f}/m²")
    print(f"{prefix}MAPE: {metrics['MAPE']:.2f}%")
    print(f"{prefix}MedAPE: {metrics['MedAPE']:.2f}%")
    print(f"{prefix}R²: {metrics['R2']:.4f}")



## Prepare Data



In [8]:
# Prepare data
TARGET = 'price_per_m2'

# Remove outliers (SAME AS v3)
df_clean = df[
    (df[TARGET] >= 500) &
    (df[TARGET] <= 5000) &
    (df['area'] >= 15) &
    (df['area'] <= 300)
].copy()

print(f"After outlier removal: {len(df_clean)} apartments")

# Extract listing IDs from URL
df_clean['listing_id'] = df_clean['url'].apply(get_listing_id_from_url)

# Split data (RANDOM SPLIT - same as v3)
X = df_clean.drop(columns=[TARGET, 'price_usd', 'price_local'], errors='ignore')
y = df_clean[TARGET]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Train: {len(X_train)}, Test: {len(X_test)}")



After outlier removal: 8727 apartments
Train: 6981, Test: 1746


## Feature Engineering



In [9]:
# Feature engineering
fe = FeatureEngineer(n_district_clusters=30)
X_train_fe = fe.fit_transform(X_train, y_train)
X_test_fe = fe.transform(X_test)

feature_cols = fe.get_feature_columns()
# Filter to existing columns
feature_cols = [c for c in feature_cols if c in X_train_fe.columns]

print(f"Tabular features: {len(feature_cols)}")
print(feature_cols)



Tabular features: 39
['latitude', 'longitude', 'area', 'rooms', 'floor', 'total_floors', 'year_built', 'ceiling_height', 'floor_ratio', 'is_first_floor', 'is_last_floor', 'building_age', 'is_new_building', 'area_per_room', 'is_highrise', 'has_balcony', 'has_parking', 'has_furniture', 'is_monolith', 'is_brick', 'is_panel', 'district_cluster', 'dist_to_center', 'dist_to_bazaars', 'dist_to_parks', 'dist_to_malls', 'dist_to_universities', 'dist_to_hospitals', 'dist_to_transport', 'dist_to_admin', 'dist_to_premium', 'is_premium_zone', 'jk_name_encoded', 'district_encoded', 'building_series_encoded', 'house_type_encoded', 'condition_encoded', 'heating_encoded', 'bathroom_encoded']


## Image Features (CV)



In [10]:
# Setup images directory
# Kaggle auto-extracts zips to folders like bishkek_p1, bishkek_p2, etc.
IMAGES_BASE = Path('/kaggle/input/bishkek-real-estate-images')
MERGED_IMAGES = Path('/kaggle/working/images')

# Check what's available
print(f"Images base exists: {IMAGES_BASE.exists()}")
if IMAGES_BASE.exists():
    contents = list(IMAGES_BASE.iterdir())
    print(f"Contents: {[c.name for c in contents[:10]]}")

    # Merge all parts into single directory
    image_parts = sorted(IMAGES_BASE.glob("bishkek_p*"))
    if image_parts:
        print(f"\nFound {len(image_parts)} image parts, merging...")
        MERGED_IMAGES.mkdir(parents=True, exist_ok=True)

        total_listings = 0
        for part_dir in image_parts:
            if part_dir.is_dir():
                for listing_dir in part_dir.iterdir():
                    if listing_dir.is_dir():
                        dest = MERGED_IMAGES / listing_dir.name
                        if not dest.exists():
                            shutil.copytree(listing_dir, dest)
                            total_listings += 1

        print(f"Merged {total_listings} listing directories")
        IMAGE_DIR = MERGED_IMAGES
    else:
        # Maybe images are directly in base
        IMAGE_DIR = IMAGES_BASE
else:
    IMAGE_DIR = None
    print("No images available")



Images base exists: True
Contents: ['bishkek_p1', 'bishkek_p2', 'bishkek_p3', 'bishkek_p4']

Found 4 image parts, merging...
Merged 7869 listing directories


In [11]:
# Extract image embeddings
USE_CV = TORCH_AVAILABLE and IMAGE_DIR is not None and IMAGE_DIR.exists()

if USE_CV:
    print("Extracting image embeddings with ResNet-50...")
    print(f"Image directory: {IMAGE_DIR}")

    img_extractor = ImageFeatureExtractor(embedding_dim=64)

    # Get train listing IDs and their image paths
    train_ids = X_train_fe['listing_id'].tolist() if 'listing_id' in X_train_fe.columns else X_train['listing_id'].tolist()
    test_ids = X_test_fe['listing_id'].tolist() if 'listing_id' in X_test_fe.columns else X_test['listing_id'].tolist()

    # Check how many listings have images
    train_with_images = sum(1 for lid in train_ids if lid and get_image_paths(lid, IMAGE_DIR))
    test_with_images = sum(1 for lid in test_ids if lid and get_image_paths(lid, IMAGE_DIR))
    print(f"Listings with images - Train: {train_with_images}/{len(train_ids)}, Test: {test_with_images}/{len(test_ids)}")

    if train_with_images > 100:  # Need enough images to train
        # Extract train embeddings
        print("\nExtracting train embeddings...")
        train_embeddings = []
        for i, lid in enumerate(train_ids):
            if lid:
                paths = get_image_paths(lid, IMAGE_DIR)
                emb = img_extractor.extract_listing_embedding(paths) if paths else None
            else:
                emb = None
            train_embeddings.append(emb)

            if (i + 1) % 500 == 0:
                print(f"  Processed {i+1}/{len(train_ids)}")
                gc.collect()
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()

        # Fit PCA on train embeddings
        img_extractor.fit_pca(train_embeddings)

        # Transform train embeddings
        train_img_features = img_extractor.transform_embeddings(train_embeddings)

        # Extract and transform test embeddings
        print("\nExtracting test embeddings...")
        test_embeddings = []
        for i, lid in enumerate(test_ids):
            if lid:
                paths = get_image_paths(lid, IMAGE_DIR)
                emb = img_extractor.extract_listing_embedding(paths) if paths else None
            else:
                emb = None
            test_embeddings.append(emb)

            if (i + 1) % 500 == 0:
                print(f"  Processed {i+1}/{len(test_ids)}")

        test_img_features = img_extractor.transform_embeddings(test_embeddings)

        print(f"\nImage features shape: train={train_img_features.shape}, test={test_img_features.shape}")
        CV_FEATURES_AVAILABLE = True
    else:
        print("Not enough listings with images, skipping CV features")
        CV_FEATURES_AVAILABLE = False
        train_img_features = None
        test_img_features = None
else:
    print("CV features disabled (no PyTorch or no images)")
    CV_FEATURES_AVAILABLE = False
    train_img_features = None
    test_img_features = None



Extracting image embeddings with ResNet-50...
Image directory: /kaggle/working/images
Downloading: "https://download.pytorch.org/models/resnet50-11ad3fa6.pth" to /root/.cache/torch/hub/checkpoints/resnet50-11ad3fa6.pth


100%|██████████| 97.8M/97.8M [00:00<00:00, 193MB/s] 


Listings with images - Train: 6246/6981, Test: 1565/1746

Extracting train embeddings...
  Processed 500/6981
  Processed 1000/6981
  Processed 1500/6981
  Processed 2000/6981
  Processed 2500/6981
  Processed 3000/6981
  Processed 3500/6981
  Processed 4000/6981
  Processed 4500/6981
  Processed 5000/6981
  Processed 5500/6981
  Processed 6000/6981
  Processed 6500/6981
PCA fitted: 6246 samples, explained variance: 81.36%

Extracting test embeddings...
  Processed 500/1746
  Processed 1000/1746
  Processed 1500/1746

Image features shape: train=(6981, 64), test=(1746, 64)


## Prepare Final Features



In [12]:
# Prepare final feature matrices
X_train_final = X_train_fe[feature_cols].fillna(0)
X_test_final = X_test_fe[feature_cols].fillna(0)

# Add image features if available
if CV_FEATURES_AVAILABLE and train_img_features is not None:
    img_cols = [f'img_{i}' for i in range(train_img_features.shape[1])]

    train_img_df = pd.DataFrame(train_img_features, columns=img_cols, index=X_train_final.index)
    test_img_df = pd.DataFrame(test_img_features, columns=img_cols, index=X_test_final.index)

    X_train_final = pd.concat([X_train_final, train_img_df], axis=1)
    X_test_final = pd.concat([X_test_final, test_img_df], axis=1)

    print(f"Combined features: {X_train_final.shape[1]} (tabular: {len(feature_cols)}, image: {len(img_cols)})")
else:
    print(f"Tabular features only: {X_train_final.shape[1]}")

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_final)
X_test_scaled = scaler.transform(X_test_final)



Combined features: 103 (tabular: 39, image: 64)


## GPU Detection



In [13]:
# Detect GPU availability for boosting models
USE_GPU_BOOSTING = False
USE_GPU_XGB = False
USE_GPU_LGB = False
USE_GPU_CAT = False

# Check if GPU is available (on Kaggle)
try:
    import subprocess
    result = subprocess.run(['nvidia-smi'], capture_output=True, text=True)
    if result.returncode == 0:
        USE_GPU_BOOSTING = True
        USE_GPU_XGB = True  # XGBoost 2.0+ with CUDA
        USE_GPU_LGB = True  # LightGBM with GPU
        USE_GPU_CAT = True  # CatBoost with GPU
        print("GPU detected! Enabling GPU acceleration for boosting models.")
        print(result.stdout.split('\n')[8] if len(result.stdout.split('\n')) > 8 else "")
except:
    pass

if not USE_GPU_BOOSTING:
    print("No GPU detected or not available. Using CPU for boosting models.")
    print("Tip: On Kaggle, enable GPU accelerator in Settings for faster training.")



GPU detected! Enabling GPU acceleration for boosting models.
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |


## Train Base Models



In [14]:
# Define base models with GPU support
xgb_params = {
    'n_estimators': 500,
    'max_depth': 6,
    'learning_rate': 0.05,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'min_child_weight': 3,
    'reg_alpha': 0.1,
    'reg_lambda': 1.0,
    'random_state': 42,
}

# XGBoost GPU settings
if USE_GPU_XGB:
    xgb_params['tree_method'] = 'hist'
    xgb_params['device'] = 'cuda'

lgb_params = {
    'n_estimators': 500,
    'max_depth': 6,
    'learning_rate': 0.05,
    'num_leaves': 31,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': 42,
    'verbose': -1,
}

if USE_GPU_LGB:
    lgb_params['device'] = 'gpu'

cat_params = {
    'iterations': 500,
    'depth': 6,
    'learning_rate': 0.05,
    'random_state': 42,
    'verbose': 0,
}

if USE_GPU_CAT:
    cat_params['task_type'] = 'GPU'



In [15]:
# Train individual models
print("Training XGBoost...")
xgb_model = XGBRegressor(**xgb_params)
xgb_model.fit(X_train_scaled, y_train)
xgb_pred = xgb_model.predict(X_test_scaled)
print_metrics(calculate_metrics(y_test, xgb_pred), "XGBoost: ")

print("\nTraining LightGBM...")
lgb_model = LGBMRegressor(**lgb_params)
lgb_model.fit(X_train_scaled, y_train)
lgb_pred = lgb_model.predict(X_test_scaled)
print_metrics(calculate_metrics(y_test, lgb_pred), "LightGBM: ")

print("\nTraining CatBoost...")
cat_model = CatBoostRegressor(**cat_params)
cat_model.fit(X_train_scaled, y_train)
cat_pred = cat_model.predict(X_test_scaled)
print_metrics(calculate_metrics(y_test, cat_pred), "CatBoost: ")



Training XGBoost...
XGBoost: MAE: $129.83/m²
XGBoost: RMSE: $189.10/m²
XGBoost: MAPE: 8.32%
XGBoost: MedAPE: 6.08%
XGBoost: R²: 0.7315

Training LightGBM...




LightGBM: MAE: $131.30/m²
LightGBM: RMSE: $190.20/m²
LightGBM: MAPE: 8.46%
LightGBM: MedAPE: 6.09%
LightGBM: R²: 0.7284

Training CatBoost...
CatBoost: MAE: $140.67/m²
CatBoost: RMSE: $200.97/m²
CatBoost: MAPE: 9.11%
CatBoost: MedAPE: 6.74%
CatBoost: R²: 0.6967


## Optuna Hyperparameter Tuning



In [16]:
def run_optuna_tuning(X_train, y_train, n_trials=50, use_gpu_xgb=False, use_gpu_lgb=False, use_gpu_cat=False):
    """
    Run Optuna hyperparameter optimization for XGBoost, LightGBM, and CatBoost.
    Returns best parameters for each model.
    """
    if not OPTUNA_AVAILABLE:
        print("Optuna not available, using default parameters")
        return None, None, None

    print("Starting Optuna hyperparameter tuning...")
    print(f"GPU: XGBoost={use_gpu_xgb}, LightGBM={use_gpu_lgb}, CatBoost={use_gpu_cat}")
    optuna.logging.set_verbosity(optuna.logging.WARNING)

    # XGBoost objective
    def xgb_objective(trial):
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
            'subsample': trial.suggest_float('subsample', 0.6, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
            'reg_alpha': trial.suggest_float('reg_alpha', 1e-4, 1.0, log=True),
            'reg_lambda': trial.suggest_float('reg_lambda', 1e-4, 1.0, log=True),
            'random_state': 42,
        }
        if use_gpu_xgb:
            params['tree_method'] = 'hist'
            params['device'] = 'cuda'

        model = XGBRegressor(**params)
        scores = cross_val_score(model, X_train, y_train, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1)
        return -scores.mean()

    # LightGBM objective
    def lgb_objective(trial):
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
            'max_depth': trial.suggest_int('max_depth', 3, 12),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
            'num_leaves': trial.suggest_int('num_leaves', 20, 100),
            'subsample': trial.suggest_float('subsample', 0.6, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
            'random_state': 42,
            'verbose': -1,
        }
        if use_gpu_lgb:
            params['device'] = 'gpu'

        model = LGBMRegressor(**params)
        scores = cross_val_score(model, X_train, y_train, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1)
        return -scores.mean()

    # CatBoost objective
    def cat_objective(trial):
        params = {
            'iterations': trial.suggest_int('iterations', 100, 1000),
            'depth': trial.suggest_int('depth', 4, 10),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
            'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 0.1, 10.0, log=True),
            'random_state': 42,
            'verbose': 0,
        }
        if use_gpu_cat:
            params['task_type'] = 'GPU'

        model = CatBoostRegressor(**params)
        scores = cross_val_score(model, X_train, y_train, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1)
        return -scores.mean()

    # Run optimization
    print("\n[1/3] Tuning XGBoost...")
    xgb_study = optuna.create_study(direction='minimize')
    xgb_study.optimize(xgb_objective, n_trials=n_trials, show_progress_bar=False)
    print(f"  Best MAE: ${-xgb_study.best_value:.2f}/m²")

    print("\n[2/3] Tuning LightGBM...")
    lgb_study = optuna.create_study(direction='minimize')
    lgb_study.optimize(lgb_objective, n_trials=n_trials, show_progress_bar=False)
    print(f"  Best MAE: ${-lgb_study.best_value:.2f}/m²")

    print("\n[3/3] Tuning CatBoost...")
    cat_study = optuna.create_study(direction='minimize')
    cat_study.optimize(cat_objective, n_trials=n_trials, show_progress_bar=False)
    print(f"  Best MAE: ${-cat_study.best_value:.2f}/m²")

    # Get best params
    xgb_best = xgb_study.best_params.copy()
    xgb_best['random_state'] = 42
    if use_gpu_xgb:
        xgb_best['tree_method'] = 'hist'
        xgb_best['device'] = 'cuda'

    lgb_best = lgb_study.best_params.copy()
    lgb_best['random_state'] = 42
    lgb_best['verbose'] = -1
    if use_gpu_lgb:
        lgb_best['device'] = 'gpu'

    cat_best = cat_study.best_params.copy()
    cat_best['random_state'] = 42
    cat_best['verbose'] = 0
    if use_gpu_cat:
        cat_best['task_type'] = 'GPU'

    return xgb_best, lgb_best, cat_best



In [18]:
# Run Optuna tuning
USE_OPTUNA = True

if USE_OPTUNA and OPTUNA_AVAILABLE:
    xgb_params_opt, lgb_params_opt, cat_params_opt = run_optuna_tuning(
        X_train_scaled, y_train, n_trials=30,
        use_gpu_xgb=False, use_gpu_lgb=False, use_gpu_cat=False
    )

    if xgb_params_opt:
        print("\n" + "="*50)
        print("OPTIMIZED PARAMETERS:")
        print("="*50)
        print(f"\nXGBoost best params: {xgb_params_opt}")
        print(f"\nLightGBM best params: {lgb_params_opt}")
        print(f"\nCatBoost best params: {cat_params_opt}")

        # Update params for ensemble
        xgb_params = xgb_params_opt
        lgb_params = lgb_params_opt
        cat_params = cat_params_opt

        # Retrain with optimized parameters
        print("\nRetraining with optimized parameters...")
        xgb_model = XGBRegressor(**xgb_params)
        xgb_model.fit(X_train_scaled, y_train)
        xgb_pred = xgb_model.predict(X_test_scaled)
        print_metrics(calculate_metrics(y_test, xgb_pred), "XGBoost (tuned): ")

        lgb_model = LGBMRegressor(**lgb_params)
        lgb_model.fit(X_train_scaled, y_train)
        lgb_pred = lgb_model.predict(X_test_scaled)
        print_metrics(calculate_metrics(y_test, lgb_pred), "\nLightGBM (tuned): ")

        cat_model = CatBoostRegressor(**cat_params)
        cat_model.fit(X_train_scaled, y_train)
        cat_pred = cat_model.predict(X_test_scaled)
        print_metrics(calculate_metrics(y_test, cat_pred), "\nCatBoost (tuned): ")



Starting Optuna hyperparameter tuning...
GPU: XGBoost=False, LightGBM=False, CatBoost=False

[1/3] Tuning XGBoost...
  Best MAE: $-135.10/m²

[2/3] Tuning LightGBM...




  Best MAE: $-136.73/m²

[3/3] Tuning CatBoost...
  Best MAE: $-138.01/m²

OPTIMIZED PARAMETERS:

XGBoost best params: {'n_estimators': 980, 'max_depth': 8, 'learning_rate': 0.01429682190405642, 'subsample': 0.7453340306688304, 'colsample_bytree': 0.6525204072741028, 'min_child_weight': 2, 'reg_alpha': 0.0004203123283104735, 'reg_lambda': 0.008661281301109377, 'random_state': 42}

LightGBM best params: {'n_estimators': 720, 'max_depth': 12, 'learning_rate': 0.03781509460184321, 'num_leaves': 78, 'subsample': 0.7589847094582766, 'colsample_bytree': 0.8423211583618249, 'random_state': 42, 'verbose': -1}

CatBoost best params: {'iterations': 916, 'depth': 8, 'learning_rate': 0.07025183668627026, 'l2_leaf_reg': 0.16162769670807364, 'random_state': 42, 'verbose': 0}

Retraining with optimized parameters...
XGBoost (tuned): MAE: $126.06/m²
XGBoost (tuned): RMSE: $185.15/m²
XGBoost (tuned): MAPE: 8.10%
XGBoost (tuned): MedAPE: 5.71%
XGBoost (tuned): R²: 0.7426

LightGBM (tuned): MAE: $128.70/

## Stacking Ensemble



In [20]:
# Ensemble with stacking
print("\nTraining Stacking Ensemble...")

estimators = [
    ('xgb', XGBRegressor(**xgb_params)),
    ('lgb', LGBMRegressor(**lgb_params)),
    ('cat', CatBoostRegressor(**cat_params))
]

stacking = StackingRegressor(
    estimators=estimators,
    final_estimator=Ridge(alpha=1.0),
    cv=5,
    n_jobs=-1
)

stacking.fit(X_train_scaled, y_train)
ensemble_pred = stacking.predict(X_test_scaled)

print("\n" + "="*50)
print("ENSEMBLE RESULTS:")
print("="*50)
metrics = calculate_metrics(y_test, ensemble_pred)
print(f"MAE: ${metrics['MAE']:.2f}/m²")
print(f"RMSE: ${metrics['RMSE']:.2f}/m²")
print(f"MAPE: {metrics['MAPE']:.2f}%")
print(f"MedAPE: {metrics['MedAPE']:.2f}%")
print(f"R²: {metrics['R2']:.4f}")




Training Stacking Ensemble...





ENSEMBLE RESULTS:
MAE: $125.28/m²
RMSE: $183.62/m²
MAPE: 8.02%
MedAPE: 5.76%
R²: 0.7468


## Summary



In [21]:
# Print summary
print("\n" + "="*60)
print("FINAL SUMMARY")
print("="*60)

print(f"\nDataset: {len(df_clean)} apartments")
print(f"Train: {len(X_train)}, Test: {len(X_test)}")
print(f"Tabular features: {len(feature_cols)}")
if CV_FEATURES_AVAILABLE:
    print(f"Image features: 64 (ResNet-50 + PCA)")
    print(f"Total features: {X_train_final.shape[1]}")
else:
    print("Image features: Not used")

print(f"\nFinal Model Performance:")
print(f"  MAE: ${metrics['MAE']:.2f}/m²")
print(f"  MedAPE: {metrics['MedAPE']:.2f}%")
print(f"  R²: {metrics['R2']:.4f}")

if CV_FEATURES_AVAILABLE:
    print(f"\nComparison with v3 baseline (tabular only):")
    print(f"  v3 baseline: MAE ~$122, R² ~0.76")
    print(f"  v3 + CV:     MAE ${metrics['MAE']:.2f}, R² {metrics['R2']:.4f}")




FINAL SUMMARY

Dataset: 8727 apartments
Train: 6981, Test: 1746
Tabular features: 39
Image features: 64 (ResNet-50 + PCA)
Total features: 103

Final Model Performance:
  MAE: $125.28/m²
  MedAPE: 5.76%
  R²: 0.7468

Comparison with v3 baseline (tabular only):
  v3 baseline: MAE ~$122, R² ~0.76
  v3 + CV:     MAE $125.28, R² 0.7468


In [22]:
# Save predictions
results = pd.DataFrame({
    'actual': y_test,
    'predicted': ensemble_pred,
    'error': y_test - ensemble_pred,
    'error_pct': (y_test - ensemble_pred) / y_test * 100
})

results.to_csv('predictions.csv', index=False)
print("Predictions saved to predictions.csv")

print("\n" + "="*60)
print("DONE!")
print("="*60)



Predictions saved to predictions.csv

DONE!
