# Data Preprocessing Pipeline
## Satellite Imagery-Based Property Valuation

This notebook covers:
1. Data loading and cleaning
2. Feature engineering
3. Train/validation split
4. Image embedding extraction

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler
import pickle

# Configuration
RAW_DATA_DIR = Path("data/raw")
PROCESSED_DATA_DIR = Path("data/processed")
PROCESSED_DATA_DIR.mkdir(parents=True, exist_ok=True)

TARGET_COLUMN = "price"
VALIDATION_SPLIT = 0.2
RANDOM_SEED = 42

## 1. Load Data

In [None]:
# Load training data with image paths
train_df = pd.read_csv(RAW_DATA_DIR / "train_with_images.csv")
print(f"Loaded training data: {train_df.shape}")
train_df.head()

In [None]:
# Check for missing values
print("Missing values:")
train_df.isnull().sum()[train_df.isnull().sum() > 0]

## 2. Data Cleaning

In [None]:
def clean_data(df):
    """Clean the dataset"""
    print("\nCleaning data...")
    initial_shape = df.shape
    
    # Remove rows with missing coordinates
    df = df.dropna(subset=['lat', 'long'])
    
    # Remove rows with missing images
    if 'image_path' in df.columns:
        df = df[df['image_path'].notna()]
    
    # Fill missing numeric values with median
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        if df[col].isna().any():
            df[col].fillna(df[col].median(), inplace=True)
    
    # Remove duplicates
    df = df.drop_duplicates()
    
    print(f"Data cleaned: {initial_shape} -> {df.shape}")
    print(f"Removed {initial_shape[0] - df.shape[0]} rows")
    
    return df

train_df = clean_data(train_df)

## 3. Feature Engineering

In [None]:
# Raw features from dataset
TABULAR_FEATURES = [
    'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',
    'floors', 'waterfront', 'view', 'condition', 'grade',
    'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated',
    'sqft_living15', 'sqft_lot15'
]

print(f"Raw features: {len(TABULAR_FEATURES)}")

In [None]:
def engineer_features(df):
    """Create derived features"""
    print("\nEngineering features...")
    
    current_year = 2024
    
    # Age features
    df['house_age'] = current_year - df['yr_built']
    df['years_since_renovation'] = df['yr_renovated'].apply(
        lambda x: 0 if x == 0 else current_year - x
    )
    df['was_renovated'] = (df['yr_renovated'] > 0).astype(int)
    
    # Ratio features
    df['living_lot_ratio'] = df['sqft_living'] / df['sqft_lot']
    df['bedroom_bathroom_ratio'] = df['bedrooms'] / (df['bathrooms'] + 1)
    
    # Basement features
    df['has_basement'] = (df['sqft_basement'] > 0).astype(int)
    df['basement_ratio'] = df['sqft_basement'] / df['sqft_living']
    
    # Neighborhood comparison
    df['living_vs_neighbors'] = df['sqft_living'] / (df['sqft_living15'] + 1)
    df['lot_vs_neighbors'] = df['sqft_lot'] / (df['sqft_lot15'] + 1)
    
    # Combined features
    df['total_rooms'] = df['bedrooms'] + df['bathrooms']
    df['quality_score'] = df['condition'] * df['grade']
    df['lat_long_interaction'] = df['lat'] * df['long']
    
    print(f"Engineered features. New shape: {df.shape}")
    return df

train_df = engineer_features(train_df)

In [None]:
# Final feature list (15 raw + 12 engineered = 27)
ALL_FEATURES = TABULAR_FEATURES + [
    'house_age', 'years_since_renovation', 'was_renovated',
    'living_lot_ratio', 'bedroom_bathroom_ratio',
    'has_basement', 'basement_ratio',
    'living_vs_neighbors', 'lot_vs_neighbors',
    'total_rooms', 'quality_score', 'lat_long_interaction'
]

print(f"Total features: {len(ALL_FEATURES)}")

## 4. Feature Scaling

In [None]:
# Extract features and target
X = train_df[ALL_FEATURES].values
y = train_df[TARGET_COLUMN].values.reshape(-1, 1)
image_paths = train_df['image_path'].values

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Image paths: {len(image_paths)}")

In [None]:
# Initialize scalers
feature_scaler = RobustScaler()
target_scaler = StandardScaler()

# Scale features and target
X_scaled = feature_scaler.fit_transform(X)
y_scaled = target_scaler.fit_transform(y)

print("Scaling complete.")

## 5. Train/Validation Split

In [None]:
# Create indices for split
indices = np.arange(len(X_scaled))
train_idx, val_idx = train_test_split(
    indices, 
    test_size=VALIDATION_SPLIT, 
    random_state=RANDOM_SEED
)

# Split data
X_train = X_scaled[train_idx]
X_val = X_scaled[val_idx]
y_train = y_scaled[train_idx]
y_val = y_scaled[val_idx]
images_train = image_paths[train_idx]
images_val = image_paths[val_idx]

print(f"Training samples: {len(X_train)}")
print(f"Validation samples: {len(X_val)}")

## 6. Save Preprocessed Data

In [None]:
# Save preprocessor
preprocessor = {
    'scaler': feature_scaler,
    'target_scaler': target_scaler,
    'feature_names': ALL_FEATURES
}

with open(PROCESSED_DATA_DIR / "preprocessor.pkl", 'wb') as f:
    pickle.dump(preprocessor, f)

# Save processed arrays
np.savez(
    PROCESSED_DATA_DIR / "train_processed.npz",
    X_train=X_train,
    y_train=y_train,
    images_train=images_train,
    X_val=X_val,
    y_val=y_val,
    images_val=images_val
)

print("Saved preprocessor and processed data.")

## 7. Extract Image Embeddings (ResNet18)

In [None]:
import torch
import torch.nn as nn
from torchvision import models
from PIL import Image
from tqdm import tqdm

def embed_images(image_paths, batch_size=64):
    """Extract 512-d embeddings from images using ResNet18"""
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    
    # Load pretrained ResNet18
    weights = models.ResNet18_Weights.DEFAULT
    tfms = weights.transforms()
    
    model = models.resnet18(weights=weights)
    model.fc = nn.Identity()  # Remove classification head
    model.eval()
    model.to(device)
    
    embeddings = []
    
    with torch.no_grad():
        for i in tqdm(range(0, len(image_paths), batch_size), desc="Extracting embeddings"):
            batch_paths = image_paths[i:i+batch_size]
            imgs = []
            for p in batch_paths:
                img = Image.open(p).convert("RGB")
                imgs.append(tfms(img))
            
            x = torch.stack(imgs, dim=0).to(device)
            feat = model(x).cpu().numpy()
            embeddings.append(feat)
    
    return np.concatenate(embeddings, axis=0)

print("Embedding function ready.")

In [None]:
# Extract embeddings for train and validation
print("Extracting training embeddings...")
emb_train = embed_images(images_train, batch_size=64)
print(f"Train embeddings shape: {emb_train.shape}")

print("\nExtracting validation embeddings...")
emb_val = embed_images(images_val, batch_size=64)
print(f"Validation embeddings shape: {emb_val.shape}")

In [None]:
# Save embeddings
np.save(PROCESSED_DATA_DIR / "img_emb_train.npy", emb_train)
np.save(PROCESSED_DATA_DIR / "img_emb_val.npy", emb_val)

print("Saved image embeddings.")

## Summary

**Preprocessing Complete:**
- Raw features: 15
- Engineered features: 12
- Total tabular features: 27
- Image embedding dimensions: 512
- Training samples: ~13,000
- Validation samples: ~3,200