# Exploratory Data Analysis

This notebook explores the building footprint dataset.

In [None]:
import os
import sys
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import rasterio
import geopandas as gpd
from tqdm import tqdm

# Add project root to path
project_root = Path().absolute().parent
sys.path.insert(0, str(project_root))

plt.style.use('seaborn-v0_8-whitegrid')
%matplotlib inline

## 1. Load Manifest

In [None]:
manifest_path = project_root / 'data' / 'manifest.csv'

if manifest_path.exists():
    df = pd.read_csv(manifest_path)
    print(f'Total images: {len(df)}')
    df.head()

## 2. Dataset Statistics

In [None]:
if 'df' in dir():
    print('Images per city:')
    print(df['city'].value_counts())
    
    fig, ax = plt.subplots(figsize=(10, 6))
    df['city'].value_counts().plot(kind='bar', ax=ax)
    ax.set_title('Number of Images per City')
    ax.set_xlabel('City')
    ax.set_ylabel('Count')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

## 3. Image Size Distribution

In [None]:
if 'df' in dir() and 'width' in df.columns:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    axes[0].hist(df['width'], bins=30, edgecolor='black')
    axes[0].set_title('Image Width Distribution')
    axes[0].set_xlabel('Width (pixels)')
    axes[0].set_ylabel('Count')
    
    axes[1].hist(df['height'], bins=30, edgecolor='black')
    axes[1].set_title('Image Height Distribution')
    axes[1].set_xlabel('Height (pixels)')
    axes[1].set_ylabel('Count')
    
    plt.tight_layout()
    plt.show()

## 4. Sample Images

In [None]:
def show_sample_images(df, n_samples=4):
    """Display sample images from each city."""
    cities = df['city'].unique()
    
    fig, axes = plt.subplots(len(cities), n_samples, figsize=(16, 4*len(cities)))
    
    for i, city in enumerate(cities):
        city_df = df[df['city'] == city].sample(min(n_samples, len(df[df['city'] == city])))
        
        for j, (_, row) in enumerate(city_df.iterrows()):
            if j >= n_samples:
                break
            
            try:
                with rasterio.open(row['image_path']) as src:
                    img = src.read([1, 2, 3])
                    img = np.transpose(img, (1, 2, 0))
                    
                    if img.max() > 1:
                        img = img / 255.0
                    
                    ax = axes[i, j] if len(cities) > 1 else axes[j]
                    ax.imshow(img)
                    ax.set_title(f'{city}')
                    ax.axis('off')
            except Exception as e:
                print(f'Error loading {row["image_path"]}: {e}')
    
    plt.tight_layout()
    plt.show()

if 'df' in dir():
    show_sample_images(df)

## 5. Building Statistics

In [None]:
def analyze_annotations(df, sample_size=50):
    """Analyze building annotations."""
    building_counts = []
    building_areas = []
    
    sample_df = df.sample(min(sample_size, len(df)))
    
    for _, row in tqdm(sample_df.iterrows(), total=len(sample_df)):
        annotation_path = row.get('annotation_path', '')
        
        if pd.isna(annotation_path) or not annotation_path:
            continue
        
        if not Path(annotation_path).exists():
            continue
        
        try:
            gdf = gpd.read_file(annotation_path)
            building_counts.append(len(gdf))
            building_areas.extend(gdf.geometry.area.tolist())
        except Exception as e:
            print(f'Error reading {annotation_path}: {e}')
    
    return building_counts, building_areas

if 'df' in dir():
    counts, areas = analyze_annotations(df)
    
    if counts:
        fig, axes = plt.subplots(1, 2, figsize=(14, 5))
        
        axes[0].hist(counts, bins=30, edgecolor='black')
        axes[0].set_title('Buildings per Image')
        axes[0].set_xlabel('Number of Buildings')
        axes[0].set_ylabel('Count')
        
        if areas:
            axes[1].hist(areas, bins=50, edgecolor='black')
            axes[1].set_title('Building Area Distribution')
            axes[1].set_xlabel('Area')
            axes[1].set_ylabel('Count')
        
        plt.tight_layout()
        plt.show()
        
        print(f'Average buildings per image: {np.mean(counts):.1f}')
        print(f'Max buildings per image: {np.max(counts)}')
        print(f'Min buildings per image: {np.min(counts)}')

## 6. Sample with Overlay

In [None]:
def show_image_with_mask(image_path, mask_path=None, annotation_path=None):
    """Show image with mask or annotation overlay."""
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    
    with rasterio.open(image_path) as src:
        img = src.read([1, 2, 3])
        img = np.transpose(img, (1, 2, 0))
        if img.max() > 1:
            img = img / 255.0
    
    axes[0].imshow(img)
    axes[0].set_title('Image')
    axes[0].axis('off')
    
    if mask_path and Path(mask_path).exists():
        with rasterio.open(mask_path) as src:
            mask = src.read(1)
        axes[1].imshow(mask, cmap='gray')
        axes[1].set_title('Mask')
        axes[1].axis('off')
    else:
        axes[1].text(0.5, 0.5, 'No mask', ha='center', va='center')
        axes[1].axis('off')
    
    axes[2].imshow(img)
    if annotation_path and Path(annotation_path).exists():
        gdf = gpd.read_file(annotation_path)
        gdf.boundary.plot(ax=axes[2], color='red', linewidth=1)
    axes[2].set_title('Overlay')
    axes[2].axis('off')
    
    plt.tight_layout()
    plt.show()

if 'df' in dir() and len(df) > 0:
    sample_row = df.iloc[0]
    show_image_with_mask(
        sample_row['image_path'],
        annotation_path=sample_row.get('annotation_path')
    )