<a href="https://colab.research.google.com/github/ruany-doehnert/Deforestation_Amazon/blob/main/data_source_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Install and authenticate Earth Engine in Colab
# !pip install earthengine-api --quiet
# !pip install rasterio

Amazon Deforestation Detection - Data Extraction Pipeline
========================================================
Author: Ruany Doehnert
Project: ML-based Amazon Deforestation Prediction
Date: July 2025

This notebook extracts satellite imagery features for Amazon deforestation
prediction using Google Earth Engine and creates a balanced training dataset.


In [14]:
import rasterio
import numpy as np
import pandas as pd
import ee
import folium
import sys
import subprocess
from datetime import datetime

Package Installation

In [15]:
def install_package(package_name, import_name=None):
    """Install package with progress tracking"""
    if import_name is None:
        import_name = package_name

    try:
        __import__(import_name)
        print(f"✅ {package_name} already installed")
        return True
    except ImportError:
        print(f"📦 Installing {package_name}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package_name, "--quiet"])
        print(f"✅ {package_name} installed successfully")
        return True

print("🚀 Amazon Deforestation ML Project - Setup")
print("=" * 45)
print(f"⏰ Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

# Install required packages
packages = [
    ("earthengine-api", "ee"),
    ("rasterio", "rasterio"),
    ("folium", "folium")
]

for package, import_name in packages:
    install_package(package, import_name)

print("\n🎯 Environment ready for satellite data processing!")

🚀 Amazon Deforestation ML Project - Setup
⏰ Started: 2025-07-14 05:13:11
✅ earthengine-api already installed
✅ rasterio already installed
✅ folium already installed

🎯 Environment ready for satellite data processing!


Enhanced Imports with Validation

In [16]:
import rasterio
import numpy as np
import pandas as pd
import ee
import folium
import json
from datetime import datetime
import warnings

warnings.filterwarnings('ignore')

print("📚 Loading Required Libraries")
print("=" * 30)

# Validate library versions
libraries = {
    'rasterio': rasterio.__version__,
    'numpy': np.__version__,
    'pandas': pd.__version__,
    'folium': folium.__version__
}

for lib, version in libraries.items():
    print(f"   ✅ {lib}: v{version}")

print(f"   ✅ earthengine-api: ready")
print(f"\n🌍 Ready to process Amazon satellite data!")

📚 Loading Required Libraries
   ✅ rasterio: v1.4.3
   ✅ numpy: v2.0.2
   ✅ pandas: v2.2.2
   ✅ folium: v0.19.7
   ✅ earthengine-api: ready

🌍 Ready to process Amazon satellite data!


Google Drive Connection

In [17]:
print("💾 Connecting to Google Drive")
print("=" * 28)

try:
    from google.colab import drive
    drive.mount('/content/drive')
    print("✅ Google Drive mounted successfully")
    print("📁 Export folder: /content/drive/MyDrive/amazon_ml_project/")
except Exception as e:
    print(f"⚠️ Drive mount failed: {e}")
    print("📝 Note: Exports will go to default Google Drive location")

print("🔗 Google Drive connection established!")

💾 Connecting to Google Drive
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Google Drive mounted successfully
📁 Export folder: /content/drive/MyDrive/amazon_ml_project/
🔗 Google Drive connection established!


Data Processing

In [18]:
print("🌍 Initializing Amazon Deforestation Analysis")
print("=" * 45)

# Authenticate and initialize Earth Engine
try:
    ee.Authenticate()
    ee.Initialize(project='amazon-deforestation-462101')
    print("✅ Earth Engine authenticated and initialized")
except Exception as e:
    print(f"❌ Earth Engine initialization failed: {e}")
    raise

# Define study region (Amazon basin - Brazil/Peru/Colombia border area)
region = ee.Geometry.BBox(-71, -17, -44, -1)

# Calculate area with proper error margin (THIS FIXES THE ERROR!)
region_area_km2 = region.area(maxError=1000).divide(1e6).getInfo()
print(f"📍 Study Region: {region_area_km2:.0f} km² in Amazon basin")

print("\n🛰️ Loading Satellite Data Sources...")

# Hansen Global Forest Change (baseline tree cover)
gfc = ee.Image('UMD/hansen/global_forest_change_2022_v1_10')
treecover = gfc.select('treecover2000')
print("   ✅ Hansen Global Forest Change (2000 baseline)")

# Landsat 8 Surface Reflectance (2021 composite)
landsat = ee.ImageCollection('LANDSAT/LC08/C02/T1_L2') \
    .filterDate('2021-01-01', '2021-12-31') \
    .filterBounds(region) \
    .median()

# Check data availability
landsat_count = ee.ImageCollection('LANDSAT/LC08/C02/T1_L2') \
    .filterDate('2021-01-01', '2021-12-31') \
    .filterBounds(region) \
    .size().getInfo()

print(f"   ✅ Landsat 8 Surface Reflectance ({landsat_count} scenes, 2021)")

print("\n🧮 Computing Vegetation Indices...")

# Calculate vegetation indices
ndvi = landsat.normalizedDifference(['SR_B5', 'SR_B4']).rename('NDVI')
nbr = landsat.normalizedDifference(['SR_B5', 'SR_B7']).rename('NBR')

print("   ✅ NDVI (forest health indicator)")
print("   ✅ NBR (burn/disturbance indicator)")

# Topographic data
elevation = ee.Image('USGS/SRTMGL1_003').rename('elevation')
print("   ✅ SRTM Elevation (30m resolution)")

print("\n🔗 Stacking Feature Bands...")
# Stack all features
features = treecover.rename('treecover') \
    .addBands([ndvi, nbr, elevation])

feature_list = ['treecover', 'NDVI', 'NBR', 'elevation']
print(f"   📊 Total features: {len(feature_list)}")
for i, feature in enumerate(feature_list, 1):
    print(f"   {i}. {feature}")

print("\n🎯 Improved Sampling Strategy...")

# Enhanced sampling for better class balance
samples = features.sample(
    region=region,
    scale=30,
    numPixels=2000,
    seed=42,
    geometries=True
)

# Improved labeling function
def label_fn(f):
    """Assign forest/non-forest label based on tree cover threshold"""
    return f.set('label', ee.Number(f.get('treecover')).gt(30).int())

samples = samples.map(label_fn)

# Get sample statistics
total_samples = samples.size().getInfo()
print(f"   📊 Total samples: {total_samples}")

# Calculate class distribution
forest_samples = samples.filter(ee.Filter.eq('label', 1)).size().getInfo()
non_forest_samples = samples.filter(ee.Filter.eq('label', 0)).size().getInfo()

print(f"   🌲 Forest samples (label=1): {forest_samples}")
print(f"   🏞️ Non-forest samples (label=0): {non_forest_samples}")
print(f"   ⚖️ Class balance: {forest_samples/(forest_samples+non_forest_samples)*100:.1f}% forest")

print("\n✅ Feature extraction completed successfully!")

🌍 Initializing Amazon Deforestation Analysis
✅ Earth Engine authenticated and initialized
📍 Study Region: 5258491 km² in Amazon basin

🛰️ Loading Satellite Data Sources...
   ✅ Hansen Global Forest Change (2000 baseline)
   ✅ Landsat 8 Surface Reflectance (4221 scenes, 2021)

🧮 Computing Vegetation Indices...
   ✅ NDVI (forest health indicator)
   ✅ NBR (burn/disturbance indicator)
   ✅ SRTM Elevation (30m resolution)

🔗 Stacking Feature Bands...
   📊 Total features: 4
   1. treecover
   2. NDVI
   3. NBR
   4. elevation

🎯 Improved Sampling Strategy...
   📊 Total samples: 2000
   🌲 Forest samples (label=1): 1456
   🏞️ Non-forest samples (label=0): 544
   ⚖️ Class balance: 72.8% forest

✅ Feature extraction completed successfully!


 Export with Metadata

In [19]:
print("💾 Exporting Training Dataset")
print("=" * 29)

# Create export metadata
export_info = {
    'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    'region_area_km2': region_area_km2,
    'total_samples': total_samples,
    'forest_samples': forest_samples,
    'non_forest_samples': non_forest_samples
}

# Export to Drive with better naming
task = ee.batch.Export.table.toDrive(
    collection=samples,
    description='amazon_deforestation_training_data_v2',
    folder='amazon_ml_project',
    fileNamePrefix='training_data_amazon_balanced',
    fileFormat='CSV',
    selectors=['treecover', 'NDVI', 'NBR', 'elevation', 'label', '.geo']
)

task.start()

print("🚀 Export initiated to Google Drive")
print(f"📁 Folder: amazon_ml_project")
print(f"📄 Filename: training_data_amazon_balanced.csv")
print(f"🔢 Features: {feature_list}")
print(f"📊 Expected samples: {total_samples}")
print(f"📍 Includes geometry coordinates")

print(f"\n📋 Dataset Summary:")
print(f"   • Region: Amazon basin ({region_area_km2:.0f} km²)")
print(f"   • Resolution: 30m Landsat/SRTM")
print(f"   • Year: 2021 composite")
print(f"   • Features: {len(feature_list)} environmental variables")
print(f"   • Balance: {forest_samples} forest + {non_forest_samples} non-forest")
print(f"   • Total size: {total_samples} training samples")

print("\n⏱️ Export typically takes 2-5 minutes. Check Google Drive!")

💾 Exporting Training Dataset
🚀 Export initiated to Google Drive
📁 Folder: amazon_ml_project
📄 Filename: training_data_amazon_balanced.csv
🔢 Features: ['treecover', 'NDVI', 'NBR', 'elevation']
📊 Expected samples: 2000
📍 Includes geometry coordinates

📋 Dataset Summary:
   • Region: Amazon basin (5258491 km²)
   • Resolution: 30m Landsat/SRTM
   • Year: 2021 composite
   • Features: 4 environmental variables
   • Balance: 1456 forest + 544 non-forest
   • Total size: 2000 training samples

⏱️ Export typically takes 2-5 minutes. Check Google Drive!


 Visualization

In [20]:
print("🗺️ Creating Interactive Study Area Map")
print("=" * 35)

# Define the bounding box coordinates (fixed coordinates)
lon_min, lat_min, lon_max, lat_max = -71, -17, -44, -1

# Center of the box
center_lat = (lat_min + lat_max) / 2
center_lon = (lon_min + lon_max) / 2

print(f"📍 Map Center: {center_lat:.2f}°N, {center_lon:.2f}°W")

# Create enhanced folium map
m = folium.Map(
    location=[center_lat, center_lon],
    zoom_start=6,
    tiles='OpenStreetMap'
)

# Add satellite imagery layer
folium.TileLayer(
    tiles='https://server.arcgisonline.com/ArcGIS/rest/services/World_Imagery/MapServer/tile/{z}/{y}/{x}',
    attr='Esri WorldImagery',
    name='Satellite',
    overlay=False,
    control=True
).add_to(m)

# Add the bounding box with enhanced styling
folium.Rectangle(
    bounds=[[lat_min, lon_min], [lat_max, lon_max]],
    color='#FF4444',
    weight=3,
    fill=True,
    fillColor='#FF4444',
    fillOpacity=0.15,
    popup=f"""
    <div style='font-family: Arial; font-size: 12px;'>
    <h4>🌲 Amazon Study Region</h4>
    <b>Area:</b> {region_area_km2:.0f} km²<br>
    <b>Samples:</b> {total_samples:,}<br>
    <b>Forest:</b> {forest_samples} ({forest_samples/total_samples*100:.1f}%)<br>
    <b>Non-forest:</b> {non_forest_samples} ({non_forest_samples/total_samples*100:.1f}%)<br>
    <b>Resolution:</b> 30m<br>
    <b>Year:</b> 2021
    </div>
    """
).add_to(m)

# Enhanced legend
legend_html = f'''
<div style="position: fixed;
            bottom: 50px; left: 50px; width: 220px; height: 140px;
            background-color: white; border:2px solid grey; z-index:9999;
            font-size:12px; padding: 15px; border-radius: 8px; box-shadow: 3px 3px 10px rgba(0,0,0,0.3);">
<h4 style="margin-top:0;">🌲 Amazon Deforestation Study</h4>
<p><strong>📊 Total Samples:</strong> {total_samples:,}</p>
<p><strong>🌲 Forest:</strong> {forest_samples:,} ({forest_samples/total_samples*100:.1f}%)</p>
<p><strong>🏞️ Non-forest:</strong> {non_forest_samples:,} ({non_forest_samples/total_samples*100:.1f}%)</p>
<hr style="margin: 8px 0;">
<p><strong>📐 Area:</strong> {region_area_km2:.0f} km²</p>
<p><strong>🎯 Resolution:</strong> 30m</p>
</div>
'''

m.get_root().html.add_child(folium.Element(legend_html))

# Add layer control
folium.LayerControl().add_to(m)

print("✅ Interactive map created successfully!")
print(f"   • Study area: {region_area_km2:.0f} km² Amazon region")
print(f"   • Satellite imagery overlay available")
print(f"   • Enhanced legend with sample statistics")

# Display the map
m

🗺️ Creating Interactive Study Area Map
📍 Map Center: -9.00°N, -57.50°W
✅ Interactive map created successfully!
   • Study area: 5258491 km² Amazon region
   • Satellite imagery overlay available
   • Enhanced legend with sample statistics
