<a href="https://colab.research.google.com/github/rmccormick314/GFSAD/blob/main/preprocessing/Stacked_Image_Generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# I. Set Up

## a. Requirements

In [1]:
repo_url = "https://raw.githubusercontent.com/GoogleCloudPlatform/python-docs-samples/main/people-and-planet-ai/land-cover-classification"

!wget --quiet {repo_url}/requirements.txt

!pip install --quiet --upgrade pip
!pip install --quiet -r requirements.txt

# Restart the runtime by ending the process.
exit()

## b. Set Project Info
Authenticates the user to Earth Engine and sets GCP variables.

In [7]:
from __future__ import annotations

import os
from google.colab import auth

auth.authenticate_user()

# Please fill in these values.
project = "gfsad-446404"  # @param {type:"string"}
bucket = "lgcip30"  # @param {type:"string"}
location = "us-west1"  # @param {type:"string"}

# Quick input validations.
assert project, "⚠️ Please provide a Google Cloud project ID"
assert bucket, "⚠️ Please provide a Cloud Storage bucket name"
assert not bucket.startswith(
    "gs://"
), f"⚠️ Please remove the gs:// prefix from the bucket name: {bucket}"
assert location, "⚠️ Please provide a Google Cloud location"

# Set GOOGLE_CLOUD_PROJECT for google.auth.default().
os.environ["GOOGLE_CLOUD_PROJECT"] = project

# Set the gcloud project for other gcloud commands.
!gcloud config set project {project}

Updated property [core/project].


## c. Clone GitHub Code

In [8]:
# Now let's get the code from GitHub and navigate to the sample.
!git clone https://github.com/GoogleCloudPlatform/python-docs-samples.git
%cd python-docs-samples/people-and-planet-ai/land-cover-classification

Cloning into 'python-docs-samples'...
remote: Enumerating objects: 118955, done.[K
remote: Counting objects: 100% (6969/6969), done.[K
remote: Compressing objects: 100% (1266/1266), done.[K
remote: Total 118955 (delta 6428), reused 5705 (delta 5703), pack-reused 111986 (from 3)[K
Receiving objects: 100% (118955/118955), 242.97 MiB | 27.21 MiB/s, done.
Resolving deltas: 100% (71878/71878), done.
Updating files: 100% (5500/5500), done.
/content/python-docs-samples/people-and-planet-ai/land-cover-classification/python-docs-samples/people-and-planet-ai/land-cover-classification


## d. Initialize Earth Engine

In [9]:
import ee
import google.auth

credentials, _ = google.auth.default()
ee.Initialize(
    credentials.with_quota_project(None),
    project=project,
    opt_url="https://earthengine-highvolume.googleapis.com",
)

# II. Create Dataset

## a. Set Parameters

In [10]:
YEAR = "2020" # @param {"type":"string"}
GAEZ = "8" # @param {"type":"string"}

## b. Visualize CDL

In [41]:
import ee
import folium

CLASSIFICATIONS = {
    "Perennial" : "1E90FF",
    "Fodder"    : "00FF00",
    "Dbl. Crop" : "FFD700",
    "Annual"    : "FF4500",
    "Fallow"    : "bfbf77"
}

image = (
    ee.Image("USDA/NASS/CDL/" + YEAR)
    .select("cropland")
    .remap(
        [1, 2, 3, 4, 5, 6, 10, 11, 12, 13, 14, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 74, 75, 76, 77, 81, 82, 83, 87, 88, 92, 111, 112, 121, 122, 123, 124, 131, 141, 142, 143, 152, 176, 190, 195, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 254],
        [4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 1, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 4, 4, 4, 4, 4, 4, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 1, 1, 1, 1, 2, 2, 5, 6, 6, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 1, 4, 4, 1, 4, 4, 1, 1, 1, 4, 4, 1, 4, 1, 1, 4, 1, 1, 4, 1, 4, 3, 3, 4, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 4, 4, 4, 4, 4, 4, 4, 1, 3]
    )
    .rename("label")
)

vis_params = {
    "max": len(CLASSIFICATIONS),
    "palette": list(CLASSIFICATIONS.values()),
    "bands": ["label"],
}
folium.Map(
    location=(38, -106),
    zoom_start=5,
    tiles=image.getMapId(vis_params)["tile_fetcher"].url_format,
    attr='Map Data &copy; <a href="https://earthengine.google.com/">Google Earth Engine</a>',
)

## c. Build Image Stack

In [42]:
import logging
import datetime
import ee

states = ee.FeatureCollection("TIGER/2018/States")
S2 = ee.ImageCollection("COPERNICUS/S2_SR_HARMONIZED")
csPlus = ee.ImageCollection("GOOGLE/CLOUD_SCORE_PLUS/V1/S2_HARMONIZED")
GAEZ = ee.FeatureCollection("users/rmccormick314/gfsad30-raez-74zones")
GCEP1k = ee.Image("USGS/GFSAD1000_V1")
LGRIP30 = ee.Image("projects/usgs-gee-wgscflag/assets/LGRIP30USA2020/LGRIP30USA2020-smoothed70m")
ESA = ee.ImageCollection("ESA/WorldCover/v200")

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')
logger = logging.getLogger()

# Create a handler (e.g., to output to console)
handler = logging.StreamHandler()
handler.setLevel(logging.DEBUG)

# Create a formatter and add it to the handler
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)

# Add the handler to the logger
logger.addHandler(handler)

# Set the year and region
YEAR_INT = int(YEAR)
START_DATE = ee.Date( YEAR+"-01-01" )
END_DATE = ee.Date( YEAR+"-12-31" )

# Define the region of interest
logger.info("Defining region of interest")

region = CDL = ee.Image("USDA/NASS/CDL/"+YEAR).select("cropland").geometry()

#region = ee.FeatureCollection('users/rmccormick314/gfsad30-raez-74zones')\
#    .filter(ee.Filter.eq("Zone", "8"))\
#    .geometry()

# Validate the geometry
if region == None:
    raise ValueError("The defined region is empty. Please check the filter criteria.")

# Load and prepare the GCEP data
logger.info("Preparing GCEP data")
GCEP = LGRIP30.clip(region)
GCEP = GCEP.updateMask(GCEP.neq(0))

# Load and process the Cropland Data Layer (CDL)
logger.info("Preparing CDL data")
CDL = ee.Image("USDA/NASS/CDL/"+YEAR).select("cropland").clip(region)
attribute_codes = [1, 2, 3, 4, 5, 6, 10, 11, 12, 13, 14, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 74, 75, 76, 77, 81, 82, 83, 87, 88, 92, 111, 112, 121, 122, 123, 124, 131, 141, 142, 143, 152, 176, 190, 195, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 254]
reclass_codes = [4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 1, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 1, 1, 4, 4, 4, 4, 4, 4, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 5, 6, 6, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 1, 4, 4, 1, 4, 4, 1, 1, 1, 4, 4, 1, 4, 1, 1, 4, 1, 1, 4, 1, 4, 3, 3, 4, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 4, 4, 4, 4, 4, 4, 4, 1, 3]
CDL = CDL.remap(attribute_codes, reclass_codes).rename(["CropType"]).updateMask(GCEP)


geometry = ee.Geometry.Polygon([
    [[-106.0, 37.0], [-106.0, 38.0], [-105.0, 38.0], [-105.0, 37.0], [-106.0, 37.0]]
])

# Load and process Sentinel-2 data
logger.info("Preparing Sentinel-2 data")
S2 = ee.ImageCollection('COPERNICUS/S2_HARMONIZED')\
    .filterDate(START_DATE, END_DATE)\
    .filterBounds(region)\
    .map(lambda img: img.clip(region).updateMask(GCEP))


In [43]:
# Calculate NDVI for each image
logger.info("Calculating NDVI")
def calculate_ndvi(image):
    ndvi = image.normalizedDifference(['B8', 'B4']).rename('NDVI')
    return image.addBands(ndvi)

S2 = S2.map(calculate_ndvi).select('NDVI')

# Temporal stacking
logger.info("Creating temporal stacks")
def temporal_stack(dates):
    def stack_period(date_range):
        start, end = date_range
        period_images = S2.filterDate(start, end)
        mosaic = period_images.max() if period_images.size().getInfo() > 0 else ee.Image(0).rename('NDVI')
        return mosaic

    return ee.ImageCollection([stack_period(d) for d in dates])

date_ranges = [
    # Year before
    ('2019-01-01', '2019-01-31'), ('2019-02-01', '2019-02-28'), ('2019-03-01', '2019-03-31'),
    ('2019-04-01', '2019-04-30'), ('2019-05-01', '2019-05-31'), ('2019-06-01', '2019-06-30'),
    ('2019-07-01', '2019-07-31'), ('2019-08-01', '2019-08-31'), ('2019-09-01', '2019-09-30'),
    ('2019-10-01', '2019-10-31'), ('2019-11-01', '2019-11-30'), ('2019-12-01', '2019-12-31'),

    # Current year
    ('2020-01-01', '2020-01-31'), ('2020-02-01', '2020-02-29'), ('2020-03-01', '2020-03-31'),
    ('2020-04-01', '2020-04-30'), ('2020-05-01', '2020-05-31'), ('2020-06-01', '2020-06-30'),
    ('2020-07-01', '2020-07-31'), ('2020-08-01', '2020-08-31'), ('2020-09-01', '2020-09-30'),
    ('2020-10-01', '2020-10-31'), ('2020-11-01', '2020-11-30'), ('2020-12-01', '2020-12-31'),

    # Year after
    ('2021-01-01', '2021-01-31'), ('2021-02-01', '2021-02-28'), ('2021-03-01', '2021-03-31'),
    ('2021-04-01', '2021-04-30'), ('2021-05-01', '2021-05-31'), ('2021-06-01', '2021-06-30'),
    ('2021-07-01', '2021-07-31'), ('2021-08-01', '2021-08-31'), ('2021-09-01', '2021-09-30'),
    ('2021-10-01', '2021-10-31'), ('2021-11-01', '2021-11-30'), ('2021-12-01', '2021-12-31'),
]

stacked_images = temporal_stack(date_ranges)
stacked_image = stacked_images.toBands().clip(region)

In [57]:
# Stratified sampling
stratified_sample = CDL.updateMask(GCEP).stratifiedSample(
    numPoints=4000,
    classBand='CropType',
    region=region,
    scale=30,
    seed=69420,
    geometries=True,
    classValues=[1, 2, 3, 4, 5],
    classPoints=[150, 150, 150, 3400, 150]
    # classPoints: [1000,1000,1000,1000,1000]
)

stacked_image = image.updateMask(GCEP)
band_names = stacked_image.bandNames().getInfo() # Get band names as a list
training_data = stacked_image.sampleRegions(
    collection=stratified_sample,
    properties=['CropType'],
    scale=30
)

def feature_collection_to_pandas(feature_collection):
  """Converts an ee.FeatureCollection to a pandas DataFrame.

  Args:
    feature_collection: The ee.FeatureCollection to convert.

  Returns:
    A pandas DataFrame.
  """
  features = feature_collection.getInfo()['features']
  df = pd.DataFrame([feature['properties'] for feature in features])
  return df

training_df = feature_collection_to_pandas(training_data)
print(training_df.head())



EEException: Computation timed out.

In [32]:
# prompt: generate a training data set using CDL as labels and stacked_image as data. use stratified sample and limit the number of points per class to 1000. then prepare the data for use in a neural network

import ee
# Instead of importing the entire numpy library, import only the unique function
# But instead of numpy.unique, we are going to use ee.List.distinct()
# to get the list of unique crop types
#from numpy import unique

# Assuming 'stacked_image', 'CDL', and 'region' are defined from the previous code

# Function to sample the data
def sample_data(image, labels, region, num_samples_per_class):
    samples = image.addBands(labels).sampleRegions(
        collection=region,
        properties=['CropType'],
        scale=30  # Adjust the scale based on your data's resolution
    )

    # Stratified sampling
    sampled_data = []
    # Convert the server-side list to a client-side list to get unique values
    # This avoids using np.unique, which produces numpy data types not recognized by ee

    # Get a list of unique crop types
    distinct_crop_types = labels.reduceRegion(
        reducer=ee.Reducer.toList(),
        geometry=region,
        scale=60
    ).get('CropType')

    # We need to cast the result to ee.List
    distinct_crop_types = ee.List(distinct_crop_types).distinct()

    # Now loop through the list of unique values using ee.List.map()
    # and filter the samples
    sampled_data = distinct_crop_types.map(lambda class_label: samples.filter(ee.Filter.eq('CropType', class_label)).limit(num_samples_per_class))

    #sampled_data = ee.FeatureCollection(sampled_data).flatten()

    # Cast the result to ee.FeatureCollection and flatten
    sampled_data = ee.FeatureCollection(sampled_data).flatten()

    return sampled_data


# Generate training dataset with stratified sampling and a maximum of 1000 samples per class
training_data = sample_data(stacked_image, CDL, region, 1000)

# Export training data to a CSV file in Google Cloud Storage
task = ee.batch.Export.table.toCloudStorage(
    collection=training_data,
    description='export_training_data',
    bucket=bucket,
    fileNamePrefix='training_data',
    fileFormat='CSV',
    selectors=['NDVI_' + str(i) for i in range(1,37)] + ['CropType'], # Update band names and label if needed
)
task.start()

# Check the status of the export task (optional)
print(f"Export task ID: {task.id}")


Export task ID: XRXPEIEL6K6NMVY4HORPQH6X
CommandException: No URLs matched: gs://lgcip30/training_data.csv


FileNotFoundError: [Errno 2] No such file or directory: 'training_data.csv'

In [33]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Download the exported CSV file
!gsutil cp gs://{bucket}/training_data.csv ./training_data.csv

# Load the data into a pandas DataFrame
df = pd.read_csv('training_data.csv')

# Preprocess the data (handle missing values, scale features, etc.)
# Example: Remove rows with any NaN value
df.dropna(inplace=True)

# Separate features (X) and labels (y)
X = df.drop('CropType', axis=1).values
y = df['CropType'].values

# One-hot encode labels (convert categorical labels to numerical)
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)
y_encoded = encoder.fit_transform(y.reshape(-1, 1))

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Normalize the data (scale to [0,1] or standardize)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print("Data prepared for neural network.")
print(f"Training data shape: {X_train.shape}, Labels shape: {y_train.shape}")
print(f"Testing data shape: {X_test.shape}, Labels shape: {y_test.shape}")

Copying gs://lgcip30/training_data.csv...
/ [1 files][  289.0 B/  289.0 B]                                                
Operation completed over 1 objects/289.0 B.                                      


ValueError: Found array with 0 sample(s) (shape=(0, 1)) while a minimum of 1 is required.

In [None]:
# Save or use image
logger.info("Exporting image to Google Cloud Storage")
output_bucket = bucket
output_project = project

def export_to_bucket(image, bucket, region):
    task = ee.batch.Export.image.toCloudStorage(
        image=image,
        description='stacked_image_export',
        bucket=bucket,
        fileNamePrefix='stacked_image',
        region=region,  # Pass the GeoJSON bounding box
        scale=30,
        maxPixels=1e13
    )
    task.start()
    logger.info("Export task started: %s", task.id)

export_to_bucket(stacked_image, output_bucket, region)
logger.info("Script finished.")