In [6]:
#!/usr/bin/env python3
"""
Notebook to extract training and validation data from shape files to a text file parallelised across features

Inputs custom function for temporal statistics calculation or multiple products

"""

# Load modules

import os
import datacube
import numpy as np
import pandas as pd
import xarray as xr
import subprocess as sp
import geopandas as gpd
from pathlib import Path
from odc.io.cgroups import get_cpu_quota
from datacube.utils.geometry import assign_crs

import sys
from dea_tools.bandindices import calculate_indices
from dea_tools.classification import collect_training_data

import warnings
warnings.filterwarnings("ignore")

# Need ls5 for 2010 and ls8 for 2015+
time = "2010"
product = ["ga_ls5t_nbart_gm_cyear_3"]
# ga_ls8c_nbart_gm_cyear_3

# Point the path to the correct file - this can be either the training data or the validation data
# Example of path for validation: /home/jovyan/arvum/data/dea_landcover/validation/{time}_validation_complete.shp

path = f"/home/jovyan/arvum/data/dea_landcover/{time}_merged/{time}_merged.shp"

# The field variable refers to the true classification column.
# The true classification column for the training data is classnum 
# while the tre classification in the validation data is called output

field = "classnum"
#field = 'output'

zonal_stats = 'median'
resolution = (-30, 30)
fail_ratio = 0.05
fail_threshold  = 0.02
return_coords=True

ncpus = round(get_cpu_quota())
print('ncpus = ' + str(ncpus))

ncpus = 31


In [3]:
def feature_layers(query):
    
    # Connect to the datacube
    dc = datacube.Datacube(app='custom_feature_layers')
    
    # Load ls geomedian
    ds = dc.load(product=product, **query, measurements=['blue', 'green', 'red', 'nir', 'swir1', 'swir2', 'sdev', 'edev', 'bcdev'])       
    # Calculate some band indices    
    gm = calculate_indices(ds, index=["NDVI", "MNDWI", "BAI", "BUI", "BSI", "TCG", "TCW", "TCB", "NDMI", "LAI", "EVI", "AWEI_sh", "BAEI", "NDSI", "SAVI", "NBR"], drop=False, collection="ga_ls_3")
    fc = dc.load(product='ga_ls_fc_pc_cyear_3', time=time, like=ds.geobox)
  
    output = xr.merge([gm, fc])
    return output

In [4]:
query = {
    "time": time,
    "resolution": resolution,
    "group_by": "solar_day",
}

In [9]:
# This cell is only used for generating the validation data
# given the true classification is stored in th  

input_data = gpd.read_file(path)

# For the validation data convert the output to numerical
# input_data['output'] = pd.to_numeric(input_data['output'])

# Plot first five rows
# input_data.head()

In [14]:
%%time
column_names, model_input = collect_training_data(
    gdf=input_data[0:100],
    dc_query=query,
    ncpus=ncpus,
    return_coords=False,
    field=field,
    zonal_stats=zonal_stats,
    feature_func=feature_layers)

Taking zonal statistic: median
Collecting training data in parallel mode


  0%|          | 0/100 [00:00<?, ?it/s]

Percentage of possible fails after run 1 = 0.0 %
Removed 0 rows wth NaNs &/or Infs
Output shape:  (100, 36)
CPU times: user 277 ms, sys: 198 ms, total: 475 ms
Wall time: 15.6 s


In [15]:
# Inspect the data
print(column_names)
print()
print(np.array_str(model_input, precision=2, suppress_small=True))

['output', 'blue', 'green', 'red', 'nir', 'swir1', 'swir2', 'sdev', 'edev', 'bcdev', 'NDVI', 'MNDWI', 'BAI', 'BUI', 'BSI', 'TCG', 'TCW', 'TCB', 'NDMI', 'LAI', 'EVI', 'AWEI_sh', 'BAEI', 'NDSI', 'SAVI', 'NBR', 'pv_pc_10', 'pv_pc_50', 'pv_pc_90', 'bs_pc_10', 'bs_pc_50', 'bs_pc_90', 'npv_pc_10', 'npv_pc_50', 'npv_pc_90', 'qa']

[[ 112.  471.  947. ...   33.   47.    2.]
 [ 112.  551. 1144. ...   50.   54.    2.]
 [ 112.  536. 1124. ...   27.   40.    2.]
 ...
 [ 112.  558.  937. ...   39.   56.    2.]
 [ 111.  769. 1130. ...   58.   84.    2.]
 [ 111.  809. 1396. ...   41.   62.    2.]]


In [16]:
# Save the data to disk

model_input = np.hstack((model_input, np.full((model_input.shape[0], 1), int(time))))
print(model_input.shape)

column_names.append("time")

# Change the name if you're working with the validation data
output_file = f"{time}_validation_data.csv"

# Add a binary classification column to the data and remove the multi-class variable 
data = pd.DataFrame(data=model_input, columns=column_names)

# Comment the following two lines when running the validation set
data['binary_class'] = data['classnum'].apply(lambda x: 1 if x==111 else 0)
data.drop(labels=['classnum'], axis=1, inplace=True)
data.to_csv(output_file, index=False)

(100, 37)


In [9]:
# Run this snippet to combine 2010 and 2015 training data
# This file will be used to train the cultivated model

data_2010 = pd.read_csv("/home/jovyan/arvum/data/dea_landcover/c3/2010_training_data.csv")
data_2015 = pd.read_csv("/home/jovyan/arvum/data/dea_landcover/c3/2015_training_data.csv")

data = pd.concat([data_2010, data_2015])
data.to_csv("/home/jovyan/arvum/data/dea_landcover/c3/2010_2015_training_data.csv",  index=False)