# Data preparation

1) obtaining cloud-free composite for the AOI using GEE python API
2) masking composite raster by label polygons with classes:

1 - ``trees`` \
2 - ``flooded vegetation`` \
3 - ``open water`` \
4 - ``settlements`` \
5 - ``bare soil`` - a target class which dynamics is of the most interest along with ``trees`` \
6 - ``agriculture and grassland`` \
7 - ``shrubs`` \

In [3]:
#libraries
import ee
import pandas as pd
#import altair as alt 
import numpy as np
import geemap.foliumap as geemap
import folium
import geopandas as gpd
import rasterio as rio
from rasterio.mask import mask

import requests
import json
from geojson import Point, Feature, FeatureCollection, dump

import matplotlib.pyplot as plt

In [4]:
import warnings
warnings.filterwarnings('ignore')

## Sentinel-2 summer composite download using GEE

Based on [Sentinel-2 Cloud Masking with s2cloudless](https://developers.google.com/earth-engine/tutorials/community/sentinel-2-s2cloudless) tutorial

In [18]:
#authorisation
ee.Authenticate()


Successfully saved authorization token.


In [19]:
ee.Initialize()

In [21]:
#all helpers from GEE
def get_s2_sr_cld_col(aoi, start_date, end_date):
    # Import and filter S2 SR.
    s2_sr_col = (ee.ImageCollection('COPERNICUS/S2_SR')
        .filterBounds(aoi)
        .filterDate(start_date, end_date)
        .filter(ee.Filter.lte('CLOUDY_PIXEL_PERCENTAGE', CLOUD_FILTER)))

    # Import and filter s2cloudless.
    s2_cloudless_col = (ee.ImageCollection('COPERNICUS/S2_CLOUD_PROBABILITY')
        .filterBounds(aoi)
        .filterDate(start_date, end_date))

    # Join the filtered s2cloudless collection to the SR collection by the 'system:index' property.
    return ee.ImageCollection(ee.Join.saveFirst('s2cloudless').apply(**{
        'primary': s2_sr_col,
        'secondary': s2_cloudless_col,
        'condition': ee.Filter.equals(**{
            'leftField': 'system:index',
            'rightField': 'system:index'
        })
    }))

def add_cloud_bands(img):
    # Get s2cloudless image, subset the probability band.
    cld_prb = ee.Image(img.get('s2cloudless')).select('probability')

    # Condition s2cloudless by the probability threshold value.
    is_cloud = cld_prb.gt(CLD_PRB_THRESH).rename('clouds')

    # Add the cloud probability layer and cloud mask as image bands.
    return img.addBands(ee.Image([cld_prb, is_cloud]))

def add_shadow_bands(img):
    # Identify water pixels from the SCL band.
    not_water = img.select('SCL').neq(6)

    # Identify dark NIR pixels that are not water (potential cloud shadow pixels).
    SR_BAND_SCALE = 1e4
    dark_pixels = img.select('B8').lt(NIR_DRK_THRESH*SR_BAND_SCALE).multiply(not_water).rename('dark_pixels')

    # Determine the direction to project cloud shadow from clouds (assumes UTM projection).
    shadow_azimuth = ee.Number(90).subtract(ee.Number(img.get('MEAN_SOLAR_AZIMUTH_ANGLE')));

    # Project shadows from clouds for the distance specified by the CLD_PRJ_DIST input.
    cld_proj = (img.select('clouds').directionalDistanceTransform(shadow_azimuth, CLD_PRJ_DIST*10)
        .reproject(**{'crs': img.select(0).projection(), 'scale': 100})
        .select('distance')
        .mask()
        .rename('cloud_transform'))

    # Identify the intersection of dark pixels with cloud shadow projection.
    shadows = cld_proj.multiply(dark_pixels).rename('shadows')

    # Add dark pixels, cloud projection, and identified shadows as image bands.
    return img.addBands(ee.Image([dark_pixels, cld_proj, shadows]))

def add_cld_shdw_mask(img):
    # Add cloud component bands.
    img_cloud = add_cloud_bands(img)

    # Add cloud shadow component bands.
    img_cloud_shadow = add_shadow_bands(img_cloud)

    # Combine cloud and shadow mask, set cloud and shadow as value 1, else 0.
    is_cld_shdw = img_cloud_shadow.select('clouds').add(img_cloud_shadow.select('shadows')).gt(0)

    # Remove small cloud-shadow patches and dilate remaining pixels by BUFFER input.
    # 20 m scale is for speed, and assumes clouds don't require 10 m precision.
    is_cld_shdw = (is_cld_shdw.focalMin(2).focalMax(BUFFER*2/20)
        .reproject(**{'crs': img.select([0]).projection(), 'scale': 20})
        .rename('cloudmask'))

    # Add the final cloud-shadow mask to the image.
    return img_cloud_shadow.addBands(is_cld_shdw)

# Define a method for displaying Earth Engine image tiles to a folium map.
def add_ee_layer(self, ee_image_object, vis_params, name, show=True, opacity=1, min_zoom=0):
    map_id_dict = ee.Image(ee_image_object).getMapId(vis_params)
    folium.raster_layers.TileLayer(
        tiles=map_id_dict['tile_fetcher'].url_format,
        attr='Map Data &copy; <a href="https://earthengine.google.com/">Google Earth Engine</a>',
        name=name,
        show=show,
        opacity=opacity,
        min_zoom=min_zoom,
        overlay=True,
        control=True
        ).add_to(self)

# Add the Earth Engine layer method to folium.
folium.Map.add_ee_layer = add_ee_layer

def apply_cld_shdw_mask(img):
    # Subset the cloudmask band and invert it so clouds/shadow are 0, else 1.
    not_cld_shdw = img.select('cloudmask').Not()

    # Subset reflectance bands and update their masks, return the result.
    return img.select('B.*').updateMask(not_cld_shdw)

In [22]:
#bbox of the area of interest

f_kola = open('data/monche_bbox.geojson')
kola_bbox = json.load(f_kola)
aoi_poly = ee.Geometry.MultiPolygon(kola_bbox['features'][0]['geometry']['coordinates'])

Map = geemap.Map()
Map.addLayer(aoi_poly,{},'AOI_Monchegorsk')
Map.setCenter(32.8, 67.92,  11)
Map

In [133]:
#settings of example
AOI = aoi_poly
START_DATE = '2021-06-15'
END_DATE = '2021-08-25'
CLOUD_FILTER = 60
CLD_PRB_THRESH = 15
NIR_DRK_THRESH = 0.15
CLD_PRJ_DIST = 1
BUFFER = 50

In [134]:
#getting cloud-free median composite for AOI and period of time
s2_sr_cld_col_eval = get_s2_sr_cld_col(AOI, START_DATE, END_DATE)
s2_sr_cld_col = get_s2_sr_cld_col(AOI, START_DATE, END_DATE)
s2_sr_median = (s2_sr_cld_col.map(add_cld_shdw_mask)
                             .map(apply_cld_shdw_mask)
                             .median())
kola_2021 = s2_sr_median.clip(AOI)

# Create a folium map object.
center = AOI.centroid(10).coordinates().reverse().getInfo()
m = folium.Map(location=center, zoom_start=12)

# Add layers to the folium map.
m.add_ee_layer(kola_2021,
                {'bands': ['B4', 'B3', 'B2'], 'min': 0, 'max': 2500, 'gamma': 1.1},
                'S2 cloud-free mosaic', True, 1, 9)

# Add a layer control panel to the map.
m.add_child(folium.LayerControl())

# Display the map.
display(m)

In [122]:
kola_2021

In [125]:
#median composite writing to tiff file -- WARNING!! it will be written to the gdrive

#task = ee.batch.Export.image.toDrive(image=kola_2021,
#                                     description='2021_kola_median_composite',
#                                     scale=30,
#                                     #region=AOI,
#                                     fileNamePrefix='2021_kola_median_composite',
#                                     crs='EPSG:4326',
#                                     fileFormat='GeoTIFF')
#task.start()
#task.status()

{'state': 'READY',
 'description': '2018_kola_median_composite',
 'creation_timestamp_ms': 1692556318737,
 'update_timestamp_ms': 1692556318737,
 'start_timestamp_ms': 0,
 'task_type': 'EXPORT_IMAGE',
 'id': 'OLXCRKALKEDU4OYSVAEIMHLV',
 'name': 'projects/earthengine-legacy/operations/OLXCRKALKEDU4OYSVAEIMHLV'}

## Getting dataset for predictions

In [5]:
#helper indices function

def get_spectral_indices(df):
    blue = df['B2']
    green = df['B3']
    red = df['B4']
    nir = df['B8']
    swir2 = df['B11']
    swir22 = df['B12']
 
    df['ndvi'] = (nir-red)/(nir+red)
    df['evi'] = 2.5 * ((nir - red) / (nir + 6 * red - 7.5 * blue + 1))
    df['savi'] = (nir - red) / (nir + red + 0.428) * (1.428)
    df['msi'] = swir2/nir
    df['bsi'] = ((swir2+red)-(nir+blue))/((swir2+red)+(nir+blue))
    #https://doi.org/10.1016/j.envc.2022.100568
    df['ndbi'] = ((swir2-nir)/(swir2+nir)) 
    df['nbi'] = (red*swir2)/nir

    #https://awesome-ee-spectral-indices.readthedocs.io/en/latest/list.html#soil
    df['bal'] = (red+swir2-nir)
    df['mbi'] = ((swir2 - swir22 - nir)/(swir2+swir22+nir)) +0.5
    df['ndsoil'] = (swir22-green)/(swir22+green)
    df['blfei'] = (((green+red+swir22)/3.0)-swir2)/(((green+red+swir22)/3.0)+swir2)
    
    return df

In [6]:
label_path = 'data/labels_satellite_osm.geojson'
image_path = 'data/2021_kola_median_composite.tif'

In [7]:
label_osm = open(label_path) #new labels corrected with osm
labels = json.load(label_osm)
count = 0 # add unique number for each data label plot so further it would be possible to consider it and stratify train/test to avoid data overlap
for item in labels['features']:
    count += 1
    item['plot'] = count
labels

{'type': 'FeatureCollection',
 'name': 'yet_another_osm_4',
 'crs': {'type': 'name',
  'properties': {'name': 'urn:ogc:def:crs:OGC:1.3:CRS84'}},
 'features': [{'type': 'Feature',
   'properties': {'id': 9},
   'geometry': {'type': 'MultiPolygon',
    'coordinates': [[[[32.79289856178918, 67.9563941463257, 0.0],
       [32.79289856178918, 67.95715441232346, 0.0],
       [32.79538170622863, 67.95715441232346, 0.0],
       [32.79538170622863, 67.9563941463257, 0.0],
       [32.79289856178918, 67.9563941463257, 0.0]]]]},
   'plot': 1},
  {'type': 'Feature',
   'properties': {'id': 9},
   'geometry': {'type': 'MultiPolygon',
    'coordinates': [[[[32.80899000244736, 67.94912120494185, 0.0],
       [32.80899000244736, 67.94991127322693, 0.0],
       [32.81142927760853, 67.94991127322693, 0.0],
       [32.81142927760853, 67.94912120494185, 0.0],
       [32.80899000244736, 67.94912120494185, 0.0]]]]},
   'plot': 2},
  {'type': 'Feature',
   'properties': {'id': 4},
   'geometry': {'type': 'Mul

In [8]:
#getting dataset of band values in pixels of polygons 
general = pd.DataFrame()
with rio.open(image_path, 'r+') as dataset:
    for row in labels['features']:
        df = pd.DataFrame(rio.mask.mask(dataset, 
                                    [row['geometry']], 
                                    all_touched=False,
                                    crop=True)[0].reshape([12,-1]).T, 
                      columns=list(dataset.descriptions))
        df['id'] = row['properties']['id']
        df['plot'] = row['plot']
        general = pd.concat([general, df])
        general = general[(general[(list(dataset.descriptions))] > 0).any(1)]
        general = general.reset_index().iloc[:,1:]
        get_spectral_indices(general)

general    

Unnamed: 0,B1,B2,B3,B4,B5,B6,B7,B8,B8A,B9,...,evi,savi,msi,bsi,ndbi,nbi,bal,mbi,ndsoil,blfei
0,446.0,494.0,652.0,680.0,1021.0,1645.0,1888.0,1932.0,2096.0,2058.0,...,1.356153,0.684366,1.108696,0.075457,0.051546,753.913043,890.0,0.260383,0.410488,-0.379266
1,482.5,445.0,596.5,595.5,993.5,1577.0,1776.5,1873.5,1991.0,1910.0,...,1.514218,0.739031,1.161996,0.089177,0.074929,691.968775,899.0,0.277084,0.444858,-0.408226
2,482.5,445.0,596.5,595.5,993.5,1577.0,1776.5,1873.5,1991.0,1910.0,...,1.514218,0.739031,1.161996,0.089177,0.074929,691.968775,899.0,0.277084,0.444858,-0.408226
3,482.5,704.5,888.5,1011.0,1206.5,1521.5,1677.5,1711.0,1838.0,1910.0,...,0.701614,0.367172,1.405319,0.171497,0.168509,1420.777031,1704.5,0.294876,0.370528,-0.305906
4,482.5,704.5,888.5,1011.0,1206.5,1521.5,1677.5,1711.0,1838.0,1910.0,...,0.701614,0.367172,1.405319,0.171497,0.168509,1420.777031,1704.5,0.294876,0.370528,-0.305906
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3933,742.0,734.0,872.0,930.0,1084.0,1169.0,1221.0,1165.0,1226.0,1616.0,...,0.473409,0.160149,1.460944,0.161774,0.187304,1358.678112,1467.0,0.272057,0.277548,-0.208521
3934,742.0,888.0,1095.0,1209.0,1307.0,1344.0,1393.0,1360.0,1313.0,1616.0,...,0.193095,0.083921,1.388971,0.158997,0.162819,1679.265441,1738.0,0.253641,0.233998,-0.164253
3935,742.0,888.0,1095.0,1209.0,1307.0,1344.0,1393.0,1360.0,1313.0,1616.0,...,0.193095,0.083921,1.388971,0.158997,0.162819,1679.265441,1738.0,0.253641,0.233998,-0.164253
3936,592.0,821.5,1021.5,1195.5,1324.0,1424.0,1514.0,1592.0,1649.0,1581.5,...,0.380555,0.203091,1.534862,0.202478,0.211002,1834.927293,2047.0,0.293409,0.350501,-0.256137


In [9]:
general.to_csv('data/general_osm_2021.csv', index=False)  

🙌