## Extraction of Data from jrc Dataset (GEE) ##

Following same extraction process in all datasets

In [1]:
# Import Google Earth Engine API and Initialize it. 
import ee
import pandas as pd

ee.Authenticate()
ee.Initialize(project="ey-data-and-ai-challenge")

In [10]:
# Read coordinates from water quality training dataset, drop given features and date since iSDA datasets are static, i.e. not date dependent.

wq_df = pd.read_csv('../data/water_quality_training_dataset.csv')
wq_df = wq_df.drop(columns=['Total Alkalinity', 'Electrical Conductance', 'Dissolved Reactive Phosphorus'])
wq_df['id'] = wq_df.index
wq_df.head()

Unnamed: 0,Latitude,Longitude,Sample Date,id
0,-28.760833,17.730278,02-01-2011,0
1,-26.861111,28.884722,03-01-2011,1
2,-26.45,28.085833,03-01-2011,2
3,-27.671111,27.236944,03-01-2011,3
4,-27.356667,27.286389,03-01-2011,4


In [11]:
# Convert Coordintaes to ee.Features

features = []

for index, row in wq_df.iterrows():
    feat = ee.Feature(
        ee.Geometry.Point([row['Longitude'], row['Latitude']]).buffer(100), #add a 100m buffer in case of inexact coordinates
        {'id': row['id']}
    )
    features.append(feat)

In [16]:
# Read hydrosheds dataset as Earth Engine image
reducer = ee.Reducer.mean().setOutputs(['flow_accumulation'])
hydro_img = ee.Image("WWF/HydroSHEDS/15ACC").select('b1').rename('flow_accumulation') # select wanted band and rename

In [17]:
fc = ee.FeatureCollection(features) # convert features into feature collection

In [18]:
# Run geospatial data query using reduceRegions()

hydro_collection = hydro_img.reduceRegions(collection=fc, reducer=reducer, scale = 464)

In [19]:
# Process data and export to Google Drive

task = ee.batch.Export.table.toDrive(
    collection=hydro_collection,
    description="hydro_csv_export",
    fileNamePrefix= "hydrosheds_features_training",
    fileFormat='CSV'
)
task.start()

In [20]:
hydro_df = pd.read_csv("../data/hydrosheds_features_training.csv")

# Drop irrelevant columns
hydro_df.drop(columns=[".geo", "system:index"], inplace=True)

hydro_df = hydro_df.merge(wq_df, on='id', how='left')
hydro_df.drop(columns=['id'], inplace=True)
hydro_df

Unnamed: 0,flow_accumulation,Latitude,Longitude,Sample Date
0,4.131443e+06,-28.760833,17.730278,02-01-2011
1,1.162031e+04,-26.861111,28.884722,03-01-2011
2,1.000000e+00,-26.450000,28.085833,03-01-2011
3,1.878500e+04,-27.671111,27.236944,03-01-2011
4,4.778000e+03,-27.356667,27.286389,03-01-2011
...,...,...,...,...
9314,5.009590e+03,-27.527500,30.858056,23-12-2015
9315,1.162031e+04,-26.861111,28.884722,23-12-2015
9316,3.094600e+04,-26.984722,26.632278,23-12-2015
9317,1.000000e+00,-27.935000,26.126667,23-12-2015


In [21]:
hydro_df.to_csv("../data/hydrosheds_features_training.csv")