## Extraction of Data from jrc Dataset (GEE) ##

Following same extraction process in all datasets

In [1]:
# Import Google Earth Engine API and Initialize it. 
import ee
import pandas as pd

ee.Authenticate()
ee.Initialize(project="ey-data-and-ai-challenge")

In [2]:
# Read coordinates from water quality training dataset, drop given features and date since iSDA datasets are static, i.e. not date dependent.

wq_df = pd.read_csv('../data/water_quality_training_dataset.csv')
wq_df = wq_df.drop(columns=['Total Alkalinity', 'Electrical Conductance', 'Dissolved Reactive Phosphorus'])
wq_df['id'] = wq_df.index
wq_df.head()

Unnamed: 0,Latitude,Longitude,Sample Date,id
0,-28.760833,17.730278,02-01-2011,0
1,-26.861111,28.884722,03-01-2011,1
2,-26.45,28.085833,03-01-2011,2
3,-27.671111,27.236944,03-01-2011,3
4,-27.356667,27.286389,03-01-2011,4


In [3]:
# Convert Coordintaes to ee.Features

features = []

for index, row in wq_df.iterrows():
    feat = ee.Feature(
        ee.Geometry.Point([row['Longitude'], row['Latitude']]).buffer(100), #add a 100m buffer in case of inexact coordinates
        {'id': row['id']}
    )
    features.append(feat)

In [9]:
# Read JRC dataset as Earth Engine image

jrc_img = ee.Image("JRC/GSW1_4/GlobalSurfaceWater").select(['occurrence', 'seasonality'], 
                                                           ['occurrence', 'seasonality']) # select bands and rename

In [10]:
fc = ee.FeatureCollection(features) # convert features into feature collection

In [11]:
# Run geospatial data query using reduceRegions()

jrc_collection = jrc_img.reduceRegions(collection=fc, reducer=ee.Reducer.mean(), scale = 30)

In [12]:
# Process data and export to Google Drive

task = ee.batch.Export.table.toDrive(
    collection=jrc_collection,
    description="jrc CSV_export",
    fileNamePrefix= "jrc_features_training",
    fileFormat='CSV'
)
task.start()

In [None]:
jrc_df = pd.read_csv("../data/jrc_features_training.csv")

# Drop irrelevant columns
jrc_df.drop(columns=[".geo", "system:index"], inplace=True)

jrc_df = jrc_df.merge(wq_df, on='id', how='left')
jrc_df.drop(columns=['id'], inplace=True)
jrc_df

Unnamed: 0,occurrence,seasonality,Latitude,Longitude,Sample Date
0,93.447449,11.991149,-28.760833,17.730278,02-01-2011
1,,,-26.861111,28.884722,03-01-2011
2,,,-26.450000,28.085833,03-01-2011
3,22.413793,3.882883,-27.671111,27.236944,03-01-2011
4,,,-27.356667,27.286389,03-01-2011
...,...,...,...,...,...
9314,,,-27.527500,30.858056,23-12-2015
9315,,,-26.861111,28.884722,23-12-2015
9316,,,-26.984722,26.632278,23-12-2015
9317,,,-27.935000,26.126667,23-12-2015


Will do imputation, preprocessing in later notebook

In [16]:
jrc_df.to_csv("../data/jrc_features_training.csv")