## Extraction of Data from jrc Dataset (GEE) ##

Following same extraction process in all datasets

In [1]:
# Import Google Earth Engine API and Initialize it. 
import ee
import pandas as pd

ee.Authenticate()
ee.Initialize(project="ey-data-and-ai-challenge")

In [2]:
# Read coordinates from water quality training dataset, drop given features and date since iSDA datasets are static, i.e. not date dependent.

wq_df = pd.read_csv('../data/water_quality_training_dataset.csv')
wq_df = wq_df.drop(columns=['Total Alkalinity', 'Electrical Conductance', 'Dissolved Reactive Phosphorus'])
wq_df['id'] = wq_df.index
wq_df.head()

Unnamed: 0,Latitude,Longitude,Sample Date,id
0,-28.760833,17.730278,02-01-2011,0
1,-26.861111,28.884722,03-01-2011,1
2,-26.45,28.085833,03-01-2011,2
3,-27.671111,27.236944,03-01-2011,3
4,-27.356667,27.286389,03-01-2011,4


In [3]:
# Convert Coordintaes to ee.Features

features = []

for index, row in wq_df.iterrows():
    feat = ee.Feature(
        ee.Geometry.Point([row['Longitude'], row['Latitude']]).buffer(100), #add a 100m buffer in case of inexact coordinates
        {'id': row['id']}
    )
    features.append(feat)

In [9]:
# Read JRC dataset as Earth Engine image

jrc_img = ee.Image("JRC/GSW1_4/GlobalSurfaceWater").select(['occurrence', 'seasonality'], 
                                                           ['occurrence', 'seasonality']) # select bands and rename

In [10]:
fc = ee.FeatureCollection(features) # convert features into feature collection

In [11]:
# Run geospatial data query using reduceRegions()

jrc_collection = jrc_img.reduceRegions(collection=fc, reducer=ee.Reducer.mean(), scale = 30)

In [12]:
# Process data and export to Google Drive

task = ee.batch.Export.table.toDrive(
    collection=jrc_collection,
    description="jrc CSV_export",
    fileNamePrefix= "jrc_features_training",
    fileFormat='CSV'
)
task.start()

In [None]:
jrc_df = pd.read_csv("../data/jrc_features_training.csv")

# Drop irrelevant columns
jrc_df.drop(columns=[".geo", "system:index"], inplace=True)

jrc_df = jrc_df.merge(wq_df, on='id', how='left')
jrc_df.drop(columns=['id'], inplace=True)
jrc_df

Unnamed: 0,occurrence,seasonality,Latitude,Longitude,Sample Date
0,93.447449,11.991149,-28.760833,17.730278,02-01-2011
1,,,-26.861111,28.884722,03-01-2011
2,,,-26.450000,28.085833,03-01-2011
3,22.413793,3.882883,-27.671111,27.236944,03-01-2011
4,,,-27.356667,27.286389,03-01-2011
...,...,...,...,...,...
9314,,,-27.527500,30.858056,23-12-2015
9315,,,-26.861111,28.884722,23-12-2015
9316,,,-26.984722,26.632278,23-12-2015
9317,,,-27.935000,26.126667,23-12-2015


Will do imputation, preprocessing in later notebook

In [16]:
jrc_df.to_csv("../data/jrc_features_training.csv")

# Repeat for Validation Set

In [18]:
val_df = pd.read_csv("../data/submission_template.csv")
val_df['id'] = val_df.index
val_df

Unnamed: 0,Latitude,Longitude,Sample Date,Total Alkalinity,Electrical Conductance,Dissolved Reactive Phosphorus,id
0,-32.043333,27.822778,01-09-2014,,,,0
1,-33.329167,26.077500,16-09-2015,,,,1
2,-32.991639,27.640028,07-05-2015,,,,2
3,-34.096389,24.439167,07-02-2012,,,,3
4,-32.000556,28.581667,01-10-2014,,,,4
...,...,...,...,...,...,...,...
195,-33.771111,25.386667,06-12-2012,,,,195
196,-33.185361,27.390750,04-09-2014,,,,196
197,-32.043333,27.822778,28-09-2015,,,,197
198,-33.001667,25.161389,08-01-2015,,,,198


In [19]:
features_val = []

for index, row in val_df.iterrows():
    feat_val = ee.Feature(
        ee.Geometry.Point([row['Longitude'], row['Latitude']]).buffer(100), #add a 100m buffer in case of inexact coordinates
        {'id': row['id']}
    )
    features_val.append(feat_val)

In [20]:
fc_val = ee.FeatureCollection(features_val)
jrc_collection_val = jrc_img.reduceRegions(collection=fc_val, reducer=ee.Reducer.mean(), scale = 30)

In [21]:
task_val = ee.batch.Export.table.toDrive(
    collection=jrc_collection_val,
    description="jrc_val_csv_export",
    fileNamePrefix= "jrc_features_validation",
    fileFormat='CSV'
)
task_val.start()

In [22]:
jrc_val_df = pd.read_csv("../data/jrc_features_validation.csv")

# Drop irrelevant columns
jrc_val_df.drop(columns=[".geo", "system:index"], inplace=True)

jrc_val_df = jrc_val_df.merge(val_df, on='id', how='left')
jrc_val_df.drop(columns=['id'], inplace=True)
jrc_val_df

Unnamed: 0,occurrence,seasonality,Latitude,Longitude,Sample Date,Total Alkalinity,Electrical Conductance,Dissolved Reactive Phosphorus
0,50.000000,12.000000,-32.043333,27.822778,01-09-2014,,,
1,,,-33.329167,26.077500,16-09-2015,,,
2,,,-32.991639,27.640028,07-05-2015,,,
3,,,-34.096389,24.439167,07-02-2012,,,
4,69.856004,10.181440,-32.000556,28.581667,01-10-2014,,,
...,...,...,...,...,...,...,...,...
195,49.065789,2.499590,-33.771111,25.386667,06-12-2012,,,
196,16.000000,12.000000,-33.185361,27.390750,04-09-2014,,,
197,50.000000,12.000000,-32.043333,27.822778,28-09-2015,,,
198,,,-33.001667,25.161389,08-01-2015,,,


In [23]:
jrc_val_df.to_csv("../data/jrc_features_validation.csv")

-- EXTRACTION FINISHED --