# Processing and Writing Radiant ML Data to CSV

This notebook focuses on extracting AgriFieldNet Competition field data from GeoTiffs and saving the results to CSV. Much of the code to perform these tasks originates with Radiant Earths' [starter notebook](https://github.com/radiantearth/agrifieldnet_india_competition/blob/main/Starter%20notebook.ipynb), but have been heavily modified to meet the specific goals of the author.

*Note: The related sibling notebook, 'Processing - RML Data (Geospatial)' goes in greater depth and breaks up and recreates scaled per-field GeoTiffs for each band. However, the results from the Geospatial notebook were not ultimately used by this project.*

## Setup

Create the libraries + import config

In [1]:
import json
import pandas as pd
import matplotlib.pyplot as plt

import sys
sys.path.append('..')

from utils.file_ops import *
from utils.data_prep import *
from utils.vegetation_indices import *

# get config
CONFIG = read_yaml('../../conf.yaml')

Setting up vars from config

In [2]:
data_dir = '../../data/'

Full_bands = CONFIG['BANDS']
collection_id = CONFIG['COLLECTION_ID']
assets = CONFIG['ASSETS']
processed_collection_id = CONFIG['PROCESSED_COLLECTION_ID']

crops = [c['id'] for c in CONFIG['CROPS']]
crops_names = [c['name'] for c in CONFIG['CROPS']]

selected_bands = [b['id'] for b in CONFIG['BANDS']]
source_collection = f'{collection_id}_source'
train_label_collection = f'{collection_id}_labels_train'
test_label_collection = f'{collection_id}_labels_test'

src_tile_size = (CONFIG['SRC_TILE_WIDTH'], CONFIG['SRC_TILE_HEIGHT'])

## Download the data if needed

In [3]:
# requires MLHUB_API_KEY ENV var to be set

def fetch_rml_data(collection_id, assets, selected_bands):
    target_dir = data_dir + collection_id

    if exists(target_dir):
        print(f'the directory {collection_id} already exists.')
    else:
        try:
            dataset = Dataset.fetch(collection_id)

            my_filter = dict(
                ref_agrifieldnet_competition_v1_labels_train=assets,
                ref_agrifieldnet_competition_v1_labels_test=[assets[0]],
                ref_agrifieldnet_competition_v1_source=selected_bands 
            )

            dataset.download(output_dir=target_dir, collection_filter=my_filter)
            print(f'{collection_id} successfully downloaded!')
        except:
            print(f'fetch_rml_data failed to download {collection_id} data!')


In [4]:
fetch_rml_data(collection_id, assets, selected_bands)

the directory ref_agrifieldnet_competition_v1 already exists.


## Prepare Train data

In [5]:
def get_field_and_label_paths(collection_id, label_collection, bin='train'):
    with open (f'{data_dir}{collection_id}/{label_collection}/collection.json') as f:
        json_data = json.load(f)
        
    folder_ids = [i['href'].split('_')[-1].split('.')[0] for i in json_data['links'][4:]]
    field_paths = [f'{data_dir}{collection_id}/{label_collection}/{label_collection}_{i}/field_ids.tif' for i in folder_ids]
    if bin == 'train':
        label_paths = [f'{data_dir}{collection_id}/{label_collection}/{label_collection}_{i}/raster_labels.tif' for i in folder_ids]    
    else:
        label_paths = []
    return folder_ids, field_paths, label_paths

In [6]:
train_folder_ids, train_field_paths, train_label_paths = get_field_and_label_paths(collection_id, train_label_collection)
test_folder_ids, test_field_paths, test_label_paths = get_field_and_label_paths(collection_id, test_label_collection, 'test')

### Train

In [7]:
#create dataset for folder_ids and field_paths

competition_train_data = pd.DataFrame(train_folder_ids, columns=['unique_folder_id'])
competition_train_data['field_paths'] = train_field_paths
write_csv_from_df(
    competition_train_data, 
    f'{data_dir}interstitial_data/folder_ids_and_field_paths_TRAIN'
)

print(competition_train_data.shape)
competition_train_data.head()

the file ../../data/interstitial_data/folder_ids_and_field_paths_TRAIN.csv already exists.
(1165, 2)


Unnamed: 0,unique_folder_id,field_paths
0,28852,../../data/ref_agrifieldnet_competition_v1/ref...
1,d987c,../../data/ref_agrifieldnet_competition_v1/ref...
2,ca1d4,../../data/ref_agrifieldnet_competition_v1/ref...
3,2ec18,../../data/ref_agrifieldnet_competition_v1/ref...
4,7575d,../../data/ref_agrifieldnet_competition_v1/ref...


### Test

In [8]:
#create dataset for folder_ids and field_paths

competition_test_data = pd.DataFrame(test_folder_ids, columns=['unique_folder_id'])
competition_test_data['field_paths'] = test_field_paths
write_csv_from_df(
    competition_test_data, 
    f'{data_dir}interstitial_data/folder_ids_and_field_paths_TEST'
)

print(competition_test_data.shape)
competition_test_data.head()

the file ../../data/interstitial_data/folder_ids_and_field_paths_TEST.csv already exists.
(707, 2)


Unnamed: 0,unique_folder_id,field_paths
0,6199c,../../data/ref_agrifieldnet_competition_v1/ref...
1,6c81d,../../data/ref_agrifieldnet_competition_v1/ref...
2,1ebeb,../../data/ref_agrifieldnet_competition_v1/ref...
3,586a2,../../data/ref_agrifieldnet_competition_v1/ref...
4,65812,../../data/ref_agrifieldnet_competition_v1/ref...


## PREPROCESS FIELDS AND CROPS IN TILES FOR TRAININIG

### Train

In [9]:
field_crop_data = get_df_from_csv_if_exists(
    f'{data_dir}labels_TRAIN.csv',
    lambda: field_crop_extractor(train_folder_ids, collection_id, train_label_collection)
)

write_csv_from_df(field_crop_data, f'{data_dir}labels_TRAIN')
print(field_crop_data.shape)
field_crop_data.head()

the file ../../data/labels_TRAIN.csv already exists.
(5551, 2)


Unnamed: 0,field_id,crop_id
1,757,6
2,756,6
3,1372,5
4,1374,1
5,1986,4


In [10]:
img_sh = 256
n_obs = 1  #imagery per chip(no time series)

train_data = get_df_from_csv_if_exists(
    f'{data_dir}/pixel_data_TRAIN.csv',
    lambda: pixel_data_extractor_1(competition_train_data, source_collection, 
                              collection_id, selected_bands, n_obs, img_sh)
)

write_csv_from_df(train_data, f'{data_dir}/pixel_data_TRAIN')
print(train_data.shape)
train_data.head()

the file ../../data//pixel_data_TRAIN.csv already exists.
(188228, 16)


Unnamed: 0,B01,B02,B03,B04,B05,B06,B07,B08,B8A,B09,B11,B12,field_id,folder_id,lon,lat
11031,43,39,38,38,41,54,63,61,64,12,57,37,757,28852,628235,3025485
11287,43,39,38,38,42,57,67,63,72,12,63,42,757,28852,628235,3025475
11288,43,39,38,37,41,59,69,65,78,12,68,43,757,28852,628245,3025475
11289,43,38,37,36,41,59,69,64,78,12,68,43,757,28852,628255,3025475
11543,43,39,38,38,42,57,67,64,72,12,63,42,757,28852,628235,3025465


### Test

In [11]:
test_data = get_df_from_csv_if_exists(
    f'{data_dir}pixel_data_TEST.csv',
    lambda: pixel_data_extractor_1(competition_test_data, source_collection, 
                              collection_id, selected_bands, n_obs, img_sh)
)

write_csv_from_df(test_data, f'{data_dir}pixel_data_TEST')
print(test_data.shape)
test_data.head()

the file ../../data/pixel_data_TEST.csv already exists.
(49318, 16)


Unnamed: 0,B01,B02,B03,B04,B05,B06,B07,B08,B8A,B09,B11,B12,field_id,folder_id,lon,lat
35283,39,35,35,35,38,48,55,59,60,11,53,39,5407,6199c,688995.0,2725025.0
35284,39,34,33,34,37,49,58,58,63,11,54,40,5407,6199c,689005.0,2725025.0
35538,39,36,36,37,39,59,70,56,76,14,55,37,5407,6199c,688985.0,2725015.0
35539,39,35,36,34,39,59,70,75,76,14,55,37,5407,6199c,688995.0,2725015.0
35540,39,33,34,31,37,70,85,79,90,14,54,34,5407,6199c,689005.0,2725015.0


## Adding Vegetation Indices

In [12]:
add_vegetation_indices(train_data, selected_bands)
add_vegetation_indices(test_data, selected_bands)

## Aggregate the data

In [13]:
train_data.columns

Index(['B01', 'B02', 'B03', 'B04', 'B05', 'B06', 'B07', 'B08', 'B8A', 'B09',
       'B11', 'B12', 'field_id', 'folder_id', 'lon', 'lat', 'NDVI', 'ARVI',
       'SAVI', 'SRI', 'RENDVI', 'ARI', 'MSI', 'MCARI', 'MARI', 'EVI2', 'NDMI',
       'NDWI', 'brightness'],
      dtype='object')

### Train

In [14]:
train_data_agg = get_df_from_csv_if_exists(
    f'{data_dir}pixel_data_agg_TRAIN.csv',
    lambda: group_and_aggregate(train_data)
)

write_csv_from_df(train_data_agg, f'{data_dir}pixel_data_agg_TRAIN')
print(train_data_agg.shape)
train_data_agg.head()

the file ../../data/pixel_data_agg_TRAIN.csv already exists.
(5551, 114)


Unnamed: 0,field_id,pixels,B01_median,B01_mean,B01_std,B01_range,B02_median,B02_mean,B02_std,B02_range,...,NDMI_std,NDMI_range,NDWI_median,NDWI_mean,NDWI_std,NDWI_range,brightness_median,brightness_mean,brightness_std,brightness_range
0,1,18,45.0,45.0,0.0,0,42.0,42.444444,0.51131,1,...,0.010839,0.03266,-0.169811,-0.170352,0.009109,0.035951,63.5,63.688889,0.66056,2.0
1,2,12,45.0,45.0,0.0,0,42.0,42.0,0.738549,2,...,0.012305,0.040125,-0.201852,-0.205101,0.013323,0.04209,64.7,64.475,0.903654,2.8
2,3,16,45.0,45.0,0.0,0,43.0,42.6875,1.25,5,...,0.015181,0.04576,-0.210526,-0.21019,0.01742,0.063147,66.5,66.54375,1.684822,5.1
3,4,15,46.0,45.866667,0.351866,1,43.0,42.466667,0.915475,3,...,0.009599,0.027042,-0.166667,-0.17702,0.023648,0.076416,64.3,63.713333,1.084216,3.2
4,5,42,46.0,46.0,0.0,0,43.0,43.238095,0.576344,2,...,0.008106,0.034662,-0.163636,-0.166941,0.015942,0.062861,64.6,64.704762,0.474189,2.3


### Test

In [15]:
test_data_agg = get_df_from_csv_if_exists(
    f'{data_dir}pixel_data_agg_TEST.csv',
    lambda: group_and_aggregate(test_data)
)

write_csv_from_df(test_data_agg, f'{data_dir}pixel_data_agg_TEST')
print(test_data_agg.shape)
test_data_agg.head()

the file ../../data/pixel_data_agg_TEST.csv already exists.
(1530, 114)


Unnamed: 0,field_id,pixels,B01_median,B01_mean,B01_std,B01_range,B02_median,B02_mean,B02_std,B02_range,...,NDMI_std,NDMI_range,NDWI_median,NDWI_mean,NDWI_std,NDWI_range,brightness_median,brightness_mean,brightness_std,brightness_range
0,11,69,43.0,43.086957,0.283836,1,39.0,39.057971,1.247162,5,...,0.031613,0.119975,-0.25,-0.252188,0.028162,0.100078,60.6,60.378261,2.214516,9.1
1,13,13,44.0,44.0,0.0,0,41.0,40.692308,0.751068,2,...,0.006843,0.026846,-0.222222,-0.226399,0.013934,0.041327,65.1,65.307692,0.754389,2.4
2,19,49,45.0,45.469388,0.504234,1,42.0,41.918367,1.351492,5,...,0.033954,0.134363,-0.207207,-0.218116,0.028949,0.104762,65.3,64.871429,1.595958,6.5
3,21,12,44.0,44.0,0.0,0,40.0,40.25,0.621582,2,...,0.011731,0.042254,-0.264606,-0.261353,0.009164,0.029484,62.7,63.008333,0.811797,3.0
4,25,10,47.0,47.0,0.0,0,44.0,44.3,0.483046,1,...,0.00967,0.025674,-0.201695,-0.202649,0.009086,0.030226,72.7,72.75,0.241523,0.7


## Conclusions and Next Steps

In this notebook, we downloaded the AgrifieldNet competition data, read pixel values from each GeoTiff, added vegetation indices, and aggregated our values by field_id storing the results in CSVs along the way.

Next we will perfom EDA to gain some familiarity with the data before modeling.