In [1]:
#----------------Johnowhitaker
# Required libraries
import requests
from urllib.parse import urlparse
from pathlib import Path
from datetime import datetime
# Required libraries
import tifffile as tiff
import datetime
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
API = 'YOUR KEY HERE'

In [None]:
!mkdir data

In [None]:
output_path = Path("data/")

In [None]:
# these headers will be used in each request
headers = {
    'Authorization': f'Bearer {API}',
    'Accept':'application/json'
}

In [None]:
def get_download_url(item, asset_key, headers):
    asset = item.get('assets', {}).get(asset_key, None)
    if asset is None:
        print(f'Asset "{asset_key}" does not exist in this item')
        return None
    r = requests.get(asset.get('href'), headers=headers, allow_redirects=False)
    return r.headers.get('Location')

def download_label(url, output_path, tileid):
    filename = urlparse(url).path.split('/')[-1]
    outpath = output_path/tileid
    outpath.mkdir(parents=True, exist_ok=True)
    
    r = requests.get(url)
    f = open(outpath/filename, 'wb')
    for chunk in r.iter_content(chunk_size=512 * 1024): 
        if chunk:
            f.write(chunk)
    f.close()
    print(f'Downloaded {filename}')
    return 

def download_imagery(url, output_path, tileid, date):
    filename = urlparse(url).path.split('/')[-1]
    outpath = output_path/tileid/date
    outpath.mkdir(parents=True, exist_ok=True)
    
    r = requests.get(url)
    f = open(outpath/filename, 'wb')
    for chunk in r.iter_content(chunk_size=512 * 1024): 
        if chunk:
            f.write(chunk)
    f.close()
    print(f'Downloaded {filename}')
    return

In [None]:
# paste the id of the labels collection:
collectionId = 'ref_african_crops_kenya_02_labels'

# these optional parameters can be used to control what items are returned. 
# Here, we want to download all the items so:
limit = 100 
bounding_box = []
date_time = []

# retrieves the items and their metadata in the collection
r = requests.get(f'https://api.radiant.earth/mlhub/v1/collections/{collectionId}/items', params={'limit':limit, 'bbox':bounding_box,'datetime':date_time}, headers=headers)
collection = r.json()

In [None]:
# retrieve list of features (in this case tiles) in the collection
for feature in collection.get('features', []):
    assets = feature.get('assets').keys()
    print("Feature", feature.get('id'), 'with the following assets', list(assets))

In [None]:
for feature in collection.get('features', []):
    
    tileid = feature.get('id').split('tile_')[-1][:2]

    # download labels
    download_url = get_download_url(feature, 'labels', headers)
    download_label(download_url, output_path, tileid)
    
    #download field_ids
    download_url = get_download_url(feature, 'field_ids', headers)
    download_label(download_url, output_path, tileid)

In [None]:
# paste the id of the imagery collection:
collectionId = 'ref_african_crops_kenya_02_source'

# these optional parameters can be used to control what items are returned. 
# Here, we want to download all the items so:
limit = 100 
bounding_box = []
date_time = []

# retrieves the items and their metadata in the collection
r = requests.get(f'https://api.radiant.earth/mlhub/v1/collections/{collectionId}/items', params={'limit':limit, 'bbox':bounding_box,'datetime':date_time}, headers=headers)
collection = r.json()

In [None]:
# List assets of the items
for feature in collection.get('features', []):
    assets = feature.get('assets').keys()
    print(list(assets))
    break #all the features have the same type of assets. for simplicity we break the loop here.

In [None]:
# This cell downloads all the multi-spectral images throughout the growing season for this competition.
# The size of data is about 1.5 GB, and download time depends on your internet connection. 
# Note that you only need to run this cell and download the data once.
i = 0
for feature in collection.get('features', []):
    assets = feature.get('assets').keys()
    tileid = feature.get('id').split('tile_')[-1][:2]
    date = datetime.strftime(datetime.strptime(feature.get('properties')['datetime'], "%Y-%m-%dT%H:%M:%SZ"), "%Y%m%d")
    for asset in assets:
        i += 1
        if i > 0: # if resuming after it failed
          download_url = get_download_url(feature, asset, headers)
          download_imagery(download_url, output_path, tileid, date)

In [None]:
def load_file(fp):
    """Takes a PosixPath object or string filepath
    and returns np array"""
    
    return tiff.imread(fp.__str__())

In [None]:
# List of dates that an observation from Sentinel-2 is provided in the training dataset
dates = [datetime.datetime(2019, 6, 6, 8, 10, 7),
         datetime.datetime(2019, 7, 1, 8, 10, 4),
         datetime.datetime(2019, 7, 6, 8, 10, 8),
         datetime.datetime(2019, 7, 11, 8, 10, 4),
         datetime.datetime(2019, 7, 21, 8, 10, 4),
         datetime.datetime(2019, 8, 5, 8, 10, 7),
         datetime.datetime(2019, 8, 15, 8, 10, 6),
         datetime.datetime(2019, 8, 25, 8, 10, 4),
         datetime.datetime(2019, 9, 9, 8, 9, 58),
         datetime.datetime(2019, 9, 19, 8, 9, 59),
         datetime.datetime(2019, 9, 24, 8, 9, 59),
         datetime.datetime(2019, 10, 4, 8, 10),
         datetime.datetime(2019, 11, 3, 8, 10)]

In [None]:
bands = ['B01', 'B02', 'B03', 'B04', 'B05', 'B06', 'B07', 'B08', 'B8A', 'B09', 'B11', 'B12', 'CLD']

In [None]:
# Sample file to load:
file_name = "data/00/20190825/0_B03_20190825.tif"
band_data = load_file(file_name)

In [None]:
fig = plt.figure(figsize=(7, 7))
plt.imshow(band_data, vmin=0, vmax=0.15)

In [None]:
# Quick way to see an RGB image. Can mess with the scaling factor to change brightness (3 in this example)

import numpy as np
def load_rgb(tile, date):

  r = load_file(f"data/{tile}/{date}/{tile[1]}_B04_{date}.tif")
  g = load_file(f"data/{tile}/{date}/{tile[1]}_B03_{date}.tif")
  b = load_file(f"data/{tile}/{date}/{tile[1]}_B02_{date}.tif")
  arr = np.dstack((r, g, b))
  print(max(g.flatten()))
  return arr

fig, ax = plt.subplots(figsize=(12, 18))
ax.imshow(load_rgb('01', '20190825')*3)
plt.tight_layout()

In [None]:
# Not super efficient but  ¯\_(ツ)_/¯
import pandas as pd

row_locs = []
col_locs = []
field_ids = []
labels = []
tiles = []
for tile in range(4):
    fids = f'/content/data/0{tile}/{tile}_field_id.tif'
    labs = f'/content/data/0{tile}/{tile}_label.tif'
    fid_arr = load_file(fids)
    lab_arr = load_file(labs)
    for row in range(len(fid_arr)):
        for col in range(len(fid_arr[0])):
            if fid_arr[row][col] != 0:
                row_locs.append(row)
                col_locs.append(col)
                field_ids.append(fid_arr[row][col])
                labels.append(lab_arr[row][col])
                tiles.append(tile)

df = pd.DataFrame({
    'fid':field_ids,
    'label':labels,
    'row_loc': row_locs,
    'col_loc':col_locs,
    'tile':tiles
})

print(df.shape)
print(df.groupby('fid').count().shape)
df.head()

In [None]:
df.to_csv('Full_data.csv', index=False)