In [1]:
import pandas as pd
from datetime import datetime
from osgeo import gdal
import numpy as np
import subprocess
import glob
from dateutil.parser import parse
from tensorflow import keras 
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score

In [2]:
train_labels = pd.read_csv("train_labels.csv")
grid_metadata = pd.read_csv("grid_metadata.csv")
satellite_metadata = pd.read_csv("satellite_metadata.csv")
satellite_metadata['Date'] =  pd.to_datetime(satellite_metadata['time_end'], format='%Y-%m-%d')

In [3]:
####################
# REMOVE THIS LINE #
####################
# train_labels = train_labels.sample(, random_state=42)

In [4]:
def get_proper_label(text):
    val = "s3://drivendata-competition-airathon-public-us/pm25/train/maiac/"
    return val + text[:4] + "/" +text

In [5]:
def get_grid_data(metadata, grid_id):
    return metadata[metadata["grid_id"] == grid_id]

In [6]:
def fetch_satellite_meta(metadata, datetime, location, datatype, split):
    if location == "Delhi":
        location = "dl"
    elif location == "Taipei":
        location = "tpe"
    else:
        location = "la"
    metadata = metadata[metadata['location'] == location]
    metadata = metadata[metadata['product'] == datatype]
    metadata = metadata[metadata['split'] == split]
    dateobject = parse(datetime)
    return metadata.loc[(metadata['Date'].dt.month == dateobject.month) & 
                        (metadata['Date'].dt.day == dateobject.day) &
                        (metadata['Date'].dt.year <= dateobject.year)]

In [7]:
ls dataset/

20180201T024000_maiac_tpe_0.hdf  20180604T054500_maiac_dl_0.hdf
20180201T042000_maiac_tpe_0.hdf  20190514T063000_maiac_dl_0.hdf
20180201T191000_maiac_la_0.hdf   20190604T065000_maiac_dl_0.hdf
20180216T065500_maiac_dl_0.hdf   20200514T064500_maiac_dl_0.hdf
20180514T070000_maiac_dl_0.hdf   20200604T070000_maiac_dl_0.hdf


In [8]:
# Opens the HDF file
def load_data(FILEPATH):
    ds = gdal.Open(FILEPATH)
    return ds

def fetch_subset(granule_id):
    
    result = get_proper_label(granule_id)
    already_files = [ i.split("/")[1] for i in glob.glob("dataset/*")]
    
    if (granule_id not in already_files):
        print("Need to download: "+granule_id)
        already_files+= granule_id
        subprocess.run(["aws", "s3", "cp", result, "./dataset", "--no-sign-request"])
    ds = load_data("dataset/" + granule_id)
    ds.GetSubDatasets()[0]
    raster = gdal.Open(ds.GetSubDatasets()[8][0]) #grid5km:cosSZA features only
    band = raster.GetRasterBand(1)
    band_arr = band.ReadAsArray()
    return band_arr

In [9]:
def fetch_training_features(grid_id, datetime, split):
    temp = get_grid_data(grid_metadata, grid_id)
    sat_met = fetch_satellite_meta(satellite_metadata, 
                               datetime, 
                               temp.iloc[0]['location'], 
                               "maiac", 
                               split)
    counter = 0
    features = None
    for i in range(len(sat_met)):
        counter+=1
        granule_id = sat_met.iloc[i]['granule_id']

        subset = fetch_subset(granule_id)
        if features is None:
            features = subset
        else:
            features+=subset
    return features/counter

In [10]:
def generate_features(train_labels, split):
    labels = []
    features = []
    for i in range(len(train_labels)):
        feature = fetch_training_features(train_labels.iloc[i]['grid_id'], train_labels.iloc[i]['datetime'], split)
        features.append(np.array(feature).reshape(-1))
        if split == "train":
            labels.append(train_labels.iloc[i]['value'])
    return np.array(features), np.array(labels)

In [None]:
features, labels = generate_features(train_labels, "train")

20180201T191000_maiac_la_0.hdf
20180201T191000_maiac_la_0.hdf
20180201T191000_maiac_la_0.hdf
20180201T191000_maiac_la_0.hdf
20180201T191000_maiac_la_0.hdf
20180201T191000_maiac_la_0.hdf
20180201T191000_maiac_la_0.hdf
20180201T191000_maiac_la_0.hdf
20180201T191000_maiac_la_0.hdf
20180201T191000_maiac_la_0.hdf
20180201T191000_maiac_la_0.hdf
20180201T191000_maiac_la_0.hdf
20180201T024000_maiac_tpe_0.hdf
20180201T042000_maiac_tpe_0.hdf
20180201T024000_maiac_tpe_0.hdf
20180201T042000_maiac_tpe_0.hdf
20180201T024000_maiac_tpe_0.hdf
20180201T042000_maiac_tpe_0.hdf
20180201T024000_maiac_tpe_0.hdf
20180201T042000_maiac_tpe_0.hdf
20180201T024000_maiac_tpe_0.hdf
20180201T042000_maiac_tpe_0.hdf
20180201T024000_maiac_tpe_0.hdf
20180201T042000_maiac_tpe_0.hdf
20180201T024000_maiac_tpe_0.hdf
20180201T042000_maiac_tpe_0.hdf
20180201T060000_maiac_dl_0.hdf
Need to download: 20180201T060000_maiac_dl_0.hdf
20180201T060000_maiac_dl_0.hdf
20180201T060000_maiac_dl_0.hdf
20180201T060000_maiac_dl_0.hdf
2018020

20180207T070000_maiac_dl_0.hdf
20180207T070000_maiac_dl_0.hdf
20180207T070000_maiac_dl_0.hdf
20180207T070000_maiac_dl_0.hdf
20180207T070000_maiac_dl_0.hdf
20180207T070000_maiac_dl_0.hdf
20180207T070000_maiac_dl_0.hdf
20180207T070000_maiac_dl_0.hdf
20180207T070000_maiac_dl_0.hdf
20180207T070000_maiac_dl_0.hdf
20180207T070000_maiac_dl_0.hdf
20180208T191500_maiac_la_0.hdf
Need to download: 20180208T191500_maiac_la_0.hdf
20180208T191500_maiac_la_0.hdf
20180208T191500_maiac_la_0.hdf
20180208T191500_maiac_la_0.hdf
20180208T191500_maiac_la_0.hdf
20180208T191500_maiac_la_0.hdf
20180208T191500_maiac_la_0.hdf
20180208T191500_maiac_la_0.hdf
20180208T191500_maiac_la_0.hdf
20180208T191500_maiac_la_0.hdf
20180208T191500_maiac_la_0.hdf
20180208T025000_maiac_tpe_0.hdf
Need to download: 20180208T025000_maiac_tpe_0.hdf
20180208T025000_maiac_tpe_1.hdf
Need to download: 20180208T025000_maiac_tpe_1.hdf
20180208T025000_maiac_tpe_0.hdf
20180208T025000_maiac_tpe_1.hdf
20180208T025000_maiac_tpe_0.hdf
20180208T

20180214T021000_maiac_tpe_0.hdf
Need to download: 20180214T021000_maiac_tpe_0.hdf
20180214T035000_maiac_tpe_0.hdf
20180214T021000_maiac_tpe_0.hdf
20180214T035000_maiac_tpe_0.hdf
20180214T021000_maiac_tpe_0.hdf
20180214T035000_maiac_tpe_0.hdf
20180214T021000_maiac_tpe_0.hdf
20180214T035000_maiac_tpe_0.hdf
20180214T021000_maiac_tpe_0.hdf
20180214T035000_maiac_tpe_0.hdf
20180214T021000_maiac_tpe_0.hdf
20180214T035000_maiac_tpe_0.hdf
20180214T021000_maiac_tpe_0.hdf
20180214T070500_maiac_dl_0.hdf
Need to download: 20180214T070500_maiac_dl_0.hdf
20180214T070500_maiac_dl_0.hdf
20180214T070500_maiac_dl_0.hdf
20180214T070500_maiac_dl_0.hdf
20180214T070500_maiac_dl_0.hdf
20180214T070500_maiac_dl_0.hdf
20180214T070500_maiac_dl_0.hdf
20180214T070500_maiac_dl_0.hdf
20180214T070500_maiac_dl_0.hdf
20180214T070500_maiac_dl_0.hdf
20180214T070500_maiac_dl_0.hdf
20180214T070500_maiac_dl_0.hdf
20180215T192000_maiac_la_0.hdf
Need to download: 20180215T192000_maiac_la_0.hdf
20180215T192000_maiac_la_0.hdf
20

20180220T031500_maiac_tpe_1.hdf
Need to download: 20180220T031500_maiac_tpe_1.hdf
20180220T031500_maiac_tpe_0.hdf
20180220T031500_maiac_tpe_1.hdf
20180220T031500_maiac_tpe_0.hdf
20180220T031500_maiac_tpe_1.hdf
20180220T031500_maiac_tpe_0.hdf
20180220T031500_maiac_tpe_1.hdf
20180220T031500_maiac_tpe_0.hdf
20180220T031500_maiac_tpe_1.hdf
20180220T031500_maiac_tpe_0.hdf
20180220T031500_maiac_tpe_1.hdf
20180220T031500_maiac_tpe_0.hdf
20180220T031500_maiac_tpe_1.hdf
20180220T063000_maiac_dl_0.hdf
Need to download: 20180220T063000_maiac_dl_0.hdf
20180220T063000_maiac_dl_0.hdf
20180220T063000_maiac_dl_0.hdf
20180220T063000_maiac_dl_0.hdf
20180220T063000_maiac_dl_0.hdf
20180220T063000_maiac_dl_0.hdf
20180220T063000_maiac_dl_0.hdf
20180220T063000_maiac_dl_0.hdf
20180220T063000_maiac_dl_0.hdf
20180220T063000_maiac_dl_0.hdf
20180220T063000_maiac_dl_0.hdf
20180220T063000_maiac_dl_0.hdf
20180221T202000_maiac_la_0.hdf
Need to download: 20180221T202000_maiac_la_0.hdf
20180221T202000_maiac_la_0.hdf
20

20180309T071500_maiac_dl_0.hdf
20180309T071500_maiac_dl_0.hdf
20180309T071500_maiac_dl_0.hdf
20180310T192500_maiac_la_0.hdf
Need to download: 20180310T192500_maiac_la_0.hdf
20180310T192500_maiac_la_0.hdf
20180310T192500_maiac_la_0.hdf
20180310T192500_maiac_la_0.hdf
20180310T192500_maiac_la_0.hdf
20180310T192500_maiac_la_0.hdf
20180310T192500_maiac_la_0.hdf
20180310T192500_maiac_la_0.hdf
20180310T192500_maiac_la_0.hdf
20180310T192500_maiac_la_0.hdf
20180310T192500_maiac_la_0.hdf
20180310T192500_maiac_la_0.hdf
20180310T062000_maiac_dl_0.hdf
Need to download: 20180310T062000_maiac_dl_0.hdf
20180310T062000_maiac_dl_0.hdf
20180310T062000_maiac_dl_0.hdf
20180310T062000_maiac_dl_0.hdf
20180311T201000_maiac_la_0.hdf
Need to download: 20180311T201000_maiac_la_0.hdf
20180311T201000_maiac_la_0.hdf
20180311T201000_maiac_la_0.hdf
20180311T201000_maiac_la_0.hdf
20180311T201000_maiac_la_0.hdf
20180311T201000_maiac_la_0.hdf
20180311T201000_maiac_la_0.hdf
20180311T201000_maiac_la_0.hdf
20180311T201000_

20180322T195000_maiac_la_0.hdf
20180322T195000_maiac_la_0.hdf
20180322T195000_maiac_la_0.hdf
20180322T195000_maiac_la_0.hdf
20180322T195000_maiac_la_0.hdf
20180322T195000_maiac_la_0.hdf
20180322T195000_maiac_la_0.hdf
20180322T195000_maiac_la_0.hdf
20180322T195000_maiac_la_0.hdf
20180322T195000_maiac_la_0.hdf
20180322T195000_maiac_la_0.hdf
20180322T064500_maiac_dl_0.hdf
Need to download: 20180322T064500_maiac_dl_0.hdf
20180322T064500_maiac_dl_0.hdf
20180322T064500_maiac_dl_0.hdf
20180322T064500_maiac_dl_0.hdf
20180322T064500_maiac_dl_0.hdf
20180322T064500_maiac_dl_0.hdf
20180322T064500_maiac_dl_0.hdf
20180322T064500_maiac_dl_0.hdf
20180322T064500_maiac_dl_0.hdf
20180322T064500_maiac_dl_0.hdf
20180322T064500_maiac_dl_0.hdf
20180322T064500_maiac_dl_0.hdf
20180322T064500_maiac_dl_0.hdf
20180322T064500_maiac_dl_0.hdf
20180322T064500_maiac_dl_0.hdf
20180322T064500_maiac_dl_0.hdf
20180322T064500_maiac_dl_0.hdf
20180322T064500_maiac_dl_0.hdf
20180323T203000_maiac_la_0.hdf
Need to download: 201

20180330T055500_maiac_dl_0.hdf
20180330T055500_maiac_dl_0.hdf
20180330T055500_maiac_dl_0.hdf
20180330T055500_maiac_dl_0.hdf
20180330T055500_maiac_dl_0.hdf
20180330T055500_maiac_dl_0.hdf
20180330T055500_maiac_dl_0.hdf
20180330T055500_maiac_dl_0.hdf
20180330T055500_maiac_dl_0.hdf
20180330T055500_maiac_dl_0.hdf
20180330T055500_maiac_dl_0.hdf
20180330T055500_maiac_dl_0.hdf
20180330T055500_maiac_dl_0.hdf
20180330T055500_maiac_dl_0.hdf
20180330T055500_maiac_dl_0.hdf
20180330T055500_maiac_dl_0.hdf
20180330T055500_maiac_dl_0.hdf
20180331T194500_maiac_la_0.hdf
Need to download: 20180331T194500_maiac_la_0.hdf
20180331T194500_maiac_la_0.hdf
20180331T194500_maiac_la_0.hdf
20180331T194500_maiac_la_0.hdf
20180331T194500_maiac_la_0.hdf
20180331T194500_maiac_la_0.hdf
20180331T194500_maiac_la_0.hdf
20180331T194500_maiac_la_0.hdf
20180331T194500_maiac_la_0.hdf
20180331T194500_maiac_la_0.hdf
20180331T194500_maiac_la_0.hdf
20180331T194500_maiac_la_0.hdf
20180331T063500_maiac_dl_0.hdf
Need to download: 201

20180410T202000_maiac_la_0.hdf
20180410T202000_maiac_la_0.hdf
20180410T202000_maiac_la_0.hdf
20180410T202000_maiac_la_0.hdf
20180410T202000_maiac_la_0.hdf
20180410T202000_maiac_la_0.hdf
20180410T202000_maiac_la_0.hdf
20180410T202000_maiac_la_0.hdf
20180410T202000_maiac_la_0.hdf
20180410T202000_maiac_la_0.hdf
20180411T192500_maiac_la_0.hdf
Need to download: 20180411T192500_maiac_la_0.hdf
20180411T192500_maiac_la_0.hdf
20180411T192500_maiac_la_0.hdf
20180411T192500_maiac_la_0.hdf
20180411T192500_maiac_la_0.hdf
20180411T192500_maiac_la_0.hdf
20180411T192500_maiac_la_0.hdf
20180411T192500_maiac_la_0.hdf
20180411T192500_maiac_la_0.hdf
20180411T192500_maiac_la_0.hdf
20180411T192500_maiac_la_0.hdf
20180412T201000_maiac_la_0.hdf
Need to download: 20180412T201000_maiac_la_0.hdf
20180412T201000_maiac_la_0.hdf
20180412T201000_maiac_la_0.hdf
20180412T201000_maiac_la_0.hdf
20180412T201000_maiac_la_0.hdf
20180412T201000_maiac_la_0.hdf
20180412T201000_maiac_la_0.hdf
20180412T201000_maiac_la_0.hdf
201

20180428T201000_maiac_la_0.hdf
20180428T201000_maiac_la_0.hdf
20180428T201000_maiac_la_0.hdf
20180428T201000_maiac_la_0.hdf
20180428T201000_maiac_la_0.hdf
20180428T201000_maiac_la_0.hdf
20180428T201000_maiac_la_0.hdf
20180428T201000_maiac_la_0.hdf
20180428T201000_maiac_la_0.hdf
20180428T201000_maiac_la_0.hdf
20180428T070000_maiac_dl_0.hdf
Need to download: 20180428T070000_maiac_dl_0.hdf
20180428T070000_maiac_dl_0.hdf
20180428T070000_maiac_dl_0.hdf
20180428T070000_maiac_dl_0.hdf
20180428T070000_maiac_dl_0.hdf
20180428T070000_maiac_dl_0.hdf
20180428T070000_maiac_dl_0.hdf
20180428T070000_maiac_dl_0.hdf
20180428T070000_maiac_dl_0.hdf
20180428T070000_maiac_dl_0.hdf
20180428T070000_maiac_dl_0.hdf
20180428T070000_maiac_dl_0.hdf
20180429T191500_maiac_la_0.hdf
Need to download: 20180429T191500_maiac_la_0.hdf
20180429T191500_maiac_la_0.hdf
20180429T191500_maiac_la_0.hdf
20180429T191500_maiac_la_0.hdf
20180429T191500_maiac_la_0.hdf
20180429T191500_maiac_la_0.hdf
20180429T191500_maiac_la_0.hdf
201

20180507T065500_maiac_dl_0.hdf
20180507T065500_maiac_dl_0.hdf
20180507T065500_maiac_dl_0.hdf
20180507T065500_maiac_dl_0.hdf
20180507T065500_maiac_dl_0.hdf
20180507T065500_maiac_dl_0.hdf
20180507T065500_maiac_dl_0.hdf
20180507T065500_maiac_dl_0.hdf
20180507T065500_maiac_dl_0.hdf
20180507T065500_maiac_dl_0.hdf
20180507T065500_maiac_dl_0.hdf
20180507T065500_maiac_dl_0.hdf
20180507T065500_maiac_dl_0.hdf
20180507T065500_maiac_dl_0.hdf
20180507T065500_maiac_dl_0.hdf
20180507T065500_maiac_dl_0.hdf
20180508T191000_maiac_la_0.hdf
Need to download: 20180508T191000_maiac_la_0.hdf
20180508T191000_maiac_la_0.hdf
20180508T191000_maiac_la_0.hdf
20180508T191000_maiac_la_0.hdf
20180508T191000_maiac_la_0.hdf
20180508T191000_maiac_la_0.hdf
20180508T191000_maiac_la_0.hdf
20180508T191000_maiac_la_0.hdf
20180508T191000_maiac_la_0.hdf
20180508T191000_maiac_la_0.hdf
20180508T191000_maiac_la_0.hdf
20180508T191000_maiac_la_0.hdf
20180508T060000_maiac_dl_0.hdf
Need to download: 20180508T060000_maiac_dl_0.hdf
201

20180521T201500_maiac_la_0.hdf
20180521T201500_maiac_la_0.hdf
20180521T201500_maiac_la_0.hdf
20180521T201500_maiac_la_0.hdf
20180521T201500_maiac_la_0.hdf
20180521T201500_maiac_la_0.hdf
20180521T201500_maiac_la_0.hdf
20180521T201500_maiac_la_0.hdf
20180521T201500_maiac_la_0.hdf
20180521T201500_maiac_la_0.hdf
20180522T192000_maiac_la_0.hdf
Need to download: 20180522T192000_maiac_la_0.hdf
20180522T192000_maiac_la_0.hdf
20180522T192000_maiac_la_0.hdf
20180522T192000_maiac_la_0.hdf
20180522T192000_maiac_la_0.hdf
20180522T192000_maiac_la_0.hdf
20180522T192000_maiac_la_0.hdf
20180522T192000_maiac_la_0.hdf
20180522T192000_maiac_la_0.hdf
20180522T192000_maiac_la_0.hdf
20180522T192000_maiac_la_0.hdf
20180523T200500_maiac_la_0.hdf
Need to download: 20180523T200500_maiac_la_0.hdf
20180523T200500_maiac_la_0.hdf
20180523T200500_maiac_la_0.hdf
20180523T200500_maiac_la_0.hdf
20180523T200500_maiac_la_0.hdf
20180523T200500_maiac_la_0.hdf
20180523T200500_maiac_la_0.hdf
20180523T200500_maiac_la_0.hdf
201

20180610T195000_maiac_la_0.hdf
20180610T195000_maiac_la_0.hdf
20180610T195000_maiac_la_0.hdf
20180610T195000_maiac_la_0.hdf
20180610T195000_maiac_la_0.hdf
20180610T195000_maiac_la_0.hdf
20180610T195000_maiac_la_0.hdf
20180610T195000_maiac_la_0.hdf
20180611T203500_maiac_la_0.hdf
Need to download: 20180611T203500_maiac_la_0.hdf
20180611T203500_maiac_la_0.hdf
20180611T203500_maiac_la_0.hdf
20180611T203500_maiac_la_0.hdf
20180611T203500_maiac_la_0.hdf
20180611T203500_maiac_la_0.hdf
20180611T203500_maiac_la_0.hdf
20180611T203500_maiac_la_0.hdf
20180611T203500_maiac_la_0.hdf
20180612T194000_maiac_la_0.hdf
Need to download: 20180612T194000_maiac_la_0.hdf
20180612T194000_maiac_la_0.hdf
20180612T194000_maiac_la_0.hdf
20180612T194000_maiac_la_0.hdf
20180612T194000_maiac_la_0.hdf
20180612T194000_maiac_la_0.hdf
20180612T194000_maiac_la_0.hdf
20180612T194000_maiac_la_0.hdf
20180612T194000_maiac_la_0.hdf
20180613T202000_maiac_la_0.hdf
Need to download: 20180613T202000_maiac_la_0.hdf
20180613T202000_

In [65]:
features.shape

(2, 57600)

In [64]:
train_labels.shape

(34312, 3)

In [63]:
labels

array([52.33333333, 49.53846154])

In [None]:
def baseline_model():
    model = Sequential()
    model.add(Dense(1, input_dim=256, activation='relu'))
    model.add(Dense(1, input_dim=128, activation='relu'))
    model.add(Dense(1, input_dim=64, activation='relu'))
    model.add(Dense(1, input_dim=32, activation='relu'))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

estimator = KerasRegressor(build_fn=baseline_model, epochs=10, batch_size=5, verbose=0)
results = cross_val_score(estimator, features, labels)
print("Baseline: %.2f (%.2f) MSE" % (results.mean(), results.std()))

In [25]:
features.shape

(2, 57600)

In [38]:
labels


array([52.33333333, 49.53846154])