In [1]:
import pandas as pd
from datetime import datetime
from osgeo import gdal
import numpy as np
import subprocess
import glob
from shapely.geometry import Polygon
import matplotlib.pyplot as plt
from dateutil.parser import parse
from tensorflow import keras 
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score

In [3]:
train_labels = pd.read_csv("train_labels.csv")
grid_metadata = pd.read_csv("grid_metadata.csv")
satellite_metadata = pd.read_csv("pm25_satellite_metadata.csv")
satellite_metadata['Date'] =  pd.to_datetime(satellite_metadata['time_end'], format='%Y-%m-%d')
test_labels = pd.read_csv("submission_format.csv")

In [4]:
train_labels

Unnamed: 0,datetime,grid_id,value
0,2018-02-01T08:00:00Z,3S31A,11.400000
1,2018-02-01T08:00:00Z,A2FBI,17.000000
2,2018-02-01T08:00:00Z,DJN0F,11.100000
3,2018-02-01T08:00:00Z,E5P9N,22.100000
4,2018-02-01T08:00:00Z,FRITQ,29.800000
...,...,...,...
34307,2020-12-31T18:30:00Z,P8JA5,368.611111
34308,2020-12-31T18:30:00Z,PW0JT,294.425000
34309,2020-12-31T18:30:00Z,VXNN3,224.857143
34310,2020-12-31T18:30:00Z,VYH7U,287.000000


In [12]:
grid_metadata.describe()

Unnamed: 0,grid_id,location,tz,wkt
count,54,54,54,54
unique,54,3,3,54
top,1X116,Delhi,Asia/Calcutta,"POLYGON ((121.5257644471362 24.97766123020391,..."
freq,1,33,33,1


In [5]:
satellite_metadata

Unnamed: 0,granule_id,time_start,time_end,product,location,split,us_url,eu_url,as_url,cksum,granule_size,Date
0,20180201T191000_maiac_la_0.hdf,2018-02-01T17:25:00.000Z,2018-02-01 19:10:00+00:00,maiac,la,train,s3://drivendata-competition-airathon-public-us...,s3://drivendata-competition-airathon-public-eu...,s3://drivendata-competition-airathon-public-as...,911405771,10446736,2018-02-01 19:10:00+00:00
1,20180202T195000_maiac_la_0.hdf,2018-02-02T18:05:00.000Z,2018-02-02 19:50:00+00:00,maiac,la,train,s3://drivendata-competition-airathon-public-us...,s3://drivendata-competition-airathon-public-eu...,s3://drivendata-competition-airathon-public-as...,2244451908,11090180,2018-02-02 19:50:00+00:00
2,20180203T203000_maiac_la_0.hdf,2018-02-03T17:10:00.000Z,2018-02-03 20:30:00+00:00,maiac,la,train,s3://drivendata-competition-airathon-public-us...,s3://drivendata-competition-airathon-public-eu...,s3://drivendata-competition-airathon-public-as...,3799527997,12468482,2018-02-03 20:30:00+00:00
3,20180204T194000_maiac_la_0.hdf,2018-02-04T17:55:00.000Z,2018-02-04 19:40:00+00:00,maiac,la,train,s3://drivendata-competition-airathon-public-us...,s3://drivendata-competition-airathon-public-eu...,s3://drivendata-competition-airathon-public-as...,4105997844,13064424,2018-02-04 19:40:00+00:00
4,20180205T202000_maiac_la_0.hdf,2018-02-05T17:00:00.000Z,2018-02-05 20:20:00+00:00,maiac,la,train,s3://drivendata-competition-airathon-public-us...,s3://drivendata-competition-airathon-public-eu...,s3://drivendata-competition-airathon-public-as...,1805072340,12549313,2018-02-05 20:20:00+00:00
...,...,...,...,...,...,...,...,...,...,...,...,...
7716,20210721T060842_misr_dl_0.nc,2021-07-21T05:11:48.000Z,2021-07-21 06:08:42+00:00,misr,dl,test,s3://drivendata-competition-airathon-public-us...,s3://drivendata-competition-airathon-public-eu...,s3://drivendata-competition-airathon-public-as...,3768009907,30790672,2021-07-21 06:08:42+00:00
7717,20210730T060258_misr_dl_0.nc,2021-07-30T05:11:45.000Z,2021-07-30 06:02:58+00:00,misr,dl,test,s3://drivendata-competition-airathon-public-us...,s3://drivendata-competition-airathon-public-eu...,s3://drivendata-competition-airathon-public-as...,2262052656,29487879,2021-07-30 06:02:58+00:00
7718,20210806T060933_misr_dl_0.nc,2021-08-06T05:18:20.000Z,2021-08-06 06:09:33+00:00,misr,dl,test,s3://drivendata-competition-airathon-public-us...,s3://drivendata-competition-airathon-public-eu...,s3://drivendata-competition-airathon-public-as...,4079577923,29589362,2021-08-06 06:09:33+00:00
7719,20210815T060400_misr_dl_0.nc,2021-08-15T05:12:45.000Z,2021-08-15 06:04:00+00:00,misr,dl,test,s3://drivendata-competition-airathon-public-us...,s3://drivendata-competition-airathon-public-eu...,s3://drivendata-competition-airathon-public-as...,2932726739,32837807,2021-08-15 06:04:00+00:00


In [11]:
# Opens the HDF file
def load_data(FILEPATH):
    ds = gdal.Open(FILEPATH)
    return ds

def format_file_path(granule_id):
    year = granule_id[:4]
    res = '../../data/raw/train/maiac/'+year+'/'+granule_id
    return res

def fetch_subset(granule_id):
    formatted = format_file_path(granule_id)
    ds = load_data( formatted)
    get_all_data_for_loci(ds,granule_id)
def fetch_satellite_meta(metadata, datetime, location, datatype, split):
    if location == "Delhi":
        location = "dl"
    elif location == "Taipei":
        location = "tpe"
    else:
        location = "la"
    metadata = metadata[metadata['location'] == location]
    metadata = metadata[metadata['product'] == datatype]
    metadata = metadata[metadata['split'] == split]
    dateobject = parse(datetime)
    return metadata.loc[(metadata['Date'].dt.month == dateobject.month) & 
                        (metadata['Date'].dt.day == dateobject.day) &
                        (metadata['Date'].dt.year <= dateobject.year)]
def get_grid_data(metadata, grid_id):
    return metadata[metadata["grid_id"] == grid_id]
def fetch_training_features(grid_id, datetime, split):
    temp = get_grid_data(grid_metadata, grid_id)
    sat_met = fetch_satellite_meta(satellite_metadata, 
                               datetime, 
                               temp.iloc[0]['location'], 
                               "maiac", 
                               split)
    counter = 0
    features = None
    for i in range(len(sat_met)):
        counter+=1
        granule_id = sat_met.iloc[i]['granule_id']

        subset = fetch_subset(granule_id)
        if features is None:
            features = subset
        else:
            features+=subset
    return features/counter
def fetch_training_features(grid_id, datetime, split):
    temp = get_grid_data(grid_metadata, grid_id)
    sat_met = fetch_satellite_meta(satellite_metadata, 
                               datetime, 
                               temp.iloc[0]['location'], 
                               "maiac", 
                               split)
    counter = 0
    features = None
    granule_id_loc = []
    for i in range(len(sat_met)):
        counter+=1
        granule_id = sat_met.iloc[i]['granule_id']
        granule_id_loc.append(granule_id)
    return granule_id_loc
        
#         subset = fetch_subset(granule_id)
        