In [1]:
import pandas as pd
from datetime import datetime
from osgeo import gdal
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score

#Skyler Added:
from dateutil import parser
from keras.layers import Flatten, Dense

In [2]:
train_labels = pd.read_csv("train_labels.csv")
grid_metadata = pd.read_csv("grid_metadata.csv")
satellite_metadata = pd.read_csv("satellite_metadata.csv")
satellite_metadata['Date'] =  pd.to_datetime(satellite_metadata['time_end'], format='%Y-%m-%d')

In [3]:
####################
# REMOVE THIS LINE #
####################
train_labels = train_labels.sample(5, random_state=42)
#print(train_labels)

In [4]:
def get_grid_data(metadata, grid_id):
    return metadata[metadata["grid_id"] == grid_id]

In [5]:
def fetch_satellite_meta(metadata, datetime, location, datatype, split):
    if location == "Delhi":
        location = "dl"
    elif location == "Taipei":
        location = "tpe"
    else:
        location = "la"
        
    metadata = metadata[metadata['location'] == location]
    metadata = metadata[metadata['product'] == datatype]
    metadata = metadata[metadata['split'] == split]
    dateobject = parser.parse(datetime)
    return metadata.loc[(metadata['Date'].dt.month == dateobject.month) & 
                        (metadata['Date'].dt.day == dateobject.day) &
                        (metadata['Date'].dt.year <= dateobject.year)]

In [6]:
# Opens the HDF file
def load_data(FILEPATH):
    print(FILEPATH)
    ds = gdal.Open(FILEPATH)
    return ds

def fetch_subset(granule_id):
    ds = load_data("C:/Users/Skyler/" + granule_id)
    ds.GetSubDatasets()[0]
    band_arr = []
    for i in range(13): #0-12 valid index
        #print(i)
        raster = gdal.Open(ds.GetSubDatasets()[i][0]) #grid5km:cosSZA features only
        band = raster.GetRasterBand(1)
        temp_band_arr = band.ReadAsArray()
        
        if temp_band_arr.shape == (240,240): #if every 5km, expand to full dimension
            temp_band_arr = temp_band_arr.repeat(5, axis=0).repeat(5, axis=1)
            
        #print(temp_band_arr.size)
        band_arr.append(temp_band_arr)
    band_arr = np.array(band_arr).flatten()
    print(band_arr.shape)
    return band_arr

In [7]:
def fetch_training_features(grid_id, datetime, split):
    temp = get_grid_data(grid_metadata, grid_id)
    sat_met = fetch_satellite_meta(satellite_metadata, 
                               datetime, 
                               temp.iloc[0]['location'], 
                               "maiac", 
                               split)
    counter = 0
    features = None
    for i in range(len(sat_met)):
        counter+=1
        #Lines below added
        split = str(sat_met.iloc[i]['split'])
        product = str(sat_met.iloc[i]['product'])
        year = str(sat_met.iloc[i]['Date'].year)
        granule_id = str(sat_met.iloc[i]['granule_id'])
        path_str = split + "/" + product + "/" + year + "/" + granule_id
        
        subset = fetch_subset(path_str)
        if features is None:
            features = subset
        else:
            features+=subset
    return features/counter

In [8]:
def generate_features(train_labels, split):
    labels = []
    features = []
    for i in range(len(train_labels)):
        feature = fetch_training_features(train_labels.iloc[i]['grid_id'], train_labels.iloc[i]['datetime'], split)
        features.append(np.array(feature).reshape(-1))
        if split == "train":
            labels.append(train_labels.iloc[i]['value'])
    return np.array(features), np.array(labels)

In [9]:
features, labels = generate_features(train_labels, "train")

C:/Users/Skyler/train/maiac/2018/20180514T070000_maiac_dl_0.hdf
C:/Users/Skyler/train/maiac/2018/20180514T070000_maiac_dl_0.hdf
C:/Users/Skyler/train/maiac/2018/20180514T070000_maiac_dl_0.hdf
C:/Users/Skyler/train/maiac/2018/20180514T070000_maiac_dl_0.hdf
C:/Users/Skyler/train/maiac/2018/20180514T070000_maiac_dl_0.hdf
C:/Users/Skyler/train/maiac/2018/20180514T070000_maiac_dl_0.hdf
(18720000,)
C:/Users/Skyler/train/maiac/2019/20190514T063000_maiac_dl_0.hdf
(18720000,)
C:/Users/Skyler/train/maiac/2020/20200514T064500_maiac_dl_0.hdf
(18720000,)
C:/Users/Skyler/train/maiac/2019/20190514T063000_maiac_dl_0.hdf
(18720000,)
C:/Users/Skyler/train/maiac/2020/20200514T064500_maiac_dl_0.hdf
(18720000,)
C:/Users/Skyler/train/maiac/2019/20190514T063000_maiac_dl_0.hdf
(18720000,)
C:/Users/Skyler/train/maiac/2020/20200514T064500_maiac_dl_0.hdf
(18720000,)
C:/Users/Skyler/train/maiac/2019/20190514T063000_maiac_dl_0.hdf
(18720000,)
C:/Users/Skyler/train/maiac/2020/20200514T064500_maiac_dl_0.hdf
(1872000

In [10]:
def baseline_model():
    model = Sequential()
    #model.add(Dense(1, input_dim=10, activation='relu'))
    #model.add(Dense(1))
    
    model.add(Flatten())
    
    model.add(Dense(1, input_dim=10, activation='relu'))
    model.add(Dense(1))
    
    #model.add(Dense(128, input_dim=10, activation='relu'))
    #model.add(Dense(128, activation='relu'))
    #model.add(Dense(10, activation='softmax'))
    
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

estimator = KerasRegressor(build_fn=baseline_model, epochs=10, batch_size=5, verbose=0)
results = cross_val_score(estimator, features, labels)
print("Baseline: %.2f (%.2f) MSE" % (results.mean(), results.std()))

  estimator = KerasRegressor(build_fn=baseline_model, epochs=10, batch_size=5, verbose=0)
  estimator = KerasRegressor(build_fn=baseline_model, epochs=10, batch_size=5, verbose=0)
  estimator = KerasRegressor(build_fn=baseline_model, epochs=10, batch_size=5, verbose=0)
  estimator = KerasRegressor(build_fn=baseline_model, epochs=10, batch_size=5, verbose=0)
  estimator = KerasRegressor(build_fn=baseline_model, epochs=10, batch_size=5, verbose=0)
  estimator = KerasRegressor(build_fn=baseline_model, epochs=10, batch_size=5, verbose=0)


Baseline: -2917.17 (3256.45) MSE
Baseline: -2917.17 (3256.45) MSE
Baseline: -2917.17 (3256.45) MSE
Baseline: -2917.17 (3256.45) MSE
Baseline: -2917.17 (3256.45) MSE
Baseline: -2917.17 (3256.45) MSE


###### 