In [1]:
from pathlib import Path
import random
from typing import Dict, List, Union

from cloudpathlib import S3Path
import geopandas as gpd
import pandas as pd
import rasterio

#MAIAC imports
import os
import re

import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from pyhdf.SD import SD, SDC, SDS
import pyproj

mpl.rcParams['figure.dpi'] = 100

DATA_PATH = Path.cwd() / "data"
RAW = DATA_PATH / "raw"
INTERIM = DATA_PATH / "interim"

print(DATA_PATH)
print(RAW)
print(INTERIM)

  _pyproj_global_context_initialize()


C:\Users\Skyler\data
C:\Users\Skyler\data\raw
C:\Users\Skyler\data\interim


In [2]:
pm_md = pd.read_csv(
    RAW / "pm25_satellite_metadata.csv",
    parse_dates=["time_start", "time_end"],
    index_col=0
)

grid_md = pd.read_csv(
    RAW / "grid_metadata.csv",
    index_col=0
)
print(pm_md)
print(grid_md)

                                              time_start  \
granule_id                                                 
20180201T191000_maiac_la_0.hdf 2018-02-01 17:25:00+00:00   
20180202T195000_maiac_la_0.hdf 2018-02-02 18:05:00+00:00   
20180203T203000_maiac_la_0.hdf 2018-02-03 17:10:00+00:00   
20180204T194000_maiac_la_0.hdf 2018-02-04 17:55:00+00:00   
20180205T202000_maiac_la_0.hdf 2018-02-05 17:00:00+00:00   
...                                                  ...   
20210721T060842_misr_dl_0.nc   2021-07-21 05:11:48+00:00   
20210730T060258_misr_dl_0.nc   2021-07-30 05:11:45+00:00   
20210806T060933_misr_dl_0.nc   2021-08-06 05:18:20+00:00   
20210815T060400_misr_dl_0.nc   2021-08-15 05:12:45+00:00   
20210822T061044_misr_dl_0.nc   2021-08-22 05:19:30+00:00   

                                                time_end product location  \
granule_id                                                                  
20180201T191000_maiac_la_0.hdf 2018-02-01 19:10:00+00:00   maiac 

In [3]:
pm_md.columns

Index(['time_start', 'time_end', 'product', 'location', 'split', 'us_url',
       'eu_url', 'as_url', 'cksum', 'granule_size'],
      dtype='object')

In [4]:
pm_md["split"].value_counts()

train    5048
test     2673
Name: split, dtype: int64

In [5]:
pm_md["product"].value_counts()

maiac    6704
misr     1017
Name: product, dtype: int64

In [6]:
maiac_md = pm_md[(pm_md["product"] == "maiac") & (pm_md["split"] == "train")].copy()
maiac_md.shape

(4260, 10)

In [7]:
maiac_md.head(3)

Unnamed: 0_level_0,time_start,time_end,product,location,split,us_url,eu_url,as_url,cksum,granule_size
granule_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
20180201T191000_maiac_la_0.hdf,2018-02-01 17:25:00+00:00,2018-02-01 19:10:00+00:00,maiac,la,train,s3://drivendata-competition-airathon-public-us...,s3://drivendata-competition-airathon-public-eu...,s3://drivendata-competition-airathon-public-as...,911405771,10446736
20180202T195000_maiac_la_0.hdf,2018-02-02 18:05:00+00:00,2018-02-02 19:50:00+00:00,maiac,la,train,s3://drivendata-competition-airathon-public-us...,s3://drivendata-competition-airathon-public-eu...,s3://drivendata-competition-airathon-public-as...,2244451908,11090180
20180203T203000_maiac_la_0.hdf,2018-02-03 17:10:00+00:00,2018-02-03 20:30:00+00:00,maiac,la,train,s3://drivendata-competition-airathon-public-us...,s3://drivendata-competition-airathon-public-eu...,s3://drivendata-competition-airathon-public-as...,3799527997,12468482


In [8]:
maiac_md.location.value_counts()

tpe    2130
la     1065
dl     1065
Name: location, dtype: int64

In [9]:
maiac_md.time_end.dt.year.value_counts()

2020    1464
2019    1460
2018    1336
Name: time_end, dtype: int64

In [10]:
maiac_md.time_end.dt.month.value_counts().sort_index()

1     248
2     340
3     372
4     360
5     372
6     360
7     372
8     372
9     360
10    372
11    360
12    372
Name: time_end, dtype: int64

In [11]:
(maiac_md.time_end - maiac_md.time_start).describe()

count                         4260
mean     0 days 01:27:16.549295774
std      0 days 00:55:30.276180728
min                0 days 00:05:00
25%                0 days 00:10:00
50%                0 days 01:45:00
75%                0 days 01:45:00
max                0 days 14:20:00
dtype: object

In [12]:
grid_md.head(3)

Unnamed: 0_level_0,location,tz,wkt
grid_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1X116,Taipei,Asia/Taipei,"POLYGON ((121.5257644471362 24.97766123020391,..."
1Z2W7,Delhi,Asia/Calcutta,"POLYGON ((77.30453178416276 28.54664454217707,..."
3S31A,Los Angeles (SoCAB),Etc/GMT+8,POLYGON ((-117.9338248256995 33.79558357488509...


In [13]:
grid_md.shape

(54, 3)

In [14]:
grid_md.location.value_counts()

Delhi                  33
Los Angeles (SoCAB)    14
Taipei                  7
Name: location, dtype: int64

In [51]:
la_file = maiac_md[maiac_md.location == "la"].iloc[0]
print(la_file)
print()
la_file_path = str(la_file['split']) + "/" + str(la_file['product']) + "/" + str(la_file['time_start'].year) + "/" + str(la_file.name)
print(la_file_path)

time_start                              2018-02-01 17:25:00+00:00
time_end                                2018-02-01 19:10:00+00:00
product                                                     maiac
location                                                       la
split                                                       train
us_url          s3://drivendata-competition-airathon-public-us...
eu_url          s3://drivendata-competition-airathon-public-eu...
as_url          s3://drivendata-competition-airathon-public-as...
cksum                                                   911405771
granule_size                                             10446736
Name: 20180201T191000_maiac_la_0.hdf, dtype: object

train/maiac/2018/20180201T191000_maiac_la_0.hdf


In [52]:
file = SD(la_file_path, SDC.READ)

print(file.info())

(13, 8)
