Extracting global VIIRS Level 2 fire detections on the cloud -> daily global txt files

In [7]:
import xarray as xr
import earthaccess
import datetime as dt
import pandas as pd
import time
import os
import fsspec
import numpy as np
#import geoparquet as gpq #optional, if using geoparquet file format

<h4>User define time of interest</h4>
Spatial domain is global, but could be filtered by bounding_box or polygon (string of list of tuples) in earthaccess.search_data()

In [27]:
START = '2024-07-01' #USER SPECIFY, then runs entire month, global

output_dir = '/projects/my-public-bucket/viirs/' #writing
input_dir = '/projects/shared-buckets/coffield/viirs/' #reading

In [28]:
start = pd.to_datetime(START)
end = start + pd.DateOffset(months=1)
day_range = pd.date_range(start, end, inclusive='left')
day_range

DatetimeIndex(['2024-07-01', '2024-07-02', '2024-07-03', '2024-07-04',
               '2024-07-05', '2024-07-06', '2024-07-07', '2024-07-08',
               '2024-07-09', '2024-07-10', '2024-07-11', '2024-07-12',
               '2024-07-13', '2024-07-14', '2024-07-15', '2024-07-16',
               '2024-07-17', '2024-07-18', '2024-07-19', '2024-07-20',
               '2024-07-21', '2024-07-22', '2024-07-23', '2024-07-24',
               '2024-07-25', '2024-07-26', '2024-07-27', '2024-07-28',
               '2024-07-29', '2024-07-30', '2024-07-31'],
              dtype='datetime64[ns]', freq='D')

<h4>Download NOAA21 fire swaths (VJ214IMG) as needed via wget:

NOAA21 fire product is not yet published and not yet on the cloud. User may or may not need to email LPDAAC about access to VIIRS science team datasets, in this case archive set 4014

Current token from Shane will work until Oct 28, 2024:

Bearer eyJ0eXAiOiJKV1QiLCJvcmlnaW4iOiJFYXJ0aGRhdGEgTG9naW4iLCJzaWciOiJlZGxqd3RwdWJrZXlfb3BzIiwiYWxnIjoiUlMyNTYifQ.eyJ0eXBlIjoiVXNlciIsInVpZCI6ImNvZmZpZWxkIiwiZXhwIjoxNzMwMTQ1NjM0LCJpYXQiOjE3MjQ5NjE2MzQsImlzcyI6Imh0dHBzOi8vdXJzLmVhcnRoZGF0YS5uYXNhLmdvdiJ9.0KmD3OdpbCoSJd4f7hxjcs7U9cfkbB9Iq0uoX-MRPO6ZHF83FeDj88VW8q4ulVGONURh4-tv6bG58SZt-MV9InrTaI2a9IaxFIMbfwIk3cxkNtyCoptAt-bihj7TvDuzHTLHc-Sxyflya_nKnxTAtdey5G7hWw6MuLFaWNhcj7IvIzXVeUvFEkdZqc6WdDHnQZtC7NROrTurfLVyzXbmFrAbJI7QR8i1n65j6FnchznnXUNllaNTjEE978QLzzCfjIGb91Btl0p2ovvQIYQ8Kaqq_wD2_SkxZm0-IqdFUOfGM5mdg9cSmWflOgx94h8bpUSVnQnFkIZPx55r8-7MmQ

In [None]:
auth = earthaccess.login()
session = auth.get_session()
token = session.headers['Authorization'] #replace with Shane's token if you don't have access to https://ladsweb.modaps.eosdis.nasa.gov/archive/allData/4014/VJ214IMG/

for day in day_range:
    filepath = f'{output_dir}/VJ214IMG/{day.year}/{day.timetuple().tm_yday}'

    if os.path.exists(filepath):
        print('NOAA21 data already downloaded for', day)
    else:
        print('downloading NOAA21 for', day)
        command = f'wget -e robots=off -m -np -R .html,.tmp -nH --cut-dirs=3 "https://ladsweb.modaps.eosdis.nasa.gov/archive/allData/4014/VJ214IMG/{day.year}/{day.timetuple().tm_yday}/" --header "Authorization: {token}" -P {output_dir}'
        #wget command will create/use the VJ214IMG directory inside ~/my-public-bucket/viirs and make subfolders for year & day
        #print(command)
        os.system(command)

<h4>Define access to LPDAAC, run fire detection retrieval

In [8]:
s3_fsspec = fsspec.filesystem("s3", profile="maap-data-reader") #for direct reader access to LPDAAC
#ask Alex for setting up config file for this first

In [9]:
pix_lut = pd.read_csv('/projects/shared-buckets/coffield/pix_size_lut.csv', index_col='sample') #lookup table for pixel sizes

In [25]:
column_names = {'FP_latitude':'latitude',
                'FP_longitude':'longitude',
                'FP_T4':'bright_ti4',
                'FP_sample':'sample',
                'FP_confidence':'confidence',
                'FP_T5':'bright_ti5',
                'FP_power':'frp',
                'FP_day':'daynight',
                'FP_ViewZenAng':'vza'} #VZA values go up to 70deg for some reason
    
columns = list(column_names.keys())

columns described here: https://www.earthdata.nasa.gov/learn/find-data/near-real-time/firms/vnp14imgtdlnrt#ed-viirs-375m-attributes

In [None]:
%%time

products = {'SNPP':'VNP14IMG',
            'NOAA20':'VJ114IMG',
            'NOAA21':'VJ214IMG'}

for satellite in products:
    
    product = products[satellite]

    t = start

    while(t < end): #for each day in the month
        t0 = t.strftime('%Y-%m-%d')
        t1 = (t + dt.timedelta(1) ).strftime('%Y-%m-%d')
        print(satellite, t0, '--------')
        
        if satellite=='NOAA21': #read files from local bucket
            path = f'/projects/shared-buckets/coffield/viirs/VJ214IMG/{t.year}/{t.strftime("%j")}/'
            files = os.listdir(path)
            files = [path + f for f in files]
            print(len(files), 'files found') #should be 240

        else: #cloud query
            results = earthaccess.search_data(
                short_name=product,
                temporal=(t0, t1),
                count=1000) #max files - change if doing more than a couple days at a time

            urls = [r.data_links(access='direct')[0] for r in results]
            files = [s3_fsspec.open(url) for url in urls]

            if files[0].url().split('.')[6]=='2354': del files[0] #sometimes overlap with prev day
            if files[-1].url().split('.')[6]=='0000': del files[-1]
        
        all_dets = pd.DataFrame()

        for f in files: #for each of the 240 files per day
            if satellite=='NOAA21': swath = xr.open_dataset(f)
            else:                   swath = xr.open_dataset(f, phony_dims='sort')
            df = swath[columns].to_dataframe().rename(columns = column_names).reset_index(drop=True)
            df['acq_date'] = swath.StartTime[:10]
            df['acq_time'] = swath.StartTime[11:16] #imprecise
            df['satellite'] = swath.Satellite
            df['version'] = swath.VersionID
            df['scan'] = pix_lut.loc[df['sample'], 'along_scan'].values
            df['track'] = pix_lut.loc[df['sample'], 'along_track'].values
            df.daynight = df.daynight.replace({0:'N',1:'D'})
            df.confidence = df.confidence.replace({7:'l', 8:'n', 9: 'h'})
            
            all_dets = pd.concat([all_dets, df])
            
        #export txt
        out_path = f'{output_dir}/{product}DL/{t.year}'
        if not os.path.exists(out_path):
            os.makedirs(out_path, exist_ok=True)
                
        all_dets.to_csv(f'{out_path}/{t0.replace("-","")}.txt', index=False) #geoparquet instead?

        t += dt.timedelta(1)

In [23]:
swath.FP_ViewZenAng.max() #why do they go up to 70deg? should only be 55

<h4>Version with candidates included</h4>
Very slow and impractical - use separate script "candidate_fire_extration" to do candidate extraction for a space/time of interest instead (also includes NOAA21)

In [22]:
%%time

products = {'SNPP':['VNP03IMG','VNP14IMG'],
            'NOAA20':['VJ103IMG','VJ114IMG']}

for satellite in products: #for each satellite
    
    product = products[satellite][1] #L2

    t = start

    while(t < end): #for each day in the month
        t0 = t.strftime('%Y-%m-%d')
        t1 = (t + dt.timedelta(1) ).strftime('%Y-%m-%d')
        print(satellite, t0, '--------')
        
        earthaccess.login(strategy='netrc') #for LAADS access - every hour
        
        #Level1 data from LAADS ------
        #geolocation 03IMG
        results = earthaccess.search_data(
            short_name=products[satellite][0],
            temporal=(t0, t1),
            count=1000) #max files - change if doing more than a couple days at a time
        l1_files = earthaccess.open(results)
        
        #Level2 data from LPDAAC -----
        results = earthaccess.search_data(
            short_name=products[satellite][1],
            temporal=(t0, t1),
            count=1000) #max files - change if doing more than a couple days at a time
        urls = [r.data_links(access='direct')[0] for r in results]
        l2_files = [s3_fsspec.open(url) for url in urls]
        
        if l1_files[0].url().split('.')[6]=='2354': del l1_files[0] #sometimes overlap with prev day
        if l1_files[-1].url().split('.')[6]=='0000': del l1_files[-1]
        if l2_files[0].url().split('.')[6]=='2354': del l2_files[0] #sometimes overlap with prev day
        if l2_files[-1].url().split('.')[6]=='0000': del l2_files[-1]
        
        if len(l1_files) != len(l2_files): print('Warning: L1-L2 file mismatch')
        
        all_dets = pd.DataFrame()
        
        for i in range(len(l2_files)): #for each of the 240 files per day
            
            timestamp = l2_files[i].path.split('.')[-4]
            print(timestamp)
            
            #Level 1 geolocation
            match = [f for f in l1_files if f.path.split('.')[-4]==timestamp][0]
            geo = xr.open_dataset(match, engine='h5netcdf', group='geolocation_data')
            lon = geo['longitude'][:]
            lat = geo['latitude'][:]
            _, j = np.indices(geo.longitude.shape) #line and sample
            
            #Level 2 fire product
            data = xr.open_dataset(l2_files[i], phony_dims='sort')

            daynight = data.DayNightFlag #string Day or Night - replace based on solar angle?
            
            qa = data.variables['algorithm QA'][:]
            fire = data.variables['fire mask'][:]  
            not_fires = (fire<7).values
            
            #start with known detections from L2 vectors
            df = data[columns].to_dataframe().rename(columns = column_names).reset_index(drop=True)

            df['scan'] = pix_lut.loc[df['sample'], 'along_scan'].values
            df['track'] = pix_lut.loc[df['sample'], 'along_track'].values
            df.daynight = df.daynight.replace({0:'N',1:'D'})
            df.confidence = df.confidence.replace({7:'l', 8:'n', 9: 'h'})
            
            #another dataframe for candidates
            values, counts = np.unique(qa, return_counts=True)

            table = pd.DataFrame(index = values, columns=range(22,-1,-1)) #[22,21,...0]
            for i1 in table.index:
                b = np.binary_repr(i1, width=23)
                b = [int(s) for s in b]
                table.loc[i1, :] = b
            #report back all the pixels that have an 8 or 10 ~ background or candidate fires
            cands = table[(table.loc[:,8]==1) | (table.loc[:,10]==1)].index
            cands = (np.isin(qa[:], cands) & (not_fires)) 
            
            df_cands = pd.DataFrame()
            df_cands['longitude'] = list(lon.values[cands])
            df_cands['latitude'] = list(lat.values[cands])
            df_cands['daynight'] = daynight[0]
            df_cands['confidence'] = 'c' #c for candidate
            df_cands['sample'] = list(j[cands]) #sample number for pixel size lookup
            df_cands['scan'] = pix_lut.loc[df_cands['sample'], 'along_scan'].values
            df_cands['track'] = pix_lut.loc[df_cands['sample'], 'along_track'].values
            
            combined = pd.concat([df, df_cands])
            
            #same for all in swath
            combined['acq_date'] = data.StartTime[:10]
            combined['acq_time'] = int(data.StartTime[11:13] + data.StartTime[14:16]) #imprecise
            combined['satellite'] = data.Satellite[0] #first character to match firms
            combined['version'] = data.VersionID

            all_dets = pd.concat([all_dets, combined])
            
        #export txt
        out_path = f'/projects/my-public-bucket/viirs/{product}DL/{t0[:4]}' #year
        if not os.path.exists(out_path):
            os.mkdir(out_path)
                
        all_dets.to_csv(f'{out_path}/{t0.replace("-","")}_cands.txt', index=False)

        t += dt.timedelta(1)

SNPP 2024-07-01 --------
Granules found: 242
Opening 242 granules, approx size: 35.51 GB
using provider: LAADS


QUEUEING TASKS | : 242it [00:00, 15795.30it/s]
PROCESSING TASKS | : 100%|██████████| 242/242 [00:01<00:00, 179.07it/s]
COLLECTING RESULTS | : 100%|██████████| 242/242 [00:00<00:00, 530037.37it/s]


Granules found: 242
0000
0006
0012
0018
0024
0030
0036
0042
0048
0054
0100
0106
0112
0118
0124
0130
0136
0142
0148
0154
0200
0206
0212
0218
0224
0230
0236
0242
0248
0254
0300
0306
0312
0318
0324
0330
0336
0342
0348
0354
0400
0406
0412
0418
0424
0430
0436
0442
0448
0454
0500
0506
0512
0518
0524
0530
0536
0542
0548
0554
0600
0606
0612
0618
0624
0630
0636
0642
0648
0654
0700
0706
0712
0718
0724
0730
0736
0742
0748
0754
0800
0806
0812
0818
0824
0830
0836
0842
0848
0854
0900
0906
0912
0918
0924
0930
0936
0942
0948
0954
1000
1006
1012
1018
1024
1030
1036
1042
1048
1054
1100
1106
1112
1118
1124
1130
1136
1142
1148
1154
1200
1206
1212
1218
1224
1230
1236
1242
1248
1254
1300
1306
1312
1318
1324
1330
1336
1342
1348
1354
1400
1406
1412
2036
2042
2048
2054
2100
2106
2112
2118
2124
2130
2136
2142
2148
2154
2200
2206
2212
2218
2224
2230
2236
2242
2248


PermissionError: The provided token has expired.