# Prepare MODIS data for ML
1. Load in all the raw data
2. Drop all of the columns we don't want
3. Turn dates/times into unix timestamps
3. Split into training, validation, and testing

In [1]:
import os
import glob

import pandas as pd
from sklearn.model_selection import train_test_split

from datetime import datetime

Load all of the files

In [2]:
# get files
path = 'dataset/raw_data/'
all_files = glob.glob(os.path.join(path, '*M6*.csv'))

In [3]:
# load files
dataframes = {}
for file in all_files:
    file_str = file.split('/')[-1].split('.')[0]
    print(file_str)
    dataframes[file_str] = pd.read_csv(file, dtype=str)

fire_archive_M6_96619
fire_nrt_M6_96619


## Take a look at the dataset
We will keep the latitude, longitude, acq_date, acq_time, instrument, confidence, bright_t31, brightness, frp, and daynight

In [4]:
to_keep = ["latitude", "longitude", "acq_date", "acq_time", "confidence", "bright_t31", "brightness", "frp", "daynight"]
# drop all of the other columns
for k,v in dataframes.items():
    cols = v.columns
    to_drop = [c for c in cols if c not in to_keep]
    v.drop(to_drop, inplace=True, axis=1)
    print(k)
    display(v.head(10))

fire_archive_M6_96619


Unnamed: 0,latitude,longitude,brightness,acq_date,acq_time,confidence,bright_t31,frp,daynight
0,-11.807,142.0583,313.0,2019-08-01,56,48,297.3,6.6,D
1,-11.7924,142.085,319.3,2019-08-01,56,71,297.3,11.3,D
2,-12.8398,132.8744,311.6,2019-08-01,57,42,298.7,23.1,D
3,-14.4306,143.3035,310.1,2019-08-01,57,33,296.1,6.5,D
4,-12.4953,131.4897,310.3,2019-08-01,57,36,298.8,27.6,D
5,-12.6191,142.1998,314.8,2019-08-01,57,68,297.6,9.3,D
6,-14.3655,143.5682,305.4,2019-08-01,57,24,283.9,5.9,D
7,-14.3195,143.5198,322.9,2019-08-01,57,79,290.9,20.4,D
8,-13.1654,141.9715,317.2,2019-08-01,57,72,300.0,9.9,D
9,-11.5473,132.6796,311.5,2019-08-01,57,40,298.7,27.3,D


fire_nrt_M6_96619


Unnamed: 0,latitude,longitude,brightness,acq_date,acq_time,confidence,bright_t31,frp,daynight
0,-14.281,143.636,323.9,2019-10-01,25,70,302.3,26.8,D
1,-14.284,143.532,343.5,2019-10-01,25,90,306.3,84.3,D
2,-14.302,143.706,320.2,2019-10-01,25,30,305.0,14.1,D
3,-14.283,143.652,320.4,2019-10-01,25,57,303.3,18.4,D
4,-14.285,143.521,349.4,2019-10-01,25,94,304.7,110.7,D
5,-14.273,143.589,328.1,2019-10-01,25,62,304.6,32.4,D
6,-14.768,141.792,324.3,2019-10-01,25,57,308.1,24.0,D
7,-14.304,143.512,338.0,2019-10-01,25,86,304.0,66.1,D
8,-14.267,143.557,329.3,2019-10-01,25,65,304.2,34.2,D
9,-14.307,143.528,331.1,2019-10-01,25,70,305.2,40.8,D


Convert dates/times into timestamps

In [5]:
def time_to_secs(x):
    hrs = int(x[:2])
    mins = int(x[2:])
    return hrs*3600 + mins*60

def date_to_secs(x):
    return int(datetime.strptime(x, '%Y-%m-%d').timestamp())


date_to_secs('2019-10-01')

1569913200

## Drop any values outside of our geographical range

In [11]:
lon_min = 128.37519675220057
lon_max = 138.06885989500768
lat_min = -16.89612224844862
lat_max = -10.803096222169515

for k,v in dataframes.items():
    v = v[v['latitude'].astype(float) <= lat_max]
    v = v[v['latitude'].astype(float) >= lat_min]
    v = v[v['longitude'].astype(float) <= lon_max]
    v = v[v['longitude'].astype(float) >= lon_min]

    dataframes[k] = v

## Compute timestamps from acq_data and acq_time

In [12]:
out_frames = {}
for k,v in dataframes.items():
    print(list(v.columns))
    secs = v['acq_time'].apply(time_to_secs)
    bases = v['acq_date'].apply(date_to_secs)
    ts = pd.Series(data=secs+bases, name='timestamp')
    timestamped = pd.concat([v, ts], axis=1)
    timestamped.drop(['acq_date', 'acq_time'], inplace=True, axis=1)
    out_frames[k] = timestamped
    display(timestamped)

['latitude', 'longitude', 'brightness', 'acq_date', 'acq_time', 'confidence', 'bright_t31', 'frp', 'daynight']


Unnamed: 0,latitude,longitude,brightness,confidence,bright_t31,frp,daynight,timestamp
2,-12.8398,132.8744,311.6,42,298.7,23.1,D,1564646220
4,-12.4953,131.4897,310.3,36,298.8,27.6,D,1564646220
9,-11.5473,132.6796,311.5,40,298.7,27.3,D,1564646220
10,-11.5417,132.649,312.2,42,298,31.4,D,1564646220
11,-11.5471,132.6538,316,65,298,52.7,D,1564646220
...,...,...,...,...,...,...,...,...
35986,-13.9292,132.9576,317.4,95,297,13.6,N,1569886800
35987,-13.9278,132.9481,315.3,91,297.9,11.6,N,1569886800
35988,-13.92,132.959,322,100,298.2,18,N,1569886800
35989,-13.9187,132.9494,310.6,80,297.6,7.3,N,1569886800


['latitude', 'longitude', 'brightness', 'acq_date', 'acq_time', 'confidence', 'bright_t31', 'frp', 'daynight']


Unnamed: 0,latitude,longitude,brightness,confidence,bright_t31,frp,daynight,timestamp
111,-12.987,132.578,335.6,81,304.4,103.3,D,1569920400
112,-12.07,134.281,321.1,56,298.3,69.9,D,1569920400
113,-12.059,134.276,319.2,20,298.3,55.6,D,1569920400
114,-12.983,132.544,324.1,47,301.8,34.3,D,1569920400
115,-12.465,132.504,330.1,76,304.6,58.5,D,1569920400
...,...,...,...,...,...,...,...,...
183226,-16.774,137.45,333.1,72,299.8,41.7,D,1578744900
183227,-16.775,137.443,327.4,48,297.6,25.2,D,1578744900
183228,-16.741,137.438,329.8,54,300.7,27.6,D,1578744900
183589,-16.708,129.646,300.5,16,280.8,2.9,N,1578789900


## Write the data

In [13]:
out_dir = 'dataset/modis_reduced'
os.makedirs(out_dir, exist_ok=True)
for k,v in out_frames.items():
    path = os.path.join(out_dir, k+'.csv')
    print(path)
    v.to_csv(path, index=False)

dataset/modis_reduced/fire_archive_M6_96619.csv
dataset/modis_reduced/fire_nrt_M6_96619.csv
