# Prepare MODIS data for ML
1. Load in all the raw data
2. Drop all of the columns we don't want
3. Turn dates/times into unix timestamps
3. Split into training, validation, and testing

In [None]:
import os
import glob

import pandas as pd

from datetime import datetime

## Load all of the files

In [None]:
# get files
path = 'dataset/raw_data/'
all_files = glob.glob(os.path.join(path, '*M6*.csv'))

In [None]:
# load files
dataframes = {}
for file in all_files:
    file_str = file.split('/')[-1].split('.')[0]
    print(file_str)
    dataframes[file_str] = pd.read_csv(file, dtype=str)

## Take a look at the dataset
We will keep the latitude, longitude, acq_date, acq_time, instrument, confidence, bright_t31, brightness, frp, and daynight, because these are the only ones that contain interesting information. Feature selection will be a problem for future me.

In [None]:
to_keep = ["latitude", "longitude", "acq_date", "acq_time", "confidence", "bright_t31", "brightness", "frp", "daynight"]
# drop all of the other columns
for k,v in dataframes.items():
    cols = v.columns
    to_drop = [c for c in cols if c not in to_keep]
    v.drop(to_drop, inplace=True, axis=1)
    print(k)
    display(v.head(10))

## Convert dates/times into timestamps

In [None]:
# this converts acq_time to "timestamps" (where 12:00am that day is t=0)
def time_to_secs(x):
    hrs = int(x[:2])
    mins = int(x[2:])
    return hrs*3600 + mins*60

# this converts acq_date to timestamps
def date_to_secs(x):
    return int(datetime.strptime(x, '%Y-%m-%d').timestamp())


date_to_secs('2019-10-01')

## Drop any values outside of our geographical range
This step is done because we are preparing for the reduced geographical range model. If we were preparing data for the baseline model, we would skip this step.

In [None]:
lon_min = 128.37519675220057
lon_max = 138.06885989500768
lat_min = -16.89612224844862
lat_max = -10.803096222169515

for k,v in dataframes.items():
    v = v[v['latitude'].astype(float) <= lat_max]
    v = v[v['latitude'].astype(float) >= lat_min]
    v = v[v['longitude'].astype(float) <= lon_max]
    v = v[v['longitude'].astype(float) >= lon_min]

    dataframes[k] = v

## Compute timestamps from acq_data and acq_time

In [None]:
out_frames = {}
for k,v in dataframes.items():
    print(list(v.columns))

    # get the timestamp offset for the given time of day
    secs = v['acq_time'].apply(time_to_secs)
    # get the baseline timestamp for the given date
    bases = v['acq_date'].apply(date_to_secs)
    # add the bases and offsets together
    ts = pd.Series(data=secs+bases, name='timestamp')

    # concatenate a new timestamp column onto our dataframe
    timestamped = pd.concat([v, ts], axis=1)
    # drop the old date and time columns
    timestamped.drop(['acq_date', 'acq_time'], inplace=True, axis=1)

    out_frames[k] = timestamped
    display(timestamped.head())

## Write the data

In [None]:
out_dir = 'dataset/modis_reduced'
os.makedirs(out_dir, exist_ok=True)

for k,v in out_frames.items():
    path = os.path.join(out_dir, k+'.csv')
    print(path)
    v.to_csv(path, index=False)