In [None]:
import json
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split

import preproc

In [None]:
# the percentage of the dataset that will be used for validation and testing data
VALID_SIZE = 0.2
TEST_SIZE = 0.2

## Load the original data
We also do our feature selection here. We only use latitude, longitude, and timestamp.

In [None]:
infile_path = 'dataset/modis_reduced/fire_archive_M6_96619.csv'
dataset_f = pd.read_csv(infile_path)
dataset_f.drop(['bright_t31', 'daynight', 'confidence', 'frp', 'brightness'], axis=1, inplace=True)

## Preprocess the dataset using funtions from preproc.py

In [None]:
# get the minimum & maximum values of each column
bounds = preproc.get_bounds(dataset_f)

# this scales all of our data
# this is also where I would turn tokens into integers... IF I HAD ANY
dataset_f = preproc.preprocess(dataset_f, bounds)
dataset_f.head()

In [None]:
bounds

## Arrange the dataset into sequences

In [None]:
# split into x and y values
dset_x, dset_y = preproc.xy_split(dataset_f)
# use a sliding window algorithm to make the sequences
sequences = preproc.sequencify(dset_x, dset_y)

## Split into training, validation, and testing

In [None]:
train, t2 = train_test_split(sequences, shuffle=False, test_size=(TEST_SIZE+VALID_SIZE))
valid, test = train_test_split(t2, shuffle=False, test_size =TEST_SIZE / (TEST_SIZE+VALID_SIZE))

print(f'{train.shape=}, {valid.shape=}, {test.shape=}')

## Write the preprocessed data to files
Use numpy for the train, test, and validation data (because it has too many dimensions for a csv), and JSON for the bounds because it's already a dict.

In [None]:
root_dir = 'dataset/reduced_preprocessed'
os.makedirs(root_dir, exist_ok=True)

# training data
train_path = os.path.join(root_dir, 'train')
np.save(train_path, train)

# validation data
valid_path = os.path.join(root_dir, 'valid')
np.save(valid_path, valid)

# test data
test_path = os.path.join(root_dir, 'test')
np.save(test_path, test)

# bounds
# we need this to undo our preprocessing later (so that we can understand & graph the model outputs)
bounds_path = os.path.join(root_dir, 'bounds.json')
with open(bounds_path, mode='w') as file:
    json.dump(bounds, file)