In [1]:
import json
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split

import preproc

In [2]:
VALID_SIZE = 0.2
TEST_SIZE = 0.2

## Load the original data

In [3]:
infile_path = 'dataset/modis_reduced/fire_archive_M6_96619.csv'
dataset_f = pd.read_csv(infile_path)
dataset_f.drop(['bright_t31', 'daynight', 'confidence', 'frp', 'brightness'], axis=1, inplace=True)

## Preprocess the dataset

In [4]:
bounds = preproc.get_bounds(dataset_f)

dataset_f = preproc.preprocess(dataset_f, bounds)
dataset_f.head()

Unnamed: 0,latitude,longitude,timestamp
0,0.686746,0.502595,0.0
1,0.746756,0.339244,0.0
2,0.911892,0.479615,0.0
3,0.912868,0.476005,0.0
4,0.911927,0.476571,0.0


In [5]:
bounds

{'latitude': {'max': -11.0415, 'min': -16.7822},
 'longitude': {'max': 137.0908, 'min': 128.614},
 'timestamp': {'max': 1569886800.0, 'min': 1564646220.0}}

## Arrange the dataset into sequences

In [6]:
dset_x, dset_y = preproc.xy_split(dataset_f)
sequences = preproc.sequencify(dset_x, dset_y)

sequence_x.shape=(807, 64, 3), sequence_y.shape=(807, 3)


In [7]:
train, t2 = train_test_split(sequences, shuffle=False, test_size=(TEST_SIZE+VALID_SIZE))
valid, test = train_test_split(t2, shuffle=False, test_size =TEST_SIZE / (TEST_SIZE+VALID_SIZE))

print(f'{train.shape=}, {valid.shape=}, {test.shape=}')

train.shape=(484, 65, 3), valid.shape=(161, 65, 3), test.shape=(162, 65, 3)


## Write the preprocessed data to csv

In [8]:
root_dir = 'dataset/reduced_preprocessed'
os.makedirs(root_dir, exist_ok=True)

# training data
train_path = os.path.join(root_dir, 'train')
np.save(train_path, train)

# validation data
valid_path = os.path.join(root_dir, 'valid')
np.save(valid_path, valid)

# test data
test_path = os.path.join(root_dir, 'test')
np.save(test_path, test)

# bounds
bounds_path = os.path.join(root_dir, 'bounds.json')
with open(bounds_path, mode='w') as file:
    json.dump(bounds, file)