# Imports

In [1]:
import tensorflow as tf

# Dataset
import numpy as np
import pandas as pd

# Constants

In [2]:
# Model
LEARNING_RATE = 5e-2
OPTIMIZER = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
LOSS = tf.keras.losses.Huber()
METRICS = ["mae", "mse"]
EPOCHS = 1000

# Data split
TEST_SPLIT = 0.1                # Float or int
VALID_SPLIT = 0.1               # Float
TRAIN_SPLIT = 1 - VALID_SPLIT   # Float

# Dataset window
STEPS_SIZE = 15               # minutes
PREDICTS_SIZE = 1             # minute(s)
WINDOW_SIZE = STEPS_SIZE + PREDICTS_SIZE
BATCH_SIZE = 32
SHUFFLE_BUFFER_SIZE = 64

# Dataset frame
NUM_OF_FEATURES = 6
NUM_OF_LABELS = 2

# File
DATASET_FILE_1 = './dataset/user_1/usr_1_w1_y2022.csv'
DATASET_FILE_2 = './dataset/user_1/usr_1_w2_y2022.csv'
model_1_file = './model/user_1/'
##INFERENCE_DATASET_FILE = './dataset/user_1/usr_1_steps.csv'

# Utility Functions

In [3]:
# Measure distance
def distance(coordinate_1:tuple[float,float], coordinate_2:tuple[float, float]) -> float:
    """Measure haversine distance between two coordinate.
    
    Note:
    - coordinate : tuple of latitude and longitude, ex. (3.10326051, 91.23206407)
    Reference: https://en.wikipedia.org/wiki/Haversine_formula
    """
    # constants
    earth_radius = 6371000 # in meters
    
    # unpack and convert params to radian
    lat_1, long_1 = np.radians(coordinate_1)
    lat_2, long_2 = np.radians(coordinate_2)
    
    d_lat = lat_2 - lat_1
    d_long = long_2 - long_1
    
    # calculate and return distance
    return 2 * earth_radius * np.arcsin (np.sqrt(
        np.sin(d_lat/2) ** 2
        + np.cos(lat_1) * np.cos(lat_2) * np.sin(d_long/2) ** 2))
    
distance((-6.200000, 106.816666),(-6.914744, 107.609810))
    

118291.96053744997

# Data

# BigQuery

In [None]:
from google.cloud import bigquery
from google.oauth2 import service_account
credentials = service_account.Credentials.from_service_account_file('./dataset/safe-route-351803-701f86f6b63e.json')

project_id = 'safe-route-351803'
bqclient = bigquery.Client(credentials= credentials,project=project_id)

In [None]:
query_job = bqclient.query("""
   SELECT (FORMAT_DATE("%m/%d/%Y", datetime)), time, lat, long
   FROM user_data.user_1_week_1
   LIMIT 10000""")

#results = query_job.to_dataframe() # Wait for the job to complete.

In [None]:
query_job2 = bqclient.query("""
   SELECT (FORMAT_DATE("%m/%d/%Y", date)), time, latitude, longitude
   FROM temporary_user_data.user_1_temporary_steps
   LIMIT 10000""")


## Fetching

In [None]:
def fetch_dataset(source):
    """Return a pandas dataframe object from given source"""
    x = source.to_dataframe()
    x.rename(columns = {'f0_':'date','lat':'latitude','long':'longitude'}, inplace = True)
    x["time"] = x["time"].astype("string", errors='ignore')
    return x

data = fetch_dataset(query_job)
data

In [17]:
def fetch_dataset2(source):
    """Return a pandas dataframe object from given source"""
    x = source.to_dataframe()
    x.rename(columns = {'f0_':'date'}, inplace = True)
    x["time"] = x["time"].astype("string", errors='ignore')
    return x


Unnamed: 0,date,time,latitude,longitude
0,3/7/2022,0:00:00,-6.268917,106.780112
1,3/7/2022,0:02:00,-6.268917,106.780112
2,3/7/2022,0:04:00,-6.268917,106.780112
3,3/7/2022,0:06:00,-6.268917,106.780112
4,3/7/2022,0:08:00,-6.268917,106.780112
...,...,...,...,...
5035,3/13/2022,23:50:00,-6.268917,106.779552
5036,3/13/2022,23:52:00,-6.268917,106.779552
5037,3/13/2022,23:54:00,-6.268917,106.779552
5038,3/13/2022,23:56:00,-6.268917,106.779552


## Preprocessing

In [18]:
def preprocess_dataset(data):
    """Preprocess dataset by doing:
    1. convert date to multiple column
    2. convert time to cumulative minute
    3. rearrange fields"""
    # Converting date string to datetime
    data["date"] = pd.to_datetime(data["date"])
    data["day_of_week"] = data["date"].dt.day_of_week
    data["month"] = data["date"].dt.month
    data["year"] = data["date"].dt.year

    # Converting time to cumulative minute
    # source: https://stackoverflow.com/questions/17951820/convert-hhmmss-to-minutes-using-python-pandas
    # credit: Andy Hayden
    data["time"] = data["time"].str.split(':').apply(lambda time: int(time[0]) * 60 + int(time[1]))

    # Removing unused column
    del data["date"]

    # Rearrange column
    data = data[["year", "month", "day_of_week", "time", "latitude", "longitude"]]
    return data

data = preprocess_dataset(data)
data

Unnamed: 0,year,month,day_of_week,time,latitude,longitude
0,2022,3,0,0,-6.268917,106.780112
1,2022,3,0,2,-6.268917,106.780112
2,2022,3,0,4,-6.268917,106.780112
3,2022,3,0,6,-6.268917,106.780112
4,2022,3,0,8,-6.268917,106.780112
...,...,...,...,...,...,...
5035,2022,3,6,1430,-6.268917,106.779552
5036,2022,3,6,1432,-6.268917,106.779552
5037,2022,3,6,1434,-6.268917,106.779552
5038,2022,3,6,1436,-6.268917,106.779552


## Splitting

In [19]:
def split_dataset(data, train_split, test_split):
    """Split data to train, valid, and test data"""
    # Split train_valid data and test data
    test_len = test_split
    if type(test_split)==float:
        test_len = int(test_len * len(data))
    train_val_data, test_data = data[:-test_len], data[-test_len:]
    
    # Split train data and valid data
    train_len = int(len(train_val_data) * train_split)
    train_data, valid_data = train_val_data[:train_len], train_val_data[train_len:]
    
    return train_data, valid_data, test_data

train_data, valid_data, test_data = split_dataset(data, TRAIN_SPLIT, TEST_SPLIT)

print("Dataset Shape")
print(f'Train : {train_data.shape}')
print(f'Valid : {valid_data.shape}')
print(f'Test  : {test_data.shape}')

Dataset Shape
Train : (4082, 6)
Valid : (454, 6)
Test  : (504, 6)


## Windowing

In [20]:
def windowed_dataset(data, steps_size, predicts_size, batch_size, shuffle_buffer):
    """Create windowed dataset"""
    # Converting to tfds
    wds = tf.data.Dataset.from_tensor_slices(data)
    
    # Data shifting
    wds = wds.window(steps_size+predicts_size, shift=predicts_size, drop_remainder=True)
    
    # Flatten windows
    wds = wds.flat_map(lambda window : window.batch(steps_size+predicts_size))
    
    # Create window tuples
    wds = wds.map(lambda window: (window[:-predicts_size], window[-predicts_size:, -NUM_OF_LABELS:]))
    
    # Shuffle windows
    wds = wds.shuffle(shuffle_buffer)
    
    # Batch windows
    wds = wds.batch(batch_size).prefetch(1)
    
    return wds

wds = windowed_dataset(data, STEPS_SIZE, PREDICTS_SIZE, BATCH_SIZE, SHUFFLE_BUFFER_SIZE)
for idx,(x,y) in enumerate(wds):
    print("x = ", x.numpy().shape)
    print("y = ", y.numpy().shape)
    break

x =  (32, 15, 6)
y =  (32, 1, 2)


## Building

In [22]:
def build_dataset(source, **kwargs):
    """Build dataset to make a train-ready dataset
    list of valid kwargs:
    - test_split: float - split value for test from whole dataset
    - valid_split: float - split value for valid from train_valid dataset
    - steps_size: int - number of steps used for prediction
    - predicts_size: int - number of predictions
    - batch_size: int - dataset batch size
    - shuffle_buffer_size: int - shuffle buffer size
    - num_of_features: int - number of features
    - num_of_labels: int - number of labels
    """
    # BUILD CONSTANTS
    # Data split
    test_split = kwargs.get('test_split', TEST_SPLIT)
    valid_split = kwargs.get('valid_split', VALID_SPLIT)
    train_split = 1 - valid_split

    # Dataset window
    steps_size = kwargs.get('steps_size', STEPS_SIZE)
    predicts_size = kwargs.get('predicts_size', PREDICTS_SIZE)
    window_size = steps_size + predicts_size
    batch_size = kwargs.get('batch_size', BATCH_SIZE)
    shuffle_buffer_size =  kwargs.get('shuffle_buffer_size', SHUFFLE_BUFFER_SIZE)

    # Dataset frame
    num_of_features = kwargs.get('num_of_features', NUM_OF_FEATURES)
    num_of_labels = kwargs.get('num_of_labels',NUM_OF_LABELS)
    
    # FETCHING DATASET
    ds = fetch_dataset(source) # use await for later asynchrounous usage
    
    # PREPROCESSING DATASET
    ds = preprocess_dataset(ds)
    
    # SPLITTING DATASET
    _train_ds, _valid_ds, _test_ds = split_dataset(ds, train_split, test_split)
    
    # WINDOWING AND RETURNING DATASET
    return \
        windowed_dataset(_train_ds, steps_size, predicts_size, batch_size, shuffle_buffer_size), \
        windowed_dataset(_valid_ds, steps_size, predicts_size, batch_size, shuffle_buffer_size), \
        windowed_dataset(_test_ds, steps_size, predicts_size, batch_size, shuffle_buffer_size)

train_wds, valid_wds, test_wds = build_dataset(query_job)

# Model

## Base Model

In [23]:
def create_model():
    """Create Forecasting Model
    Model used: LSTM
    output should consist of 2 item, latitude and longitude
    """
    tf.keras.backend.clear_session()
    # Generating model
    model = tf.keras.models.Sequential([
        tf.keras.layers.LSTM(64, activation='sigmoid', input_shape=(STEPS_SIZE, NUM_OF_FEATURES), return_sequences=True),
        tf.keras.layers.LSTM(32, activation='sigmoid', return_sequences=True),
        tf.keras.layers.LSTM(16, activation='sigmoid'),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(16, activation='sigmoid'),
        tf.keras.layers.Dense(8, activation='sigmoid'),
        tf.keras.layers.Dense(NUM_OF_LABELS, activation='linear')
    ])

    # Compiling model
    model.compile(
        loss=LOSS,
        optimizer=OPTIMIZER,
        metrics=METRICS,
    )
    
    return model
model = create_model()
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 15, 64)            18176     
                                                                 
 lstm_1 (LSTM)               (None, 15, 32)            12416     
                                                                 
 lstm_2 (LSTM)               (None, 16)                3136      
                                                                 
 flatten (Flatten)           (None, 16)                0         
                                                                 
 dense (Dense)               (None, 16)                272       
                                                                 
 dense_1 (Dense)             (None, 8)                 136       
                                                                 
 dense_2 (Dense)             (None, 2)                 1

## Training

In [13]:
model.fit(train_wds, epochs=20, validation_data=valid_wds)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x16b77ca3f70>

## Evaluating Model

In [None]:
model.evaluate(test_wds)

## Using Model

### Converting Data

In [None]:
def convert_data(data):
    """Convert data to model input"""
    # Take last "steps size" data from copy data
    cdata = data.copy()[-STEPS_SIZE:]
    if len(cdata) != STEPS_SIZE:
        # Not enough data to do prediction
        return None
    # Add empty row in the bottom
    cdata.loc[cdata.shape[0]] = np.zeros(NUM_OF_FEATURES)
    
    return windowed_dataset(cdata, STEPS_SIZE, PREDICTS_SIZE, BATCH_SIZE, SHUFFLE_BUFFER_SIZE)

predict_data = build_inference_dataset(query_job2)
predict_data

### Build Inference Input (Dataset)

In [29]:
def build_inference_dataset(source, **kwargs):
    """Build inference dataset in the form of prefetch dataset from given source"""
    # BUILD CONSTANTS
    # Data split
    test_split = kwargs.get('test_split', TEST_SPLIT)
    valid_split = kwargs.get('valid_split', VALID_SPLIT)
    train_split = 1 - valid_split

    # Dataset window
    steps_size = kwargs.get('steps_size', STEPS_SIZE)
    predicts_size = kwargs.get('predicts_size', PREDICTS_SIZE)
    window_size = steps_size + predicts_size
    batch_size = kwargs.get('batch_size', BATCH_SIZE)
    shuffle_buffer_size =  kwargs.get('shuffle_buffer_size', SHUFFLE_BUFFER_SIZE)

    # Dataset frame
    num_of_features = kwargs.get('num_of_features', NUM_OF_FEATURES)
    num_of_labels = kwargs.get('num_of_labels',NUM_OF_LABELS)
    
    # FETCHING DATASET
    ds = fetch_dataset2(source) # use await for later asynchrounous usage
    ds = ds[-steps_size:] # take only n-steps
    
    # PREPROCESSING DATASET
    ds = preprocess_dataset(ds)
    
    # CONVERTING TO INFERENCE SHAPE
    if len(ds) != steps_size:
        return None
    # Add empty row as empty label
    ds.loc[ds.shape[0]] = np.zeros(num_of_features)
    
    # WINDOWING AND RETURNING DATASET
    return windowed_dataset(ds, steps_size, predicts_size, batch_size, shuffle_buffer_size)

predict_data = build_inference_dataset(query_job2)
predict_data

<PrefetchDataset element_spec=(TensorSpec(shape=(None, None, 6), dtype=tf.float64, name=None), TensorSpec(shape=(None, None, 2), dtype=tf.float64, name=None))>

### Predicting Data

In [16]:
def predict(model, data):
    """Predict data using model and data"""
    steps_data = convert_data(data)
    if not steps_data:
        # If converting failed (not enough data) return none
        return None
    # Else return prediction
    return  model.predict(
        steps_data
    )[0]


print("\n\n========================================")
print("Predict")
print("========================================\n")
print(predict(model, data[:5000]))

print("\n\n========================================")
print("Label")
print("========================================\n")
print([data.loc[5000]["latitude"], data.loc[5000]["longitude"]])



Predict

[ -6.2280617 106.81565  ]


Label

[-6.229087622, 106.7979363]


## Trigger

In [17]:
# Constants
max_range = 1000 # in meters

# Trigger function
def calculate_trigger(
        coordinate_1:tuple[float,float],
        coordinate_2:tuple[float,float],
        max_range:float) -> bool:
    """Calculate if distance between two coordinates is 
    over the max range and return True if distance is more
    than max_range"""
    print(distance(coordinate_1, coordinate_2))
    return distance(coordinate_1, coordinate_2) > max_range

calculate_trigger(
    predict(model, data[:5000]),
    (data.loc[5000]["latitude"], data.loc[5000]["longitude"]),
    max_range)

1961.1701042958343


True

# References

- [Sequences, Time Series and Prediction by DeepLearning.AI](https://www.coursera.org/learn/tensorflow-sequences-time-series-and-prediction)
- [Multi-Variate Time Series Forecasting Tensorflow by Nicholas Jhana](https://www.kaggle.com/code/nicholasjhana/multi-variate-time-series-forecasting-tensorflow/notebook#Visualizing-Predictions)