In [1]:
%reload_ext autoreload
%autoreload 2

import pandas as pd

from src.paths import DATA_DIR, RAW_DATA_DIR, TEMPORARY_DATA, TRAINING_DATA, PARENT_DIR
from src.data_extraction import load_raw_data
from src.data_transformations import (
  clean_raw_data, transform_cleaned_data_into_ts_data, transform_ts_into_training_data
)

## Get the raw data

In [None]:
trips_per_month_2023 = list(load_raw_data(year=2023))

In [None]:
# Form a dataframe by concatenating all the available months of 2023
trips = pd.concat(trips_per_month_2023)

## Clean the data

In [None]:
# Clean the data
trips = clean_raw_data(trips)

In [None]:
trips

## Define datasets

In [None]:
starts = trips[
    ["start_time", "start_latitude", "start_longitude"]
]

stops = trips[
    ["stop_time", "stop_latitude", "stop_longitude"]
]

## Transform datasets into time series

In [None]:
agg_starts, agg_stops = transform_cleaned_data_into_ts_data(start_df = starts, stop_df = stops)

In [None]:
trimmed_agg_starts = agg_starts.iloc[:,:3]
trimmed_agg_stops = agg_stops.iloc[:,:3]

trimmed_agg_starts.to_parquet(TEMPORARY_DATA/"agg_starts.parquet")
trimmed_agg_stops.to_parquet(TEMPORARY_DATA/"agg_stops.parquet")

## Transform time series data into training data

### Start data

In [None]:
start_features, start_target = transform_ts_into_training_data(
    ts_data=trimmed_agg_starts,
    start_or_stop="start",
    input_seq_len=24*28*1,                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  
    step_size=24
)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          

print(f"{start_features.shape=}")
print(f"{start_target.shape=}")

In [None]:
start_features.to_parquet(TEMPORARY_DATA/"start_features.parquet")
start_target.to_csv(TEMPORARY_DATA/"start_target.csv")

#### Saving the final tabular dataset

In [None]:
start_table = start_features
start_features["trips_next_hour"] = start_target

start_table.to_parquet(TRAINING_DATA/"start_table.parquet")

### Stop data

In [None]:
stop_features, stop_target = transform_ts_into_training_data(
    ts_data=trimmed_agg_stops,
    start_or_stop="stop",
    input_seq_len=24*28*1, 
    step_size=24
)
print(f"{stop_features.shape=}")
print(f"{stop_target.shape=}")                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  

#### Saving the final tabular dataset

In [None]:
stop_table = stop_features
stop_features["trips_next_hour"] = stop_target

stop_table.to_parquet(TRAINING_DATA/"stop_table.parquet")