In [None]:
%reload_ext autoreload
%autoreload 2

import pandas as pd

from src.paths import DATA_DIR, RAW_DATA_DIR, TEMPORARY_DATA, TRAINING_DATA, PARENT_DIR, TIME_SERIES_DATA
from src.data_extraction import load_raw_data

from src.data_transformations import (
  clean_raw_data, transform_cleaned_data_into_ts_data, transform_ts_into_training_data
)

## Get the raw data

### I am going to consider only the data from June 2023 till the end of January 2024, due to memory limitations that were encountered when attempting hyperparameter tuning on the full start dataset.


In [None]:
trips_2023 = list(
  load_raw_data(year=2023, months=list(range(6,13)))
)

### January 2024

In [None]:
jan_2024 = list(
  load_raw_data(year=2024, months=[1])
)

In [None]:
# Form a dataframe by concatenating all the available months of 2023 and January 2024
trips = pd.concat(trips_2023+jan_2024)

## Clean the data

In [None]:
# Clean the data
clean_trips = clean_raw_data(trips)

In [None]:
clean_trips

## Define datasets

In [None]:
starts = clean_trips[
    ["start_time", "start_latitude", "start_longitude"]
]

stops = clean_trips[
    ["stop_time", "stop_latitude", "stop_longitude"]
]

## Transform datasets into time series

In [None]:
agg_starts, agg_stops = transform_cleaned_data_into_ts_data(start_df = starts, stop_df = stops)

### Save time series

In [None]:
trimmed_agg_starts = agg_starts.iloc[:,:3]
trimmed_agg_stops = agg_stops.iloc[:,:3]

trimmed_agg_starts.to_parquet(TEMPORARY_DATA/"agg_starts.parquet")
trimmed_agg_stops.to_parquet(TEMPORARY_DATA/"agg_stops.parquet")

## Transform time series data into training data

### Start data

In [None]:
start_features, start_target = transform_ts_into_training_data(
    ts_data=trimmed_agg_starts,
    start_or_stop="start",
    input_seq_len=24*28*1,                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  
    step_size=24
)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          

print(f"{start_features.shape=}")
print(f"{start_target.shape=}")

In [None]:
start_features["trips_next_hour"] = start_target
start_table = start_features

### Stop data

In [None]:
stop_features, stop_target = transform_ts_into_training_data(
    ts_data=trimmed_agg_stops,
    start_or_stop="stop",
    input_seq_len=24*28*1, 
    step_size=24
)
print(f"{stop_features.shape=}")
print(f"{stop_target.shape=}")                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  

In [None]:
stop_features["trips_next_hour"] = stop_target
stop_table = stop_features

### Changing data types to reduce memory load

In [None]:
import numpy as np
from tqdm import tqdm
from src.miscellaneous import change_column_data_type

#for dataset in tqdm([start_table, start_past_4_months, stop_table]):
  
for dataset in [start_table, stop_table]:
  
  dataset = dataset.sort_index()
  
  for col in dataset.columns:
      
      if "trips" in col:

        change_column_data_type(
          data=dataset, 
          columns=col, 
          to_format=np.int8)

#### Saving the final tabular datasets

In [None]:
start_table.to_parquet(TRAINING_DATA/"starts.parquet")
stop_table.to_parquet(TRAINING_DATA/"stops.parquet")