In [1]:
%reload_ext autoreload
%autoreload 2

import pandas as pd
from src.paths import TRANSFORMED_DATA
from src.data_extraction import load_raw_data
from src.data_transformations import clean_raw_data, transform_cleaned_data_into_ts_data, transform_ts_into_training_data

## Get the raw data

In [2]:
trips_2023 = list(
        load_raw_data(
            year=2023, months = list(range(1,12))
        )
)

The file 202301-divvy-tripdata.zip is already in local storage
The file 202302-divvy-tripdata.zip is already in local storage
The file 202303-divvy-tripdata.zip is already in local storage
The file 202304-divvy-tripdata.zip is already in local storage
The file 202305-divvy-tripdata.zip is already in local storage
The file 202306-divvy-tripdata.zip is already in local storage
The file 202307-divvy-tripdata.zip is already in local storage
The file 202308-divvy-tripdata.zip is already in local storage
The file 202309-divvy-tripdata.zip is already in local storage
The file 202310-divvy-tripdata.zip is already in local storage
The file 202311-divvy-tripdata.zip is already in local storage


In [3]:
# Form a dataframe by concatenating all the available months of 2023
trips = pd.concat(trips_2023)

## Clean the data

In [4]:
# Clean the data
trips = clean_raw_data(trips)

In [5]:
trips

Unnamed: 0,start_time,stop_time,start_latitude,start_longitude,stop_latitude,stop_longitude
0,2023-01-21 20:05:42,2023-01-21 20:16:33,41.924074,-87.646278,41.930000,-87.640000
1,2023-01-10 15:37:36,2023-01-10 15:46:05,41.799568,-87.594747,41.809835,-87.599383
2,2023-01-02 07:51:57,2023-01-02 08:05:11,42.008571,-87.690483,42.039742,-87.699413
3,2023-01-22 10:52:58,2023-01-22 11:01:44,41.799568,-87.594747,41.809835,-87.599383
4,2023-01-12 13:58:01,2023-01-12 14:13:20,41.799568,-87.594747,41.809835,-87.599383
...,...,...,...,...,...,...
362513,2023-11-24 08:39:27,2023-11-24 08:47:03,41.936497,-87.647539,41.935775,-87.663600
362514,2023-11-06 09:07:20,2023-11-06 09:10:00,41.877726,-87.654787,41.877642,-87.649618
362515,2023-11-10 19:35:30,2023-11-10 19:44:28,41.943687,-87.648855,41.935775,-87.663600
362516,2023-11-27 09:11:23,2023-11-27 09:13:23,41.877726,-87.654787,41.877642,-87.649618


## Define datasets

In [6]:
starts = trips[
    ["start_time", "start_latitude", "start_longitude"]
]

stops = trips[
    ["stop_time", "stop_latitude", "stop_longitude"]
]

## Transform datasets into time series

In [7]:
agg_starts, agg_stops = transform_cleaned_data_into_ts_data(start_df = starts, stop_df = stops)

This might take a moment
Computing the hours during which each trip starts
Approximating the coordinates of the start of each trip


100%|██████████| 5489029/5489029 [00:30<00:00, 180689.24it/s]
100%|██████████| 5489029/5489029 [00:28<00:00, 189767.24it/s]


Matching up approximate locations with generated IDs


1832it [00:00, 1483104.60it/s]


Computing the hours during which each trip stops
Approximating the coordinates of the stop of each trip


100%|██████████| 5489029/5489029 [00:29<00:00, 184389.89it/s]
100%|██████████| 5489029/5489029 [00:28<00:00, 189887.87it/s]


Matching up approximate locations with generated IDs


1240it [00:00, 1426868.85it/s]


Aggregating the final data on trip starts


100%|██████████| 1830/1830 [01:17<00:00, 23.63it/s]






Aggregating the final data on trip stops


100%|██████████| 1240/1240 [00:40<00:00, 30.56it/s]








In [8]:
agg_stops.to_parquet(path = TRANSFORMED_DATA/"agg_stops.parquet")
agg_starts.to_parquet(path =TRANSFORMED_DATA/"agg_starts.parquet")

## Transform time series data into training data

### Start data

In [9]:
start_features, start_target = transform_ts_into_training_data(
    ts_data=agg_starts,
    start_or_stop="start",
    input_seq_len=24*28*1, 
    step_size=24
)

print(f"{start_features.shape=}")
print(f"{start_target.shape=}")

100%|██████████| 1830/1830 [08:11<00:00,  3.72it/s]  


start_features.shape=(559980, 674)
start_target.shape=(559980,)


### Stop data

In [10]:
stop_features, stop_target = transform_ts_into_training_data(
    ts_data=agg_stops,
    start_or_stop="stop",
    input_seq_len=24*28*1, 
    step_size=24
)

print(f"{stop_features.shape=}")
print(f"{stop_target.shape=}")

100%|██████████| 1240/1240 [03:38<00:00,  5.66it/s]


stop_features.shape=(380680, 674)
stop_target.shape=(380680,)


## Saving the final tabular datasets

### Starts

In [11]:
start_table = start_features
start_features["trips_next_hour"] = start_target

start_table.to_parquet(TRANSFORMED_DATA/"start_table.parquet")

### Stops

In [12]:
stop_table = stop_features
stop_features["trips_next_hour"] = stop_target

stop_table.to_parquet(TRANSFORMED_DATA/"stop_table.parquet")