In [1]:
%reload_ext autoreload
%autoreload 2

import pandas as pd
from src.paths import TEMPORARY_DATA, TRAINING_DATA
from src.data_extraction import load_raw_data
from src.data_transformations import clean_raw_data, transform_cleaned_data_into_ts_data, transform_ts_into_training_data

## Get the raw data

In [2]:
trips_per_month_2023 = list(load_raw_data(year=2023))

The file 202301-divvy-tripdata.zip is already in local storage
The file 202302-divvy-tripdata.zip is already in local storage
The file 202303-divvy-tripdata.zip is already in local storage
The file 202304-divvy-tripdata.zip is already in local storage
The file 202305-divvy-tripdata.zip is already in local storage
The file 202306-divvy-tripdata.zip is already in local storage
The file 202307-divvy-tripdata.zip is already in local storage
The file 202308-divvy-tripdata.zip is already in local storage
The file 202309-divvy-tripdata.zip is already in local storage
The file 202310-divvy-tripdata.zip is already in local storage
The file 202311-divvy-tripdata.zip is already in local storage
The file 202312-divvy-tripdata.zip is already in local storage


In [3]:
# Form a dataframe by concatenating all the available months of 2023
trips = pd.concat(trips_per_month_2023)

## Clean the data

In [4]:
# Clean the data
trips = clean_raw_data(trips)

In [5]:
trips

Unnamed: 0,start_time,stop_time,start_latitude,start_longitude,stop_latitude,stop_longitude
0,2023-01-21 20:05:42,2023-01-21 20:16:33,41.924074,-87.646278,41.930000,-87.640000
1,2023-01-10 15:37:36,2023-01-10 15:46:05,41.799568,-87.594747,41.809835,-87.599383
2,2023-01-02 07:51:57,2023-01-02 08:05:11,42.008571,-87.690483,42.039742,-87.699413
3,2023-01-22 10:52:58,2023-01-22 11:01:44,41.799568,-87.594747,41.809835,-87.599383
4,2023-01-12 13:58:01,2023-01-12 14:13:20,41.799568,-87.594747,41.809835,-87.599383
...,...,...,...,...,...,...
224068,2023-12-07 13:15:24,2023-12-07 13:17:37,41.874702,-87.649804,41.874640,-87.657030
224069,2023-12-08 18:42:21,2023-12-08 18:45:56,41.874754,-87.649807,41.874640,-87.657030
224070,2023-12-05 14:09:11,2023-12-05 14:13:01,41.874754,-87.649807,41.874640,-87.657030
224071,2023-12-02 21:36:07,2023-12-02 21:53:45,41.881396,-87.674984,41.885492,-87.652289


## Define datasets

In [6]:
starts = trips[
    ["start_time", "start_latitude", "start_longitude"]
]

stops = trips[
    ["stop_time", "stop_latitude", "stop_longitude"]
]

## Transform datasets into time series

In [7]:
agg_starts, agg_stops = transform_cleaned_data_into_ts_data(start_df = starts, stop_df = stops)

This might take a while


Computing the hours during which each trip starts
Approximating the coordinates of the location at which each trip starts


100%|██████████| 5712863/5712863 [00:31<00:00, 184126.53it/s]
100%|██████████| 5712863/5712863 [00:32<00:00, 178237.40it/s]


Matching up approximate locations with generated IDs


10669it [00:00, 1826714.67it/s]




Computing the hours during which each trip stops
Approximating the coordinates of the location at which each trip stops


100%|██████████| 5712863/5712863 [00:32<00:00, 176561.37it/s]
100%|██████████| 5712863/5712863 [00:29<00:00, 193742.53it/s]


Matching up approximate locations with generated IDs


1264it [00:00, 2221030.69it/s]




Aggregating the final data on trip starts


100%|██████████| 10668/10668 [1:38:41<00:00,  1.80it/s]


Aggregating the final data on trip stops


100%|██████████| 1264/1264 [01:35<00:00, 13.27it/s]


In [8]:
trimmed_agg_starts = agg_starts.iloc[:,:3]
trimmed_agg_stops = agg_stops.iloc[:,:3]

trimmed_agg_starts.to_parquet(TEMPORARY_DATA/"agg_starts.parquet")
trimmed_agg_stops.to_parquet(TEMPORARY_DATA/"agg_stops.parquet")

## Transform time series data into training data

### Start data

In [9]:
start_features, start_target = transform_ts_into_training_data(
    ts_data=trimmed_agg_starts,
    start_or_stop="start",
    input_seq_len=24*28*1,                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  
    step_size=24
)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          

print(f"{start_features.shape=}")
print(f"{start_target.shape=}")

100%|██████████| 10668/10668 [7:38:33<00:00,  2.58s/it]      


start_features.shape=(3595116, 674)
start_target.shape=(3595116,)


In [None]:
start_features.to_parquet(TEMPORARY_DATA/"start_features.parquet")
start_target.to_csv(TEMPORARY_DATA/"start_target.csv")

#### Saving the final tabular dataset

In [15]:
start_table = start_features
start_features["trips_next_hour"] = start_target

start_table.to_parquet(TRAINING_DATA/"start_table.parquet")

### Stop data

In [19]:
stop_features, stop_target = transform_ts_into_training_data(
    ts_data=trimmed_agg_stops,
    start_or_stop="stop",
    input_seq_len=24*28*1, 
    step_size=24
)
print(f"{stop_features.shape=}")
print(f"{stop_target.shape=}")                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  

100%|██████████| 1264/1264 [05:34<00:00,  3.78it/s]


stop_features.shape=(427232, 674)
stop_target.shape=(427232,)


#### Saving the final tabular dataset

In [21]:
stop_table = stop_features
stop_features["trips_next_hour"] = stop_target

stop_table.to_parquet(TRAINING_DATA/"stop_table.parquet")