# Transform the data into a complete time series

In [36]:
import pandas as pd
from src.paths import CLEANED_DATA, TRANSFORMED_DATA, GEOGRAPHICAL_DATA
from src.data_transformations import add_missing_slots
from src.miscellaneous import add_rounded_coordinates_to_dataframe, add_column_of_rounded_points, make_new_station_ids,save_dict, add_column_of_ids 

## Import the cleaned 2023 data

In [37]:
trips = pd.read_parquet(path = CLEANED_DATA/"final.parquet")
trips.head()

Unnamed: 0,start_time,stop_time,start_latitude,start_longitude,stop_latitude,stop_longitude
0,2023-01-21 20:05:42,2023-01-21 20:16:33,41.924074,-87.646278,41.93,-87.64
1,2023-01-10 15:37:36,2023-01-10 15:46:05,41.799568,-87.594747,41.809835,-87.599383
2,2023-01-02 07:51:57,2023-01-02 08:05:11,42.008571,-87.690483,42.039742,-87.699413
3,2023-01-22 10:52:58,2023-01-22 11:01:44,41.799568,-87.594747,41.809835,-87.599383
4,2023-01-12 13:58:01,2023-01-12 14:13:20,41.799568,-87.594747,41.809835,-87.599383


## Reduce the time stamps to hour-based values

In [38]:
trips["start_hour"] = trips["start_time"].dt.floor("H")
trips["stop_hour"] = trips["stop_time"].dt.floor("H")

trips.drop(columns = ["start_time", "stop_time"], inplace = True)

## Preparing data for aggregation

### Starts 

In [39]:
starts = trips[
    [
        "start_hour", "start_latitude", "start_longitude"
    ]
] 


stops = trips[
    [
        "stop_hour", "stop_latitude", "stop_longitude"
    ]
]

##### Save original coordinates

In [40]:
starts[["start_latitude", "start_longitude"]].to_parquet(path = GEOGRAPHICAL_DATA/ "original_start_coordinates.parquet") 
stops[["stop_latitude", "stop_longitude"]].to_parquet(path = GEOGRAPHICAL_DATA/ "original_stop_coordinates.parquet") 

##### Round the coordinates to two decimal places to make grouping easier, and remove the old latitude and 

In [41]:
add_rounded_coordinates_to_dataframe(data = starts, decimal_places = 3, start_or_stop = "start")

100%|██████████| 5495778/5495778 [00:34<00:00, 158455.35it/s]
100%|██████████| 5495778/5495778 [00:34<00:00, 159723.89it/s]


In [42]:
add_rounded_coordinates_to_dataframe(data = stops, decimal_places = 3, start_or_stop = "stop")

100%|██████████| 5495778/5495778 [00:34<00:00, 161258.13it/s]
100%|██████████| 5495778/5495778 [00:33<00:00, 163418.29it/s]


##### Remove original coordinates, and put those tuples in a dedicated column

In [43]:
starts = starts.drop(columns = ["start_latitude", "start_longitude"])
stops = stops.drop(columns = ["stop_latitude", "stop_longitude"])

##### Add columns of consisting of tuples of rounded coordinates

In [44]:
add_column_of_rounded_points(data = starts, start_or_stop = "start")
add_column_of_rounded_points(data = stops, start_or_stop = "stop")

In [45]:
starts = starts.drop(columns = ["rounded_start_latitude", "rounded_start_longitude"])
stops = stops.drop(columns = ["rounded_stop_latitude", "rounded_stop_longitude"])

#### Make new location IDs, and associate each of them to a point using a dictionary

In [46]:
origins_and_ids = make_new_station_ids(data = starts, start_or_stop = "start")
destinations_and_ids = make_new_station_ids(data = stops, start_or_stop = "stop")

1833it [00:00, 1318949.95it/s]
1241it [00:00, 1537704.95it/s]


##### Ensuring that any points common to the origins and destinations have the same IDs

###### We have made dictionaries associating the origins and destinations with IDs, but some of the destinations and origins may be the same. And they will have been assigned to different IDs. These common locations (or rather, their coordinates) must be assigned to the same ID in each dictionary.


###### First, let us find out how many of these points are common to these dictionaries.

In [47]:
common_points = [point for point in destinations_and_ids.keys() if point in origins_and_ids.keys()]
len(common_points)

1168

###### There are 2,357 common locations. And they will most likely have been assigned to different IDs in each dictionary.
###### Let us assign ensure that these common points have the same IDs in each dictionary.

In [48]:
for point in common_points:

        destinations_and_ids[point] = origins_and_ids[point]

###### Checking for repetition of origin IDs. The presence of repeated values necessitates a deeper investigation into whether they are shared by two different points.

In [49]:
len(origins_and_ids.values()) == len(set(origins_and_ids.values()))

True

###### There are no repeated origin IDs 

In [50]:
len(destinations_and_ids.values()) == len(set(destinations_and_ids.values()))

False

###### There are some repeated destination IDs. Let us check whether they belong to the same points or not. If they belong to two different points, then that will have to be rectified.

In [51]:
for a,b in zip(destinations_and_ids.keys(), destinations_and_ids.keys()):

    if destinations_and_ids[a] == destinations_and_ids[b] and a != b:

        print((a,b))

###### All clear!

In [52]:
stops

Unnamed: 0,stop_hour,rounded_stop_points
0,2023-01-21 20:00:00,"(41.93, -87.64)"
1,2023-01-10 15:00:00,"(41.81, -87.599)"
2,2023-01-02 08:00:00,"(42.04, -87.699)"
3,2023-01-22 11:00:00,"(41.81, -87.599)"
4,2023-01-12 14:00:00,"(41.81, -87.599)"
...,...,...
362513,2023-11-24 08:00:00,"(41.933, -87.636)"
362514,2023-11-06 09:00:00,"(41.831, -87.627)"
362515,2023-11-10 19:00:00,"(41.925, -87.689)"
362516,2023-11-27 09:00:00,"(41.831, -87.627)"


##### Save these dictionaries

###### This is crucial because it will allow me to recall this particular association of coordinates with IDs later on.

In [53]:
save_dict(
    folder = GEOGRAPHICAL_DATA, 
    dictionary = origins_and_ids, 
    file_name = "rounded_origin_points_and_their_IDs.pkl"
    )

save_dict(
    folder = GEOGRAPHICAL_DATA, 
    dictionary = destinations_and_ids, 
    file_name = "rounded_destination_points_and_their_IDs.pkl"
    )

##### Form a column of said IDs (in the appropriate order)

In [54]:
add_column_of_ids(data = starts, start_or_stop = "start", points_and_ids = origins_and_ids)

In [55]:
add_column_of_ids(data = stops, start_or_stop = "stop", points_and_ids = destinations_and_ids)

## Form Aggregate Datasets

In [56]:
starts = starts.drop("rounded_start_points", axis = 1)
stops = stops.drop("rounded_stop_points", axis = 1)

### Aggregate Starts

In [57]:
agg_starts = starts.groupby(["start_hour", "start_station_id"]).size().reset_index()
agg_starts = agg_starts.rename(columns = {0: "trips"}) 

#### Full Aggregate Starts with Missing Slots

In [58]:
ts_starts = add_missing_slots(agg_data = agg_starts, start_or_stop = "start")

100%|██████████| 1733/1733 [00:55<00:00, 31.10it/s]


In [59]:
ts_starts.to_parquet(path = TRANSFORMED_DATA/"ts_starts.parquet")

### Aggregate Stops

In [60]:
agg_stops = stops.groupby(["stop_hour", "stop_station_id"]).size().reset_index()
agg_stops = agg_stops.rename(columns = {0: "trips"}) 

#### Full Aggregate Stops with Missing Slots

In [61]:
ts_stops = add_missing_slots(agg_data = agg_stops, start_or_stop = "stop")

100%|██████████| 1164/1164 [00:25<00:00, 45.78it/s]


In [62]:
ts_stops.to_parquet(path = TRANSFORMED_DATA/"ts_stops.parquet")