# Prepare Stuttgart demand data for IrConv-LSTM

In [15]:
# imports
import numpy as np
import pandas as pd
from dtaidistance import dtw

from tqdm import tqdm

In [16]:
# parameters
INPUT_FILE_PATH = '../../../../processed_data/bolt_pickup_demand_h3_hourly.pickle'

SIMILARITY_OUTPUT_FILE_PATH = './similarity_matrix.csv'
DEMAND_OUTPUT_FILE_PATH = './demand.npy'

In [17]:
df = pd.read_pickle(INPUT_FILE_PATH)
df.head()

grid_index,881faa4485fffff,881faa4487fffff,881faa4493fffff,881faa4497fffff,881faa4499fffff,881faa449bfffff,881faa44a3fffff,881faa44a7fffff,881faa44abfffff,881faa44b1fffff,...,881faa7ad9fffff,881faa7adbfffff,881faa7addfffff,881faa7ae1fffff,881faa7ae3fffff,881faa7ae5fffff,881faa7ae7fffff,881faa7ae9fffff,881faa7aebfffff,881faa7aedfffff
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2025-02-11 14:00:00,0,0,0,0,0,0,0,0,0,0,...,1.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0
2025-02-11 15:00:00,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
2025-02-11 16:00:00,0,0,0,0,0,0,0,0,0,0,...,0.0,2.0,1.0,0.0,6.0,0.0,0.0,0.0,0.0,1.0
2025-02-11 17:00:00,0,0,0,0,0,0,0,0,0,0,...,1.0,0.0,1.0,1.0,2.0,0.0,0.0,0.0,2.0,0.0
2025-02-11 18:00:00,0,0,0,0,0,0,0,0,0,0,...,1.0,2.0,2.0,0.0,3.0,0.0,0.0,0.0,2.0,0.0


In [18]:
df.columns = sorted(df.columns)
df.head()

Unnamed: 0_level_0,881faa4485fffff,881faa4487fffff,881faa4493fffff,881faa4497fffff,881faa4499fffff,881faa449bfffff,881faa44a3fffff,881faa44a7fffff,881faa44abfffff,881faa44b1fffff,...,881faa7ad9fffff,881faa7adbfffff,881faa7addfffff,881faa7ae1fffff,881faa7ae3fffff,881faa7ae5fffff,881faa7ae7fffff,881faa7ae9fffff,881faa7aebfffff,881faa7aedfffff
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2025-02-11 14:00:00,0,0,0,0,0,0,0,0,0,0,...,1.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0
2025-02-11 15:00:00,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
2025-02-11 16:00:00,0,0,0,0,0,0,0,0,0,0,...,0.0,2.0,1.0,0.0,6.0,0.0,0.0,0.0,0.0,1.0
2025-02-11 17:00:00,0,0,0,0,0,0,0,0,0,0,...,1.0,0.0,1.0,1.0,2.0,0.0,0.0,0.0,2.0,0.0
2025-02-11 18:00:00,0,0,0,0,0,0,0,0,0,0,...,1.0,2.0,2.0,0.0,3.0,0.0,0.0,0.0,2.0,0.0


In [19]:
# add missing rows
def add_missing_rows(df):
    # Create a complete date range
    all_dates = pd.date_range(start=df.index.min(), end=df.index.max(), freq='H')
    
    # Reindex the DataFrame to include all dates
    df = df.reindex(all_dates)
    
    # Fill missing values with 0
    df.fillna(0, inplace=True)
    
    return df

In [20]:
print(len(df))
df = add_missing_rows(df)
print(len(df))

3023
3050


  all_dates = pd.date_range(start=df.index.min(), end=df.index.max(), freq='H')


In [21]:
# Reshape data for hexagonal grid format
# In hexagonal grid, we only have a list of cells (no x,y coordinates)
# We need to reshape the data to be compatible with the IrConv-LSTM model

# Get the number of time steps and cells
num_timesteps, num_cells = df.shape

# Reshape the data values to add a channel dimension
# The shape will be [time, cells, 1] (adding the channel dimension)
data_values = df.values.reshape(num_timesteps, num_cells, 1)

print(f"Original data shape: {df.values.shape}")
print(f"Reshaped data shape: {data_values.shape}")

# Save the reshaped data
np.save(DEMAND_OUTPUT_FILE_PATH, data_values)
data_values

Original data shape: (3050, 291)
Reshaped data shape: (3050, 291, 1)


array([[[0.],
        [0.],
        [0.],
        ...,
        [0.],
        [1.],
        [0.]],

       [[0.],
        [0.],
        [0.],
        ...,
        [0.],
        [0.],
        [0.]],

       [[0.],
        [0.],
        [0.],
        ...,
        [0.],
        [0.],
        [1.]],

       ...,

       [[0.],
        [0.],
        [0.],
        ...,
        [0.],
        [1.],
        [1.]],

       [[0.],
        [0.],
        [0.],
        ...,
        [1.],
        [0.],
        [0.]],

       [[0.],
        [0.],
        [0.],
        ...,
        [0.],
        [1.],
        [1.]]])

In [22]:
def compute_dtw_matrix(counts):
    T, N, _ = counts.shape  # Update to handle the new 3D shape
    dtw_dist = np.zeros((N, N), dtype=float)
    for i in tqdm(range(N)):
        for j in range(i+1, N):
            # Extract the time series for cells i and j
            dist = dtw.distance_fast(counts[:, i, 0], counts[:, j, 0])
            dtw_dist[i, j] = dist
            dtw_dist[j, i] = dist
    return dtw_dist

def dtw_to_similarity(dtw_dist):
    return 1.0 / (1.0 + dtw_dist)

In [23]:
data_values

array([[[0.],
        [0.],
        [0.],
        ...,
        [0.],
        [1.],
        [0.]],

       [[0.],
        [0.],
        [0.],
        ...,
        [0.],
        [0.],
        [0.]],

       [[0.],
        [0.],
        [0.],
        ...,
        [0.],
        [0.],
        [1.]],

       ...,

       [[0.],
        [0.],
        [0.],
        ...,
        [0.],
        [1.],
        [1.]],

       [[0.],
        [0.],
        [0.],
        ...,
        [1.],
        [0.],
        [0.]],

       [[0.],
        [0.],
        [0.],
        ...,
        [0.],
        [1.],
        [1.]]])

In [24]:
dtw_dist_matrix = compute_dtw_matrix(data_values)

dtw_similarity_matrix = dtw_to_similarity(dtw_dist_matrix)

100%|██████████| 291/291 [25:52<00:00,  5.34s/it]


In [25]:
similarity_df = pd.DataFrame(dtw_similarity_matrix, index=df.columns, columns=df.columns)
similarity_df.to_csv(SIMILARITY_OUTPUT_FILE_PATH)

In [26]:
similarity_df.head()

Unnamed: 0,881faa4485fffff,881faa4487fffff,881faa4493fffff,881faa4497fffff,881faa4499fffff,881faa449bfffff,881faa44a3fffff,881faa44a7fffff,881faa44abfffff,881faa44b1fffff,...,881faa7ad9fffff,881faa7adbfffff,881faa7addfffff,881faa7ae1fffff,881faa7ae3fffff,881faa7ae5fffff,881faa7ae7fffff,881faa7ae9fffff,881faa7aebfffff,881faa7aedfffff
881faa4485fffff,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.015326,0.018871,0.013207,0.01566,0.005679,0.414214,0.08121,0.026186,0.011322,0.023893
881faa4487fffff,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.015326,0.018871,0.013207,0.01566,0.005679,0.414214,0.08121,0.026186,0.011322,0.023893
881faa4493fffff,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.015326,0.018871,0.013207,0.01566,0.005679,0.414214,0.08121,0.026186,0.011322,0.023893
881faa4497fffff,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.015326,0.018871,0.013207,0.01566,0.005679,0.414214,0.08121,0.026186,0.011322,0.023893
881faa4499fffff,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.015326,0.018871,0.013207,0.01566,0.005679,0.414214,0.08121,0.026186,0.011322,0.023893


In [27]:
df.index[0], df.index[-200], df.index[-1]

(Timestamp('2025-02-11 14:00:00'),
 Timestamp('2025-06-10 08:00:00'),
 Timestamp('2025-06-18 15:00:00'))

In [28]:
len(df.index) / 10 

305.0

# Model Adaptation Notes

The original IrConv-LSTM model was designed for 2D grid data (with x,y coordinates).
Since we're using a hexagonal grid with a single index per cell, we've reshaped our data to be compatible:

1. Our data is now [time, cells, 1] instead of [time, height, width]
2. The similarity matrix contains the relationships between cells based on DTW distance