# Prepare Stuttgart demand data for IrConv-LSTM

In [1]:
# imports
import numpy as np
import pandas as pd
from dtaidistance import dtw

from tqdm import tqdm

In [2]:
# parameters
INPUT_FILE_PATH = '../../../../processed_data/bolt_dropoff_demand_h3_hourly.pickle'

SIMILARITY_OUTPUT_FILE_PATH = './similarity_matrix.csv'
DEMAND_OUTPUT_FILE_PATH = './demand.npy'

In [3]:
df = pd.read_pickle(INPUT_FILE_PATH)
df.head()

grid_index,881faa44b3fffff,881faa6143fffff,881faa614bfffff,881faa614dfffff,881faa626dfffff,881faa6301fffff,881faa6303fffff,881faa6305fffff,881faa6307fffff,881faa6309fffff,...,881faa7ad9fffff,881faa7adbfffff,881faa7addfffff,881faa7ae1fffff,881faa7ae3fffff,881faa7ae7fffff,881faa7ae9fffff,881faa7aebfffff,881faa7aedfffff,881faa7b13fffff
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2025-02-11 14:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
2025-02-11 15:00:00,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,3.0,0.0,0.0
2025-02-11 16:00:00,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,...,1.0,2.0,3.0,2.0,3.0,0.0,0.0,2.0,0.0,1.0
2025-02-11 17:00:00,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,...,1.0,1.0,2.0,2.0,2.0,0.0,1.0,3.0,0.0,0.0
2025-02-11 18:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,...,1.0,0.0,1.0,1.0,3.0,0.0,1.0,2.0,0.0,0.0


In [4]:
df.columns = sorted(df.columns)
df.head()

Unnamed: 0_level_0,881faa44b3fffff,881faa6143fffff,881faa614bfffff,881faa614dfffff,881faa626dfffff,881faa6301fffff,881faa6303fffff,881faa6305fffff,881faa6307fffff,881faa6309fffff,...,881faa7ad9fffff,881faa7adbfffff,881faa7addfffff,881faa7ae1fffff,881faa7ae3fffff,881faa7ae7fffff,881faa7ae9fffff,881faa7aebfffff,881faa7aedfffff,881faa7b13fffff
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2025-02-11 14:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
2025-02-11 15:00:00,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,3.0,0.0,0.0
2025-02-11 16:00:00,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,...,1.0,2.0,3.0,2.0,3.0,0.0,0.0,2.0,0.0,1.0
2025-02-11 17:00:00,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,...,1.0,1.0,2.0,2.0,2.0,0.0,1.0,3.0,0.0,0.0
2025-02-11 18:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,...,1.0,0.0,1.0,1.0,3.0,0.0,1.0,2.0,0.0,0.0


In [5]:
# add missing rows
def add_missing_rows(df):
    # Create a complete date range
    all_dates = pd.date_range(start=df.index.min(), end=df.index.max(), freq='H')
    
    # Reindex the DataFrame to include all dates
    df = df.reindex(all_dates)
    
    # Fill missing values with 0
    df.fillna(0, inplace=True)
    
    return df

In [6]:
print(len(df))
df = add_missing_rows(df)
print(len(df))

1793
1820


  all_dates = pd.date_range(start=df.index.min(), end=df.index.max(), freq='H')


In [7]:
# Reshape data for hexagonal grid format
# In hexagonal grid, we only have a list of cells (no x,y coordinates)
# We need to reshape the data to be compatible with the IrConv-LSTM model

# Get the number of time steps and cells
num_timesteps, num_cells = df.shape

# Reshape the data values to add a channel dimension
# The shape will be [time, cells, 1] (adding the channel dimension)
data_values = df.values.reshape(num_timesteps, num_cells, 1)

print(f"Original data shape: {df.values.shape}")
print(f"Reshaped data shape: {data_values.shape}")

# Save the reshaped data
np.save(DEMAND_OUTPUT_FILE_PATH, data_values)
data_values

Original data shape: (1820, 236)
Reshaped data shape: (1820, 236, 1)


array([[[0.],
        [0.],
        [0.],
        ...,
        [1.],
        [0.],
        [0.]],

       [[0.],
        [0.],
        [0.],
        ...,
        [3.],
        [0.],
        [0.]],

       [[0.],
        [0.],
        [0.],
        ...,
        [2.],
        [0.],
        [1.]],

       ...,

       [[0.],
        [0.],
        [0.],
        ...,
        [2.],
        [0.],
        [0.]],

       [[0.],
        [0.],
        [0.],
        ...,
        [1.],
        [0.],
        [0.]],

       [[0.],
        [0.],
        [0.],
        ...,
        [0.],
        [0.],
        [0.]]])

In [8]:
def compute_dtw_matrix(counts):
    T, N, _ = counts.shape  # Update to handle the new 3D shape
    dtw_dist = np.zeros((N, N), dtype=float)
    for i in tqdm(range(N)):
        for j in range(i+1, N):
            # Extract the time series for cells i and j
            dist = dtw.distance_fast(counts[:, i, 0], counts[:, j, 0])
            dtw_dist[i, j] = dist
            dtw_dist[j, i] = dist
    return dtw_dist

def dtw_to_similarity(dtw_dist):
    return 1.0 / (1.0 + dtw_dist)

In [9]:
data_values

array([[[0.],
        [0.],
        [0.],
        ...,
        [1.],
        [0.],
        [0.]],

       [[0.],
        [0.],
        [0.],
        ...,
        [3.],
        [0.],
        [0.]],

       [[0.],
        [0.],
        [0.],
        ...,
        [2.],
        [0.],
        [1.]],

       ...,

       [[0.],
        [0.],
        [0.],
        ...,
        [2.],
        [0.],
        [0.]],

       [[0.],
        [0.],
        [0.],
        ...,
        [1.],
        [0.],
        [0.]],

       [[0.],
        [0.],
        [0.],
        ...,
        [0.],
        [0.],
        [0.]]])

In [10]:
dtw_dist_matrix = compute_dtw_matrix(data_values)

dtw_similarity_matrix = dtw_to_similarity(dtw_dist_matrix)

  0%|          | 0/236 [00:00<?, ?it/s]

100%|██████████| 236/236 [06:04<00:00,  1.54s/it]


In [11]:
similarity_df = pd.DataFrame(dtw_similarity_matrix, index=df.columns, columns=df.columns)
similarity_df.to_csv(SIMILARITY_OUTPUT_FILE_PATH)

In [12]:
similarity_df.head()

Unnamed: 0,881faa44b3fffff,881faa6143fffff,881faa614bfffff,881faa614dfffff,881faa626dfffff,881faa6301fffff,881faa6303fffff,881faa6305fffff,881faa6307fffff,881faa6309fffff,...,881faa7ad9fffff,881faa7adbfffff,881faa7addfffff,881faa7ae1fffff,881faa7ae3fffff,881faa7ae7fffff,881faa7ae9fffff,881faa7aebfffff,881faa7aedfffff,881faa7b13fffff
881faa44b3fffff,1.0,0.5,0.5,0.5,0.5,0.02914,0.019903,0.03794,0.021739,0.040861,...,0.022116,0.024236,0.021599,0.023789,0.010184,0.133677,0.036699,0.01817,0.036854,0.091747
881faa6143fffff,0.5,1.0,1.0,1.0,1.0,0.029597,0.020012,0.038462,0.021909,0.041148,...,0.022444,0.024873,0.021744,0.024221,0.010193,0.138026,0.037196,0.018237,0.03725,0.093051
881faa614bfffff,0.5,1.0,1.0,1.0,1.0,0.029597,0.020012,0.038462,0.021909,0.041148,...,0.022444,0.024873,0.021744,0.024221,0.010193,0.138026,0.037196,0.018237,0.03725,0.093051
881faa614dfffff,0.5,1.0,1.0,1.0,1.0,0.029597,0.020012,0.038462,0.021909,0.041148,...,0.022444,0.024873,0.021744,0.024221,0.010193,0.138026,0.037196,0.018237,0.03725,0.093051
881faa626dfffff,0.5,1.0,1.0,1.0,1.0,0.029597,0.020012,0.038462,0.021909,0.041148,...,0.022444,0.024873,0.021744,0.024221,0.010193,0.138026,0.037196,0.018237,0.03725,0.093051


In [13]:
df.index[0], df.index[-200], df.index[-1]

(Timestamp('2025-02-11 14:00:00'),
 Timestamp('2025-04-20 02:00:00'),
 Timestamp('2025-04-28 09:00:00'))

In [14]:
len(df.index) / 10 

182.0

# Model Adaptation Notes

The original IrConv-LSTM model was designed for 2D grid data (with x,y coordinates).
Since we're using a hexagonal grid with a single index per cell, we've reshaped our data to be compatible:

1. Our data is now [time, cells, 1] instead of [time, height, width]
2. The similarity matrix contains the relationships between cells based on DTW distance