# Prepare Stuttgart demand data for IrConv-LSTM

In [1]:
# imports
import numpy as np
import pandas as pd
from dtaidistance import dtw

from tqdm import tqdm

In [2]:
# parameters
INPUT_FILE_PATH = '../../../../processed_data/bolt_pickup_demand_h3_hourly.pickle'

SIMILARITY_OUTPUT_FILE_PATH = './similarity_matrix.csv'
DEMAND_OUTPUT_FILE_PATH = './demand.npy'

In [3]:
df = pd.read_pickle(INPUT_FILE_PATH)
df.head()

grid_index,881faa44b3fffff,881faa6143fffff,881faa614bfffff,881faa614dfffff,881faa626dfffff,881faa6301fffff,881faa6303fffff,881faa6305fffff,881faa6307fffff,881faa6309fffff,...,881faa7ad9fffff,881faa7adbfffff,881faa7addfffff,881faa7ae1fffff,881faa7ae3fffff,881faa7ae7fffff,881faa7ae9fffff,881faa7aebfffff,881faa7aedfffff,881faa7b13fffff
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2025-02-11 14:00:00,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,2.0,0.0,0.0,1.0,0.0,1.0
2025-02-11 15:00:00,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
2025-02-11 16:00:00,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,...,0.0,2.0,1.0,0.0,6.0,0.0,0.0,0.0,1.0,0.0
2025-02-11 17:00:00,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,1.0,1.0,2.0,0.0,0.0,2.0,0.0,1.0
2025-02-11 18:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,...,1.0,2.0,2.0,0.0,3.0,0.0,0.0,2.0,0.0,0.0


In [4]:
df.columns = sorted(df.columns)
df.head()

Unnamed: 0_level_0,881faa44b3fffff,881faa6143fffff,881faa614bfffff,881faa614dfffff,881faa626dfffff,881faa6301fffff,881faa6303fffff,881faa6305fffff,881faa6307fffff,881faa6309fffff,...,881faa7ad9fffff,881faa7adbfffff,881faa7addfffff,881faa7ae1fffff,881faa7ae3fffff,881faa7ae7fffff,881faa7ae9fffff,881faa7aebfffff,881faa7aedfffff,881faa7b13fffff
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2025-02-11 14:00:00,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,2.0,0.0,0.0,1.0,0.0,1.0
2025-02-11 15:00:00,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
2025-02-11 16:00:00,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,...,0.0,2.0,1.0,0.0,6.0,0.0,0.0,0.0,1.0,0.0
2025-02-11 17:00:00,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,1.0,1.0,2.0,0.0,0.0,2.0,0.0,1.0
2025-02-11 18:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,...,1.0,2.0,2.0,0.0,3.0,0.0,0.0,2.0,0.0,0.0


In [5]:
# add missing rows
def add_missing_rows(df):
    # Create a complete date range
    all_dates = pd.date_range(start=df.index.min(), end=df.index.max(), freq='H')
    
    # Reindex the DataFrame to include all dates
    df = df.reindex(all_dates)
    
    # Fill missing values with 0
    df.fillna(0, inplace=True)
    
    return df

In [6]:
print(len(df))
df = add_missing_rows(df)
print(len(df))

1793
1820


  all_dates = pd.date_range(start=df.index.min(), end=df.index.max(), freq='H')


In [7]:
data_values = np.array(df.values)
np.save(DEMAND_OUTPUT_FILE_PATH, data_values)
data_values

array([[0., 0., 0., ..., 1., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [8]:
def compute_dtw_matrix(counts):
    T, N = counts.shape
    dtw_dist = np.zeros((N, N), dtype=float)
    for i in tqdm(range(N)):
        for j in range(i+1, N):
            dist = dtw.distance_fast(counts[:, i], counts[:, j])
            dtw_dist[i, j] = dist
            dtw_dist[j, i] = dist
    return dtw_dist

def dtw_to_similarity(dtw_dist):
    return 1.0 / (1.0 + dtw_dist)

In [9]:
data_values

array([[0., 0., 0., ..., 1., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [10]:
dtw_dist_matrix = compute_dtw_matrix(data_values)

dtw_similarity_matrix = dtw_to_similarity(dtw_dist_matrix)

100%|██████████| 233/233 [05:53<00:00,  1.52s/it]


In [11]:
similarity_df = pd.DataFrame(dtw_similarity_matrix, index=df.columns, columns=df.columns)
similarity_df.to_csv(SIMILARITY_OUTPUT_FILE_PATH)

In [12]:
similarity_df.head()

Unnamed: 0,881faa44b3fffff,881faa6143fffff,881faa614bfffff,881faa614dfffff,881faa626dfffff,881faa6301fffff,881faa6303fffff,881faa6305fffff,881faa6307fffff,881faa6309fffff,...,881faa7ad9fffff,881faa7adbfffff,881faa7addfffff,881faa7ae1fffff,881faa7ae3fffff,881faa7ae7fffff,881faa7ae9fffff,881faa7aebfffff,881faa7aedfffff,881faa7b13fffff
881faa44b3fffff,1.0,0.5,0.5,0.5,0.5,0.025922,0.020049,0.039703,0.021553,0.038169,...,0.021306,0.023991,0.019452,0.023524,0.009794,0.142857,0.035977,0.017021,0.034982,0.091747
881faa6143fffff,0.5,1.0,1.0,1.0,1.0,0.026204,0.02014,0.040033,0.021723,0.038462,...,0.021537,0.024533,0.01955,0.023977,0.009802,0.148268,0.036345,0.017079,0.035228,0.093051
881faa614bfffff,0.5,1.0,1.0,1.0,1.0,0.026204,0.02014,0.040033,0.021723,0.038462,...,0.021537,0.024533,0.01955,0.023977,0.009802,0.148268,0.036345,0.017079,0.035228,0.093051
881faa614dfffff,0.5,1.0,1.0,1.0,1.0,0.026204,0.02014,0.040033,0.021723,0.038462,...,0.021537,0.024533,0.01955,0.023977,0.009802,0.148268,0.036345,0.017079,0.035228,0.093051
881faa626dfffff,0.5,1.0,1.0,1.0,1.0,0.026204,0.02014,0.040033,0.021723,0.038462,...,0.021537,0.024533,0.01955,0.023977,0.009802,0.148268,0.036345,0.017079,0.035228,0.093051


In [13]:
df.index[0], df.index[-200], df.index[-1]

(Timestamp('2025-02-11 14:00:00'),
 Timestamp('2025-04-20 02:00:00'),
 Timestamp('2025-04-28 09:00:00'))

In [14]:
len(df.index) / 10 

182.0