In [1]:
# Copyright (c) Facebook, Inc. and its affiliates.

from pathlib import Path

import pandas as pd
import numpy as np
import reverse_geocoder as rg
from tqdm import tqdm

from download_utils import download_url

DATA_DIR = Path.cwd()
def main():
    np.random.seed(0)

    download_url("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv", DATA_DIR / "covid19")

    df = pd.read_csv(DATA_DIR / "covid19/us-counties.csv")

    # Remove rows with unknown counties.
    df = df[df.county != "Unknown"]

    # Merge with latitude and longitude.
    url = "https://en.m.wikipedia.org/wiki/User:Michael_J/County_table"
    df_county_geoloc = pd.read_html(url)[0]
    df_county_geoloc["Area"] = df_county_geoloc["Total Area km²"]
    df_county_geoloc = df_county_geoloc[["FIPS", "Longitude", "Latitude", "Area"]]

    df_county_geoloc.Latitude = df_county_geoloc.Latitude.map(lambda s: float(s.replace("–", "-").replace("°", "")))
    df_county_geoloc.Longitude = df_county_geoloc.Longitude.map(lambda s: float(s.replace("–", "-").replace("°", "")))
    df_county_geoloc.FIPS = df_county_geoloc.FIPS.map(lambda x: float(x))
    df = df.merge(df_county_geoloc, left_on="fips", right_on="FIPS", how="left")

    # Fill in rows with NaN FIPS.
    df.set_index("county", inplace=True)
    missing_latlong = [
        ["New York City", 40.7128, -74.0060, 783.8],
        ["Kansas City", 39.0997, -94.5786, 815.72],
        ["Joplin", 37.0842, -94.5133, 81.7],
        ["Kusilvak Census Area", 62.1458, -162.8919, 44240],
    ]

    df_na = pd.DataFrame(missing_latlong, columns=["county", "Longitude", "Latitude", "Area"])
    df_na.set_index("county", inplace=True)
    df.update(df_na, overwrite=False)
    df = df.reset_index()

    # Remove Alaska and Hawaii.
    df = df[df.state != "Alaska"]
    df = df[df.state != "Hawaii"]

    # Compute number of new cases/deaths each day instead of cumulative.
    df.sort_values(by=["state", "county", "date"], inplace=True)

    df["new_cases"] = df.groupby(["state", "county"])["cases"].diff().fillna(df["cases"])
    df["new_deaths"] = df.groupby(["state", "county"])["deaths"].diff().fillna(df["deaths"])

    # Select time line from March to June.
    df["date"] = pd.to_datetime(df["date"])
    start_date = pd.Timestamp("2020-03-15")
    end_date = pd.Timestamp("2020-08-01")
    df = df[pd.DatetimeIndex(df.date) >= start_date]
    df = df[pd.DatetimeIndex(df.date) <= end_date]

    # Create numeric time column.
    df["day"] = df["date"].apply(lambda x: float((x - start_date).days))

    # Cases in New Jersey.
    df = df[["day", "Longitude", "Latitude", "Area", "new_cases", "state", "county"]]
    df = df[df.new_cases > 0]
    df = df.loc[df.index.repeat(df.new_cases)]
    df = df[df.state == "New Jersey"]

    # Break into 7 day intervals using a sliding window of 3 days.
    sequences = {}
    interval_length = 7
    for start in range(0, int(df["day"].max()) - interval_length + 1, 3):
        date = start_date + pd.Timedelta(days=start)
        seq_name = f"{date.year}{date.month:02d}" + f"{date.day:02d}"

        df_range = df[df["day"] >= start]
        df_range = df_range[df_range["day"] < start + interval_length]
        df_range["day"] = df_range["day"] - start

        seq = df_range.to_numpy()[:, :4].astype(np.float64)
        counties = df_range.to_numpy()[:, -1]

        t, x = seq[:, 0:1], seq[:, 1:3]
        area = seq[:, 3]

        print(seq_name, seq.shape[0])

        for i in tqdm(range(50)):
            # subsample_idx = np.sort(np.random.choice(seq.shape[0], seq.shape[0] // 200, replace=False))
            subsample_idx = np.random.rand(seq.shape[0]) < (1 / 100)

            while np.sum(subsample_idx) == 0:
                subsample_idx = np.random.rand(seq.shape[0]) < (1 / 100)

            # Uniformly distribute the daily case count.
            _t = add_temporal_noise(t[subsample_idx])

            # Assume each degree of longitude/latitude is ~110km.
            degrees = np.sqrt(area) / 110.0
            _x = add_unif_spatial_noise(x[subsample_idx], degrees[subsample_idx].reshape(-1, 1), counties[subsample_idx])

            sort_idx = np.argsort(_t.reshape(-1))
            sequences[seq_name + f"_{i:03d}"] = np.concatenate([_t, _x], axis=1)[sort_idx]

    np.savez(DATA_DIR / "covid_nj_cases.npz", **sequences)


def add_unif_spatial_noise(coords, width, counties):
    sampled_coords = coords

    match = np.zeros(sampled_coords.shape[0]) > 0
    while not match.all():
        sampled_coords = sampled_coords * match.reshape(-1, 1) + (coords + 2.0 * (np.random.rand(*coords.shape) * width - width / 2)) * ~match.reshape(-1, 1)
        lons, lats = sampled_coords[:, 0], sampled_coords[:, 1]
        queries = list(zip(lats, lons))
        results = rg.search(queries)
        match = np.array([county in res["admin2"] for county, res in zip(counties, results)])

    return sampled_coords


def add_temporal_noise(day):
    return day + np.random.rand(*day.shape)


if __name__ == "__main__":
    main()

104800256it [00:12, 8649839.79it/s]                               


20200315 1111


  0%|          | 0/50 [00:00<?, ?it/s]

Loading formatted geocoded file...


100%|██████████| 50/50 [05:01<00:00,  6.02s/it]


20200318 2763


100%|██████████| 50/50 [06:01<00:00,  7.22s/it]


20200321 5948


100%|██████████| 50/50 [07:44<00:00,  9.30s/it]


20200324 10368


100%|██████████| 50/50 [08:23<00:00, 10.07s/it]


20200327 15326


100%|██████████| 50/50 [08:57<00:00, 10.76s/it]


20200330 23318


100%|██████████| 50/50 [10:04<00:00, 12.10s/it]


20200402 28332


100%|██████████| 50/50 [10:19<00:00, 12.38s/it]


20200405 27038


100%|██████████| 50/50 [10:07<00:00, 12.16s/it]


20200408 24926


100%|██████████| 50/50 [09:41<00:00, 11.62s/it]


20200411 23809


100%|██████████| 50/50 [09:49<00:00, 11.78s/it]


20200414 24756


100%|██████████| 50/50 [09:05<00:00, 10.91s/it]


20200417 24612


100%|██████████| 50/50 [10:41<00:00, 12.83s/it]


20200420 23641


100%|██████████| 50/50 [10:27<00:00, 12.55s/it]


20200423 20827


100%|██████████| 50/50 [09:45<00:00, 11.71s/it]


20200426 18075


100%|██████████| 50/50 [10:53<00:00, 13.08s/it]


20200429 16905


  6%|▌         | 3/50 [00:50<13:14, 16.90s/it]


KeyboardInterrupt: 

In [5]:
df_data = pd.read_csv('data.csv')

In [6]:
df_data

Unnamed: 0,num,date,time,code,pop,elderly
0,0,20230801,0,11110560,18298.6017,2461.0440
1,1,20230801,0,11140625,13906.9668,1747.0685
2,2,20230801,0,11710631,32190.0943,2838.3728
3,3,20230801,0,11680630,30591.9275,1693.9024
4,4,20230801,0,11440720,22091.3105,2050.8839
...,...,...,...,...,...,...
620731,620731,20230930,23,11530730,18130.9735,2418.9675
620732,620732,20230930,23,11110615,28696.5405,1882.4947
620733,620733,20230930,23,11110710,10176.9055,1136.7005
620734,620734,20230930,23,11215870,23514.4348,2310.6715


In [9]:
df_code = pd.read_csv('서울시행정동중심점_2017.csv', encoding='utf-8')

In [10]:
df_code

Unnamed: 0,코드,시도명,시군구명,읍면동명,X,Y
0,11110515,서울특별시,종로구,청운효자동,126.970417,37.584658
1,11110530,서울특별시,종로구,사직동,126.970144,37.574108
2,11110540,서울특별시,종로구,삼청동,126.981114,37.588013
3,11110550,서울특별시,종로구,부암동,126.962557,37.596699
4,11110560,서울특별시,종로구,평창동,126.969274,37.613965
...,...,...,...,...,...,...
419,11740650,서울특별시,강동구,성내제2동,127.127737,37.534365
420,11740660,서울특별시,강동구,성내제3동,127.133797,37.528404
421,11740685,서울특별시,강동구,길동,127.145907,37.539725
422,11740690,서울특별시,강동구,둔촌제1동,127.140435,37.523145


In [11]:
# X와 Y 값을 저장할 빈 열 추가
df_data['X'] = np.nan
df_data['Y'] = np.nan

# code를 기준으로 X, Y 값을 df_code에서 찾아서 할당
for index, row in df_data.iterrows():
    code = row['code']
    # df_code에서 매칭되는 코드의 X, Y 값 찾기
    matching_row = df_code[df_code['코드'] == code]
    if not matching_row.empty:
        df_data.at[index, 'X'] = matching_row['X'].values[0]
        df_data.at[index, 'Y'] = matching_row['Y'].values[0]

# 결과 확인
print(df_data.head())


   num      date  time      code         pop    elderly           X          Y
0    0  20230801     0  11110560  18298.6017  2461.0440  126.969274  37.613965
1    1  20230801     0  11140625  13906.9668  1747.0685  127.008263  37.554703
2    2  20230801     0  11710631  32190.0943  2838.3728  127.108235  37.495329
3    3  20230801     0  11680630  30591.9275  1693.9024  127.054899  37.501661
4    4  20230801     0  11440720  22091.3105  2050.8839  126.911845  37.561982


In [15]:
df_area = pd.read_csv('행정구역(동별)_20241204210153.csv', encoding='utf-8')

In [16]:
df_area

Unnamed: 0,동별(1),동별(2),동별(3),면적,구성비,행정,법정,소계,소계.1
0,서울시,종로구,소계,23.91,3.95,17,87,279,1475
1,서울시,종로구,사직동,1.23,0.20,1,12,16,68
2,서울시,종로구,삼청동,1.49,0.25,1,7,10,42
3,서울시,종로구,부암동,2.27,0.38,1,3,18,98
4,서울시,종로구,평창동,8.87,1.47,1,2,19,147
...,...,...,...,...,...,...,...,...,...
446,서울시,강동구,둔촌1동,0.92,0.15,1,1,23,229
447,서울시,강동구,둔촌2동,1.56,0.26,1,-,29,262
448,서울시,강동구,강일동,2.26,0.37,1,1,40,317
449,서울시,강동구,상일1동,1.82,0.30,1,1,46,397


In [18]:
# X와 Y 값을 저장할 빈 열 추가
df_data['area'] = np.nan
df_code['area'] = np.nan

# code를 기준으로 X, Y 값을 df_code에서 찾아서 할당
for index, row in df_code.iterrows():
    code = row['읍면동명']
    # df_code에서 매칭되는 코드의 X, Y 값 찾기
    matching_row = df_area[df_area['동별(3)'] == code]
    if not matching_row.empty:
        df_code.at[index, 'area'] = matching_row['면적'].values[0]

# code를 기준으로 X, Y 값을 df_code에서 찾아서 할당
for index, row in df_data.iterrows():
    code = row['code']
    # df_code에서 매칭되는 코드의 X, Y 값 찾기
    matching_row = df_code[df_code['코드'] == code]
    if not matching_row.empty:
        df_data.at[index, 'area'] = matching_row['area'].values[0]


# 결과 확인
print(df_data.head())

   num      date  time      code         pop    elderly           X  \
0    0  20230801     0  11110560  18298.6017  2461.0440  126.969274   
1    1  20230801     0  11140625  13906.9668  1747.0685  127.008263   
2    2  20230801     0  11710631  32190.0943  2838.3728  127.108235   
3    3  20230801     0  11680630  30591.9275  1693.9024  127.054899   
4    4  20230801     0  11440720  22091.3105  2050.8839  126.911845   

           Y  area  
0  37.613965  8.87  
1  37.554703  0.51  
2  37.495329  1.34  
3  37.501661  0.73  
4  37.561982   NaN  


In [20]:
df_code

Unnamed: 0,코드,시도명,시군구명,읍면동명,X,Y,area
0,11110515,서울특별시,종로구,청운효자동,126.970417,37.584658,2.57
1,11110530,서울특별시,종로구,사직동,126.970144,37.574108,1.23
2,11110540,서울특별시,종로구,삼청동,126.981114,37.588013,1.49
3,11110550,서울특별시,종로구,부암동,126.962557,37.596699,2.27
4,11110560,서울특별시,종로구,평창동,126.969274,37.613965,8.87
...,...,...,...,...,...,...,...
419,11740650,서울특별시,강동구,성내제2동,127.127737,37.534365,
420,11740660,서울특별시,강동구,성내제3동,127.133797,37.528404,
421,11740685,서울특별시,강동구,길동,127.145907,37.539725,1.61
422,11740690,서울특별시,강동구,둔촌제1동,127.140435,37.523145,


In [21]:
# 각 열별로 NaN 값 개수 확인
nan_counts_per_column = df_code.isnull().sum()

# 데이터프레임 전체에서 NaN 값 개수 확인
total_nan_count = df_code.isnull().sum().sum()

print("NaN counts per column:")
print(nan_counts_per_column)

print(f"\nTotal NaN count: {total_nan_count}")


NaN counts per column:
코드        0
시도명       0
시군구명      0
읍면동명      0
X         0
Y         0
area    187
dtype: int64

Total NaN count: 187


In [25]:
df_data_drop = df_data.dropna().reset_index(drop=True)

In [27]:
df_data_drop['num'] = df_data_drop.index

In [28]:
df_data_drop

Unnamed: 0,num,date,time,code,pop,elderly,X,Y,area
0,0,20230801,0,11110560,18298.6017,2461.0440,126.969274,37.613965,8.87
1,1,20230801,0,11140625,13906.9668,1747.0685,127.008263,37.554703,0.51
2,2,20230801,0,11710631,32190.0943,2838.3728,127.108235,37.495329,1.34
3,3,20230801,0,11680630,30591.9275,1693.9024,127.054899,37.501661,0.73
4,4,20230801,0,11710520,32400.8646,3721.7892,127.110127,37.529850,1.59
...,...,...,...,...,...,...,...,...,...
346963,346963,20230930,23,11530595,8739.3458,767.8214,126.888257,37.482555,0.40
346964,346964,20230930,23,11110540,3457.1579,522.5814,126.981114,37.588013,1.49
346965,346965,20230930,23,11530790,21323.3121,2632.5240,126.826070,37.498277,2.67
346966,346966,20230930,23,11110615,28696.5405,1882.4947,126.989733,37.575077,2.35


In [22]:
# 각 열별로 NaN 값 개수 확인
nan_counts_per_column = df_data.isnull().sum()

# 데이터프레임 전체에서 NaN 값 개수 확인
total_nan_count = df_data.isnull().sum().sum()

print("NaN counts per column:")
print(nan_counts_per_column)

print(f"\nTotal NaN count: {total_nan_count}")


NaN counts per column:
num             0
date            0
time            0
code            0
pop             0
elderly         0
X               0
Y               0
area       273768
dtype: int64

Total NaN count: 273768


In [31]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

# Step 1: Convert 'date' to datetime and sort values
df_data_drop['date'] = pd.to_datetime(df_data_drop['date'], format='%Y%m%d')
df_data_drop = df_data_drop.sort_values(by=['code', 'date', 'time'])

# Step 2: Calculate daily changes for 'pop' and 'elderly'
df_data_drop['new_pop'] = df_data_drop.groupby(['code'])['pop'].diff().fillna(df_data_drop['pop'])
df_data_drop['new_elderly'] = df_data_drop.groupby(['code'])['elderly'].diff().fillna(df_data_drop['elderly'])

# Step 3: Create a numeric 'day' column
start_date = df_data_drop['date'].min()
df_data_drop['day'] = (df_data_drop['date'] - start_date).dt.days

# Step 4: Filter rows with non-zero changes in 'pop' and valid 'area' values
df_data_filtered = df_data_drop[(df_data_drop['new_pop'] > 0) & (df_data_drop['area'].notna())]

# Step 5: Define function to add spatial noise
def add_unif_spatial_noise(coords, width):
    # Ensure noise matches the shape of the input coordinates
    noise = np.random.rand(len(coords)) * width - (width / 2)
    return coords.flatten() + noise

# Step 6: Process data in chunks to avoid memory errors
chunk_size = 10000  # Adjust chunk size based on your system's memory capacity
chunk_list = []

for i in tqdm(range(0, len(df_data_filtered), chunk_size)):
    chunk = df_data_filtered.iloc[i:i + chunk_size].copy()
    chunk['X_noisy'] = add_unif_spatial_noise(chunk[['X']].values, np.sqrt(chunk['area'].values) / 110)
    chunk['Y_noisy'] = add_unif_spatial_noise(chunk[['Y']].values, np.sqrt(chunk['area'].values) / 110)
    chunk_list.append(chunk)

df_data_filtered_noisy = pd.concat(chunk_list, ignore_index=True)

# Step 7: Break into 1-day intervals
sequences = {}
interval_length = 24  # Length of each interval in hours (1 day)
slide_step = 24  # Step size for sliding window (1 day)

for start in tqdm(range(0, df_data_filtered_noisy['day'].max() - interval_length + 1, slide_step)):
    interval_data = df_data_filtered_noisy[
        (df_data_filtered_noisy['day'] >= start) &
        (df_data_filtered_noisy['day'] < start + interval_length)
    ].copy()
    
    interval_data['day'] -= start  # Reset time within the interval
    seq_name = f"{start_date + pd.Timedelta(days=start // 24):%Y%m%d}_hour{start % 24:02d}"
    
    sequences[seq_name] = interval_data

# Step 8: Save intervals as CSV files
output_dir = 'output_intervals_1day'  # Specify output directory
os.makedirs(output_dir, exist_ok=True)

for seq_name, seq_data in sequences.items():
    seq_data.to_csv(f"{output_dir}/interval_{seq_name}.csv", index=False)

print("Preprocessing complete. Intervals saved.")




100%|██████████| 18/18 [00:00<00:00, 642.70it/s]
100%|██████████| 2/2 [00:00<00:00, 99.97it/s]


Preprocessing complete. Intervals saved.


In [33]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

# Step 1: Convert 'date' to datetime and sort values
df_data_drop['date'] = pd.to_datetime(df_data_drop['date'], format='%Y%m%d')
df_data_drop = df_data_drop.sort_values(by=['code', 'date', 'time'])

# Step 2: Calculate daily changes for 'pop' and 'elderly'
df_data_drop['new_pop'] = df_data_drop.groupby(['code'])['pop'].diff().fillna(df_data_drop['pop'])
df_data_drop['new_elderly'] = df_data_drop.groupby(['code'])['elderly'].diff().fillna(df_data_drop['elderly'])

# Step 3: Create a numeric 'day' column
start_date = df_data_drop['date'].min()
df_data_drop['day'] = (df_data_drop['date'] - start_date).dt.days

# Step 4: Filter rows with non-zero changes in 'pop' and valid 'area' values
df_data_filtered = df_data_drop[(df_data_drop['new_pop'] > 0) & (df_data_drop['area'].notna())]

# Step 5: Define function to add spatial noise
def add_unif_spatial_noise(coords, width):
    # Ensure noise matches the shape of the input coordinates
    noise = np.random.rand(len(coords)) * width - (width / 2)
    return coords.flatten() + noise

# Step 6: Process data in chunks to avoid memory errors
chunk_size = 10000  # Adjust chunk size based on your system's memory capacity
chunk_list = []

for i in tqdm(range(0, len(df_data_filtered), chunk_size)):
    chunk = df_data_filtered.iloc[i:i + chunk_size].copy()
    chunk['X_noisy'] = add_unif_spatial_noise(chunk[['X']].values, np.sqrt(chunk['area'].values) / 110)
    chunk['Y_noisy'] = add_unif_spatial_noise(chunk[['Y']].values, np.sqrt(chunk['area'].values) / 110)
    chunk_list.append(chunk)

df_data_filtered_noisy = pd.concat(chunk_list, ignore_index=True)

# Step 7: Break into 8-hour intervals with 2-hour overlaps
sequences = {}
interval_length = 8  # Length of each interval in hours (8 hours)
slide_step = 2  # Step size for sliding window (2-hour overlap)

for day in tqdm(df_data_filtered_noisy['day'].unique()):
    day_data = df_data_filtered_noisy[df_data_filtered_noisy['day'] == day]
    max_time = day_data['time'].max()
    
    for start in range(0, max_time - interval_length + 1, slide_step):
        interval_data = day_data[
            (day_data['time'] >= start) &
            (day_data['time'] < start + interval_length)
        ].copy()
        
        interval_data['time'] -= start  # Reset time within the interval
        seq_name = f"{start_date + pd.Timedelta(days=day):%Y%m%d}_hour{start:02d}"
        
        sequences[seq_name] = interval_data

# Step 8: Save intervals as CSV files
output_dir = 'output_intervals_8hours_overlap'  # Specify output directory
os.makedirs(output_dir, exist_ok=True)

for seq_name, seq_data in sequences.items():
    seq_data.to_csv(f"{output_dir}/interval_{seq_name}.csv", index=False)

print("Preprocessing complete. Intervals saved.")


100%|██████████| 18/18 [00:00<00:00, 580.50it/s]
100%|██████████| 61/61 [00:00<00:00, 80.35it/s] 


Preprocessing complete. Intervals saved.


In [34]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

# Step 1: Convert 'date' to datetime and sort values
df_data_drop['date'] = pd.to_datetime(df_data_drop['date'], format='%Y%m%d')
df_data_drop = df_data_drop.sort_values(by=['code', 'date', 'time'])

# Step 2: Calculate daily changes for 'pop' and 'elderly'
df_data_drop['new_pop'] = df_data_drop.groupby(['code'])['pop'].diff().fillna(df_data_drop['pop'])
df_data_drop['new_elderly'] = df_data_drop.groupby(['code'])['elderly'].diff().fillna(df_data_drop['elderly'])

# Step 3: Create a numeric 'day' column
start_date = df_data_drop['date'].min()
df_data_drop['day'] = (df_data_drop['date'] - start_date).dt.days

# Step 4: Filter rows with non-zero changes in 'pop' and valid 'area' values
df_data_filtered = df_data_drop[(df_data_drop['new_pop'] > 0) & (df_data_drop['area'].notna())]

# Step 5: Define function to add spatial noise
def add_unif_spatial_noise(coords, width):
    # Ensure noise matches the shape of the input coordinates
    noise = np.random.rand(len(coords)) * width - (width / 2)
    return coords.flatten() + noise

# Step 6: Process data in chunks to avoid memory errors
chunk_size = 10000  # Adjust chunk size based on your system's memory capacity
chunk_list = []

for i in tqdm(range(0, len(df_data_filtered), chunk_size)):
    chunk = df_data_filtered.iloc[i:i + chunk_size].copy()
    chunk['X_noisy'] = add_unif_spatial_noise(chunk[['X']].values, np.sqrt(chunk['area'].values) / 110)
    chunk['Y_noisy'] = add_unif_spatial_noise(chunk[['Y']].values, np.sqrt(chunk['area'].values) / 110)
    chunk_list.append(chunk)

df_data_filtered_noisy = pd.concat(chunk_list, ignore_index=True)

# Step 7: Break into 6-hour intervals with 2-hour overlaps
sequences = {}
interval_length = 6  # Length of each interval in hours (6 hours)
slide_step = 2  # Step size for sliding window (2-hour overlap)

for day in tqdm(df_data_filtered_noisy['day'].unique()):
    day_data = df_data_filtered_noisy[df_data_filtered_noisy['day'] == day]
    max_time = day_data['time'].max()
    
    for start in range(0, max_time - interval_length + 1, slide_step):
        interval_data = day_data[
            (day_data['time'] >= start) &
            (day_data['time'] < start + interval_length)
        ].copy()
        
        interval_data['time'] -= start  # Reset time within the interval
        seq_name = f"{start_date + pd.Timedelta(days=day):%Y%m%d}_hour{start:02d}"
        
        sequences[seq_name] = interval_data

# Step 8: Save intervals as CSV files
output_dir = 'output_intervals_6hours_overlap'  # Specify output directory
os.makedirs(output_dir, exist_ok=True)

for seq_name, seq_data in sequences.items():
    seq_data.to_csv(f"{output_dir}/interval_{seq_name}.csv", index=False)

print("Preprocessing complete. Intervals saved.")


100%|██████████| 18/18 [00:00<00:00, 620.53it/s]
100%|██████████| 61/61 [00:00<00:00, 130.31it/s]


Preprocessing complete. Intervals saved.


In [32]:
sequences

{'20230801_hour00':            num       date  time      code         pop    elderly           X  \
 0           67 2023-08-01     0  11110515  14359.6184  1654.0251  126.970417   
 1          291 2023-08-01     1  11110515  14427.5077  1662.1085  126.970417   
 2          623 2023-08-01     2  11110515  14539.6848  1662.1048  126.970417   
 3          962 2023-08-01     4  11110515  14646.1146  1653.8880  126.970417   
 4         1522 2023-08-01     6  11110515  15004.0128  1669.8213  126.970417   
 ...        ...        ...   ...       ...         ...        ...         ...   
 172627  135551 2023-08-24    19  11740685  55906.3491  7578.9524  127.145907   
 172628  135788 2023-08-24    20  11740685  55910.7500  7404.3607  127.145907   
 172629  136037 2023-08-24    21  11740685  56606.0276  7480.6394  127.145907   
 172630  136274 2023-08-24    22  11740685  56914.1554  7365.5950  127.145907   
 172631  136511 2023-08-24    23  11740685  59293.0435  7800.4181  127.145907   
 
       