In [1]:
import pandas as pd
from glob import glob

In [47]:
# List of input files
files = glob('data_*.csv')

# Concatenate along rows
df = pd.concat([pd.read_csv(f) for f in files], ignore_index=True)

# List of columns that will not be rescaled
unmodified_columns = ['hour', 'zip_code', 'date']

# Rescaling
to_scale = [c for c in df.columns if c not in unmodified_columns]
scale_base = df[to_scale].max().max()
df_scaled = df.copy()
for c in to_scale:
    df_scaled[c] = 100 * df_scaled[c] / scale_base

# Get the column (-> max_column) where scale_base was observed
max_column = None
for c in to_scale:
    if len(df[df[c] == scale_base]) > 0:
        max_column = c
        break

# Gather info about the row where scale_base was observed
base_info = df[df[c] == scale_base][['date', 'hour', 'zip_code']].to_dict(orient='list')
base_info = {k: base_info[k][0] for k in base_info.keys()}
base_info['hour'] = str(base_info['hour']).zfill(2) + ':00'
base_info['zip_code'] = str(base_info['zip_code']).zfill(5)
base_info['category'] = c

# Save CSV files
df_scaled.to_csv('unscaled_data.csv', index=False)
df.to_csv('scaled_data.csv', index=False)

In [49]:
base_info

{'date': '2025-05-27',
 'hour': '00:00',
 'zip_code': '03940',
 'category': 'total'}

In [52]:
round(df.describe(), 2)

Unnamed: 0,hour,zip_code,food,restaurants,hotels,health,airlines,services,goods,leisure,gasoline,total,transaction_online,transaction_physical
count,347719.0,347719.0,276381.0,144167.0,38340.0,130825.0,1971.0,185703.0,328718.0,8441.0,31555.0,347719.0,0.0,0.0
mean,12.56,8091.18,20698.89,12627.12,6651.33,7292.07,170620.6,115312.18,92243.72,99077.47,12347.25,148786.8,,
std,7.0,4580.53,276405.21,44956.4,40573.67,25059.24,1408809.61,961586.98,1511280.0,395280.38,21451.85,1651758.0,,
min,0.0,1000.0,0.0,0.01,0.0,0.01,25.0,0.01,0.0,0.01,0.01,0.0,,
25%,7.0,4200.0,568.0,676.1,650.0,429.0,2674.5,1088.0,861.5,720.0,2280.13,1085.5,,
50%,13.0,7740.0,1785.2,2675.36,1588.0,1507.95,12241.34,4307.7,3557.5,3351.0,7606.06,5133.55,,
75%,18.0,11529.0,5860.74,9339.22,4508.09,6130.48,41579.48,17628.61,18450.04,20907.7,16459.49,26942.36,,
max,59.0,16900.0,16823309.68,5085481.32,6551939.42,4636100.78,36419421.0,46935766.44,122308500.0,11088402.49,1055972.22,127481500.0,,


In [53]:
round(df_scaled.describe(), 2)

Unnamed: 0,hour,zip_code,food,restaurants,hotels,health,airlines,services,goods,leisure,gasoline,total,transaction_online,transaction_physical
count,347719.0,347719.0,276381.0,144167.0,38340.0,130825.0,1971.0,185703.0,328718.0,8441.0,31555.0,347719.0,0.0,0.0
mean,12.56,8091.18,0.02,0.01,0.01,0.01,0.13,0.09,0.07,0.08,0.01,0.12,,
std,7.0,4580.53,0.22,0.04,0.03,0.02,1.11,0.75,1.19,0.31,0.02,1.3,,
min,0.0,1000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
25%,7.0,4200.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
50%,13.0,7740.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.01,0.0,,
75%,18.0,11529.0,0.0,0.01,0.0,0.0,0.03,0.01,0.01,0.02,0.01,0.02,,
max,59.0,16900.0,13.2,3.99,5.14,3.64,28.57,36.82,95.94,8.7,0.83,100.0,,
