## Preprocessing Pipeline and Seasonal Decomposition

Read in bike count and weather data, proprocess data and perform seasonal decomposition via MSTL.

In [2]:
import sys
sys.path.append("..")
from utils.preprocessing_utils import *
from utils.mstl_utils import process_city_mstl

### Read in Data

##### Bike data

In [None]:
## Read in .csv file of all bike data if it exists, otherwise download data from URLs and save as .csv
file_path_bike_counts = "../data/full_bike_data.csv"

bike_data_raw = read_in_bike_data(file_path_bike_counts) # Last accessed: 2025-12-31
bike_data_raw['iso_timestamp'] = pd.to_datetime(bike_data_raw['iso_timestamp'], utc = True, errors='coerce') # Convert 'iso_timestamp' to datetime format

## View data
#bike_data_raw.head()
#bike_data_raw.info()

# Cities in the dataset
#unique_cities = bike_data_raw['domain_name'].unique()
#print(f"Number of cities: {len(unique_cities)}")
#print(unique_cities)

# Counter in the dataset
#unique_counters = bike_data_raw['counter_site_id'].unique()
#print(f"Number of counters: {len(unique_counters)}")

# Time range of the dataset
#min_timestamp = bike_data_raw['iso_timestamp'].min()
#max_timestamp = bike_data_raw['iso_timestamp'].max()
#print(f"Time range: {min_timestamp} to {max_timestamp}")

## Clean bike data
bike_data_cleaned = bike_data_raw.copy()

# Keep only relevant columns
bike_data_cleaned = bike_data_cleaned[['domain_name', 'counter_site', 'longitude', 'latitude',
       'iso_timestamp', 'channels_all']]

# Reduce dataset to the following cities: Freiburg, Tübingen, Stuttgart, Ludwigsburg, Mannheim, Heidelberg, Reutlingen
cities_to_keep = ["Stadt Freiburg", "Stadt Tübingen", "Landeshauptstadt Stuttgart", 
                  "Stadt Ludwigsburg", "Stadt Mannheim", "Stadt Heidelberg", "Stadt Reutlingen"]
bike_data_cleaned = bike_data_cleaned[bike_data_cleaned['domain_name'].isin(cities_to_keep)]

# Rename columns for better clarity
bike_data_cleaned.rename(columns={'domain_name': 'city', 'channels_all': 'count', 'iso_timestamp': 'timestamp'}, inplace=True)

# Convert timestamp to Europe/Berlin timezone
bike_data_cleaned['timestamp'] = pd.to_datetime(bike_data_cleaned['timestamp'], utc = True, errors='coerce').dt.tz_convert('Europe/Berlin')

# Save cleaned data (for figure 1)
bike_data_cleaned.to_csv("../data/full_bike_data_cleaned.csv", index=False)
bike_data_cleaned.head()

##### Weather data

In [4]:
file_path_weather = "../data/weather_per_city.csv"

# Download corresponding weather data for each city or read from csv if available
if os.path.exists(file_path_weather):
    print("Reading weather data from local CSV file.")
    weather_data = pd.read_csv(file_path_weather)
else:
    weather_data = download_corresponding_weather_data(bike_data_cleaned) # Last accessed: 2026-12-31

weather_data['timestamp'] = pd.to_datetime(weather_data['date'], utc = True, errors='coerce').dt.tz_convert('Europe/Berlin') # Convert 'date' to datetime format

## View data
weather_data.head()
#weather_data.info()

Reading weather data from local CSV file.


Unnamed: 0,date,temperature_2m,apparent_temperature,rain,snowfall,forecast_temperature_2m,forecast_apparent_temperature,forecast_rain,forecast_snowfall,city,timestamp
0,2012-12-30 23:00:00+00:00,5.8285,1.619927,0.0,0.0,,,,,Landeshauptstadt Stuttgart,2012-12-31 00:00:00+01:00
1,2012-12-31 00:00:00+00:00,5.8285,1.688656,0.0,0.0,,,,,Landeshauptstadt Stuttgart,2012-12-31 01:00:00+01:00
2,2012-12-31 01:00:00+00:00,5.9285,1.827309,0.0,0.0,,,,,Landeshauptstadt Stuttgart,2012-12-31 02:00:00+01:00
3,2012-12-31 02:00:00+00:00,5.7285,1.757791,0.0,0.0,,,,,Landeshauptstadt Stuttgart,2012-12-31 03:00:00+01:00
4,2012-12-31 03:00:00+00:00,5.4785,1.713898,0.0,0.0,,,,,Landeshauptstadt Stuttgart,2012-12-31 04:00:00+01:00


### Preprocess Bike Data (counterwise)

##### Outliers

In [5]:
# Cap high counter values based on hour-of-day and day-of-year patterns. Outliers were capped using an interquartile range (IQR)-based approach. 
# Values exceeding Q3 + k · IQR were replaced by the upper bound. 
cap_const = 10
bike_data_capped = cap_outliers_by_time_pattern(bike_data_cleaned, "count", cap_const=cap_const)

  cap_counter_by_time_pattern, values_where=values_where, cap_const=cap_const


##### Missing Data

In [6]:
# Check for NaNs in 'count' column
nan_counts = bike_data_capped['count'].isna().sum()
print(f'Found {nan_counts} NaN values in count column.')

Found 0 NaN values in count column.


In [7]:
# Handling of missing data in counter
# - Remove long zero-count intervals (likely counter malfunctions or contemporary deactivation)
# - Impute short gaps using linear interpolation
# - Split time series at longer gaps
# - Remove counters with less than 2 years of data

# Set thresholds (in hours)
LONG_ZERO_LIMIT = 168          # max. period of zero counts to be accepted as valid (1 week)
INTERPOLATION_LIMIT = 3        # max. missing hours to interpolate
MIN_TS = 2 * ((365 * 24) + 6)  # min. years of data required per counter (2 years)

# Remove long zero-count intervals, interpolate short gaps, split time series at longer gaps
bike_data_final, summary_df = handle_missing_data(bike_data_capped,
                                                   long_zero_limit=LONG_ZERO_LIMIT,
                                                   interpolation_limit=INTERPOLATION_LIMIT)

# Remove counters with less than MIN_TS data points
counter_data_counts = bike_data_final.groupby('counter_site')['count'].count()
counters_to_keep = counter_data_counts[counter_data_counts >= MIN_TS].index
bike_data_final = bike_data_final[bike_data_final['counter_site'].isin(counters_to_keep)]
summary_df = summary_df[summary_df['counter_site'].isin(counters_to_keep)]

# Final overview
print("\nPreprocessing complete.")
print(f"Final dataset shape: {bike_data_final.shape}")
print(f"\nCounters remaining: {bike_data_final['counter_site'].nunique()}")
print("\nSummary:")
summary_df

# Save the final preprosessed data
bike_data_final.to_csv("../data/full_bike_data_preprocessed.csv", index=False)


Processing city: Stadt Freiburg
Processing counter: Wiwilibrücke


  df_counter.index.min(),


Processing counter: FR2 Güterbahn Süd / Ferd.-Weiß-Str.


  df_counter.index.min(),


Processing counter: FR1 Dreisam / Otto-Wels-Str.


  df_counter.index.min(),



Processing city: Landeshauptstadt Stuttgart
Processing counter: König-Karls-Brücke Barometer


  df_counter.index.min(),


Processing counter: Böblinger Straße


  df_counter.index.min(),


Processing counter: Taubenheimstraße


  df_counter.index.min(),


Processing counter: Waiblinger Straße


  df_counter.index.min(),


Processing counter: Samaraweg


  df_counter.index.min(),


Processing counter: Waldburgstraße


  df_counter.index.min(),


Processing counter: Lautenschlager Straße
Processing counter: Tübinger Straße


  df_counter.index.min(),
  df_counter.index.min(),


Processing counter: Inselstraße


  df_counter.index.min(),


Processing counter: Kremmlerstraße


  df_counter.index.min(),


Processing counter: Kirchheimer Straße


  df_counter.index.min(),


Processing counter: Stuttgarter Straße


  df_counter.index.min(),


Processing counter: Solitudestraße


  df_counter.index.min(),


Processing counter: Am Kräherwald


  df_counter.index.min(),


Processing counter: Neckartalstraße


  df_counter.index.min(),



Processing city: Stadt Tübingen
Processing counter: Unterführung Steinlach/Karlstraße Südseite - Steinlachallee


  df_counter.index.min(),


Processing counter: Fuß- & Radtunnel Südportal - Derendinger Allee


  df_counter.index.min(),


Processing counter: Neckartalradweg Hirschau - parallel L371


  df_counter.index.min(),


Processing counter: Radbrücke Mitte - Wöhrdstraße
Processing counter: Radbrücke Ost


  df_counter.index.min(),
  df_counter.index.min(),



Processing city: Stadt Mannheim
Processing counter: Renzstraße


  df_counter.index.min(),


Processing counter: Kurpfalzbrücke
Processing counter: Jungbuschbrücke


  df_counter.index.min(),
  df_counter.index.min(),


Processing counter: Konrad-Adenauer-Brücke
Processing counter: Lindenhofüberführung


  df_counter.index.min(),
  df_counter.index.min(),


Processing counter: Neckarauer Übergang -Schwetzinger Str.
Processing counter: Schlosspark Lindenhof (Richtung Jugendherberge)


  df_counter.index.min(),
  df_counter.index.min(),


Processing counter: Feudenheimstr. stadtauswärts
Processing counter: Luzenbergstr.


  df_counter.index.min(),
  df_counter.index.min(),


Processing counter: Feudenheimerstr. stadteinwärts
Processing counter: B38. RI. AUS


  df_counter.index.min(),
  df_counter.index.min(),
  df_counter.index.min(),


Processing counter: Theodor-Heuss-Anlage. RI. IN.
Processing counter: Theodor-Heuss-Anlage. RI. AUS


  df_counter.index.min(),
  df_counter.index.min(),


Processing counter: Fernmeldeturm.

Processing city: Stadt Heidelberg
Processing counter: Ernst-Walz-Brücke Querschnitt
Processing counter: Ernst-Walz-Brücke West - alt


  df_counter.index.min(),
  df_counter.index.min(),
  df_counter.index.min(),


Processing counter: Plöck
Processing counter: Gaisbergstraße


  df_counter.index.min(),


Processing counter: Mannheimer Straße


  df_counter.index.min(),


Processing counter: Thedor-Heuss-Brücke Querschnitt
Processing counter: Rohrbacher Straße Querschnitt


  df_counter.index.min(),
  df_counter.index.min(),


Processing counter: Liebermannstraße
Processing counter: Schlierbacher Landstraße


  df_counter.index.min(),
  df_counter.index.min(),


Processing counter: Ziegelhäuser Landstraße


  df_counter.index.min(),


Processing counter: Kurfürstenanlage Querschnitt


  df_counter.index.min(),


Processing counter: Hardtstraße
Processing counter: Bahnstadtpromenade


  df_counter.index.min(),
  df_counter.index.min(),


Processing counter: Berliner Straße Querschnitt
Processing counter: Eppelheimer Str. Querschnitt


  df_counter.index.min(),
  df_counter.index.min(),



Processing city: Stadt Ludwigsburg
Processing counter: Marbacher Straße - Favoritepark


  df_counter.index.min(),


Processing counter: Alleenstraße


  df_counter.index.min(),


Processing counter: Marbacher Straße - Neckarbrücke


  df_counter.index.min(),


Processing counter: Schlieffenstraße


  df_counter.index.min(),


Processing counter: Fuchshof


  df_counter.index.min(),


Processing counter: Seestraße


  df_counter.index.min(),


Processing counter: Schlossstraße


  df_counter.index.min(),


Processing counter: Kesseläcker (Verl. Nussackerweg)


  df_counter.index.min(),


Processing counter: Zugwiesen


  df_counter.index.min(),


Processing counter: Solitudeallee


  df_counter.index.min(),


Processing counter: Aldinger Straße


  df_counter.index.min(),


Processing counter: Bottwartalstraße


  df_counter.index.min(),


Processing counter: Bismarckstraße


  df_counter.index.min(),


Processing counter: Königinallee


  df_counter.index.min(),


Processing counter: Friedrich-Ebert-Straße

Processing city: Stadt Reutlingen


  df_counter.index.min(),


Processing counter: Tübinger Tor
Processing counter: Charlottenstraße


  df_counter.index.min(),
  df_counter.index.min(),


Processing counter: Konrad-Adenauer-Straße


  df_counter.index.min(),


Processing counter: Metzgerstraße


  df_counter.index.min(),


Processing counter: Bellinostraße


  df_counter.index.min(),


Processing counter: Hindenburgstraße


  df_counter.index.min(),


Processing counter: Moltkestraße


  df_counter.index.min(),


Processing counter: Unter den Linden


  df_counter.index.min(),



Preprocessing complete.
Final dataset shape: (3678215, 7)

Counters remaining: 77

Summary:


### MSTL

Extraction of the daily, weekly and annual seasonality as well as trend for each counter of a city and saving the results per city

Warning: This might take a while, last execution took 16h30min.

In [None]:
#bike_data_final = pd.read_csv("../data/full_bike_data_preprocessed.csv", low_memory=False)
for city, df_city in bike_data_final.groupby("city"):
    print("Processing city: ", city)
    process_city_mstl(df_city, city)
    print("Finished city", city)