In [1]:
import pandas as pd
import requests_cache
import numpy as np
import os
import sys
sys.path.append("..")
from utils.preprocessing_utils import detect_intervals_with_missing_data, interpolate_short_gaps, remove_long_zero_intervals, split_long_gaps

### Read in Data

Bike data

In [None]:
## Read in .csv file of all bike data if it exists, otherwise download data from URLs and save as .csv

file_path = "../data/full_bike_data.csv"

if os.path.exists(file_path):
    #print(f"Datei existiert: {file_path}")
    bike_data_raw = pd.read_csv("../data/full_bike_data.csv", low_memory=False)
    
else:
    #print(f"Datei fehlt: {file_path}")
    url_list = []

    # URLs zu Daten generieren nach folgendem Schema:
    # https://mobidata-bw.de/daten/eco-counter/v2/fahrradzaehler_stundenwerten_{yyyymm}.csv.gz 

    for year in range(2013, 2026):
        for month in range(1, 13):
            yyyymm = f"{year}{month:02d}"
            url = f"https://mobidata-bw.de/daten/eco-counter/v2/fahrradzaehler_stundenwerten_{yyyymm}.csv.gz"
            
            # Überprüfen, ob die URL existiert
            response = requests_cache.CachedSession().head(url)
            if response.status_code == 200:    
                # in url_list hinzufügen
                url_list.append(url)

    general_columns = pd.read_csv(url_list[1]).columns.tolist()

    # Erstelle CSV-Datei, in der Daten aller URLs gespeichert werden
    full_bike_data = pd.DataFrame()
    for url in url_list:
        csv_data = pd.read_csv(url, low_memory=False)
        assert list(csv_data.columns) == general_columns, f"Spalten stimmen nicht überein in {url}"
        full_bike_data = pd.concat([full_bike_data, csv_data], ignore_index=True)

    # Speichere full_bike_data lokal als CSV-Datei
    full_bike_data.to_csv("../data/full_bike_data.csv", index=False)   
    bike_data_raw = full_bike_data.copy()

bike_data_raw['iso_timestamp'] = pd.to_datetime(bike_data_raw['iso_timestamp'], utc = True, errors='coerce') # Isotimestamp ist lokale Zeit und berücksichtigt Sommerzeit
bike_data_raw.head()

In [None]:
bike_data_raw.info()

In [None]:
# Übersicht über die Counters in einer Stadt
city = "Stadt Heidelberg"
data_city = bike_data_raw[bike_data_raw['domain_name'] == city]

counters = data_city[['counter_site', 'counter_site_id', 'counter_serial']].drop_duplicates().reset_index(drop=True)

tracking = data_city.groupby(['counter_site', 'counter_site_id', 'counter_serial'], dropna = False)['iso_timestamp'] \
    .agg(first_timestamp='min', last_timestamp='max') \
    .reset_index()
counters_with_tracking = counters.merge(tracking, on=['counter_site', 'counter_site_id', 'counter_serial'])

counters_with_tracking

Weather data

In [None]:
# Read in csv file
weather_data = pd.read_csv("../data/weather_per_city.csv")
weather_data['timestamp'] = pd.to_datetime(weather_data['date'], utc = True, errors='coerce') # Timestamp is in UTC
weather_data.head()

In [None]:
weather_data.info()

### Preprocess Data

##### Clean Bike Data

In [None]:
## Clean data
data_cleaned = bike_data_raw.copy()

# Keep only relevant columns
data_cleaned = data_cleaned[['domain_name', 'counter_site', 'longitude', 'latitude',
       'iso_timestamp', 'channels_all']]

# Only keep the following cities: Freiburg, Tübingen, Stuttgart, Ludwigsburg, Mannheim, Heidelberg, Reutlingen
cities_to_keep = ["Stadt Freiburg", "Stadt Tübingen", "Landeshauptstadt Stuttgart", 
                  "Stadt Ludwigsburg", "Stadt Mannheim", "Stadt Heidelberg", "Stadt Reutlingen"]
data_cleaned = data_cleaned[data_cleaned['domain_name'].isin(cities_to_keep)].copy()

# 'Isotimestamp' is local time and considers 'Sommerzeit'. Therefore, we use this for better accuracy in time representation.
# Exchange 'timestamp' with 'iso_timestamp' and convert to datetime with UTC timezone.
# Drop 'timezone' as this is identical for all entries.
data_cleaned['timestamp'] = pd.to_datetime(data_cleaned['iso_timestamp'], utc = True, errors='coerce') 
data_cleaned['timestamp'] = data_cleaned['timestamp'].dt.tz_convert('Europe/Berlin')
data_cleaned = data_cleaned.drop(columns=['iso_timestamp'])

# Rename columns for better clarity
data_cleaned = data_cleaned.rename(columns={'domain_name': 'city', 'channels_all': 'count'})

# Save cleaned data
data_cleaned.to_csv("../data/full_bike_data_cleaned.csv", index=False)

bike_data = data_cleaned.copy()
bike_data.head()

In [None]:
bike_data.info()

Sanity Check: Do channels_in & channels_out & channel_unknown sum up to channels out? # TODO: Add other sanity checks we did like checking that # counter_name = # counter ids etc.

In [None]:
data_checksum = bike_data_raw.copy()
# ersetze 'na' Werte in den 3 channel spalten durch 0, um Checksum berechnen zu können
data_checksum['channels_in'] = np.where(data_checksum['channels_in'].eq('na'), 0, data_checksum['channels_in'])
data_checksum['channels_out'] = np.where(data_checksum['channels_out'].eq('na'), int(0), data_checksum['channels_out'])
data_checksum['channels_unknown'] = np.where(data_checksum['channels_unknown'].eq('na'), int(0), data_checksum['channels_unknown'])

# konvertiere die 4 Spalten in Integer
data_checksum[['channels_in', 'channels_out', 'channels_unknown', 'count']] = data_checksum[['channels_in', 'channels_out', 'channels_unknown', 'count']].astype(int)
sum_cols = ['channels_in', 'channels_out', 'channels_unknown']

# Prüfe, ob die Summe der 3 channel Spalten der count Spalte entspricht
data_checksum['checksum_correct'] = data_checksum[sum_cols].sum(axis=1).eq(data_checksum['count'])
print(data_checksum['checksum_correct'].value_counts())

##### Missing Data

In [14]:
# Read in csv. file with full data
bike_data = pd.read_csv("../data/full_bike_data_cleaned.csv", low_memory=False)

In [None]:
# Check for NaNs in 'count' column
nan_counts = bike_data['count'].isna().sum()
print(f'Found {nan_counts} NaN values in count column.')

In [None]:
# Repress warnings for cleaner output
import warnings
warnings.filterwarnings("ignore")

LONG_ZERO_LIMIT = 168      # 1 week
INTERPOLATION_LIMIT = 3   # hours
MIN_TS = 2 * 365 * 24     # Minimum years of data required per counter (2 years)

all_counters_processed = []
summary_list = []

for city in bike_data['city'].unique():
    print(f"\nProcessing city: {city}\n{'='*80}")
    df_city = bike_data[bike_data['city'] == city]

    for counter in df_city['counter_site'].unique():
        print(f"Processing counter: {counter}")

        df_counter = df_city[
            df_city['counter_site'] == counter
        ].copy()

        df_counter['timestamp'] = pd.to_datetime(
            df_counter['timestamp'], utc=True
        )
        df_counter['count'] = pd.to_numeric(
            df_counter['count'], errors='coerce'
        )

        # -------------------------------------------------
        # Remove long zero-count intervals
        # -------------------------------------------------
        df_counter, removed_count = remove_long_zero_intervals(
            df_counter, LONG_ZERO_LIMIT
        )

        # -------------------------------------------------
        # Reindex to hourly frequency
        # -------------------------------------------------
        df_counter = df_counter.set_index('timestamp').sort_index()

        full_index = pd.date_range(
            df_counter.index.min(),
            df_counter.index.max(),
            freq='H',
            tz='UTC'
        )

        df_counter = df_counter.reindex(full_index)
        df_counter.index.name = 'timestamp'

        # -------------------------------------------------
        # Detect missing intervals
        # -------------------------------------------------
        missing_intervals = detect_intervals_with_missing_data(
            df_counter,
            column='count',
            mode='missing'
        )

        total_missing = missing_intervals['n_points'].sum()

        # -------------------------------------------------
        # Interpolate short gaps
        # -------------------------------------------------
        df_counter, interpolated_count = interpolate_short_gaps(
            df_counter,
            missing_intervals,
            INTERPOLATION_LIMIT
        )

        # -------------------------------------------------
        # Fill metadata
        # -------------------------------------------------
        meta_cols = ['city', 'counter_site', 'longitude', 'latitude']
        df_counter[meta_cols] = df_counter[meta_cols].ffill().bfill()

        # -------------------------------------------------
        # Split long gaps
        # -------------------------------------------------
        long_gaps = missing_intervals[
            missing_intervals['n_points'] > INTERPOLATION_LIMIT
        ]

        df_counter = split_long_gaps(df_counter, long_gaps)

        # -------------------------------------------------
        # Final validation + collection
        # -------------------------------------------------
        for site, g in df_counter.groupby('counter_site'):
            if g['count'].isna().any():
                raise ValueError(
                    f"NaNs remain after processing counter {site}"
                )

            all_counters_processed.append(g)
            summary_list.append({
                'city': city,
                'counter_site': site,
                'total_missing': total_missing,
                'interpolated': interpolated_count,
                'removed zeros': removed_count
            })


# =====================================================
# Final outputs
# =====================================================

bike_data_final = (
    pd.concat(all_counters_processed)
      .sort_index()
      .reset_index()
      .rename(columns={'index': 'timestamp'})
)

summary_df = pd.DataFrame(summary_list)

# =====================================================
# Remove counters with less than 2 years of data
# =====================================================
counter_data_counts = bike_data_final.groupby('counter_site')['count'].count()
counters_to_keep = counter_data_counts[counter_data_counts >= MIN_TS].index
bike_data_final = bike_data_final[bike_data_final['counter_site'].isin(counters_to_keep)]

summary_df = summary_df[summary_df['counter_site'].isin(counters_to_keep)]

print("\nPreprocessing complete.")
print(f"Final dataset shape: {bike_data_final.shape}")
print(f"\nCounters remaining: {bike_data_final['counter_site'].nunique()}")
print("\nSummary:")
summary_df



Processing city: Stadt Freiburg
Processing counter: Wiwilibrücke
Processing counter: FR2 Güterbahn Süd / Ferd.-Weiß-Str.
Processing counter: FR1 Dreisam / Otto-Wels-Str.

Processing city: Landeshauptstadt Stuttgart
Processing counter: König-Karls-Brücke Barometer
Processing counter: Böblinger Straße
Processing counter: Taubenheimstraße
Processing counter: Waiblinger Straße
Processing counter: Samaraweg
Processing counter: Waldburgstraße
Processing counter: Lautenschlager Straße
Processing counter: Tübinger Straße
Processing counter: Inselstraße
Processing counter: Kremmlerstraße
Processing counter: Kirchheimer Straße
Processing counter: Stuttgarter Straße
Processing counter: Solitudestraße
Processing counter: Am Kräherwald
Processing counter: Neckartalstraße

Processing city: Stadt Tübingen
Processing counter: Unterführung Steinlach/Karlstraße Südseite - Steinlachallee
Processing counter: Fuß- & Radtunnel Südportal - Derendinger Allee
Processing counter: Neckartalradweg Hirschau - par

Unnamed: 0,city,counter_site,total_missing,interpolated,removed zeros
0,Stadt Freiburg,Wiwilibrücke,13,13,0
1,Stadt Freiburg,FR2 Güterbahn Süd / Ferd.-Weiß-Str.,12,12,0
2,Stadt Freiburg,FR1 Dreisam / Otto-Wels-Str.,6306,12,0
3,Stadt Freiburg,FR1 Dreisam / Otto-Wels-Str._2,6306,12,0
4,Stadt Freiburg,FR1 Dreisam / Otto-Wels-Str._3,6306,12,0
...,...,...,...,...,...
324,Stadt Reutlingen,Metzgerstraße,5,5,0
325,Stadt Reutlingen,Bellinostraße,5,5,0
326,Stadt Reutlingen,Hindenburgstraße,5,5,0
327,Stadt Reutlingen,Moltkestraße,5,5,0
