In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
import matplotlib.cm as cm
import matplotlib.dates as mdates
from scipy.signal import find_peaks
import openmeteo_requests
import requests_cache
from datetime import date, timedelta
import seaborn as sns
import numpy as np
import os
import sys
sys.path.append("..")

### Read in Data

Bike data

In [62]:
## Read in .csv file of all bike data if it exists, otherwise download data from URLs and save as .csv

file_path = "../data/full_bike_data.csv"

if os.path.exists(file_path):
    #print(f"Datei existiert: {file_path}")
    bike_data_raw = pd.read_csv("../data/full_bike_data.csv", low_memory=False)
    
else:
    #print(f"Datei fehlt: {file_path}")
    url_list = []

    # URLs zu Daten generieren nach folgendem Schema:
    # https://mobidata-bw.de/daten/eco-counter/v2/fahrradzaehler_stundenwerten_{yyyymm}.csv.gz 

    for year in range(2013, 2026):
        for month in range(1, 13):
            yyyymm = f"{year}{month:02d}"
            url = f"https://mobidata-bw.de/daten/eco-counter/v2/fahrradzaehler_stundenwerten_{yyyymm}.csv.gz"
            
            # Überprüfen, ob die URL existiert
            response = requests_cache.CachedSession().head(url)
            if response.status_code == 200:    
                # in url_list hinzufügen
                url_list.append(url)

    general_columns = pd.read_csv(url_list[1]).columns.tolist()

    # Erstelle CSV-Datei, in der Daten aller URLs gespeichert werden
    full_bike_data = pd.DataFrame()
    for url in url_list:
        csv_data = pd.read_csv(url, low_memory=False)
        assert list(csv_data.columns) == general_columns, f"Spalten stimmen nicht überein in {url}"
        full_bike_data = pd.concat([full_bike_data, csv_data], ignore_index=True)

    # Speichere full_bike_data lokal als CSV-Datei
    full_bike_data.to_csv("../data/full_bike_data.csv", index=False)   
    bike_data_raw = full_bike_data.copy()

bike_data_raw['iso_timestamp'] = pd.to_datetime(bike_data_raw['iso_timestamp'], utc = True, errors='coerce') # Isotimestamp ist lokale Zeit und berücksichtigt Sommerzeit
bike_data_raw.head()

Unnamed: 0,operator_name,domain_name,domain_id,counter_site,counter_site_id,counter_serial,longitude,latitude,timezone,iso_timestamp,channels_in,channels_out,channels_unknown,channels_all,site_temperature,site_rain_accumulation,site_snow_accumulation
0,Eco Counter GmbH,Stadt Karlsruhe,752,Erbprinzenstraße,100004165,Y2H16070301,8.402715,49.007286,(UTC+01:00) Europe/Paris DST,2012-12-31 23:00:00+00:00,9,10,na,19,5.0,0.0,na
1,Eco Counter GmbH,Stadt Karlsruhe,752,Erbprinzenstraße,100004165,Y2H16070301,8.402715,49.007286,(UTC+01:00) Europe/Paris DST,2013-01-01 00:00:00+00:00,15,18,na,33,5.0,0.0,na
2,Eco Counter GmbH,Stadt Karlsruhe,752,Erbprinzenstraße,100004165,Y2H16070301,8.402715,49.007286,(UTC+01:00) Europe/Paris DST,2013-01-01 01:00:00+00:00,17,14,na,31,5.0,0.0,na
3,Eco Counter GmbH,Stadt Karlsruhe,752,Erbprinzenstraße,100004165,Y2H16070301,8.402715,49.007286,(UTC+01:00) Europe/Paris DST,2013-01-01 02:00:00+00:00,14,26,na,40,5.0,0.0,na
4,Eco Counter GmbH,Stadt Karlsruhe,752,Erbprinzenstraße,100004165,Y2H16070301,8.402715,49.007286,(UTC+01:00) Europe/Paris DST,2013-01-01 03:00:00+00:00,13,17,na,30,5.0,0.0,na


In [None]:
bike_data_raw.info()

In [None]:
# Übersicht über die Counters in einer Stadt
city = "Stadt Heidelberg"
data_city = bike_data_raw[bike_data_raw['domain_name'] == city]

counters = data_city[['counter_site', 'counter_site_id', 'counter_serial']].drop_duplicates().reset_index(drop=True)

tracking = data_city.groupby(['counter_site', 'counter_site_id', 'counter_serial'], dropna = False)['iso_timestamp'] \
    .agg(first_timestamp='min', last_timestamp='max') \
    .reset_index()
counters_with_tracking = counters.merge(tracking, on=['counter_site', 'counter_site_id', 'counter_serial'])

counters_with_tracking

Weather data

In [None]:
# Read in csv file
weather_data = pd.read_csv("../data/weather_per_city.csv")
weather_data['timestamp'] = pd.to_datetime(weather_data['date'], utc = True, errors='coerce') # Timestamp is in UTC
weather_data.head()

In [None]:
weather_data.info()

### Preprocess Data

##### Clean Bike Data

In [140]:
## Clean data
data_cleaned = bike_data_raw.copy()

# Keep only relevant columns
data_cleaned = data_cleaned[['domain_name', 'counter_site', 'longitude', 'latitude',
       'iso_timestamp', 'channels_all']]

# Only keep the following cities: Freiburg, Tübingen, Stuttgart, Ludwigsburg, Mannheim, Heidelberg, Reutlingen
cities_to_keep = ["Stadt Freiburg", "Stadt Tübingen", "Landeshauptstadt Stuttgart", 
                  "Stadt Ludwigsburg", "Stadt Mannheim", "Stadt Heidelberg", "Stadt Reutlingen"]
data_cleaned = data_cleaned[data_cleaned['domain_name'].isin(cities_to_keep)].copy()

# 'Isotimestamp' is local time and considers 'Sommerzeit'. Therefore, we use this for better accuracy in time representation.
# Exchange 'timestamp' with 'iso_timestamp' and convert to datetime with UTC timezone.
# Drop 'timezone' as this is identical for all entries.
data_cleaned['timestamp'] = pd.to_datetime(data_cleaned['iso_timestamp'], utc = True, errors='coerce') 
data_cleaned = data_cleaned.drop(columns=['iso_timestamp'])

# Rename columns for better clarity
data_cleaned = data_cleaned.rename(columns={'domain_name': 'city', 'channels_all': 'count'})

# Save cleaned data
data_cleaned.to_csv("../data/full_bike_data_cleaned.csv", index=False)

bike_data = data_cleaned.copy()
bike_data.head()

Unnamed: 0,city,counter_site,longitude,latitude,count,timestamp
744,Stadt Freiburg,Wiwilibrücke,7.840753,47.995213,51,2012-12-31 23:00:00+00:00
745,Stadt Freiburg,Wiwilibrücke,7.840753,47.995213,117,2013-01-01 00:00:00+00:00
746,Stadt Freiburg,Wiwilibrücke,7.840753,47.995213,131,2013-01-01 01:00:00+00:00
747,Stadt Freiburg,Wiwilibrücke,7.840753,47.995213,145,2013-01-01 02:00:00+00:00
748,Stadt Freiburg,Wiwilibrücke,7.840753,47.995213,76,2013-01-01 03:00:00+00:00


In [141]:
bike_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4164946 entries, 744 to 6115072
Data columns (total 6 columns):
 #   Column        Dtype              
---  ------        -----              
 0   city          object             
 1   counter_site  object             
 2   longitude     float64            
 3   latitude      float64            
 4   count         int64              
 5   timestamp     datetime64[ns, UTC]
dtypes: datetime64[ns, UTC](1), float64(2), int64(1), object(2)
memory usage: 222.4+ MB


Sanity Check: Do channels_in & channels_out & channel_unknown sum up to channels out? # TODO: Add other sanity checks we did like checking that # counter_name = # counter ids etc.

In [None]:
data_checksum = bike_data_raw.copy()
# ersetze 'na' Werte in den 3 channel spalten durch 0, um Checksum berechnen zu können
data_checksum['channels_in'] = np.where(data_checksum['channels_in'].eq('na'), 0, data_checksum['channels_in'])
data_checksum['channels_out'] = np.where(data_checksum['channels_out'].eq('na'), int(0), data_checksum['channels_out'])
data_checksum['channels_unknown'] = np.where(data_checksum['channels_unknown'].eq('na'), int(0), data_checksum['channels_unknown'])

# konvertiere die 4 Spalten in Integer
data_checksum[['channels_in', 'channels_out', 'channels_unknown', 'count']] = data_checksum[['channels_in', 'channels_out', 'channels_unknown', 'count']].astype(int)
sum_cols = ['channels_in', 'channels_out', 'channels_unknown']

# Prüfe, ob die Summe der 3 channel Spalten der count Spalte entspricht
data_checksum['checksum_correct'] = data_checksum[sum_cols].sum(axis=1).eq(data_checksum['count'])
print(data_checksum['checksum_correct'].value_counts())

##### Missing Data

In [None]:
# Read in csv. file with full data
bike_data = pd.read_csv("../data/full_bike_data_cleaned.csv", low_memory=False)

In [81]:
# Check for NaNs in 'count' column
nan_counts = bike_data['count'].isna().sum()
print(f'Found {nan_counts} NaN values in count column.')

Found 0 NaN values in count column.


In [None]:
# Get data for for specific city
city = "Stadt Reutlingen"
bike_data_city = bike_data[bike_data['city'] == city]
bike_data_city.head()

Unnamed: 0,city,counter_site,longitude,latitude,count,timestamp
2014093,Stadt Reutlingen,Tübinger Tor,9.209075,48.49085,0,2021-05-11 00:00:00+00:00
2014094,Stadt Reutlingen,Tübinger Tor,9.209075,48.49085,0,2021-05-11 01:00:00+00:00
2014095,Stadt Reutlingen,Tübinger Tor,9.209075,48.49085,0,2021-05-11 02:00:00+00:00
2014096,Stadt Reutlingen,Tübinger Tor,9.209075,48.49085,0,2021-05-11 03:00:00+00:00
2014097,Stadt Reutlingen,Tübinger Tor,9.209075,48.49085,0,2021-05-11 04:00:00+00:00


In [None]:
threshold = 3  # set threshold for maximum number of missing hours to interpolate
all_cities_counters_processed = []

# Get unique cities in the data
unique_cities = bike_data['city'].unique()
for city in unique_cities:
    bike_data_city = bike_data[bike_data['city'] == city]
    print(f"\nProcessing city: {city}\n{'='*80}")

    # Get unique counter sites in this city
    unique_counters = bike_data_city['counter_site'].unique()
    print(f'{len(unique_counters)} unique counter sites in {city}:')
    for counter in unique_counters:
        print(counter)

    # Get data for a specific counter site
    for counter in unique_counters:
        bike_data_counter = bike_data_city[bike_data_city['counter_site'] == counter]
        bike_data_counter['timestamp'] = pd.to_datetime(
            bike_data_counter['timestamp'],
            utc=True,
            errors='coerce'
        )
        print(f"\nProcessing counter: {counter}\n")

        ##############################################################################
        # Exclude intervals with more than 1 week of missing data (7 days = 168 hours)
        ##############################################################################

        print("\n##############################################################################")
        print("Check for intervals with more than 1 week of consecutive 0 counts...")
        print("##############################################################################\n")
        # Filter rows where 'count' is 0
        bike_data_counter['count'] = pd.to_numeric(bike_data_counter['count'], errors='coerce')
        zero_counts = bike_data_counter['count'] == 0

        # Detect continuous intervals of zero_counts data
        grp = (zero_counts != zero_counts.shift()).cumsum()
        missing_data = bike_data_counter[zero_counts].assign(group=grp)
        intervals = (
            missing_data
            .groupby(['group'])
            .agg(
                start=('timestamp', 'min'),
                end=('timestamp', 'max'),
                n_points=('timestamp', 'count')
            )
            .reset_index(drop=True)
        )

        print(f'Found {len(intervals)} interval(s) with consecutive timestamps with 0 counts:')
        #for index, row in intervals.iterrows():
        #    print(f"Interval {index + 1}: from {row['start']} to {row['end']} ({row['n_points']} points)")
        #print()

        # Identify intervals longer than 168 hours (1 week)
        intervals_to_exclude = intervals[intervals['n_points'] > 168]

        # Create a boolean mask initialized with False
        exclude_mask = pd.Series(False, index=bike_data_counter.index)

        # Mark timestamps that fall into intervals to exclude
        for _, row in intervals_to_exclude.iterrows():
            exclude_mask |= (
                (bike_data_counter['timestamp'] >= row['start']) &
                (bike_data_counter['timestamp'] <= row['end'])
            )

        # Exclude these periods from the data
        bike_data_counter = bike_data_counter.loc[~exclude_mask].copy()
        print(f"Removed {exclude_mask.sum()} row(s) due 0 count intervals longer than 168 hours.\n")

        ######################################################################################
        # Interpolate missing 'count' values linearly for intervals with up to 3 missing hours
        #######################################################################################

        ## Check for missing timestamps in the data
        print("\n##############################################################################")
        print("Check for missing timestamps in the data...")
        print("##############################################################################\n")
        all_timestamps = pd.date_range(
            start=bike_data_counter['timestamp'].min(),
            end=bike_data_counter['timestamp'].max(),
            freq='H',
            tz='UTC'
        )


        bike_data_counter = (
            bike_data_counter
            .set_index('timestamp')
            .sort_index()
            .reindex(all_timestamps)
        )

        bike_data_counter.index.name = 'timestamp'

        is_missing = bike_data_counter['count'].isna()

        grp = (is_missing != is_missing.shift()).cumsum()

        missing_intervals = (
            bike_data_counter[is_missing]
            .assign(group=grp)
            .groupby('group')
            .agg(
                start=('count', lambda x: x.index.min()),
                end=('count', lambda x: x.index.max()),
                n_points=('count', 'size')
            )
            .reset_index(drop=True)
        )

        print(f'Found {len(missing_intervals)} intervals with missing timestamp(s):')
        #for index, row in missing_intervals.iterrows():
        #    print(f"Interval {index + 1}: from {row['start']} to {row['end']} ({row['n_points']} points)")
        #print()


        # Interpolate only intervals with up to 3 missing points
        intervals_to_interpolate = missing_intervals[missing_intervals['n_points'] <= threshold]
        print("\n##############################################################################")
        print(f'Interpolating {len(intervals_to_interpolate)} interval(s) with up to {threshold} missing hours of data.')
        print("##############################################################################\n")

        interpolate_mask = pd.Series(False, index=bike_data_counter.index)

        for _, row in intervals_to_interpolate.iterrows():
            interpolate_mask |= (
                (bike_data_counter.index >= row['start']) &
                (bike_data_counter.index <= row['end'])
            )

        bike_data_counter.loc[interpolate_mask, 'count'] = (
            bike_data_counter['count']
            .interpolate(method='time')
        )

        # Columns with constant metadata
        meta_cols = ['city', 'counter_site', 'longitude', 'latitude']

        # Forward/backward fill metadata only
        bike_data_counter[meta_cols] = (
            bike_data_counter[meta_cols]
            .ffill()
            .bfill()
        )

        ####################################################
        # Split data for longer gaps into multiple counters
        ####################################################
        long_gaps = missing_intervals[missing_intervals['n_points'] > threshold]
        if not long_gaps.empty:
            print("\n##############################################################################")
            print(f"Splitting data into {len(long_gaps) + 1} counters, due to {len(long_gaps)} intervals with >{threshold} missing hours...")
            print("##############################################################################\n")
            counter_base = bike_data_counter['counter_site'].iloc[0]
            counter_suffix = 1

            segments = []

            last_end = bike_data_counter.index.min()

            for _, row in long_gaps.iterrows():
                # Data until the start of the gap
                segment = bike_data_counter.loc[last_end:row['start'] - pd.Timedelta(hours=1)].copy()
                if not segment.empty:
                    segments.append(segment)
                
                # Update last_end to the end of the gap + 1 hour
                last_end = row['end'] + pd.Timedelta(hours=1)
                
            # Data after the last gap
            segment = bike_data_counter.loc[last_end:].copy()
            if not segment.empty:
                segments.append(segment)


            processed_segments = []
            for i, seg in enumerate(segments):
                seg = seg.copy()
                if i == 0:
                    # First segment keeps the original counter site name
                    seg['counter_site'] = counter_base
                else:
                    # Subsequent segments get a suffix
                    seg['counter_site'] = f"{counter_base}_{i+1}"
                processed_segments.append(seg)

            # Combine all processed segments back into a single DataFrame
            bike_data_counter_split = pd.concat(processed_segments).sort_index()

        ######################
        # Final check for NaNs
        #######################
        print("\n\nFinal check for NaNs in 'count' column:")
        nan_counts = bike_data_counter_split['count'].isna().sum()
        if nan_counts > 0:
            raise ValueError(f'Found {nan_counts} NaN values in count column after processing counter {counter}!')
        else:
            print(f'No NaN values found in count column after processing counter {counter}.')

        all_cities_counters_processed.append(bike_data_counter_split)


###################################################
# Combine all counters back into a single DataFrame
###################################################
bike_data_final = pd.concat(all_cities_counters_processed).sort_index().reset_index()
bike_data_final.rename(columns={'index': 'timestamp'}, inplace=True)
print("\nPreprocessing complete. Final DataFrame ready.")

# Save final processed data
bike_data_final.to_csv("../data/bike_data_processed.csv", index=False)


Processing city: Stadt Freiburg
3 unique counter sites in Stadt Freiburg:
Wiwilibrücke
FR2 Güterbahn Süd / Ferd.-Weiß-Str.
FR1 Dreisam / Otto-Wels-Str.

Processing counter: Wiwilibrücke


##############################################################################
Check for intervals with more than 1 week of consecutive 0 counts...
##############################################################################

Found 112 interval(s) with consecutive timestamps with 0 counts:
Removed 0 row(s) due 0 count intervals longer than 168 hours.


##############################################################################
Check for missing timestamps in the data...
##############################################################################

Found 13 intervals with missing timestamp(s):

##############################################################################
Interpolating 13 interval(s) with up to 3 missing hours of data.
############################################################