In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
import matplotlib.cm as cm
import matplotlib.dates as mdates
from scipy.signal import find_peaks
import openmeteo_requests
import requests_cache
from datetime import date, timedelta
import seaborn as sns
import numpy as np
import os
import sys
sys.path.append("..")

### Fahrraddaten

#### Read in Data

In [2]:
file_path = "../data/full_data.csv"

if os.path.exists(file_path):
    #print(f"Datei existiert: {file_path}")
    data = pd.read_csv("../data/full_data.csv", low_memory=False)
    
else:
    #print(f"Datei fehlt: {file_path}")
    url_list = []

    # URLs zu Daten generieren nach folgendem Schema:
    # https://mobidata-bw.de/daten/eco-counter/v2/fahrradzaehler_stundenwerten_{yyyymm}.csv.gz 

    for year in range(2013, 2026):
        for month in range(1, 13):
            yyyymm = f"{year}{month:02d}"
            url = f"https://mobidata-bw.de/daten/eco-counter/v2/fahrradzaehler_stundenwerten_{yyyymm}.csv.gz"
            
            # Überprüfen, ob die URL existiert
            response = requests_cache.CachedSession().head(url)
            if response.status_code == 200:    
                # in url_list hinzufügen
                url_list.append(url)

    general_columns = pd.read_csv(url_list[1]).columns.tolist()

    # Erstelle CSV-Datei, in der Daten aller URLs gespeichert werden
    full_data = pd.DataFrame()
    for url in url_list:
        csv_data = pd.read_csv(url, low_memory=False)
        assert list(csv_data.columns) == general_columns, f"Spalten stimmen nicht überein in {url}"
        full_data = pd.concat([full_data, csv_data], ignore_index=True)

    # Speichere full_data lokal als CSV-Datei
    full_data.to_csv("../data/full_data.csv", index=False)   
    data = full_data.copy()

data['iso_timestamp'] = pd.to_datetime(data['iso_timestamp'], utc = True, errors='coerce') # Isotimestamp ist lokale Zeit und berücksichtigt Sommerzeit
data.head()

Unnamed: 0,operator_name,domain_name,domain_id,counter_site,counter_site_id,counter_serial,longitude,latitude,timezone,iso_timestamp,channels_in,channels_out,channels_unknown,channels_all,site_temperature,site_rain_accumulation,site_snow_accumulation
0,Eco Counter GmbH,Stadt Karlsruhe,752,Erbprinzenstraße,100004165,Y2H16070301,8.402715,49.007286,(UTC+01:00) Europe/Paris DST,2012-12-31 23:00:00+00:00,9,10,na,19,5.0,0.0,na
1,Eco Counter GmbH,Stadt Karlsruhe,752,Erbprinzenstraße,100004165,Y2H16070301,8.402715,49.007286,(UTC+01:00) Europe/Paris DST,2013-01-01 00:00:00+00:00,15,18,na,33,5.0,0.0,na
2,Eco Counter GmbH,Stadt Karlsruhe,752,Erbprinzenstraße,100004165,Y2H16070301,8.402715,49.007286,(UTC+01:00) Europe/Paris DST,2013-01-01 01:00:00+00:00,17,14,na,31,5.0,0.0,na
3,Eco Counter GmbH,Stadt Karlsruhe,752,Erbprinzenstraße,100004165,Y2H16070301,8.402715,49.007286,(UTC+01:00) Europe/Paris DST,2013-01-01 02:00:00+00:00,14,26,na,40,5.0,0.0,na
4,Eco Counter GmbH,Stadt Karlsruhe,752,Erbprinzenstraße,100004165,Y2H16070301,8.402715,49.007286,(UTC+01:00) Europe/Paris DST,2013-01-01 03:00:00+00:00,13,17,na,30,5.0,0.0,na


In [36]:
# Read in csv file with full data
bike_data = pd.read_csv("../data/full_data.csv", low_memory=False)

In [None]:
# Übersicht über die Counters in einer Stadt
city = "Stadt Heidelberg"
data_city = bike_data[bike_data['domain_name'] == city]

counters = data_city[['counter_site', 'counter_site_id', 'counter_serial']].drop_duplicates().reset_index(drop=True)

tracking = data_city.groupby(['counter_site', 'counter_site_id', 'counter_serial'], dropna = False)['iso_timestamp'] \
    .agg(first_timestamp='min', last_timestamp='max') \
    .reset_index()
counters_with_tracking = counters.merge(tracking, on=['counter_site', 'counter_site_id', 'counter_serial'])

counters_with_tracking

Unnamed: 0,counter_site,counter_site_id,counter_serial,first_timestamp,last_timestamp
0,Ernst-Walz-Brücke Querschnitt,100048811,Y2H17082923,2014-05-31 22:00:00+00:00,2025-12-14 22:00:00+00:00
1,Ernst-Walz-Brücke West - alt,100050030,,2014-05-31 22:00:00+00:00,2018-10-10 21:00:00+00:00
2,Plöck,100012161,Y2H22032496,2015-10-31 23:00:00+00:00,2025-12-14 22:00:00+00:00
3,Gaisbergstraße,100012608,YTH15078127,2015-10-31 23:00:00+00:00,2024-12-26 22:00:00+00:00
4,Mannheimer Straße,100013034,Y2G14014962,2015-10-31 23:00:00+00:00,2025-12-14 22:00:00+00:00
5,Thedor-Heuss-Brücke Querschnitt,100048812,Y2H22011700,2018-07-06 04:00:00+00:00,2025-12-14 22:00:00+00:00
6,Rohrbacher Straße Querschnitt,100048813,Y2H17082922,2018-10-10 22:00:00+00:00,2025-12-14 22:00:00+00:00
7,Liebermannstraße,100048814,Y2H17082934,2018-10-10 22:00:00+00:00,2025-12-14 22:00:00+00:00
8,Schlierbacher Landstraße,100049883,Y2H18106691,2019-01-31 23:00:00+00:00,2025-12-14 22:00:00+00:00
9,Ziegelhäuser Landstraße,100049901,Y2H18106689,2019-01-31 23:00:00+00:00,2025-11-21 03:00:00+00:00


#### Preprocess Data

##### Clean Data

In [37]:
## Clean data
data_cleaned = bike_data.copy()
data_cleaned = data_cleaned[['domain_name', 'counter_site', 'counter_site_id', 'longitude', 'latitude',
       'iso_timestamp', 'channels_in', 'channels_out', 'channels_unknown', 'channels_all', 'site_temperature',
       'site_rain_accumulation', 'site_snow_accumulation']]

# 1. Time
# 'Isotimestamp' is local time and considers 'Sommerzeit'. Therefore, we use this for better accuracy in time representation.
# Exchange 'timestamp' with 'iso_timestamp' and convert to datetime with UTC timezone.
# Drop 'timezone' as this is identical for all entries.
data_cleaned['timestamp'] = pd.to_datetime(data_cleaned['iso_timestamp'], utc = True, errors='coerce') 
data_cleaned = data_cleaned.drop(columns=['iso_timestamp'])

# 2. City
# Drop 'operator_name' as this is not of interest.
# Drop 'domain_id' as this is not informative.
# Rename 'domain_name' to 'city' for clarity.
data_cleaned = data_cleaned.rename(columns={'domain_name': 'city'})

# 3. Counter
# Drop 'counter_serial' as this is not informative and has many missing values.
# Rename 'counter_site' to 'counter_site_name' for clarity.
# Note: For further analysis, use 'counter_site_id' to uniquely identify counter sites.
data_cleaned = data_cleaned.rename(columns={'counter_site': 'counter_site_name'})


# 5. Count
# Rename 'channels_all' to 'count' for clarity.
data_cleaned = data_cleaned.rename(columns={'channels_all': 'count'})

# 6. Weather
# Drop wheather columns that are not of interest (we us another weather dataset later).
data_cleaned = data_cleaned.drop(columns=['site_rain_accumulation', 'site_snow_accumulation', 'site_temperature'])

# Save cleaned data
data_cleaned.to_csv("../data/cleaned_full_data.csv", index=False)

data_cleaned.head()

Unnamed: 0,city,counter_site_name,counter_site_id,longitude,latitude,channels_in,channels_out,channels_unknown,count,timestamp
0,Stadt Karlsruhe,Erbprinzenstraße,100004165,8.402715,49.007286,9,10,na,19,2012-12-31 23:00:00+00:00
1,Stadt Karlsruhe,Erbprinzenstraße,100004165,8.402715,49.007286,15,18,na,33,2013-01-01 00:00:00+00:00
2,Stadt Karlsruhe,Erbprinzenstraße,100004165,8.402715,49.007286,17,14,na,31,2013-01-01 01:00:00+00:00
3,Stadt Karlsruhe,Erbprinzenstraße,100004165,8.402715,49.007286,14,26,na,40,2013-01-01 02:00:00+00:00
4,Stadt Karlsruhe,Erbprinzenstraße,100004165,8.402715,49.007286,13,17,na,30,2013-01-01 03:00:00+00:00


In [None]:
data_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6125443 entries, 0 to 6125442
Data columns (total 13 columns):
 #   Column                  Dtype              
---  ------                  -----              
 0   city                    object             
 1   counter_site_name       object             
 2   counter_site_id         int64              
 3   longitude               float64            
 4   latitude                float64            
 5   channels_in             object             
 6   channels_out            object             
 7   channels_unknown        object             
 8   count                   int64              
 9   site_temperature        object             
 10  site_rain_accumulation  object             
 11  site_snow_accumulation  object             
 12  timestamp               datetime64[ns, UTC]
dtypes: datetime64[ns, UTC](1), float64(2), int64(2), object(8)
memory usage: 607.5+ MB


optional, können wir auch weglassen:

In [None]:
data_checksum = data_cleaned.copy()
# ersetze 'na' Werte in den 3 channel spalten durch 0, um Checksum berechnen zu können, Kommentar - Silja: Ich würde NaNs/Na nicht auf 0 setzen, da 0 auch ein valider Wert sein kann
data_checksum['channels_in'] = np.where(data_checksum['channels_in'].eq('na'), 0, data_checksum['channels_in'])
data_checksum['channels_out'] = np.where(data_checksum['channels_out'].eq('na'), int(0), data_checksum['channels_out'])
data_checksum['channels_unknown'] = np.where(data_checksum['channels_unknown'].eq('na'), int(0), data_checksum['channels_unknown'])

# konvertiere die 4 Spalten in Integer
data_checksum[['channels_in', 'channels_out', 'channels_unknown', 'count']] = data_checksum[['channels_in', 'channels_out', 'channels_unknown', 'count']].astype(int)
sum_cols = ['channels_in', 'channels_out', 'channels_unknown']

# Prüfe, ob die Summe der 3 channel Spalten der count Spalte entspricht
data_checksum['checksum_correct'] = data_checksum[sum_cols].sum(axis=1).eq(data_checksum['count'])
print(data_checksum['checksum_correct'].value_counts())

checksum_correct
True    6080705
Name: count, dtype: int64


##### Missing Data

In [27]:
# Read in csv file with full data
bike_data = pd.read_csv("../data/cleaned_full_data.csv", low_memory=False)

In [28]:
# Get data for Freiburg
city = "Stadt Freiburg"
bike_data_city = bike_data[bike_data['city'] == city]
bike_data_city.head()

Unnamed: 0,city,counter_site_name,counter_site_id,longitude,latitude,channels_in,channels_out,channels_unknown,count,timestamp
744,Stadt Freiburg,Wiwilibrücke,100004595,7.840753,47.995213,na,na,51,51,2012-12-31 23:00:00+00:00
745,Stadt Freiburg,Wiwilibrücke,100004595,7.840753,47.995213,na,na,117,117,2013-01-01 00:00:00+00:00
746,Stadt Freiburg,Wiwilibrücke,100004595,7.840753,47.995213,na,na,131,131,2013-01-01 01:00:00+00:00
747,Stadt Freiburg,Wiwilibrücke,100004595,7.840753,47.995213,na,na,145,145,2013-01-01 02:00:00+00:00
748,Stadt Freiburg,Wiwilibrücke,100004595,7.840753,47.995213,na,na,76,76,2013-01-01 03:00:00+00:00


In [29]:
# Get unique counter sites in Freiburg
unique_counters = bike_data_city['counter_site_name'].unique()
print(f'{len(unique_counters)} unique counter sites in {city}:')
for counter in unique_counters:
    print(counter)

3 unique counter sites in Stadt Freiburg:
Wiwilibrücke
FR2 Güterbahn Süd / Ferd.-Weiß-Str.
FR1 Dreisam / Otto-Wels-Str.


In [44]:
import warnings
warnings.filterwarnings("ignore")

# Get data for a specific counter site
bike_data_counter = bike_data_city[bike_data_city['counter_site_name'] == unique_counters[2]]
bike_data_counter['timestamp'] = pd.to_datetime(
    bike_data_counter['timestamp'],
    utc=True,
    errors='coerce'
)

# Get first and last date of data collection
first_date = bike_data_counter['timestamp'].min()
last_date = bike_data_counter['timestamp'].max()
print(f'Data collection for counter site "{unique_counters[0]}" in {city} ranges from {first_date} to {last_date}.')

# Detect intervalls of missing data
bike_data_counter['count'] = pd.to_numeric(bike_data_counter['count'], errors='coerce')
# Filter rows where 'count' is NaN or 0
bad = bike_data_counter['count'].isna() | (bike_data_counter['count'] == 0)
# Detect continuous intervals of bad data
grp = (bad != bad.shift()).cumsum()
missing_data = bike_data_counter[bad].assign(group=grp)
intervals = (
    missing_data
    .groupby(['group'])
    .agg(
        start=('timestamp', 'min'),
        end=('timestamp', 'max'),
        n_points=('timestamp', 'count')
    )
    .reset_index(drop=True)
)

print(f'Found {len(intervals)} intervals with consecutive missing data points:')
intervals

Data collection for counter site "Wiwilibrücke" in Stadt Freiburg ranges from 2014-07-31 22:00:00+00:00 to 2025-12-31 22:00:00+00:00.
Found 245 intervals with consecutive missing data points:


Unnamed: 0,start,end,n_points
0,2014-10-13 01:00:00+00:00,2014-10-13 01:00:00+00:00,1
1,2014-10-27 02:00:00+00:00,2014-10-27 03:00:00+00:00,2
2,2014-11-17 01:00:00+00:00,2014-11-17 01:00:00+00:00,1
3,2014-12-31 02:00:00+00:00,2014-12-31 02:00:00+00:00,1
4,2015-01-21 02:00:00+00:00,2015-01-21 02:00:00+00:00,1
...,...,...,...
240,2025-10-13 23:00:00+00:00,2025-10-14 02:00:00+00:00,4
241,2025-10-14 22:00:00+00:00,2025-10-15 00:00:00+00:00,3
242,2025-10-15 02:00:00+00:00,2025-10-15 02:00:00+00:00,1
243,2025-10-27 02:00:00+00:00,2025-10-27 02:00:00+00:00,1


In [None]:
# List of intervals with missing data
for index, row in intervals.iterrows():
    print(f"From {row['start']} to {row['end']} ({row['n_points']} missing points)")

In [6]:
distribution = (
    intervals['n_points']
    .value_counts()
    .sort_index()
)
print(distribution)

n_points
1      404
2       55
3       23
4       19
5       13
6        7
7        4
9        1
11       2
12       1
102      1
116      1
Name: count, dtype: int64


In [None]:
bike_data_counter['timestamp'] = pd.to_datetime(bike_data_counter['timestamp'], utc=True)
intervals['start'] = pd.to_datetime(intervals['start'], utc=True)
intervals['end'] = pd.to_datetime(intervals['end'], utc=True)

intervals['start_hour'] = intervals['start'].dt.hour
intervals['end_hour'] = intervals['end'].dt.hour


def is_night_hour(h):
    return (h >= 22) | (h <= 5) # go down to 20 pm?

intervals['is_night'] = (
    is_night_hour(intervals['start_hour']) &
    is_night_hour(intervals['end_hour'])
)

intervals['interval_valid'] = (
    (intervals['n_points'] <= 2) |   # short gaps of 2 hour are valid    
    (intervals['is_night']) # gaps during night are valid
)

bike_data_counter['flag'] = 'missing'

for _, row in intervals.iterrows():
    mask = (
        (bike_data_counter['timestamp'] >= row['start']) &
        (bike_data_counter['timestamp'] <= row['end'])
    )
    if row['interval_valid']:
        bike_data_counter.loc[mask, 'flag'] = 'valid'

bike_data_counter.loc[bike_data_counter['count'] > 0, 'flag'] = 'valid' # mark all rows with count > 0 as valid

In [46]:
missing = bike_data_counter['flag'] == 'missing'
grp = (missing != missing.shift()).cumsum()
missing_data = bike_data_counter[missing].assign(group=grp)
missing_intervals = (
    missing_data
    .groupby('group')
    .agg(
        start=('timestamp', 'min'),
        end=('timestamp', 'max'),
        n_points=('timestamp', 'count')
    )
    .reset_index(drop=True)
)
print(f'Found {len(missing_intervals)} intervals with missing data points:')
missing_intervals

Found 1 intervals with missing data points:


Unnamed: 0,start,end,n_points
0,2025-09-29 21:00:00+00:00,2025-09-29 23:00:00+00:00,3


### Wetterdaten

#### Read in Data

In [17]:
# Read in csv file
weather_data = pd.read_csv("../data/weather_per_city.csv")
weather_data.head()

Unnamed: 0,date,temperature_2m,apparent_temperature,rain,snowfall,forecast_temperature_2m,forecast_apparent_temperature,forecast_rain,forecast_snowfall,city
0,2012-12-30 23:00:00+00:00,5.8285,1.619927,0.0,0.0,,,,,Landeshauptstadt Stuttgart
1,2012-12-31 00:00:00+00:00,5.8285,1.688656,0.0,0.0,,,,,Landeshauptstadt Stuttgart
2,2012-12-31 01:00:00+00:00,5.9285,1.827309,0.0,0.0,,,,,Landeshauptstadt Stuttgart
3,2012-12-31 02:00:00+00:00,5.7285,1.757791,0.0,0.0,,,,,Landeshauptstadt Stuttgart
4,2012-12-31 03:00:00+00:00,5.4785,1.713898,0.0,0.0,,,,,Landeshauptstadt Stuttgart


In [26]:
weather_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2498496 entries, 0 to 2498495
Data columns (total 11 columns):
 #   Column                         Dtype              
---  ------                         -----              
 0   date                           object             
 1   temperature_2m                 float64            
 2   apparent_temperature           float64            
 3   rain                           float64            
 4   snowfall                       float64            
 5   forecast_temperature_2m        float64            
 6   forecast_apparent_temperature  float64            
 7   forecast_rain                  float64            
 8   forecast_snowfall              float64            
 9   city                           object             
 10  timestamp                      datetime64[ns, UTC]
dtypes: datetime64[ns, UTC](1), float64(8), object(2)
memory usage: 209.7+ MB


#### Preprocessing

In [18]:
weather_data['timestamp'] = pd.to_datetime(weather_data['date'], utc = True, errors='coerce') 