## Data Exploration

In [1]:
# Import packages
import pandas as pd
import matplotlib.pyplot as plt
import sys
sys.path.append("..")
from utils.filter import get_daily_counter_site_count, get_normalized_daily_city_count
import plotly.graph_objects as go

# Für interaktiven Plot:
import plotly.express as px
import ipywidgets as widgets
from IPython.display import display

In [2]:
# Load Data
data = pd.read_csv("../alle_fahrradzaehler_daten.csv")
data['iso_timestamp'] = pd.to_datetime(data['iso_timestamp'], utc = True, errors='coerce') # Isotimestamp is lokale Zeit und berücksichtigt Sommerzeit
data.head()

Unnamed: 0,timestamp,iso_timestamp,zählstand,stand,standort,channel_name,channel_id,counter_site,counter_site_id,domain_name,domain_id,longitude,latitude,timezone,interval,counter_serial
0,2013-01-01T01:00:00+0000,2013-01-01 00:00:00+00:00,15,0,Stadt Karlsruhe,Erbprinz. West,101004165,Erbprinzenstraße,100004165,Stadt Karlsruhe,752,8.402715,49.007286,(UTC+01:00) Europe/Paris;DST,15,Y2H16070301
1,2013-01-01T02:00:00+0000,2013-01-01 01:00:00+00:00,17,0,Stadt Karlsruhe,Erbprinz. West,101004165,Erbprinzenstraße,100004165,Stadt Karlsruhe,752,8.402715,49.007286,(UTC+01:00) Europe/Paris;DST,15,Y2H16070301
2,2013-01-01T03:00:00+0000,2013-01-01 02:00:00+00:00,14,0,Stadt Karlsruhe,Erbprinz. West,101004165,Erbprinzenstraße,100004165,Stadt Karlsruhe,752,8.402715,49.007286,(UTC+01:00) Europe/Paris;DST,15,Y2H16070301
3,2013-01-01T04:00:00+0000,2013-01-01 03:00:00+00:00,13,0,Stadt Karlsruhe,Erbprinz. West,101004165,Erbprinzenstraße,100004165,Stadt Karlsruhe,752,8.402715,49.007286,(UTC+01:00) Europe/Paris;DST,15,Y2H16070301
4,2013-01-01T05:00:00+0000,2013-01-01 04:00:00+00:00,9,0,Stadt Karlsruhe,Erbprinz. West,101004165,Erbprinzenstraße,100004165,Stadt Karlsruhe,752,8.402715,49.007286,(UTC+01:00) Europe/Paris;DST,15,Y2H16070301


In [3]:
# Remove data before 2014-06-01 and after 2024-05-31
start_date = pd.Timestamp('2014-06-01', tz='UTC')
end_date = pd.Timestamp('2024-05-31', tz='UTC')

data = data[(data['iso_timestamp'] >= start_date) & (data['iso_timestamp'] <= end_date)]

# Remove cities that do not have data for the whole period
valid_cities = []
for city in data['standort'].unique():
    city_data = data[data['standort'] == city]
    city_start_date = city_data['iso_timestamp'].min()
    city_end_date = city_data['iso_timestamp'].max()
    if city_start_date <= start_date and city_end_date >= end_date:
        valid_cities.append(city)
print(f"{len(valid_cities)} Cities with complete data from {start_date.date()} to {end_date.date()}:")
for city in valid_cities:
    print(city)
data = data[data['standort'].isin(valid_cities)]

9 Cities with complete data from 2014-06-01 to 2024-05-31:
Stadt Tübingen
Stadt Karlsruhe
Stadt Freiburg
Stadt Lörrach
Stadt Heilbronn
Stadt Mannheim
Stadt Kirchheim unter Teck
Landeshauptstadt Stuttgart
Stadt Heidelberg


# Plots mit nicht normalisierten Daten

In [11]:
# interaktiver Plot

# prepare convenience columns
data['date'] = data['iso_timestamp'].dt.date
data['month'] = data['iso_timestamp'].dt.to_period('M').dt.to_timestamp()

# controls
freq_toggle = widgets.ToggleButtons(options=['Hourly', 'Daily', 'Monthly'], description='Frequency', value='Hourly')
min_date = data['iso_timestamp'].dt.date.min()
max_date = data['iso_timestamp'].dt.date.max()
start_picker = widgets.DatePicker(value=min_date, description='Start')
end_picker = widgets.DatePicker(value=max_date, description='End')

out = widgets.Output()

def to_utc_ts(d):
    return pd.Timestamp(d).tz_localize('UTC')

def update_plot(freq, start, end):
    out.clear_output(wait=True)
    if start is None or end is None:
        with out:
            print("Bitte Start- und Enddatum wählen.") #ich war hier hihi :)
        return

    start_ts = to_utc_ts(start)
    end_ts = to_utc_ts(end) + pd.Timedelta(days=1) - pd.Timedelta(seconds=1)

    df_filtered = data[(data['iso_timestamp'] >= start_ts) & (data['iso_timestamp'] <= end_ts)].copy()
    if df_filtered.empty:
        with out:
            print("Keine Daten für die Auswahl.")
        return

    if freq == 'Daily':
        agg = df_filtered.groupby([pd.Grouper(key='iso_timestamp', freq='D'), 'standort'])['zählstand'].sum().reset_index()
        agg.rename(columns={'iso_timestamp':'period', 'zählstand':'count'}, inplace=True)
    elif freq == 'Monthly':
        agg = df_filtered.groupby([pd.Grouper(key='iso_timestamp', freq='M'), 'standort'])['zählstand'].sum().reset_index()
        agg.rename(columns={'iso_timestamp':'period', 'zählstand':'count'}, inplace=True)
    elif freq == 'Hourly':
        agg = df_filtered.groupby([pd.Grouper(key='iso_timestamp', freq='H'), 'standort'])['zählstand'].sum().reset_index()
        agg.rename(columns={'iso_timestamp':'period', 'zählstand':'count'}, inplace=True)
    else:
        with out:
            print(f"Unbekannte Frequency: {freq}")
        return

    with out:
        fig = px.line(agg, x='period', y='count', color='standort', markers=True,
                      title=f"All cities — {freq} counts",
                      labels={'period':'Date','count':'Count','standort':'City'})
        fig.update_layout(hovermode='x unified')
        fig.show()

# link controls (only freq + date range)
ui = widgets.HBox([freq_toggle, start_picker, end_picker])
controls = {'freq': freq_toggle, 'start': start_picker, 'end': end_picker}
widget = widgets.interactive_output(update_plot, controls)

display(ui, out, widget)


Converting to PeriodArray/Index representation will drop timezone information.



HBox(children=(ToggleButtons(description='Frequency', options=('Hourly', 'Daily', 'Monthly'), value='Hourly'),…

Output()

Output()

In [5]:
#Plot the Data in polar coordinates
def plot_polar_all_cities(data, time_period='monthly', valuesin='zählstand'):
    """
    Polar-Plot mit einer Linie pro Stadt.
    time_period kann sein:
      - 'monthly' : Monat 1..12 (Jan..Dez)
      - 'weekday' / 'WEEKDAY': Wochentag 0..6 (Mon..So)
      - 'hourly' : Stunde 0..23
      - 'dayofyear' / 'DOY'  : Tag im Jahr 1..365/366
    valuesin: Spaltenname mit Werten, standardmäßig 'zählstand' (kann 'zählstand_normalized' sein).
    """

    if 'iso_timestamp' not in data.columns:
        raise ValueError("DataFrame enthält keine Spalte 'iso_timestamp'")

    if valuesin not in data.columns:
        raise ValueError(f"Spalte '{valuesin}' nicht im DataFrame")

    kp = time_period.lower() # normalisisert den String auf Kleinbuchstaben

    # vorbereiten je nach Periode
    if kp in ('monthly'):
        labels = ['Jan','Feb','Mär','Apr','Mai','Jun','Jul','Aug','Sep','Okt','Nov','Dez']
        months = list(range(1,13))
        mat = (data.groupby([data['standort'], data['iso_timestamp'].dt.month])[valuesin]
                 .sum().unstack(fill_value=0))
        # sicherstellen, dass alle Monate vorhanden sind
        for m in months:
            if m not in mat.columns:
                mat[m] = 0
        mat = mat[months]
        theta = labels + [labels[0]]

    elif kp in ('weekday', 'weekdays'):
        labels = ['Mo','Di','Mi','Do','Fr','Sa','So']
        days = list(range(0,7))
        mat = (data.groupby([data['standort'], data['iso_timestamp'].dt.weekday])[valuesin]
                 .sum().unstack(fill_value=0))
        for d in days:
            if d not in mat.columns:
                mat[d] = 0
        mat = mat[days]
        theta = labels + [labels[0]]

    elif kp in ('hourly', 'h', 'hour'):
        labels = [f"{h:02d}" for h in range(24)]
        hours = list(range(0,24))
        mat = (data.groupby([data['standort'], data['iso_timestamp'].dt.hour])[valuesin]
                 .sum().unstack(fill_value=0))
        for h in hours:
            if h not in mat.columns:
                mat[h] = 0
        mat = mat[hours]
        theta = labels + [labels[0]]

    elif kp in ('dayofyear', 'doy'):
        # Achtung: evtl. 365 oder 366 Tage vorhanden -> wir verwenden die maximale Menge aus den Daten
        max_day = int(data['iso_timestamp'].dt.dayofyear.max())
        days = list(range(1, max_day+1))
        labels = [str(d) for d in days]
        mat = (data.groupby([data['standort'], data['iso_timestamp'].dt.dayofyear])[valuesin]
                 .sum().unstack(fill_value=0))
        for d in days:
            if d not in mat.columns:
                mat[d] = 0
        mat = mat[days]
        theta = labels + [labels[0]]
    
    else:
        raise ValueError(f"Unbekannter time_period-Wert: '{time_period}'")


    # Erzeuge Plotly-Figur
    fig = go.Figure()
    for city in mat.index:
        counts = mat.loc[city].astype(float).tolist()
        r = counts + [counts[0]]  # Loop schließen
        fig.add_trace(go.Scatterpolar(
            r = r,
            theta = theta,
            mode = 'lines+markers',
            name = city,
            hovertemplate = "%{theta}: %{r}<extra>%{fullData.name}</extra>"
        ))

    fig.update_layout(
        title=f"Polar plot — {time_period} aggregiert ({valuesin})",
        polar = dict(radialaxis = dict(nticks=5)),
        legend = dict(orientation="v", x=1.02, y=1)
    )
    fig.show()
    return fig

In [6]:
# Polar plots für alle time-periods für die nicht-normalisierten Daten
plot_polar_all_cities(data, time_period='monthly')
plot_polar_all_cities(data, time_period='weekday')
plot_polar_all_cities(data, time_period='hourly')
plot_polar_all_cities(data, time_period='dayofyear')

# Normalisieren der Daten

In [7]:
# Normalisieren der Anzahl an Countern                  #TODO: counter möglicherweise anders normalisieren
# calculate number of counters in the city at the time step and add it as a new column
data['num_counters'] = data.groupby(['standort', 'iso_timestamp'])['counter_site_id'].transform('nunique')
#TODO eventuell muss man noch die Counter rausrechnen, bei denen an dem entsprechenden Tag kein einziges Fahrrad durchgefahren ist (dann ist es wahrscheinlich, dass die Counter inactiv waren)

data['zählstand_normalized'] = data['zählstand'] / data['num_counters']

In [8]:
## Normalisieren der Daten auf einen Wert zwischen 0 und 1

# überprüfen ob das Minimum bei den Zählständen 0 ist (= Sanity check) -> Dann können wir das Minumum beim Normalisieren ignorieren
min_count = data['zählstand_normalized'].min()
print(f"Minimum der Zählstände: {min_count}")

# alle Zählstände normalisieren auf [0,1]
# Zählstände durch das Maximum der jeweiligen Stadt teilen
max_counts = data.groupby('standort')['zählstand_normalized'].transform('max')
data['zählstand_normalized'] = data['zählstand_normalized'] / max_counts

Minimum der Zählstände: 0.0


# Plots mit normalisierten Daten

In [16]:
# Interaktiver Plot für die normalisierten Daten

# Controls
freq_toggle = widgets.ToggleButtons(options=['Hourly', 'Daily', 'Monthly'], description='Frequency', value='Hourly')
min_date = data['iso_timestamp'].dt.date.min()
max_date = data['iso_timestamp'].dt.date.max()
start_picker = widgets.DatePicker(value=min_date, description='Start')
end_picker = widgets.DatePicker(value=max_date, description='End')

out = widgets.Output()

def to_utc_ts(d):
    return pd.Timestamp(d).tz_localize('UTC')

def update_normalized_plot(freq, start, end):
    out.clear_output(wait=True)
    if start is None or end is None:
        with out:
            print("Bitte Start- und Enddatum wählen.")
        return

    start_ts = to_utc_ts(start)
    end_ts = to_utc_ts(end) + pd.Timedelta(days=1) - pd.Timedelta(seconds=1)

    df_filtered = data[(data['iso_timestamp'] >= start_ts) & (data['iso_timestamp'] <= end_ts)].copy()
    if df_filtered.empty:
        with out:
            print("Keine Daten für die Auswahl.")
        return

    if freq == 'Daily':
        agg = (df_filtered
               .groupby([pd.Grouper(key='iso_timestamp', freq='D'), 'standort'])['zählstand_normalized']
               .sum()
               .reset_index())
        agg.rename(columns={'iso_timestamp':'period', 'zählstand_normalized':'normalized_count'}, inplace=True)
    elif freq == 'Monthly':
        agg = (df_filtered
               .groupby([pd.Grouper(key='iso_timestamp', freq='M'), 'standort'])['zählstand_normalized']
               .sum()
               .reset_index())
        agg.rename(columns={'iso_timestamp':'period', 'zählstand_normalized':'normalized_count'}, inplace=True)
    elif freq == 'Hourly':
        agg = (df_filtered
               .groupby([pd.Grouper(key='iso_timestamp', freq='h'), 'standort'])['zählstand']
               .sum()
               .reset_index(name='count'))
        agg['normalized_count'] = agg.groupby('standort')['count'].transform(lambda x: x / x.max())
        agg.rename(columns={'iso_timestamp':'period'}, inplace=True)
    else:
        with out:
            print(f"Unbekannte Frequency: {freq}")
        return

    with out:
        fig = px.line(agg, x='period', y='normalized_count', color='standort', markers=True,
                      title=f"Normalisierte Zählungen (alle Städte) — {freq}",
                      labels={'period':'Datum', 'normalized_count':'Normalisierte Summe', 'standort':'Stadt'})
        fig.update_layout(hovermode='x unified')
        fig.show()

# UI verbinden und anzeigen
ui = widgets.HBox([freq_toggle, start_picker, end_picker])
controls = {'freq': freq_toggle, 'start': start_picker, 'end': end_picker}
widget = widgets.interactive_output(update_normalized_plot, controls)

display(ui, out, widget)

HBox(children=(ToggleButtons(description='Frequency', options=('Hourly', 'Daily', 'Monthly'), value='Hourly'),…

Output()

Output()

In [10]:
# Polar Plots for normalized data
plot_polar_all_cities(data, time_period='monthly', valuesin='zählstand_normalized')
plot_polar_all_cities(data, time_period='weekday', valuesin='zählstand_normalized')
plot_polar_all_cities(data, time_period='hourly', valuesin='zählstand_normalized')
plot_polar_all_cities(data, time_period='dayofyear', valuesin='zählstand_normalized')