## Import Zürich Ferien

In [1]:
import requests
import pandas as pd
import numpy as np

# ------------------------------
# 1) Schulferien ZH laden (OpenHolidays) – Datum ONLY (ganze Tage)
# ------------------------------
URL = (
    "https://openholidaysapi.org/SchoolHolidays?"
    "countryIsoCode=CH&subdivisionCode=CH-ZH&languageIsoCode=DE"
    "&validFrom=2023-01-01&validTo=2025-09-12"
)
resp = requests.get(URL, headers={"accept": "application/json"})
resp.raise_for_status()
data = resp.json()

hol = pd.json_normalize(data)

# Wichtige Spalten aufräumen
# startDate/endDate sind Tagesgrenzen (lokal), wir interpretieren sie in Europe/Zurich
hol = hol.rename(columns={
    "startDate": "start_date",
    "endDate":   "end_date",
})
# Für lesbaren Namen:
if "name" in hol.columns:
    # name ist Liste von Objekten; nimm den ersten DE-Eintrag, sonst ersten
    def pick_name(row):
        nm = row.get("name", [])
        if isinstance(nm, list) and len(nm):
            # versuche DE zu finden
            for e in nm:
                if e.get("language") == "DE":
                    return e.get("text")
            return nm[0].get("text")
        return None
    hol["holiday_name"] = hol.apply(pick_name, axis=1)
else:
    hol["holiday_name"] = None

# In echte Zeitstempel konvertieren (lokal CH) und als Intervall (ganzer Tag) definieren
tz_local = "Europe/Zurich"
hol["start_local"] = pd.to_datetime(hol["start_date"]).dt.tz_localize(tz_local)
# Ende des Tages inkl. Zeit: 23:59:59.999999999 lokal
hol["end_local"]   = pd.to_datetime(hol["end_date"]).dt.tz_localize(tz_local) + pd.Timedelta(days=1) - pd.Timedelta(nanoseconds=1)

# Auch in UTC vorbereiten (falls eure Daten UTC sind)
hol["start_utc"] = hol["start_local"].dt.tz_convert("UTC")
hol["end_utc"]   = hol["end_local"].dt.tz_convert("UTC")

# ------------------------------
# 2) Funktion: Zeitreihe flaggen (UTC-Timestamps)
# ------------------------------
def mark_school_vacation(df, ts_col="timestamp", start_utc=None, end_utc=None):
    """
    df[ts_col]: UTC timestamps (tz-aware oder naive in UTC)
    start_utc, end_utc: optionaler Zuschnitt auf euren Analysezeitraum (UTC)
    """
    out = df.copy()

    # Timestamp spalte sicher in UTC bringen
    ts = pd.to_datetime(out[ts_col], utc=True)

    # optional auf Zeitraum zuschneiden
    if start_utc is not None:
        ts_min = pd.to_datetime(start_utc, utc=True)
        out = out.loc[ts >= ts_min]
        ts = ts.loc[ts >= ts_min]
    if end_utc is not None:
        ts_max = pd.to_datetime(end_utc, utc=True)
        out = out.loc[ts <= ts_max]
        ts = ts.loc[ts <= ts_max]

    # Vektorisiertes Matching per IntervalIndex
    intervals = pd.IntervalIndex.from_arrays(hol["start_utc"], hol["end_utc"], closed="both")
    # Für jeden ts prüfen, ob er in einem Intervall liegt
    # Trick: sortierter IntervalIndex + .get_indexer(ts, method='pad') funktioniert nur für disjunkte/aufsteigende Intervalle.
    # Alternative: boolean-accumulate (robust, schnell genug bei wenigen Intervallen)
    is_vac = np.zeros(len(ts), dtype=bool)
    for s, e in zip(hol["start_utc"].to_numpy(), hol["end_utc"].to_numpy()):
        is_vac |= ((ts >= s) & (ts <= e)).to_numpy()

    out["is_school_vacation"] = is_vac
    return out

# ------------------------------
# 3) Beispiel-Nutzung mit eurem Zeitraum
# ------------------------------
ANALYSIS_START_UTC = "2023-01-01 05:50:00Z"
ANALYSIS_END_UTC   = "2025-09-12 23:20:00Z"

# Beispiel: Falls du einen 30-Minuten-Grid bauen willst (nur Demo)
# (In der Praxis nutzt du hier deinen existierenden DataFrame)
grid = pd.DataFrame({
    "timestamp": pd.date_range(
        start=pd.Timestamp(ANALYSIS_START_UTC),
        end=pd.Timestamp(ANALYSIS_END_UTC),
        freq="30min"
    )
})

grid_flagged = mark_school_vacation(
    grid,
    ts_col="timestamp",
    start_utc=ANALYSIS_START_UTC,
    end_utc=ANALYSIS_END_UTC
)

print(grid_flagged.head())
print(grid_flagged.tail())
print(grid_flagged["is_school_vacation"].value_counts())


                  timestamp  is_school_vacation
0 2023-01-01 05:50:00+00:00                True
1 2023-01-01 06:20:00+00:00                True
2 2023-01-01 06:50:00+00:00                True
3 2023-01-01 07:20:00+00:00                True
4 2023-01-01 07:50:00+00:00                True
                      timestamp  is_school_vacation
47311 2025-09-12 21:20:00+00:00               False
47312 2025-09-12 21:50:00+00:00               False
47313 2025-09-12 22:20:00+00:00               False
47314 2025-09-12 22:50:00+00:00               False
47315 2025-09-12 23:20:00+00:00               False
is_school_vacation
False    36915
True     10401
Name: count, dtype: int64


## Import alle Ferien

In [2]:
import requests
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)  # Alle Spalten anzeigen
# =========================================
# PARAMETER
# =========================================
TZ_LOCAL = "Europe/Zurich"
VALID_FROM = "2023-01-01"
VALID_TO   = "2025-09-12"

# feste liste aller 26 Kantonskürzel 
CANTONS_26 = [
    "AG","AI","AR","BE","BL","BS","FR","GE","GL","GR","JU","LU","NE","NW",
    "OW","SG","SH","SO","SZ","TG","TI","UR","VD","VS","ZG","ZH"
]

# =========================================
# HILFSFUNKTIONEN
# =========================================
def _pick_name_de(name_list):
    if isinstance(name_list, list) and name_list:
        for e in name_list:
            if e.get("language") == "DE":
                return e.get("text")
        return name_list[0].get("text")
    return None

def _fetch_school_holidays(subdivision_code, start_date, end_date, language="DE"):
    """Lädt Schulferien für eine Subdivision (z. B. CH-ZH) als DF mit UTC-Intervallen."""
    url = (
        "https://openholidaysapi.org/SchoolHolidays?"
        f"countryIsoCode=CH&subdivisionCode={subdivision_code}"
        f"&languageIsoCode={language}&validFrom={start_date}&validTo={end_date}"
    )
    resp = requests.get(url, headers={"accept": "application/json"})
    resp.raise_for_status()
    data = resp.json()
    if not isinstance(data, list) or len(data) == 0:
        return pd.DataFrame(columns=["start_utc","end_utc","holiday_name"])

    df = pd.json_normalize(data).rename(columns={"startDate":"start_date", "endDate":"end_date"})
    df["holiday_name"] = df.get("name", None).apply(_pick_name_de) if "name" in df.columns else None

    # tagesgrenzen lokal (ganzer tag) -> utc
    df["start_local"] = pd.to_datetime(df["start_date"]).dt.tz_localize(TZ_LOCAL)
    df["end_local"]   = pd.to_datetime(df["end_date"]).dt.tz_localize(TZ_LOCAL) + pd.Timedelta(days=1) - pd.Timedelta(nanoseconds=1)
    df["start_utc"]   = df["start_local"].dt.tz_convert("UTC")
    df["end_utc"]     = df["end_local"].dt.tz_convert("UTC")

    return df[["holiday_name","start_utc","end_utc"]]

def _build_holiday_dict_all_cantons(start_date, end_date):
    """
    Baut ein dict mit schlüsseln = kantonskürzel und werten = DF(start_utc,end_utc,holiday_name).
    Nutzt die feste 26er-liste und mappt auf openholidays-subdivision-codes 'CH-XX'.
    """
    hol = {}
    for abbr in CANTONS_26:
        code = f"CH-{abbr}"
        try:
            df = _fetch_school_holidays(code, start_date, end_date, language="DE")
            hol[abbr] = df.reset_index(drop=True)
        except requests.HTTPError as e:
            print(f"Warnung: {abbr} konnte nicht geladen werden ({e}).")
            hol[abbr] = pd.DataFrame(columns=["holiday_name","start_utc","end_utc"])
    return hol

def attach_canton_vacation_flags(df, ts_col="timestamp",
                                 start_date=VALID_FROM, end_date=VALID_TO):
    """
    Hängt an df (UTC timestamps in ts_col) für JEDEN Kanton (26) eine Bool-Spalte (AG, AI, ..., ZH) an.
    """
    out = df.copy()

    # timestamps in UTC normalisieren
    ts = pd.to_datetime(out[ts_col], utc=True)
    out[ts_col] = ts
    ts64 = ts.view("int64")  # ns seit epoch -> schneller & robuster vergleich

    # feriendaten laden
    hol_dict = _build_holiday_dict_all_cantons(start_date, end_date)

    # 26 spalten initialisieren
    for abbr in CANTONS_26:
        out[abbr] = False

    # pro kanton die intervalle anwenden
    for abbr in CANTONS_26:
        dfh = hol_dict.get(abbr)
        if dfh is None or dfh.empty:
            continue
        starts = pd.to_datetime(dfh["start_utc"], utc=True).view("int64").to_numpy()
        ends   = pd.to_datetime(dfh["end_utc"],   utc=True).view("int64").to_numpy()

        mask = np.zeros(len(ts64), dtype=bool)
        for s, e in zip(starts, ends):
            mask |= ((ts64 >= s) & (ts64 <= e))
        out[abbr] = mask

    return out

# =========================================
# BEISPIEL-NUTZUNG
# =========================================
# 1) FALL A: du hast schon einen dataframe 'df_lszh' mit UTC-timestamps in spalte 'timestamp'
# df_with_cantons = attach_canton_vacation_flags(df_lszh, ts_col="timestamp")

# 2) FALL B: kurzer test mit einem 30-min grid über deinen analysenzeitraum
if __name__ == "__main__":
    grid = pd.DataFrame({
        "timestamp": pd.date_range("2023-01-01 05:50:00Z", "2025-09-12 23:20:00Z", freq="30min")
    })
    out = attach_canton_vacation_flags(grid)
    # nur timestamp + kantonsspalten zeigen
    cols = ["timestamp"] + CANTONS_26
    print(out[cols].head(40))   # ersten 40 zeilen
    print("\nSpalten:", cols)
    # optional: komplette tabelle sehr groß -> besser exportieren
    # out[cols].to_csv("ch_cantonal_school_vacations_30min.csv", index=False)


  ts64 = ts.view("int64")  # ns seit epoch -> schneller & robuster vergleich


                   timestamp    AG    AI    AR    BE    BL    BS    FR    GE  \
0  2023-01-01 05:50:00+00:00  True  True  True  True  True  True  True  True   
1  2023-01-01 06:20:00+00:00  True  True  True  True  True  True  True  True   
2  2023-01-01 06:50:00+00:00  True  True  True  True  True  True  True  True   
3  2023-01-01 07:20:00+00:00  True  True  True  True  True  True  True  True   
4  2023-01-01 07:50:00+00:00  True  True  True  True  True  True  True  True   
5  2023-01-01 08:20:00+00:00  True  True  True  True  True  True  True  True   
6  2023-01-01 08:50:00+00:00  True  True  True  True  True  True  True  True   
7  2023-01-01 09:20:00+00:00  True  True  True  True  True  True  True  True   
8  2023-01-01 09:50:00+00:00  True  True  True  True  True  True  True  True   
9  2023-01-01 10:20:00+00:00  True  True  True  True  True  True  True  True   
10 2023-01-01 10:50:00+00:00  True  True  True  True  True  True  True  True   
11 2023-01-01 11:20:00+00:00  True  True

  starts = pd.to_datetime(dfh["start_utc"], utc=True).view("int64").to_numpy()
  ends   = pd.to_datetime(dfh["end_utc"],   utc=True).view("int64").to_numpy()
  starts = pd.to_datetime(dfh["start_utc"], utc=True).view("int64").to_numpy()
  ends   = pd.to_datetime(dfh["end_utc"],   utc=True).view("int64").to_numpy()
  starts = pd.to_datetime(dfh["start_utc"], utc=True).view("int64").to_numpy()
  ends   = pd.to_datetime(dfh["end_utc"],   utc=True).view("int64").to_numpy()
  starts = pd.to_datetime(dfh["start_utc"], utc=True).view("int64").to_numpy()
  ends   = pd.to_datetime(dfh["end_utc"],   utc=True).view("int64").to_numpy()
  starts = pd.to_datetime(dfh["start_utc"], utc=True).view("int64").to_numpy()
  ends   = pd.to_datetime(dfh["end_utc"],   utc=True).view("int64").to_numpy()
  starts = pd.to_datetime(dfh["start_utc"], utc=True).view("int64").to_numpy()
  ends   = pd.to_datetime(dfh["end_utc"],   utc=True).view("int64").to_numpy()
  starts = pd.to_datetime(dfh["start_utc"], utc=True

## Gesamten Count erstellen der Ferien

In [3]:
import requests
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)

# =========================================
# PARAMETER
# =========================================
TZ_LOCAL   = "Europe/Zurich"
VALID_FROM = "2023-01-01"
VALID_TO   = "2025-09-12"
OUTPUT_FILE = "ch_school_vacations_30min.csv"  # Dateiname bleibt so wie gewünscht

# feste Reihenfolge aller 26 Kantone (eine Spalte je Kanton)
CANTONS_26 = [
    "AG","AI","AR","BE","BL","BS","FR","GE","GL","GR","JU","LU","NE","NW",
    "OW","SG","SH","SO","SZ","TG","TI","UR","VD","VS","ZG","ZH"
]

# =========================================
# HILFSFUNKTIONEN
# =========================================
def _pick_name_de(name_list):
    if isinstance(name_list, list) and name_list:
        for e in name_list:
            if e.get("language") == "DE":
                return e.get("text")
        return name_list[0].get("text")
    return None

def _normalize_day_range_to_utc(df, col_start="startDate", col_end="endDate"):
    df = df.copy()
    df["start_local"] = pd.to_datetime(df[col_start]).dt.tz_localize(TZ_LOCAL)
    df["end_local"]   = pd.to_datetime(df[col_end]).dt.tz_localize(TZ_LOCAL) + pd.Timedelta(days=1) - pd.Timedelta(nanoseconds=1)
    df["start_utc"]   = df["start_local"].dt.tz_convert("UTC")
    df["end_utc"]     = df["end_local"].dt.tz_convert("UTC")
    return df[["start_utc","end_utc"]]

def _fetch_school_holidays(subdivision_code, start_date, end_date, language="DE"):
    url = (
        "https://openholidaysapi.org/SchoolHolidays?"
        f"countryIsoCode=CH&subdivisionCode={subdivision_code}"
        f"&languageIsoCode={language}&validFrom={start_date}&validTo={end_date}"
    )
    resp = requests.get(url, headers={"accept": "application/json"})
    resp.raise_for_status()
    data = resp.json()
    if not isinstance(data, list) or len(data) == 0:
        return pd.DataFrame(columns=["start_utc","end_utc"])
    raw = pd.json_normalize(data)
    rng = _normalize_day_range_to_utc(raw, "startDate", "endDate")
    return rng

def _fetch_public_holidays(subdivision_code, start_date, end_date, language="DE"):
    url = (
        "https://openholidaysapi.org/PublicHolidays?"
        f"countryIsoCode=CH&subdivisionCode={subdivision_code}"
        f"&languageIsoCode={language}&validFrom={start_date}&validTo={end_date}"
    )
    resp = requests.get(url, headers={"accept": "application/json"})
    resp.raise_for_status()
    data = resp.json()
    if not isinstance(data, list) or len(data) == 0:
        return pd.DataFrame(columns=["start_utc","end_utc"])
    raw = pd.json_normalize(data)
    start_col = "startDate" if "startDate" in raw.columns else "start"
    end_col   = "endDate"   if "endDate"   in raw.columns else "end"
    rng = _normalize_day_range_to_utc(raw, start_col, end_col)
    return rng

def _build_combined_intervals_all_cantons(start_date, end_date):
    """
    Für jeden Kanton ein DF mit kombinierten Intervallen (Ferien ODER Feiertage):
    combined[ABBR] -> DF('start_utc','end_utc')
    """
    combined = {}
    for abbr in CANTONS_26:
        code = f"CH-{abbr}"
        try:
            vac = _fetch_school_holidays(code, start_date, end_date)
        except requests.HTTPError:
            vac = pd.DataFrame(columns=["start_utc","end_utc"])
        try:
            hol = _fetch_public_holidays(code, start_date, end_date)
        except requests.HTTPError:
            hol = pd.DataFrame(columns=["start_utc","end_utc"])

        df = pd.concat([vac, hol], ignore_index=True)
        if df.empty:
            combined[abbr] = df
            continue

        # Merge evtl. überlappender/angrenzender Intervalle (Union)
        df = df.sort_values("start_utc").reset_index(drop=True)
        merged = []
        cur_s, cur_e = df.loc[0, "start_utc"], df.loc[0, "end_utc"]
        for i in range(1, len(df)):
            s, e = df.loc[i, "start_utc"], df.loc[i, "end_utc"]
            if s <= cur_e + pd.Timedelta(nanoseconds=1):  # überlappend/angrenzend -> zusammenfassen
                cur_e = max(cur_e, e)
            else:
                merged.append((cur_s, cur_e))
                cur_s, cur_e = s, e
        merged.append((cur_s, cur_e))
        combined[abbr] = pd.DataFrame(merged, columns=["start_utc","end_utc"])
    return combined

def _apply_intervals(ts64, intervals_df):
    if intervals_df is None or intervals_df.empty:
        return np.zeros(len(ts64), dtype=bool)
    starts = pd.to_datetime(intervals_df["start_utc"], utc=True).view("int64").to_numpy()
    ends   = pd.to_datetime(intervals_df["end_utc"],   utc=True).view("int64").to_numpy()
    mask = np.zeros(len(ts64), dtype=bool)
    for s, e in zip(starts, ends):
        mask |= ((ts64 >= s) & (ts64 <= e))
    return mask

def attach_combined_flags_one_col_per_canton(df, ts_col="timestamp",
                                             start_date=VALID_FROM, end_date=VALID_TO):
    """
    Hängt an df (UTC-Zeitstempel) je Kanton GENAU EINE Spalte an (AG, AI, ..., ZH),
    die True ist, wenn Schulferien ODER Feiertag vorliegt.
    Zusätzlich:
      - COUNT_ANY: Anzahl Kantone mit True
      - SHARE_ANY: Anteil Kantone mit True
      - ANY: mind. ein Kanton True
    """
    out = df.copy()
    ts = pd.to_datetime(out[ts_col], utc=True)
    ts64 = ts.view("int64")

    intervals = _build_combined_intervals_all_cantons(start_date, end_date)

    for abbr in CANTONS_26:
        out[abbr] = _apply_intervals(ts64, intervals.get(abbr))

    canton_cols = CANTONS_26
    out["COUNT_ANY"] = out[canton_cols].sum(axis=1)
    out["ANY"]       = out["COUNT_ANY"] > 0
    out["SHARE_ANY"] = out["COUNT_ANY"] / len(canton_cols)

    return out

# =========================================
# BEISPIEL – 30min GRID + EXPORT & RELOAD (DEIN STIL)
# =========================================
if __name__ == "__main__":
    grid = pd.DataFrame({
        "timestamp": pd.date_range("2023-01-01 05:50:00Z", "2025-09-12 23:20:00Z", freq="30min")
    })
    out = attach_combined_flags_one_col_per_canton(grid)

    # Nur timestamp + die 26 Kantone + Aggregate exportieren
    cols = ["timestamp"] + CANTONS_26 + ["COUNT_ANY","ANY","SHARE_ANY"]
    out[cols].to_csv(OUTPUT_FILE, index=False)
    df = pd.read_csv(OUTPUT_FILE)
    df


  ts64 = ts.view("int64")
  starts = pd.to_datetime(intervals_df["start_utc"], utc=True).view("int64").to_numpy()
  ends   = pd.to_datetime(intervals_df["end_utc"],   utc=True).view("int64").to_numpy()
  starts = pd.to_datetime(intervals_df["start_utc"], utc=True).view("int64").to_numpy()
  ends   = pd.to_datetime(intervals_df["end_utc"],   utc=True).view("int64").to_numpy()
  starts = pd.to_datetime(intervals_df["start_utc"], utc=True).view("int64").to_numpy()
  ends   = pd.to_datetime(intervals_df["end_utc"],   utc=True).view("int64").to_numpy()
  starts = pd.to_datetime(intervals_df["start_utc"], utc=True).view("int64").to_numpy()
  ends   = pd.to_datetime(intervals_df["end_utc"],   utc=True).view("int64").to_numpy()
  starts = pd.to_datetime(intervals_df["start_utc"], utc=True).view("int64").to_numpy()
  ends   = pd.to_datetime(intervals_df["end_utc"],   utc=True).view("int64").to_numpy()
  starts = pd.to_datetime(intervals_df["start_utc"], utc=True).view("int64").to_numpy()
  ends

In [4]:
out[cols].to_csv("ch_school_vacations_30min.csv", index=False)
df = pd.read_csv("ch_school_vacations_30min.csv")
df

Unnamed: 0,timestamp,AG,AI,AR,BE,BL,BS,FR,GE,GL,GR,JU,LU,NE,NW,OW,SG,SH,SO,SZ,TG,TI,UR,VD,VS,ZG,ZH,COUNT_ANY,ANY,SHARE_ANY
0,2023-01-01 05:50:00+00:00,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,26,True,1.0
1,2023-01-01 06:20:00+00:00,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,26,True,1.0
2,2023-01-01 06:50:00+00:00,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,26,True,1.0
3,2023-01-01 07:20:00+00:00,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,26,True,1.0
4,2023-01-01 07:50:00+00:00,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,26,True,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47311,2025-09-12 21:20:00+00:00,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,False,0.0
47312,2025-09-12 21:50:00+00:00,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,False,0.0
47313,2025-09-12 22:20:00+00:00,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,False,0.0
47314,2025-09-12 22:50:00+00:00,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,False,0.0
