In [None]:
import pandas as pd
import os
import geopandas
import pickle
from datetime import datetime

In [None]:
measurements_dir = 'data/measurements/'
grace_dates_dir = "data/grace_dates.pickle"
input_data_dir = os.path.join(measurements_dir, 'input_data.json')
input_data_manual = os.path.join(measurements_dir, "GA_5513_1739_2022_quantitative_status_monitoring_manual.json")
input_data_automatic = os.path.join(measurements_dir, "GA_5513_1739_2022_quantitative_status_monitoring_automatic.json")

In [None]:
def prepare_gdf(gdf):
    df = pd.DataFrame(gdf.drop(columns='geometry'))
    df = df.reset_index()
    df = df.set_index(["date", "name"])
    df = df.drop(columns=["index"])
    df = df.drop(columns=["value", "height"])
    df = df.rename({"normalized_value": "value" }, axis=1)
    return df

In [None]:
manual_gdf = geopandas.read_file(input_data_manual)
automatic_gdf = geopandas.read_file(input_data_automatic)

manual_df = prepare_gdf(manual_gdf)
automatic_df = prepare_gdf(automatic_gdf)

manual_df["value"] = manual_df["value"].fillna(automatic_df["value"])
df = manual_df.reset_index()
df = df.set_index(["date"])
df = df.pivot(columns='name', values='value')
cols = df.columns.values

In [None]:
with open(grace_dates_dir, 'rb') as f:
    grace_dates = pickle.load(f)

grace_dates = [datetime.strptime(date, "%Y-%m-%d") for date in grace_dates]

In [None]:
def extract_data_for_station(df, grace_dates, station):
    df2 = df[station]
    df2.index = pd.to_datetime(df2.index)
    df2 = df2.reset_index()

    output_df = pd.DataFrame(columns = ["date", "value"])

    for date in grace_dates:
        idx = df2.date.searchsorted(date)
        s_row = df2.iloc[idx]
        value = 0
        if s_row.date != date:
            diff = s_row.date - date
            if abs(diff.days) > 15:
                continue
            if s_row.date < date:
                value = (s_row[station] + df2.iloc[idx+1][station])/2.0
            else:
                value = (s_row[station] + df2.iloc[idx-1][station])/2.0
        else:
            value = s_row[station]
        output_df = output_df.append(pd.Series({"date": date.strftime('%Y-%m-%d'), "value": value}), ignore_index=True)
    output_df = output_df.sort_values(by="date", ignore_index=True)
    return output_df

In [None]:
def sanitize_station_name(station):
    return station.replace("/", "_")

In [None]:
def extract_all_stations(df, grace_dates, stations):
    for station in stations:
        out_df = extract_data_for_station(df, grace_dates, station)
        pd.to_pickle(out_df, "data/measurements/" + sanitize_station_name(station) + ".pickle")

In [None]:
extract_all_stations(df, grace_dates, cols)