# Prepare data from S2S Database at KIT

* load data ensemble-wise for each forecast initial time and filetype(P, SDA, TH, ...)
* compute weekly means
* do this for all forecast initial times

! this takes very long depending on the filetype, don't try for all forecast initial times at once.

### Todo: if aggregate_weekly is used with s2s= True, the mapping from time to forecast_time is wrong for variables without a value at lead_time = 0. 
also: rename variables!

Most of the functions used in this notebook can be found in helper_functions.py

In [None]:
##shift coods: flip_antimeridian in helper_functions.py: not used so far

In [None]:
from tensorflow.keras.layers import Input, Dense, Flatten# you don't need tensorflow here, this is here because of problems with my env on windows

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import xarray as xr
xr.set_options(display_style='text')



from dask.utils import format_bytes
import xskillscore as xs

%matplotlib inline 
#so that figures appear again

#for prediction
from scripts import make_probabilistic
from scripts import add_valid_time_from_forecast_reference_time_and_lead_time
from scripts import skill_by_year
from scripts import add_year_week_coords


from helper_functions import  concat_ensemble, aggregate_weekly, get_filenames

In [None]:
datapath = "Z:"#"../../S2S_data/"
var = 'SDA'

### for one ensemble

In [None]:
ds_s2s = xr.open_dataset(f'{datapath}/ecmwf/2017/20170101/TH20170101_00_PF_01', engine = 'h5netcdf')
#if without tensorflow, h5netcdf throws an error

In [None]:
ds_s2s#.tp.isel(time = 0).where( ds_s2s.tp.isel(time = 0) != 0).sum()

In [None]:
#get filenames
filenames = get_filenames(datapath, model = 'ecmwf', date = str(ds_s2s.time[0].values), file_type = var)
filenames

#str(ds_s2s.time[0].values) does not work if ds_s2s contains SDA data since there 
#forecast_time is shifted by one day compared to the filename

In [None]:
#open and concat files  #this takes a few minutes
ds_s2s = concat_ensemble(filenames, var)

In [None]:
ds_s2s

In [None]:
##compute weekly aggregates for each forecast init time
ds_w = ds_s2s.map(aggregate_weekly, s2s = True)
#ds_w = ds_w.map(ensure_attributes, biweekly=False) this downloads data, so omit this step            
ds_w = ds_w.sortby('forecast_time')
ds_w

### for multiple forecast initial dates

In [None]:
#load dates where labels are available
cache_path = '../data'
dat  = xr.open_zarr(f'{cache_path}/ecmwf_hindcast-input_2000-2019_biweekly_deterministic.zarr', consolidated=True)
thursdays_2000_2019 = [str(e)[0:10] for e in dat.forecast_time.values]
thursdays_2000_2019

### for one forecast initial time

In [None]:
from helper_functions import get_single_forecast

In [None]:
date = str(thursdays_2000_2019[0]) ##some dates won't be found
#datapath = "Z:"#"../../S2S_data/"
#var = 'TH'
date

In [None]:
# this takes a few minutes, does the same than then the functions above, just now combined in one function
ds_weekly_ = get_single_forecast(datapath, date, var)
ds_weekly_

### for multiple forecast initial dates

In [None]:
thursdays_2000_2019[0:53]

deprecated
def get_multiple_forecasts(datapath, date_list, file_type):
    ds_weekly_list = []
    for d in date_list:
        print(d)
        ds_weekly = get_single_forecast(datapath, d, file_type)
        ds_weekly_list.append(ds_weekly)
        #if d%10 == 0: #maybe better to concat after every tenth dataset
        #    if d == 10:
    ds_weekly_multiple = xr.concat(ds_weekly_list, dim = 'forecast_time')            
    return ds_weekly_multiple

In [None]:
from helper_functions import replace_unavailable_dates, get_multiple_forecasts

In [None]:
##only first 3 since this takes VERY long (more than 10 minutes)
ds_weekly_multiple = get_multiple_forecasts(datapath, thursdays_2000_2019[0:3], var)

In [None]:
ds_weekly_multiple

In [None]:
ds_weekly_multiple.isel(lead_time = 0).mean(('realization', 'forecast_time'))['2t'].plot()

In [None]:
#ds_weekly_multiple.to_netcdf('/../data/s2s_weekly_hindcasts_ecmwf_t2m_2000.nc')

### construct monday dates for 2000 - 2019

#### Thursdays

In [None]:
cache_path = '../template/data'
dat_2020 = xr.open_zarr(f'{cache_path}/ecmwf_forecast-input_2020_biweekly_deterministic.zarr', consolidated=True)
#dat_2020

In [None]:
str(dat_2020.forecast_time[0].values)

In [None]:
#[dat_2020.forecast_time.values]###convert to string itemwise...
dates_2020 = [str(e) for e in dat_2020.forecast_time.values]

In [None]:
#these are the Thursdays
days = [str(e)[5:10] for e in dat_2020.forecast_time.values]
year = [str(e)[0:4] for e in dat_2020.forecast_time.values]

print(days)
print(year)

#### Mondays

In [None]:
mondays_2020 = pd.read_csv("../../S2S_data/Mondays2020.csv")#, index_col = False, sep = ';')#.reset_index(drop=True, inplace=True) 

In [None]:
mondays_2020['Mondays 2020'][0]

In [None]:
mondays = mondays_2020['Mondays 2020'].str[5:10].to_list()#[str(e)[5:10] for e in list(mondays_2020)]
#year = [str(e)[0:4] for e in dat_2020.forecast_time.values]

print(mondays)
#print(year)

In [None]:
years = np.arange(2000,2018)
years = np.repeat(years, 52)
years = list(years)
years = [str(e) for e in years]
len(years)/18###only 52 mondays in 2020...

In [None]:
monday_dates = [ '-'.join([years[i],mondays[i%len(mondays)]]) for i in range(0,len(years))]

In [None]:
monday_dates

In [None]:
#commented out to avoid that file is overwritten
#pd.DataFrame(monday_dates).to_csv("../../S2S_data/Mondays2000_2017.csv", index = False, header = False)