# Saving the data in a TXT file with the correct structure for GAMCR

To properly use the GAMCR package, you should have a folder for each site with the following structure:

- this folder should have name `site`
- in this folder, you should have the `data_{site}.txt` file saved
- GAMCR will save in this folder the different models that you will train for that site
- in this folder, two subfolders will be created and used by GAMCR. 
    * The first subfolder `data` will be created to save the preprocessed data when calling a `save_batch` type method
    * The second subfolder `results` will be created to save some statistics on the results of a trained model when calling the `compute_statistics` method
    
This notebook will create the folder `site` and the txt file `data_{site}.txt` in it. This text file needs to have the following columns:
- `q`: streamflow time series
- `p`: precipitation time series
- `timeyear`: fractional year (e.g. 2022.5 for 2nd July 2022)
- `date`: date of the year (datetime python object)
- `pet`: potential evapotranspiration

In [1]:
import numpy as np
import pandas as pd
import os

all_GISID = [44]
all_GISID = np.array([str(el) for el in all_GISID])

path_catchments_geodata = '../data/CH_Catchments_Geodata_MF_20221209.csv'
from data_and_visualization.get_feat_space import *
feat_space, all_GISID, dffeat = get_feat_space(path_catchments_geodata, all_GISID=all_GISID, get_df=True, normalize=False)

GISID = all_GISID[0]
pathdata = '../data/real_data/GISID2hourly_data_withPET/{0}.csv'.format(GISID)
df = pd.read_csv(pathdata, sep=',')
df

Unnamed: 0,discharge,precip,t,datetime,tmin,tmax,tabs,pet
0,0.350167,0.229430,2005.000114,2005-01-01 00:00:00,-2.403909,3.026066,0.379802,0.540212
1,0.351000,0.077918,2005.000228,2005-01-01 01:00:00,-2.403909,3.026066,0.379802,0.540212
2,0.351833,0.094607,2005.000342,2005-01-01 02:00:00,-2.403909,3.026066,0.379802,0.540212
3,0.352500,0.264169,2005.000457,2005-01-01 03:00:00,-2.403909,3.026066,0.379802,0.540212
4,0.353167,0.181210,2005.000571,2005-01-01 04:00:00,-2.403909,3.026066,0.379802,0.540212
...,...,...,...,...,...,...,...,...
136580,0.618667,0.000000,2020.581626,2020-07-31 20:00:00,15.530965,26.443121,21.434227,4.725457
136581,0.611667,0.000000,2020.581740,2020-07-31 21:00:00,15.530965,26.443121,21.434227,4.725457
136582,0.607000,0.000000,2020.581853,2020-07-31 22:00:00,15.530965,26.443121,21.434227,4.725457
136583,0.605000,0.000000,2020.581967,2020-07-31 23:00:00,15.530965,26.443121,21.434227,4.725457


In [2]:
df.loc[:,'datetime'] = pd.to_datetime(df['datetime'])

In [3]:
for GISID in all_GISID:
    df = pd.read_csv(pathdata, sep=',')
    df = df.rename(columns={"discharge": "q"})
    
    # conversion of discharge data to mm/h
    df['q'] = df['q'] * 3600 * 1000 / (dffeat.loc[GISID, 'EZG '] * 1000000)  
    
    df = df.rename(columns={"t": "timeyear"})
    df = df.rename(columns={"datetime": "date"})
    df.loc[df['precip']<=0.1,'precip'] = 0
    df = df.rename(columns={"precip": "p"})
    df = df.fillna(0)
    
    ########### Filtering out some date just to have faster computation for the tutorial
    df = df.loc[df['timeyear']>2014]
    df.reset_index(inplace=True, drop=True)
    import os
    directory = './{0}/'.format(GISID)
    if not os.path.exists(directory):
        os.makedirs(directory)
        
    df.to_csv(directory+'data_{0}.txt'.format(GISID), index=False)

In [4]:
df

Unnamed: 0,q,p,timeyear,date,tmin,tmax,tabs,pet
0,0.048807,0.0,2014.000114,2014-01-01 00:00:00,-4.756859,2.940631,-1.325035,0.582874
1,0.047939,0.0,2014.000228,2014-01-01 01:00:00,-4.756859,2.940631,-1.325035,0.582874
2,0.047121,0.0,2014.000342,2014-01-01 02:00:00,-4.756859,2.940631,-1.325035,0.582874
3,0.046385,0.0,2014.000457,2014-01-01 03:00:00,-4.756859,2.940631,-1.325035,0.582874
4,0.046988,0.0,2014.000571,2014-01-01 04:00:00,-4.756859,2.940631,-1.325035,0.582874
...,...,...,...,...,...,...,...,...
57692,0.037942,0.0,2020.581626,2020-07-31 20:00:00,15.530965,26.443121,21.434227,4.725457
57693,0.037513,0.0,2020.581740,2020-07-31 21:00:00,15.530965,26.443121,21.434227,4.725457
57694,0.037227,0.0,2020.581853,2020-07-31 22:00:00,15.530965,26.443121,21.434227,4.725457
57695,0.037104,0.0,2020.581967,2020-07-31 23:00:00,15.530965,26.443121,21.434227,4.725457
