In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os


## Define folders

In [None]:
#Read main path
with open('../path_main.txt', 'r') as file:   path_main = file.read()

dir_scripts = f'{path_main}Scripts/'
dir_GSOD    = f'{path_main}Data/GSOD/'
dir_out     = f'{path_main}Data/GSOD/Stations_all_years/'
if not os.path.exists(dir_out): os.mkdir(dir_out)


## Get list of all stations

In [None]:
#Define years
years = np.arange(1981, 2021)

#Initialize set to collect stations
all_stations = set()

#Loop over years
N = np.empty(len(years)) * np.NaN
for i, year in enumerate(years):
    
    #Get files in folder
    dir_files = dir_GSOD + 'stations_' + str(year) + '/'
    files     = os.listdir(dir_files)
    
    #Extract station IDs
    STA_IDS = [file.split('.')[0] for file in files]

    #Select in set
    all_stations = all_stations.union(set(STA_IDS))
    N[i] = len(all_stations) 

#Convert to list
all_stations = sorted(list(all_stations))

#Plot number of stations
plt.plot(N)


## Save data for each station and variable in one file

In [None]:
#Define variables
variables = ['MAX', 'MIN']

#Define output names
vars_out = dict()
vars_out['MAX'] = 'TX'
vars_out['MIN'] = 'TN'

#Select variable
for variable in variables:

    #Loop over all stations
    N = 0
    for station in all_stations:

        #Loop over years
        create = 1
        for year in years:

            #Define folder
            dir_data = dir_GSOD + 'stations_' + str(year) + '/'

            if station + '.csv' in os.listdir(dir_data):

                #Read data
                data_read = pd.read_csv(dir_data + station + '.csv')
                
                #Get dates and data and mask missing values
                dates = pd.DatetimeIndex(data_read['DATE'])
                data  = data_read[variable]
                data  = data.where(data<9999)

                #Convert °F to °C
                if variable in ['MAX', 'MIN']:
                    data = (data - 32) * 5/9

            else:

                #Create array with NaNs if data is not available
                dates = pd.date_range(start='1/1/' + str(year), end='31/12/' + str(year))
                data  = np.empty(len(dates)) * np.NaN

            #Collect in array
            if create==1:
                dates_coll = dates
                data_coll  = data
                create = 0
            elif create==0:
                dates_coll = np.concatenate((dates_coll, dates), axis=0)
                data_coll  = np.concatenate((data_coll, data), axis=0)
                create = 0        

        #Convert to DataFrame
        data_out = pd.DataFrame(list(zip(dates_coll, data_coll)))
        data_out.columns = ['time', vars_out[variable]]

        if np.sum(~np.isnan(data_coll))<10*365:
            N = N + 1
            continue

        #Convert to xarray and add lat and lon
        data_out = data_out.set_index('time')
        data_out = data_out.to_xarray()
        data_out['lat'] = data_read['LATITUDE'][0]
        data_out['lon'] = data_read['LONGITUDE'][0]            

        #Save in file
        file_out = dir_out + vars_out[variable] + '_' + 'STA' + station + '_' + str(years[0]) + '-' + str(years[-1]) + '.nc'
        data_out.to_netcdf(file_out)

    print('Stations with less than 10 years of data are excluded. Number of excluded stations: ' +
          str(N) + ' out of ' + str(len(all_stations)) + ' for ' + variable)
