In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

GHCND_PATH = Path("/mnt/d/climate_data/ghcnd/data/")
DATA_DOC_PATH = Path("./data/dataset_docs/")

In [None]:
def load_stn_tmean(stn_id):
    """
    Load GHCN-d TMEAN data that passed QC into a dataframe with datetime index
    """
    # Load and reindex data
    df = pd.read_csv(GHCND_PATH / f"{stn_id}.csv", low_memory=False, usecols=['DATE', 'TMAX', 'TMIN', 'TMAX_ATTRIBUTES', 'TMIN_ATTRIBUTES'], index_col='DATE', parse_dates=True, na_values=[9999, -9999]).asfreq('D')

    # Remove flagged data
    tmax_split = df['TMAX_ATTRIBUTES'].str.split(',', expand=True)
    tmin_split = df['TMIN_ATTRIBUTES'].str.split(',', expand=True)

    ## Set to NaN if flag1 or flag2 is not empty
    df.loc[(tmax_split[0] != '') | (tmax_split[1] != ''), 'TMAX'] = np.nan
    df.loc[(tmin_split[0] != '') | (tmin_split[1] != ''), 'TMIN'] = np.nan

    df["TMEAN"] = (df["TMAX"] + df["TMIN"]) / 20 # TEMP data stored as tenth of Celsius

    return df.drop(columns=['TMAX', 'TMIN', 'TMAX_ATTRIBUTES', 'TMIN_ATTRIBUTES'])

def get_stn_tmean_summary(stn_id):
    """
    Compute station data summary 
    """
    try:
        temp_df = load_stn_tmean(stn_id)
    except:
        return (np.nan, 0, np.nan)
    POR_Date_Range = temp_df.index[0].strftime("%Y%m%d")+"-"+temp_df.index[-1].strftime("%Y%m%d")
    Num_Years = len(temp_df.index.year.unique())
    PCT_POR_Good = (len(temp_df["TMEAN"].dropna()) / len(temp_df)) * 100
    return (POR_Date_Range, Num_Years, round(PCT_POR_Good, 1))

In [3]:
ghcnd_stations = pd.read_csv(DATA_DOC_PATH / "ghcnd_us_stations.csv").rename(columns={'STATION_ID': 'StnID'})
ghcnd_stations

Unnamed: 0,StnID,LAT,LON,ELEV,STATE,NAME
0,US009052008,43.7333,-96.6333,482.0,SD,SIOUX FALLS (ENVIRON. CANADA)
1,US10RMHS145,40.5268,-105.1113,1569.1,CO,RMHS 1.6 SSW
2,US10adam001,40.5680,-98.5069,598.0,NE,JUNIATA 1.5 S
3,US10adam002,40.5093,-98.5493,601.1,NE,JUNIATA 6.0 SSW
4,US10adam003,40.4663,-98.6537,615.1,NE,HOLSTEIN 0.1 NW
...,...,...,...,...,...,...
65165,USW00096405,60.4731,-145.3542,25.3,AK,CORDOVA 14 ESE
65166,USW00096406,64.5014,-154.1297,78.9,AK,RUBY 44 ESE 7
65167,USW00096407,66.5620,-159.0036,6.7,AK,SELAWIK 28 E 7
65168,USW00096408,63.4519,-150.8747,678.2,AK,DENALI 27 N 7


In [4]:
ghcnd_stations[["POR_Date_Range", "Num_Years", "PCT_POR_Good"]] = ghcnd_stations.apply(lambda row: get_stn_tmean_summary(row["StnID"]), axis=1, result_type='expand')
ghcnd_stations.head(3)

Unnamed: 0,StnID,LAT,LON,ELEV,STATE,NAME,POR_Date_Range,Num_Years,PCT_POR_Good
0,US009052008,43.7333,-96.6333,482.0,SD,SIOUX FALLS (ENVIRON. CANADA),20081008-20160106,9.0,89.1
1,US10RMHS145,40.5268,-105.1113,1569.1,CO,RMHS 1.6 SSW,,0.0,
2,US10adam001,40.568,-98.5069,598.0,NE,JUNIATA 1.5 S,,0.0,


In [5]:
ghcnd_stations.to_csv(DATA_DOC_PATH / "GHCND_TMEAN_station_info.csv", index=False)