# Reading GHCN files

Code adapated from https://gitlab.com/snippets/1838910

In [13]:
import os
import pandas as pd

In [14]:
# Metadata specs #

metadata_col_specs = [
    (0,  12),
    (12, 21),
    (21, 31),
    (31, 38),
    (38, 41),
    (41, 72),
    (72, 76),
    (76, 80),
    (80, 86)
]

metadata_names = [
    "ID",
    "LATITUDE",
    "LONGITUDE",
    "ELEVATION",
    "STATE",
    "NAME",
    "GSN FLAG",
    "HCN/CRN FLAG",
    "WMO ID"]

metadata_dtype = {
    "ID": str,
    "STATE": str,
    "NAME": str,
    "GSN FLAG": str,
    "HCN/CRN FLAG": str,
    "WMO ID": str
    }


# Data specs #

data_header_names = [
    "ID",
    "YEAR",
    "MONTH",
    "ELEMENT"]

data_header_col_specs = [
    (0,  11),
    (11, 15),
    (15, 17),
    (17, 21)]

data_header_dtypes = {
    "ID": str,
    "YEAR": int,
    "MONTH": int,
    "ELEMENT": str}

data_col_names = [[
    "VALUE" + str(i + 1),
    "MFLAG" + str(i + 1),
    "QFLAG" + str(i + 1),
    "SFLAG" + str(i + 1)]
    for i in range(31)]

# Join sub-lists
data_col_names = sum(data_col_names, [])

data_replacement_col_names = [[
    ("VALUE", i + 1),
    ("MFLAG", i + 1),
    ("QFLAG", i + 1),
    ("SFLAG", i + 1)]
    for i in range(31)]
# Join sub-lists
data_replacement_col_names = sum(data_replacement_col_names, [])
data_replacement_col_names = pd.MultiIndex.from_tuples(
    data_replacement_col_names,
    names=['VAR_TYPE', 'DAY'])

data_col_specs = [[
    (21 + i * 8, 26 + i * 8),
    (26 + i * 8, 27 + i * 8),
    (27 + i * 8, 28 + i * 8),
    (28 + i * 8, 29 + i * 8)]
    for i in range(31)]
data_col_specs = sum(data_col_specs, [])

data_col_dtypes = [{
    "VALUE" + str(i + 1): int,
    "MFLAG" + str(i + 1): str,
    "QFLAG" + str(i + 1): str,
    "SFLAG" + str(i + 1): str}
    for i in range(31)]
data_header_dtypes.update({k: v for d in data_col_dtypes for k, v in d.items()})


In [29]:
def read_station_metadata(filename='data/ghcnd-stations.txt'):
    """Reads in station metadata

    :filename: ghcnd station metadata file.
    :returns: station metadata as a pandas Dataframe

    """
    df = pd.read_fwf(filename, metadata_col_specs, names=metadata_names,
                      dtype=metadata_dtype)

    return df


In [28]:
df = read_station_metadata()

<class 'pandas.core.frame.DataFrame'>
Index: 115082 entries, ACW00011604 to ZI000067991
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   LATITUDE      115082 non-null  float64
 1   LONGITUDE     115082 non-null  float64
 2   ELEVATION     115082 non-null  float64
 3   STATE         71105 non-null   object 
 4   NAME          115082 non-null  object 
 5   GSN FLAG      991 non-null     object 
 6   HCN/CRN FLAG  1451 non-null    object 
 7   WMO ID        8088 non-null    object 
dtypes: float64(3), object(5)
memory usage: 7.9+ MB


In [17]:
output_dir = 'output'
output_filename = 'ghcnd-stations.xlsx'
output_path = os.path.join(output_dir, output_filename)
df.to_excel(output_path)

In [53]:
def read_ghcn_data_file(filename):
    df = pd.read_fwf(
        filename,
        colspecs=data_header_col_specs + data_col_specs,
        names=data_header_names + data_col_names,
        dtype=data_header_dtypes
        )
    return df

In [19]:
df = read_ghcn_data_file('data/AE000041196.dly')
df2020 = df[df['YEAR'] == 2020]
dfprcp = df2020[df['ELEMENT'] == 'PRCP'].copy()
dfprcp

TypeError: read_ghcn_data_file() takes 0 positional arguments but 1 was given

In [58]:
filename = 'data/2020.csv'
data_col_dtypes = {
    'ID': str,
    'DATE': str,
    'ELEMENT': str,
    'DATA VALUE': int,
    'M-FLAG': str,
    'Q-FLAG': str,
    'S-FLAG': str,
    'OBS-TIME': str}


column_names = [
    'ID','DATE','ELEMENT','DATA VALUE','M-FLAG','Q-FLAG','S-FLAG','OBS-TIME'
]

ghcn_all = pd.read_csv(filename, names=column_names, dtype=data_col_dtypes)

In [59]:
ghcn_all

Unnamed: 0,ID,DATE,ELEMENT,DATA VALUE,M-FLAG,Q-FLAG,S-FLAG,OBS-TIME
0,AE000041196,20200101,TMIN,168,,,S,
1,AE000041196,20200101,PRCP,0,D,,S,
2,AE000041196,20200101,TAVG,211,H,,S,
3,AEM00041194,20200101,PRCP,0,,,S,
4,AEM00041194,20200101,TAVG,217,H,,S,
...,...,...,...,...,...,...,...,...
16693194,USW00094911,20200713,SNOW,0,,,H,
16693195,USW00094911,20200713,SNWD,0,,,H,0630
16693196,VQC00671740,20200713,TMAX,311,,,H,0800
16693197,VQC00671740,20200713,TMIN,244,,,H,0800


In [60]:
stations = read_station_metadata()
wisc_stations = stations[stations['STATE'] == 'WI']
wisc_stations_list = wisc_stations['ID'].unique().tolist()

In [68]:
filtered = ghcn_all[(ghcn_all['ELEMENT'] == 'SNOW') & (ghcn_all['DATE'].str.startswith('202001'))]
filtered

Unnamed: 0,ID,DATE,ELEMENT,DATA VALUE,M-FLAG,Q-FLAG,S-FLAG,OBS-TIME
5824,BF1FP000001,20200101,SNOW,0,,,N,
5830,BF1SS000005,20200101,SNOW,0,,,N,
5832,BF1SS000006,20200101,SNOW,0,,,N,
5834,BF1SS000007,20200101,SNOW,0,,,N,
6155,CA001011500,20200101,SNOW,0,,,C,
...,...,...,...,...,...,...,...,...
2892193,USW00094982,20200131,SNOW,13,,,W,
2892205,USW00094985,20200131,SNOW,0,,,W,
2892261,USW00094993,20200131,SNOW,0,,,W,
2892465,VQW00011624,20200131,SNOW,0,,,H,


In [69]:
for station in wisc_stations_list:
    station_df = filtered[filtered['ID'] == station]
    if not station_df.empty:
        output_file = 'output/wi/{}.xlsx'.format(station)
        station_df.to_excel(output_file, index=False)

In [65]:
filtered

Unnamed: 0,ID,DATE,ELEMENT,DATA VALUE,M-FLAG,Q-FLAG,S-FLAG,OBS-TIME
1634436,US1WIAD0005,20200118,SNOW,104,,,N,
2564032,US1WIAD0005,20200128,SNOW,0,,,N,
