In [1]:
"""
This code converts .dly files from the ghcn daily database to a csv format, after selecting a single metric and splitting the daily data into separate rows for machine learning.
John Moen

Dataset Database: https://www.ncei.noaa.gov/pub/data/ghcn/daily/ (dwnload link: https://www.ncei.noaa.gov/pub/data/ghcn/daily/ghcnd_all.tar.gz)
Dataset Readme: https://www.ncei.noaa.gov/pub/data/ghcn/daily/readme.txt
"""

import pandas as pd

METRIC = "TMAX"
STATION = "US1AKAB0003.dly"
FILE_PATH = f"../data/raw_station_data/{STATION}"
OUT_FILE_PATH = f"../data/cleaned_station_data/{STATION[:-4]}.csv"

In [2]:
"""
  Generate the colspecs, these are for the .fwf 'fixed width format' filetypes. 
  The .read_fwf()'s  'infer' does not work here as some data is left blank as a value
  
  From the data set documentation:

    ------------------------------
    Variable   Columns   Type
    ------------------------------
    ID            1-11   Character
    YEAR         12-15   Integer
    MONTH        16-17   Integer
    ELEMENT      18-21   Character
    VALUE1       22-26   Integer
    MFLAG1       27-27   Character
    QFLAG1       28-28   Character
    SFLAG1       29-29   Character
    VALUE2       30-34   Integer
    MFLAG2       35-35   Character
    QFLAG2       36-36   Character
    SFLAG2       37-37   Character
      .           .          .
      .           .          .
      .           .          .
    VALUE31    262-266   Integer
    MFLAG31    267-267   Character
    QFLAG31    268-268   Character
    SFLAG31    269-269   Character
    ------------------------------

  I generate a list of of tuples each representing the 'columns' column above, skipping the M,Q, and S flags
"""

# Initialize colspecs with ID, YEAR, MONTH, and ELEMENT
colspecs = [(0, 11), (11, 15), (15, 17), (17, 21)]


# Generate remaining colspecs for each (31) day, skipping the flags
x = 21
for i in range(31):
    colspecs.append((x,x+5))
    x += 8



In [3]:
# Read in the data
df = pd.read_fwf(FILE_PATH, header=None, colspecs=colspecs)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,33,34
0,US1AKAB0003,2007,4,PRCP,-9999,-9999,-9999,-9999,-9999,-9999,...,-9999,-9999,-9999,-9999,-9999,-9999,0,0,0,-9999
1,US1AKAB0003,2007,4,SNOW,-9999,-9999,-9999,-9999,-9999,-9999,...,-9999,-9999,-9999,-9999,-9999,-9999,0,0,0,-9999
2,US1AKAB0003,2007,4,SNWD,-9999,-9999,-9999,-9999,-9999,-9999,...,-9999,-9999,-9999,-9999,-9999,-9999,0,0,0,-9999
3,US1AKAB0003,2007,4,WESD,-9999,-9999,-9999,-9999,-9999,-9999,...,-9999,-9999,-9999,-9999,-9999,-9999,-9999,0,0,-9999
4,US1AKAB0003,2007,4,WESF,-9999,-9999,-9999,-9999,-9999,-9999,...,-9999,-9999,-9999,-9999,-9999,-9999,-9999,0,0,-9999


In [4]:
"""
Convert the data into a machine learning usable format, w/ each day on a different row

"""

# Initialize the Data frame
columns = ["station_id", "year", "month", "day", "metric", "value"]
new_df = pd.DataFrame(columns=columns)

# Loop through raw dataframe, only pulling the chosen METRIC
for index, row in df.iterrows():
    if row[3] == "TMAX":
        day_counter = 4

        # For each day in the month, create a row
        for i in range(30):
            new_df.loc[len(new_df.index)] = [row[0], row[1], row[2], day_counter - 3, row[3], row[day_counter] / 10]
            day_counter += 1

new_df.head()


Unnamed: 0,station_id,year,month,day,metric,value


In [5]:
# Save the dataframe to a csv with the same filename
new_df.to_csv(OUT_FILE_PATH, index=False)