In [1]:
"""
This code converts .dly files from the ghcn daily database to a csv format, after selecting a single metric, splitting the daily data into separate rows, and assigning values to States for machine learning.
John Moen

Dataset Database: https://www.ncei.noaa.gov/pub/data/ghcn/daily/ (dwnload link: https://www.ncei.noaa.gov/pub/data/ghcn/daily/ghcnd_all.tar.gz)
Dataset Readme: https://www.ncei.noaa.gov/pub/data/ghcn/daily/readme.txt
"""
import pandas as pd
import glob
import os


In [2]:
# This code gets all the filenames from the raw_station_data folder

cwd = os.getcwd()
os.listdir("../data/raw_station_data")
# print(glob.glob(r"C:\Users\lundk\Desktop\ClassWork\Final Project\Dev\data\raw_station_data\*.dly"))

# mydir = r"C:\Users\lundk\Desktop\ClassWork\Final Project\Dev\data\raw_station_data"
file_list = os.listdir("../data/raw_station_data")

# file_list = glob.glob(mydir + "/*.dly") 

# file_list = [os.path.basename(file) for file in file_list]
# for file in file_list: print (file)

# print(mydir)
print(file_list)

['US10deue020.dly', 'US1AKAB0003.dly', 'US1FLAL0072.dly', 'US1MNRW0014.dly', 'US1MNSB0004.dly', 'US1MNSB0005.dly', 'US1MOBY0007.dly', 'US1MOBY0010.dly', 'US1MOBY0012.dly', 'US1MOBY0013.dly', 'US1MOBY0014.dly', 'US1MOBY0018.dly', 'USC00135123.dly', 'USC00135193.dly', 'USC00331497.dly', 'USC00518051.dly', 'USW00024091.dly', 'USW00024101.dly', 'USW00024103.dly', 'USW00024106.dly']


In [3]:
METRIC = "TMAX"
FILE_PATH = "../data/raw_station_data/"
# OUT_FILE_PATH = f"../data/cleaned_station_data/{STATION[:-4]}.csv"

In [4]:
"""
  Generate the colspecs, these are for the .fwf 'fixed width format' filetypes. 
  The .read_fwf()'s  'infer' does not work here as some data is left blank as a value
  
  From the data set documentation:

    ------------------------------
    Variable   Columns   Type
    ------------------------------
    ID            1-11   Character
    YEAR         12-15   Integer
    MONTH        16-17   Integer
    ELEMENT      18-21   Character
    VALUE1       22-26   Integer
    MFLAG1       27-27   Character
    QFLAG1       28-28   Character
    SFLAG1       29-29   Character
    VALUE2       30-34   Integer
    MFLAG2       35-35   Character
    QFLAG2       36-36   Character
    SFLAG2       37-37   Character
      .           .          .
      .           .          .
      .           .          .
    VALUE31    262-266   Integer
    MFLAG31    267-267   Character
    QFLAG31    268-268   Character
    SFLAG31    269-269   Character
    ------------------------------

  I generate a list of of tuples each representing the 'columns' column above, skipping the M,Q, and S flags
"""

# Generate colspecs for reading the dly files into a dataframe

# Initialize colspecs with ID, YEAR, MONTH, and ELEMENT
colspecs = [(0, 11), (11, 15), (15, 17), (17, 21)]

# Generate remaining colspecs for each (31) day, skipping the flags
x = 21
for i in range(31):
    colspecs.append((x,x+5))
    x += 8

print(colspecs)

[(0, 11), (11, 15), (15, 17), (17, 21), (21, 26), (29, 34), (37, 42), (45, 50), (53, 58), (61, 66), (69, 74), (77, 82), (85, 90), (93, 98), (101, 106), (109, 114), (117, 122), (125, 130), (133, 138), (141, 146), (149, 154), (157, 162), (165, 170), (173, 178), (181, 186), (189, 194), (197, 202), (205, 210), (213, 218), (221, 226), (229, 234), (237, 242), (245, 250), (253, 258), (261, 266)]


In [5]:
# Read in lookup table

stations_colspecs = [(0, 11), (13, 20), (22, 30), (32, 37), (38, 40), (41, -1)]
stations_lookup_df = pd.read_fwf('../data/lookup_data/station_state_city.txt', colspecs=stations_colspecs, names=['station_id', 'latitude', 'longitude', 'elevation', 'state_code', 'station_name'])

In [6]:
# Initialize the final dataframe
columns = ["station_id", "state_code", "year", "month", "day", "metric", "value"]
final_df = pd.DataFrame(columns=columns)


# Read in the data one data at a time
for file in file_list:
    
    station_df = pd.read_fwf(f"../data/raw_station_data/{file}", header=None, colspecs=colspecs)

    # # Initialize the Data frame
    # columns = ["station_id", "year", "month", "day", "metric", "value"]
    # temp_df = pd.DataFrame(columns=columns)

    # Loop through raw dataframe, only pulling the chosen METRIC
    # This print if for debugging
    print(METRIC in station_df[3].unique())

    # Check if there are any of the METRIC in the dataset at
    if METRIC in station_df[3].unique():


        # Select only Rows w/ selected METRIC
        for index, row in station_df.iterrows():
            if row[3] == METRIC:
                

                # Look up the state code
                state_code = stations_lookup_df[stations_lookup_df.station_id == station_df[0][0]].state_code

            
                # For each day in the month, create a row
                day_counter = 4
                for i in range(30):

                    year, month, day, value =  row[1], row[2], day_counter - 3, row[day_counter] / 10
                    new_row = [station_df[0][0], state_code, row[1], row[2], day_counter - 3, METRIC, row[day_counter] / 10]

                    final_df.loc[len(final_df.index)] = new_row

                    # test = final_df.loc[(final_df["state_code"] == state_code ) & 
                    #                 (final_df['year'] == year) &
                    #                 (final_df['month'] == month) &
                    #                 (final_df['day'] == day)
                    #                 , "value"]

                    # if final_df.loc[(final_df["state_code"] == state_code ) & 
                    #                 (final_df['year'] == year) &
                    #                 (final_df['month'] == month) &
                    #                 (final_df['day'] == day)
                    #                 , "value"] < value:
                    #     print("found one!")

                    #     final_df.loc[(final_df["state_code"] == state_code ) & 
                    #                 (final_df['year'] == year) &
                    #                 (final_df['month'] == month) &
                    #                 (final_df['day'] == day)
                    #                 , "value"][0] = value
                        





                    # if pd.isnull(final_df[(final_df.state_code == state_code) & 
                    #             (final_df.year == year) & 
                    #             (final_df.month == month) & 
                    #             (final_df.day == day)].value.item()):
                    #     final_df[(final_df.state_code == state_code) & 
                    #             (final_df.year == year) & 
                    #             (final_df.month == month) & 
                    #             (final_df.day == day)].value.item() = value
                    # elif final_df[(final_df.state_code == state_code) & 
                    #             (final_df.year == year) & 
                    #             (final_df.month == month) & 
                    #             (final_df.day == day)].value.item() < value:
                    #         final_df[(final_df.state_code == state_code) & 
                    #             (final_df.year == year) & 
                    #             (final_df.month == month) & 
                    #             (final_df.day == day)].value.item() = value






                    # temp_df.loc[len(temp_df.index)] = [row[0], row[1], row[2], day_counter - 3, row[3], row[day_counter] / 10]
                    day_counter += 1

    # Use lookup table for the state 
final_df.head()
    

False
False
False
False
False
False
False
False
False
False
False
False
True


KeyboardInterrupt: 