<a href="https://colab.research.google.com/github/stevegbrooks/commodify/blob/main/noaa_data_download_and_processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 0. Download NCEI NOAA weather data via wget 

Data download via wget on command prompt (Refer to https://eternallybored.org/misc/wget/)

Command: 
wget -r --no-parent --reject "index.html*" https://www.ncei.noaa.gov/data/global-summary-of-the-day/access/{1929..2021}


## 1. Import libraries

In [None]:
import os
import pandas as pd
import numpy as np

## 2. User inputs (directories, years to process) 

In [None]:
# User input
input_dir = r"C:\cygwin64\home\irtx\www.ncei.noaa.gov\data\global-summary-of-the-day\access"
start_year = 1929
end_year = 2021

## 3. Process data 

In [None]:
li = []    # empty list to store dataframes

# Loop to process files in each folder (by year)
for i in range(start_year,end_year+1):
    filepath = "{0}\{1}".format(input_dir, i)
    print("Processing year {0}/{1}...".format(i,end_year),end='\r')
    
    for file in os.listdir(filepath):
        
        if file.endswith(".csv"):
            df = pd.read_csv(filepath + '/' + file, index_col=None, header=0)
            
            # Only append df if station is in USA
            try:
                if (df['NAME'][0].endswith("US")):
                    li.append(df)
            except:
                pass

# Combine dfs into one main df - 'usa'
usa = pd.concat(li, axis=0, ignore_index=True)
usa['STATE']=usa['NAME'].str[-5:-3]
usa['DATE']= pd.to_datetime(usa['DATE'])
usa.index = pd.to_datetime(usa['DATE'],format='%y-%m-%d')

# Only keep rows with correct STATE values
us_states = ['AK', 'AL', 'AR', 'AS', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 
             'GA', 'GU', 'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 
             'MD', 'ME', 'MI', 'MN', 'MO', 'MP', 'MS', 'MT', 'NC', 'ND', 'NE', 
             'NH', 'NJ', 'NM', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'PR', 'RI', 
             'SC', 'SD', 'TN', 'TX', 'UM', 'UT', 'VA', 'VI', 'VT', 'WA', 'WI', 
             'WV', 'WY']

pattern = '|'.join(us_states)
usa = usa[usa['STATE'].str.contains(pattern)]

# Save out useful attributes, and calculate monthly mean values
usa = usa[['DATE','TEMP','DEWP','SLP','WDSP','MXSPD','GUST','PRCP','SNDP','FRSHTT','STATE']]
usa = usa.groupby(by=[usa.index.year, usa.index.month, usa.STATE]).mean()
usa.index.set_names(["YEAR", "MONTH", "STATE"], inplace=True)

# Replace placeholder values (9999.9, 999.9, 99.9) with N/A
for i in ['SLP','DEWP']:
    mask = usa[i] > 9999
    usa.loc[mask, i] = np.nan

for i in ['SNDP','GUST']:
    mask = usa[i] > 999
    usa.loc[mask, i] = np.nan

mask = usa['PRCP'] > 99
usa.loc[mask, 'PRCP'] = np.nan

## 4. Export CSV (monthly means in USA states)

In [None]:
#Export processed file as csv
usa.to_csv("{0}\processed_usa_{1}-{2}.csv".format(input_dir,start_year,end_year))