# Weather Predictions on AWS Autopilot with NOAA Data

https://docs.opendata.aws/noaa-ghcn-pds/readme.html

Get the data from public S3 and copy it to my new bucket

In [13]:
%%bash

# get and cleanup the stations file
aws s3 cp s3://noaa-ghcn-pds/ghcnd-stations.txt ./weather/ghcnd-stations.txt 
python3 stations_cleanup.py

# upload it
aws s3 cp ./weather/stations.csv s3://raw-weather-data/ghcnd-stations.csv

# Clean up the temp files and directory
rm ./weather/ghcnd-stations.txt 

download: s3://noaa-ghcn-pds/ghcnd-stations.txt to weather/ghcnd-stations.txt


Let's try to only generate files for the five core elements:

* PRCP = Precipitation (tenths of mm)
* SNOW = Snowfall (mm)
* SNWD = Snow depth (mm)
* TMAX = Maximum temperature (tenths of degrees C)
* TMIN = Minimum temperature (tenths of degrees C)

In [19]:
%%bash

for VARIABLE in 2020; do
    # Get the file
    aws s3 cp s3://noaa-ghcn-pds/csv.gz/"$VARIABLE".csv.gz ./weather/"$VARIABLE".csv.gz
    # Decompress the zip file into a temp directory
    gzip -d ./weather/"$VARIABLE".csv.gz
    # Add headers
    { echo 'id,date,element,value,M-FLAG,Q-FLAG,S-FLAG,OBS-TIME'; cat ./weather/"$VARIABLE".csv; } > ./weather/"$VARIABLE"_with_headers.csv
    # filter out the columns with bad data
    awk -F '","'  'BEGIN {OFS=","} { if ((toupper($6) == ""))  print }' ./weather/"$VARIABLE"_with_headers.csv > ./weather/"$VARIABLE"_filtered.csv
    # create a separate file for each value in the third column
    awk -v year=$VARIABLE -F ',' '{print >> ("./weather/" year "/" $3 ".csv")}' ./weather/"$VARIABLE"_filtered.csv
    # Combine the stations data in and add headers back to the remaining files
    for ELEMENT in PRCP SNOW SNWD TMAX TMIN; do
        {
            join -t, <(sort ./weather/"$VARIABLE"/"$ELEMENT".csv) <(sed 1d ./weather/stations.csv | sort)
        } > ./weather/"$VARIABLE"/"$ELEMENT"_combined.csv
        { 
            echo 'id,date,element,value,M-FLAG,Q-FLAG,S-FLAG,OBS-TIME,state'; cat ./weather/"$VARIABLE"/"$ELEMENT"_combined.csv; 
        } > ./weather/"$VARIABLE"/"$ELEMENT"_with_headers.csv
        # Sync up the contents of the temp directory to S3 prefix
        aws s3 cp ./weather/"$VARIABLE"/"$ELEMENT"_with_headers.csv s3://raw-weather-data/"$ELEMENT"/"$VARIABLE".csv
    done
    # delete all files except those with _with_headers.csv
    ls -d -1 "$PWD/weather/$VARIABLE/"*.* | egrep -v "_with_headers.csv" | xargs rm
    # Clean up the temp files and directory
    rm ./weather/"$VARIABLE"_with_headers.csv ./weather/"$VARIABLE".csv*
done

download: s3://noaa-ghcn-pds/csv.gz/2020.csv.gz to weather/2020.csv.gz


## Combine the stations data in with each file 

In [20]:
import pandas as pd
#awk -F '","'  'BEGIN {OFS=","} { if (toupper($6) == "NaN" && $3 ~ /^WT/)  print }' ./weather/2022_with_headers.csv > ./weather/2022_with_headers_filtered.csv
_2020 = pd.read_csv('./weather/2020_filtered.csv', nrows=1000)
stations = pd.read_csv('./weather/stations.csv', nrows=1000)

In [22]:
snow_combined = pd.read_csv('./weather/2020/SNOW_with_headers.csv', nrows=1000)

In [23]:
snow_combined.head()

Unnamed: 0,id,date,element,value,M-FLAG,Q-FLAG,S-FLAG,OBS-TIME
BF1BI000001,20200716,SNOW,0,,,N,900,BH
BF1EX000001,20200107,SNOW,0,,,N,800,BH
BF1EX000001,20200108,SNOW,0,,,N,800,BH
BF1EX000001,20200114,SNOW,0,,,N,800,BH
BF1EX000001,20200115,SNOW,0,,,N,800,BH


In [None]:
_2020['element'].value_counts()

In [None]:
import sagemaker

prefix = 'sagemaker/weather-predictions/input'
sess   = sagemaker.Session()

uri = sess.upload_data(path="./weather/2022.csv", key_prefix=prefix)
print(uri)