# Weather Predictions on AWS Autopilot with NOAA Data

https://docs.opendata.aws/noaa-ghcn-pds/readme.html

Get the data from public S3 and copy it to my new bucket

In [8]:
%%bash

# get and cleanup the stations file
aws s3 cp s3://noaa-ghcn-pds/ghcnd-stations.txt ./weather/ghcnd-stations.txt 
python3 stations_cleanup.py

# upload it
aws s3 cp ./weather/stations.csv s3://raw-weather-data/ghcnd-stations.csv

# Clean up the temp files and directory
rm ./weather/stations.csv ./weather/ghcnd-stations.txt 

download: s3://noaa-ghcn-pds/ghcnd-stations.txt to weather/ghcnd-stations.txt
upload: weather/stations.csv to s3://raw-weather-data/ghcnd-stations.csv


In [19]:
%%bash

for VARIABLE in 2019 2020; do
    # Get the file
    aws s3 cp s3://noaa-ghcn-pds/csv.gz/"$VARIABLE".csv.gz ./weather/"$VARIABLE".csv.gz
    # Decompress the zip file into a temp directory
    gzip -d ./weather/"$VARIABLE".csv.gz
    # Add headers
    { echo 'id,date,element,value,M-FLAG,Q-FLAG,S-FLAG,OBS-TIME'; cat ./weather/"$VARIABLE".csv; } > ./weather/"$VARIABLE"_with_headers.csv
    # filter out the columns with bad data
    awk -F '","'  'BEGIN {OFS=","} { if ((toupper($6) == ""))  print }' ./weather/"$VARIABLE"_with_headers.csv > ./weather/"$VARIABLE"_filtered.csv
    # Sync up the contents of the temp directory to S3 prefix
    #aws s3 cp ./weather/"$VARIABLE"_filtered.csv s3://raw-weather-data/"$VARIABLE".csv
    # Clean up the temp files and directory
    rm ./weather/"$VARIABLE"_with_headers.csv
done

download: s3://noaa-ghcn-pds/csv.gz/2019.csv.gz to weather/2019.csv.gz
download: s3://noaa-ghcn-pds/csv.gz/2020.csv.gz to weather/2020.csv.gz


In [20]:
import pandas as pd
#awk -F '","'  'BEGIN {OFS=","} { if (toupper($6) == "NaN" && $3 ~ /^WT/)  print }' ./weather/2022_with_headers.csv > ./weather/2022_with_headers_filtered.csv
_2020 = pd.read_csv('./weather/2020_filtered.csv', nrows=10000)

In [21]:
_2020.head()

Unnamed: 0,id,date,element,value,M-FLAG,Q-FLAG,S-FLAG,OBS-TIME
0,AE000041196,20200101,TMIN,168,,,S,
1,AE000041196,20200101,PRCP,0,D,,S,
2,AE000041196,20200101,TAVG,211,H,,S,
3,AEM00041194,20200101,PRCP,0,,,S,
4,AEM00041194,20200101,TAVG,217,H,,S,


In [18]:
_2020[' element'].value_counts()

PRCP    5036
TMIN    1446
TMAX    1364
TAVG    1119
SNWD     418
SNOW     206
WDFG     174
WSFG     174
DATX      19
MDTX      19
DATN       5
MDTN       5
MDPR       2
DAPR       2
ADPT       1
ASLP       1
ASTP       1
DWPR       1
AWND       1
RHAV       1
WSF2       1
AWBT       1
RHMX       1
RHMN       1
WDF2       1
Name:  element, dtype: int64

In [None]:
%%bash 
{
  echo "id, state, Tmax, Date, Tmin1, Tmax1"
  join -t, <(sort ./weather/2022.csv) <(sed 1d ./weather/stations.csv | sort)
} > ./weather/combined.csv

In [None]:
import sagemaker

prefix = 'sagemaker/weather-predictions/input'
sess   = sagemaker.Session()

uri = sess.upload_data(path="./weather/2022.csv", key_prefix=prefix)
print(uri)