In [97]:
import pandas as pd
import boto3
import json

# Weather Predictions on AWS Autopilot with NOAA Data

https://docs.opendata.aws/noaa-ghcn-pds/readme.html

Get the data from public S3 and copy it to my new bucket

In [13]:
%%bash

# get and cleanup the stations file
aws s3 cp s3://noaa-ghcn-pds/ghcnd-stations.txt ./weather/ghcnd-stations.txt 
python3 stations_cleanup.py

# upload it
aws s3 cp ./weather/stations.csv s3://raw-weather-data/ghcnd-stations.csv

# Clean up the temp files and directory
rm ./weather/ghcnd-stations.txt 

download: s3://noaa-ghcn-pds/ghcnd-stations.txt to weather/ghcnd-stations.txt


Let's try to only generate files for the five core elements:

* PRCP = Precipitation (tenths of mm)
* SNOW = Snowfall (mm)
* SNWD = Snow depth (mm)
* TMAX = Maximum temperature (tenths of degrees C)
* TMIN = Minimum temperature (tenths of degrees C)

In [None]:
%%bash

for VARIABLE in 2010 2011 2012 2013 2014 2015 2016 2018 2019 2020 2021 2022; do
    # Get the file
    aws s3 cp s3://noaa-ghcn-pds/csv.gz/"$VARIABLE".csv.gz ./weather/"$VARIABLE".csv.gz
    # Decompress the zip file into a temp directory
    gzip -d ./weather/"$VARIABLE".csv.gz
    # Add headers
    { echo 'id,date,element,value,M-FLAG,Q-FLAG,S-FLAG,OBS-TIME'; cat ./weather/"$VARIABLE".csv; } > ./weather/"$VARIABLE"_with_headers.csv
    # filter out the columns with bad data
    awk -F '","'  'BEGIN {OFS=","} { if ((toupper($6) == ""))  print }' ./weather/"$VARIABLE"_with_headers.csv > ./weather/"$VARIABLE"_filtered.csv
    # create a separate file for each value in the third column
    awk -v year=$VARIABLE -F ',' '{print >> ("./weather/" year "/" $3 ".csv")}' ./weather/"$VARIABLE"_filtered.csv
    # Combine the stations data in and add headers back to the remaining files
    for ELEMENT in PRCP SNOW SNWD TMAX TMIN; do
        {
            join -t, <(sort ./weather/"$VARIABLE"/"$ELEMENT".csv) <(sed 1d ./weather/stations.csv | sort)
        } > ./weather/"$VARIABLE"/"$ELEMENT"_combined.csv
        { 
            echo 'id,date,element,reported_value,M-FLAG,Q-FLAG,S-FLAG,OBS-TIME,location'; cat ./weather/"$VARIABLE"/"$ELEMENT"_combined.csv; 
        } > ./weather/"$VARIABLE"/"$ELEMENT"_with_headers.csv
        {
            cut -d , -f2,4,9 < ./weather/"$VARIABLE"/"$ELEMENT"_with_headers.csv; 
        } > ./weather/"$VARIABLE"/"$ELEMENT"_cut.csv
        # Dates need dashes in them
        sed -r 's/^(.{4})(.{2})/\1-\2-/;s/$//' ./weather/"$VARIABLE"/"$ELEMENT"_cut.csv > ./weather/"$VARIABLE"/"$ELEMENT"_edited.csv
        # Sync up the contents of the temp directory to S3 prefix
        aws s3 cp ./weather/"$VARIABLE"/"$ELEMENT"_edited.csv s3://raw-weather-data/"$ELEMENT"/"$VARIABLE".csv
    done
    # delete all files except those with _with_headers.csv
    ls -d -1 "$PWD/weather/$VARIABLE/"*.* | egrep -v "_edited.csv" | xargs rm
    # Clean up the temp files and directory
    rm ./weather/"$VARIABLE"_with_headers.csv ./weather/"$VARIABLE".csv*
done

In [152]:
pd.read_csv('./weather/2010/SNOW_edited.csv', nrows=100)

Unnamed: 0,date-,r-eported_value,location
0,2010-01-01,0,AS
1,2010-01-02,0,AS
2,2010-01-03,0,AS
3,2010-01-04,0,AS
4,2010-01-05,0,AS
...,...,...,...
95,2010-04-06,0,AS
96,2010-04-07,0,AS
97,2010-04-08,0,AS
98,2010-04-09,0,AS


In [99]:
SNOW_S3_Path = 's3://raw-weather-data/SNOW/'
PRCP_S3_Path = 's3://raw-weather-data/PRCP/'
SNWD_S3_Path = 's3://raw-weather-data/SNWD/'
TMAX_S3_Path = 's3://raw-weather-data/TMAX/'
TMIN_S3_Path = 's3://raw-weather-data/TMIN/'

print('S3 URIs')
print(f'S3 snow URI: {SNOW_S3_Path}')
print(f'S3 snow URI: {PRCP_S3_Path}')
print(f'S3 snow URI: {SNWD_S3_Path}')
print(f'S3 snow URI: {TMAX_S3_Path}')
print(f'S3 snow URI: {TMIN_S3_Path}')

S3 URIs
S3 snow URI: s3://raw-weather-data/SNOW/
S3 snow URI: s3://raw-weather-data/PRCP/
S3 snow URI: s3://raw-weather-data/SNWD/
S3 snow URI: s3://raw-weather-data/TMAX/
S3 snow URI: s3://raw-weather-data/TMIN/


# Amazon Forecast

For this, we'll be using Amazon Forecast:

https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/forecast.html

Demo: https://github.com/aws-samples/amazon-forecast-samples/blob/main/notebooks/basic/Getting_Started/Amazon_Forecast_Quick_Start_Guide.ipynb

## Get the IAM Role ARN

In [100]:
role = boto3.client('iam').get_role(RoleName='ForecastNotebookRole')
role_arn = role['Role']['Arn']

In [102]:
client = boto3.client('forecast')
forecastquery = boto3.client(service_name='forecastquery')

## Create the Dataset

```
response = client.create_dataset(
    DatasetName='string',
    Domain='RETAIL'|'CUSTOM'|'INVENTORY_PLANNING'|'EC2_CAPACITY'|'WORK_FORCE'|'WEB_TRAFFIC'|'METRICS',
    DatasetType='TARGET_TIME_SERIES'|'RELATED_TIME_SERIES'|'ITEM_METADATA',
    DataFrequency='string',
    Schema={
        'Attributes': [
            {
                'AttributeName': 'string',
                'AttributeType': 'string'|'integer'|'float'|'timestamp'|'geolocation'
            },
        ]
    },
    EncryptionConfig={
        'RoleArn': 'string',
        'KMSKeyArn': 'string'
    },
    Tags=[
        {
            'Key': 'string',
            'Value': 'string'
        },
    ]
)
```

In [None]:
weather_predictions_schema = {
   "Attributes":[
      {
         "AttributeName":"timestamp",
         "AttributeType":"timestamp"
      },
      {
         "AttributeName":"target_value",
         "AttributeType":"integer"
      },
      {
         "AttributeName":"item_id",
         "AttributeType":"string"
      }
   ]
}

create_dataset_response = client.create_dataset(
    DatasetName='Weather_Predictions_Time_Series_MSDS_434',
    Domain='CUSTOM',
    DatasetType='RELATED_TIME_SERIES',
    DataFrequency='1D',
    Schema=weather_predictions_schema
)


ts_dataset_arn = create_dataset_response['DatasetArn']


In [110]:
describe_dataset_response = client.describe_dataset(DatasetArn=ts_dataset_arn)

print(f"The Dataset with ARN {ts_dataset_arn} is now {describe_dataset_response['Status']}.")

The Dataset with ARN arn:aws:forecast:us-east-1:669437599565:dataset/Weather_Predictions_Time_Series_MSDS_434 is now ACTIVE.


## Import the Dataset

Example:

```
TIMESTAMP_FORMAT = "yyyy-MM-dd hh:mm:ss"
TS_IMPORT_JOB_NAME = "TAXI_TTS_IMPORT"
TIMEZONE = "EST"

ts_dataset_import_job_response = \
    forecast.create_dataset_import_job(DatasetImportJobName=TS_IMPORT_JOB_NAME,
                                       DatasetArn=ts_dataset_arn,
                                       DataSource= {
                                         "S3Config" : {
                                             "Path": ts_s3_path,
                                             "RoleArn": role_arn
                                         } 
                                       },
                                       TimestampFormat=TIMESTAMP_FORMAT,
                                       TimeZone = TIMEZONE)

ts_dataset_import_job_arn = ts_dataset_import_job_response['DatasetImportJobArn']
describe_dataset_import_job_response = forecast.describe_dataset_import_job(DatasetImportJobArn=ts_dataset_import_job_arn)

print(f"Waiting for Dataset Import Job with ARN {ts_dataset_import_job_arn} to become ACTIVE. This process could take 5-10 minutes.\n\nCurrent Status:")

status = util.wait(lambda: forecast.describe_dataset_import_job(DatasetImportJobArn=ts_dataset_import_job_arn))

describe_dataset_import_job_response = forecast.describe_dataset_import_job(DatasetImportJobArn=ts_dataset_import_job_arn)
print(f"\n\nThe Dataset Import Job with ARN {ts_dataset_import_job_arn} is now {describe_dataset_import_job_response['Status']}.")
```

In [115]:
TIMESTAMP_FORMAT = "yyyy-MM-dd"
TS_IMPORT_JOB_NAME = "Snow_Prediction_Import_Job"
TIMEZONE = "EST"

ts_dataset_import_job_response = \
    client.create_dataset_import_job(DatasetImportJobName=TS_IMPORT_JOB_NAME,
                                       DatasetArn=ts_dataset_arn,
                                       DataSource= {
                                         "S3Config" : {
                                             "Path": SNOW_S3_Path,
                                             "RoleArn": role_arn
                                         } 
                                       },
                                       TimestampFormat=TIMESTAMP_FORMAT)#,
                                       #TimeZone = TIMEZONE)

ts_dataset_import_job_arn = ts_dataset_import_job_response['DatasetImportJobArn']
describe_dataset_import_job_response = client.describe_dataset_import_job(DatasetImportJobArn=ts_dataset_import_job_arn)

print(f"Waiting for Dataset Import Job with ARN {ts_dataset_import_job_arn} to become ACTIVE. This process could take 5-10 minutes.\n\nCurrent Status:")

status = util.wait(lambda: client.describe_dataset_import_job(DatasetImportJobArn=ts_dataset_import_job_arn))

describe_dataset_import_job_response = client.describe_dataset_import_job(DatasetImportJobArn=ts_dataset_import_job_arn)
print(f"\n\nThe Dataset Import Job with ARN {ts_dataset_import_job_arn} is now {describe_dataset_import_job_response['Status']}.")

InvalidInputException: An error occurred (InvalidInputException) when calling the CreateDatasetImportJob operation: Input data has invalid timestamp value: 20100101, Please ensure timestamp values match the specified format: yyyy-MM-dd

### Creating a DatasetGroup

Example:
```
DATASET_GROUP_NAME = "TAXI_DEMO"
DATASET_ARNS = [ts_dataset_arn]

create_dataset_group_response = \
    forecast.create_dataset_group(Domain="CUSTOM",
                                  DatasetGroupName=DATASET_GROUP_NAME,
                                  DatasetArns=DATASET_ARNS)

dataset_group_arn = create_dataset_group_response['DatasetGroupArn']
describe_dataset_group_response = forecast.describe_dataset_group(DatasetGroupArn=dataset_group_arn)

print(f"The DatasetGroup with ARN {dataset_group_arn} is now {describe_dataset_group_response['Status']}.")
```

In [None]:
DATASET_GROUP_NAME = "Snow_Forecast"
DATASET_ARNS = [ts_dataset_arn]

create_dataset_group_response = \
    forecast.create_dataset_group(Domain="CUSTOM",
                                  DatasetGroupName=DATASET_GROUP_NAME,
                                  DatasetArns=DATASET_ARNS)

dataset_group_arn = create_dataset_group_response['DatasetGroupArn']
describe_dataset_group_response = forecast.describe_dataset_group(DatasetGroupArn=dataset_group_arn)

print(f"The DatasetGroup with ARN {dataset_group_arn} is now {describe_dataset_group_response['Status']}.")

## Traing a Predictor

Example:
```
PREDICTOR_NAME = "TAXI_PREDICTOR"
FORECAST_HORIZON = 24
FORECAST_FREQUENCY = "H"
HOLIDAY_DATASET = [{
        'Name': 'holiday',
        'Configuration': {
        'CountryCode': ['US']
    }
}]

create_auto_predictor_response = \
    forecast.create_auto_predictor(PredictorName = PREDICTOR_NAME,
                                   ForecastHorizon = FORECAST_HORIZON,
                                   ForecastFrequency = FORECAST_FREQUENCY,
                                   DataConfig = {
                                       'DatasetGroupArn': dataset_group_arn, 
                                       'AdditionalDatasets': HOLIDAY_DATASET
                                    },
                                   ExplainPredictor = True)

predictor_arn = create_auto_predictor_response['PredictorArn']
print(f"Waiting for Predictor with ARN {predictor_arn} to become ACTIVE. Depending on data size and predictor setting，it can take several hours to be ACTIVE.\n\nCurrent Status:")

status = util.wait(lambda: forecast.describe_auto_predictor(PredictorArn=predictor_arn))

describe_auto_predictor_response = forecast.describe_auto_predictor(PredictorArn=predictor_arn)
print(f"\n\nThe Predictor with ARN {predictor_arn} is now {describe_auto_predictor_response['Status']}.")
```


In [None]:
PREDICTOR_NAME = "Snow_Predictor"
FORECAST_HORIZON = 180
FORECAST_FREQUENCY = "1D"

create_auto_predictor_response = \
    forecast.create_auto_predictor(PredictorName = PREDICTOR_NAME,
                                   ForecastHorizon = FORECAST_HORIZON,
                                   ForecastFrequency = FORECAST_FREQUENCY,
                                   DataConfig = {
                                       'DatasetGroupArn': dataset_group_arn
                                    },
                                   ExplainPredictor = True)

predictor_arn = create_auto_predictor_response['PredictorArn']
print(f"Waiting for Predictor with ARN {predictor_arn} to become ACTIVE. Depending on data size and predictor setting，it can take several hours to be ACTIVE.\n\nCurrent Status:")

status = util.wait(lambda: forecast.describe_auto_predictor(PredictorArn=predictor_arn))

describe_auto_predictor_response = forecast.describe_auto_predictor(PredictorArn=predictor_arn)
print(f"\n\nThe Predictor with ARN {predictor_arn} is now {describe_auto_predictor_response['Status']}.")

### Accuracy Metrics

In [None]:
get_accuracy_metrics_response = forecast.get_accuracy_metrics(PredictorArn=predictor_arn)
wql = get_accuracy_metrics_response['PredictorEvaluationResults'][0]['TestWindows'][0]['Metrics']['WeightedQuantileLosses']
accuracy_scores = get_accuracy_metrics_response['PredictorEvaluationResults'][0]['TestWindows'][0]['Metrics']['ErrorMetrics'][0]

print(f"Weighted Quantile Loss (wQL): {json.dumps(wql, indent=2)}\n\n")

print(f"Root Mean Square Error (RMSE): {accuracy_scores['RMSE']}\n\n")

print(f"Weighted Absolute Percentage Error (WAPE): {accuracy_scores['WAPE']}\n\n")

print(f"Mean Absolute Percentage Error (MAPE): {accuracy_scores['MAPE']}\n\n")

print(f"Mean Absolute Scaled Error (MASE): {accuracy_scores['MASE']}\n")

## Forecast

In [None]:
FORECAST_NAME = "SNOW_FORECAST"

create_forecast_response = \
    forecast.create_forecast(ForecastName=FORECAST_NAME,
                             PredictorArn=predictor_arn)

forecast_arn = create_forecast_response['ForecastArn']
print(f"Waiting for Forecast with ARN {forecast_arn} to become ACTIVE. Depending on data size and predictor settings，it can take several hours to be ACTIVE.\n\nCurrent Status:")

status = util.wait(lambda: forecast.describe_forecast(ForecastArn=forecast_arn))

describe_forecast_response = forecast.describe_forecast(ForecastArn=forecast_arn)
print(f"\n\nThe Forecast with ARN {forecast_arn} is now {describe_forecast_response['Status']}.")