In [1]:
import pandas as pd
import boto3
import json

# Weather Predictions on AWS with NOAA Data

https://docs.opendata.aws/noaa-ghcn-pds/readme.html

Check to see if the files are in S3 first (see below code)

If not:
Get the data from public S3 and copy it to my new bucket
1. Run `bash get_stations.bash`
2. Run `bash get_weather_data.bash`

These only generate files for the five core elements:

* PRCP = Precipitation (tenths of mm)
* SNOW = Snowfall (mm)
* SNWD = Snow depth (mm)
* TMAX = Maximum temperature (tenths of degrees C)
* TMIN = Minimum temperature (tenths of degrees C)

In [25]:
# Here's the URIs
bucket = 'raw-weather-data'
path_names = ['SNOW', 'PRCP', 'SNWD', 'TMAX', 'TMIN']
paths = []
for item in paths:
    paths.append(f's3://{bucket}/{item}/')

In [32]:
def list_files_in_s3(bucket, path):
    '''
    Get the list of files from S3
    
    If we wanted to just list items in the bucket, we could do the following.  
    However, we can't check the file has content with this
    
    s3 = boto3.resource('s3')
    my_bucket = s3.Bucket(bucket)
    for file in my_bucket.objects.all():
        print(file.key)
    
    '''
    bucket = bucket
    
    s3 = boto3.client('s3')
    
    response = s3.list_objects_v2(
        Bucket=bucket,
        Prefix=path
    )

    return response

In [33]:
for item in path_names:
    print(item)
    print([(i['Key'], i['Size']) for i in list_files_in_s3(bucket, item)['Contents']])

SNOW
[('SNOW/', 0), ('SNOW/2010.csv', 83146517), ('SNOW/2011.csv', 60171285), ('SNOW/2012.csv', 64987919), ('SNOW/2013.csv', 69215906), ('SNOW/2014.csv', 68552703), ('SNOW/2015.csv', 67754976), ('SNOW/2016.csv', 69610408), ('SNOW/2018.csv', 67856373), ('SNOW/2019.csv', 67592700), ('SNOW/2020.csv', 70260824), ('SNOW/2021.csv', 83057402), ('SNOW/2022.csv', 54991998)]
PRCP
[('PRCP/', 0), ('PRCP/2010.csv', 111588229), ('PRCP/2011.csv', 111580532), ('PRCP/2012.csv', 116675791), ('PRCP/2013.csv', 119863280), ('PRCP/2014.csv', 119868139), ('PRCP/2015.csv', 121048582), ('PRCP/2016.csv', 120887972), ('PRCP/2018.csv', 123466948), ('PRCP/2019.csv', 125105128), ('PRCP/2020.csv', 131016264), ('PRCP/2021.csv', 131048385), ('PRCP/2022.csv', 86813486)]
SNWD
[('SNWD/', 0), ('SNWD/2010.csv', 56900773), ('SNWD/2011.csv', 33523394), ('SNWD/2012.csv', 34633900), ('SNWD/2013.csv', 37997768), ('SNWD/2014.csv', 38942839), ('SNWD/2015.csv', 39456045), ('SNWD/2016.csv', 42010806), ('SNWD/2018.csv', 42557831), (

# Amazon Forecast

For this, we'll be using Amazon Forecast:

https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/forecast.html

Demo: https://github.com/aws-samples/amazon-forecast-samples/blob/main/notebooks/basic/Getting_Started/Amazon_Forecast_Quick_Start_Guide.ipynb

## Get the IAM Role ARN

In [34]:
role = boto3.client('iam').get_role(RoleName='ForecastNotebookRole')
role_arn = role['Role']['Arn']

In [35]:
client = boto3.client('forecast')
forecastquery = boto3.client(service_name='forecastquery')

## Create the Dataset

```
response = client.create_dataset(
    DatasetName='string',
    Domain='RETAIL'|'CUSTOM'|'INVENTORY_PLANNING'|'EC2_CAPACITY'|'WORK_FORCE'|'WEB_TRAFFIC'|'METRICS',
    DatasetType='TARGET_TIME_SERIES'|'RELATED_TIME_SERIES'|'ITEM_METADATA',
    DataFrequency='string',
    Schema={
        'Attributes': [
            {
                'AttributeName': 'string',
                'AttributeType': 'string'|'integer'|'float'|'timestamp'|'geolocation'
            },
        ]
    },
    EncryptionConfig={
        'RoleArn': 'string',
        'KMSKeyArn': 'string'
    },
    Tags=[
        {
            'Key': 'string',
            'Value': 'string'
        },
    ]
)
```

In [57]:
dataset_name = 'Weather_Predictions_Time_Series_MSDS_434'
dataset_dicts = client.list_datasets()['Datasets']

if dataset_name in [i['DatasetName'] for i in client.list_datasets()['Datasets']]:
    print('Dataset Already Exists')
    ts_dataset_arn = [item for item in dataset_dicts if item["DatasetName"] == dataset_name][0]['DatasetArn']
else:
    weather_predictions_schema = {
       "Attributes":[
          {
             "AttributeName":"timestamp",
             "AttributeType":"timestamp"
          },
          {
             "AttributeName":"target_value",
             "AttributeType":"integer"
          },
          {
             "AttributeName":"item_id",
             "AttributeType":"string"
          }
       ]
    }

    # check if the dataset is created first:

    create_dataset_response = client.create_dataset(
        DatasetName=dataset_name,
        Domain='CUSTOM',
        DatasetType='TARGET_TIME_SERIES',
        DataFrequency='1D',
        Schema=weather_predictions_schema
    )
    ts_dataset_arn = create_dataset_response['DatasetArn']


Dataset Already Exists


In [58]:
describe_dataset_response = client.describe_dataset(DatasetArn=ts_dataset_arn)

print(f"The Dataset is now {describe_dataset_response['Status']}.")

The Dataset is now ACTIVE.


## Import the Dataset

Example:

```
TIMESTAMP_FORMAT = "yyyy-MM-dd hh:mm:ss"
TS_IMPORT_JOB_NAME = "TAXI_TTS_IMPORT"
TIMEZONE = "EST"

ts_dataset_import_job_response = \
    forecast.create_dataset_import_job(DatasetImportJobName=TS_IMPORT_JOB_NAME,
                                       DatasetArn=ts_dataset_arn,
                                       DataSource= {
                                         "S3Config" : {
                                             "Path": ts_s3_path,
                                             "RoleArn": role_arn
                                         } 
                                       },
                                       TimestampFormat=TIMESTAMP_FORMAT,
                                       TimeZone = TIMEZONE)

ts_dataset_import_job_arn = ts_dataset_import_job_response['DatasetImportJobArn']
describe_dataset_import_job_response = forecast.describe_dataset_import_job(DatasetImportJobArn=ts_dataset_import_job_arn)

print(f"Waiting for Dataset Import Job with ARN {ts_dataset_import_job_arn} to become ACTIVE. This process could take 5-10 minutes.\n\nCurrent Status:")

status = util.wait(lambda: forecast.describe_dataset_import_job(DatasetImportJobArn=ts_dataset_import_job_arn))

describe_dataset_import_job_response = forecast.describe_dataset_import_job(DatasetImportJobArn=ts_dataset_import_job_arn)
print(f"\n\nThe Dataset Import Job with ARN {ts_dataset_import_job_arn} is now {describe_dataset_import_job_response['Status']}.")
```

In [78]:
TIMESTAMP_FORMAT = "yyyy-MM-dd"
TIMEZONE = "UTC"
TS_IMPORT_JOB_NAME = "Snow_Prediction_Import_Job"
import_job = client.list_dataset_import_jobs(
    Filters=[
        {
            'Key': 'DatasetArn',
            'Value': ts_dataset_arn,
            'Condition': 'IS'
        },
    ]
)   

if TS_IMPORT_JOB_NAME in [i['DatasetImportJobName'] for i in import_job['DatasetImportJobs']]:
    print('Already Exists')
    ts_dataset_import_job_arn = [item for item in import_job['DatasetImportJobs'] if item["DatasetImportJobName"] == TS_IMPORT_JOB_NAME][0]['DatasetImportJobArn']
else:
    ts_dataset_import_job_response = \
        client.create_dataset_import_job(DatasetImportJobName=TS_IMPORT_JOB_NAME,
                                           DatasetArn=ts_dataset_arn,
                                           DataSource= {
                                             "S3Config" : {
                                                 "Path": SNOW_S3_Path,
                                                 "RoleArn": role_arn
                                             } 
                                           },
                                           TimestampFormat=TIMESTAMP_FORMAT,
                                           TimeZone = TIMEZONE)

    ts_dataset_import_job_arn = ts_dataset_import_job_response['DatasetImportJobArn']
    describe_dataset_import_job_response = client.describe_dataset_import_job(DatasetImportJobArn=ts_dataset_import_job_arn)

    print(f"Waiting for Dataset Import Job with ARNto become ACTIVE. This process could take 5-10 minutes.")


Already Exists


In [79]:
status = client.describe_dataset_import_job(DatasetImportJobArn=ts_dataset_import_job_arn)
print(status['DatasetImportJobName'])
try:
    print(f'Time Remaining in Minutes (estimated): {status["EstimatedTimeRemainingInMinutes"]}')
except:
    pass
print(f'Status: {status["Status"]}')

Snow_Prediction_Import_Job
Status: ACTIVE


### Creating a DatasetGroup

Example:
```
DATASET_GROUP_NAME = "TAXI_DEMO"
DATASET_ARNS = [ts_dataset_arn]

create_dataset_group_response = \
    forecast.create_dataset_group(Domain="CUSTOM",
                                  DatasetGroupName=DATASET_GROUP_NAME,
                                  DatasetArns=DATASET_ARNS)

dataset_group_arn = create_dataset_group_response['DatasetGroupArn']
describe_dataset_group_response = forecast.describe_dataset_group(DatasetGroupArn=dataset_group_arn)

print(f"The DatasetGroup with ARN {dataset_group_arn} is now {describe_dataset_group_response['Status']}.")
```

In [85]:
DATASET_ARNS = [ts_dataset_arn]

DATASET_GROUP_NAME = "Snow_Forecast"
if DATASET_GROUP_NAME in [i['DatasetGroupName'] for i in client.list_dataset_groups()['DatasetGroups']]:
    print('Already Exists')
    dataset_group_arn = ([item for item in client.list_dataset_groups()['DatasetGroups'] 
                         if item["DatasetGroupName"] == DATASET_GROUP_NAME][0]['DatasetGroupArn'])
else:
    create_dataset_group_response = \
        client.create_dataset_group(Domain="CUSTOM",
                                      DatasetGroupName=DATASET_GROUP_NAME,
                                      DatasetArns=DATASET_ARNS)

    dataset_group_arn = create_dataset_group_response['DatasetGroupArn']
    describe_dataset_group_response = client.describe_dataset_group(DatasetGroupArn=dataset_group_arn)

    print(f"The DatasetGroup with ARN {dataset_group_arn} is now {describe_dataset_group_response['Status']}.")

Already Exists


## Traing a Predictor

Example:
```
PREDICTOR_NAME = "TAXI_PREDICTOR"
FORECAST_HORIZON = 24
FORECAST_FREQUENCY = "H"
HOLIDAY_DATASET = [{
        'Name': 'holiday',
        'Configuration': {
        'CountryCode': ['US']
    }
}]

create_auto_predictor_response = \
    forecast.create_auto_predictor(PredictorName = PREDICTOR_NAME,
                                   ForecastHorizon = FORECAST_HORIZON,
                                   ForecastFrequency = FORECAST_FREQUENCY,
                                   DataConfig = {
                                       'DatasetGroupArn': dataset_group_arn, 
                                       'AdditionalDatasets': HOLIDAY_DATASET
                                    },
                                   ExplainPredictor = True)

predictor_arn = create_auto_predictor_response['PredictorArn']
print(f"Waiting for Predictor with ARN {predictor_arn} to become ACTIVE. Depending on data size and predictor setting，it can take several hours to be ACTIVE.\n\nCurrent Status:")

status = util.wait(lambda: forecast.describe_auto_predictor(PredictorArn=predictor_arn))

describe_auto_predictor_response = forecast.describe_auto_predictor(PredictorArn=predictor_arn)
print(f"\n\nThe Predictor with ARN {predictor_arn} is now {describe_auto_predictor_response['Status']}.")
```


In [89]:
FORECAST_HORIZON = 180
FORECAST_FREQUENCY = "1D"

PREDICTOR_NAME = "Snow_Predictor"
if PREDICTOR_NAME in [i['PredictorName'] for i in client.list_predictors()['Predictors']]:
    print('Already Exists')
    predictor_arn = ([item for item in client.list_predictors()['Predictors'] 
                         if item["PredictorName"] == PREDICTOR_NAME][0]['PredictorArn'])
else:
    create_auto_predictor_response = \
        client.create_auto_predictor(PredictorName = PREDICTOR_NAME,
                                       ForecastHorizon = FORECAST_HORIZON,
                                       ForecastFrequency = FORECAST_FREQUENCY,
                                       DataConfig = {
                                           'DatasetGroupArn': dataset_group_arn
                                        })

    predictor_arn = create_auto_predictor_response['PredictorArn']
    print(f"Waiting for Predictor with ARN to become ACTIVE. Depending on data size and predictor setting，it can take several hours to be ACTIVE")

Waiting for Predictor with ARN to become ACTIVE. Depending on data size and predictor setting，it can take several hours to be ACTIVE


In [97]:
pred_status = client.describe_auto_predictor(PredictorArn=predictor_arn)

print(pred_status['PredictorName'])
try:
    print(f'Time Remaining in Minutes (estimated): {pred_status["EstimatedTimeRemainingInMinutes"]}')
except:
    pass
print(f'Status: {pred_status["Status"]}')

Snow_Predictor
Status: ACTIVE


### Accuracy Metrics

In [98]:
get_accuracy_metrics_response = client.get_accuracy_metrics(PredictorArn=predictor_arn)
wql = get_accuracy_metrics_response['PredictorEvaluationResults'][0]['TestWindows'][0]['Metrics']['WeightedQuantileLosses']
accuracy_scores = get_accuracy_metrics_response['PredictorEvaluationResults'][0]['TestWindows'][0]['Metrics']['ErrorMetrics'][0]

print(f"Weighted Quantile Loss (wQL): {json.dumps(wql, indent=2)}\n\n")

print(f"Root Mean Square Error (RMSE): {accuracy_scores['RMSE']}\n\n")

print(f"Weighted Absolute Percentage Error (WAPE): {accuracy_scores['WAPE']}\n\n")

print(f"Mean Absolute Percentage Error (MAPE): {accuracy_scores['MAPE']}\n\n")

print(f"Mean Absolute Scaled Error (MASE): {accuracy_scores['MASE']}\n")

Weighted Quantile Loss (wQL): [
  {
    "Quantile": 0.9,
    "LossValue": 1.6621176413113308
  },
  {
    "Quantile": 0.5,
    "LossValue": 1.0098386699743338
  },
  {
    "Quantile": 0.1,
    "LossValue": 0.23374340045447026
  }
]


Root Mean Square Error (RMSE): 1279.1895539850382


Weighted Absolute Percentage Error (WAPE): 1.8649589831575322


Mean Absolute Percentage Error (MAPE): 0.8804727600326857


Mean Absolute Scaled Error (MASE): 1.939399311943807



## Forecast

In [None]:
FORECAST_NAME = "SNOW_FORECAST"

if FORECAST_NAME in [i['ForecastName'] for i in client.list_forecasts()['Forecasts']]:
    print('Already Exists')
    forecast_arn = ([item for item in client.list_forecasts()['Forecasts'] 
                         if item["ForecastName"] == FORECAST_NAME][0]['ForecastArn'])
else:

    create_forecast_response = \
        forecast.create_forecast(ForecastName=FORECAST_NAME,
                                 PredictorArn=predictor_arn)

    forecast_arn = create_forecast_response['ForecastArn']
    print(f"Waiting for Forecast to become ACTIVE. Depending on data size and predictor settings，it can take several hours to be ACTIVE.")



In [None]:
forecast_status = client.describe_forecast(ForecastArn=forecast_arn)

print(forecast_status['PredictorName'])
try:
    print(f'Time Remaining in Minutes (estimated): {forecast_status["EstimatedTimeRemainingInMinutes"]}')
except:
    pass
print(f'Status: {forecast_status["Status"]}')