In [64]:
import pandas as pd
import boto3
from datetime import datetime
import subprocess
from collections import defaultdict
import time as t

# Cost Predictions on AWS

https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/ce.html



## Get the Most Recently Updated File Key from S3

In [12]:
bucket = 'raw-weather-data'

In [13]:
def get_current_year_key_from_s3(bucket):
    '''
    Here, we get the most recent file from s3, and then get the date from it
    '''
    # Here's the URIs
    path_names = ['SNOW', 'PRCP', 'SNWD', 'TMAX', 'TMIN']
    
    current_year = datetime.now().year
    s3 = boto3.client('s3')
    
    element_key_dict = {}
    for item in path_names:
        response = s3.list_objects_v2(
            Bucket=bucket,
            Prefix=item # this is somewhat arbitrary, but we just need to pick a path
        )
        key = [i['Key'] for i in response['Contents'] if str(current_year) in
              i['Key']][0]
        
        element_key_dict[item] = key
    return element_key_dict



# Forecast

In [79]:
class weather_forecast:
    
    def __init__(self, s3_path, retrain, element, import_job_name,
                forecast_horizon, forecast_frequency, predictor_name,
                forecast_name):
        
        self.s3_path = s3_path
        self.role = boto3.client('iam').get_role(RoleName='ForecastNotebookRole')
        self.role_arn = self.role['Role']['Arn']
        
        self.client = boto3.client('forecast')
        
        if element == 'Snow':
            self.dataset_name = 'Weather_Predictions_Time_Series_MSDS_434'
        else:
            self.dataset_name = f'Weather_Predictions_Time_Series_{element}'
        self.dataset_arn = self.create_dataset()
        
        while self.check_status('dataset') != 'ACTIVE':
            t.sleep(60)
            self.check_status('dataset')
            
        self.import_job_name = import_job_name
        self.dataset_import_job_arn = self.import_dataset()
        
        self.dataset_group_name = f'{element}_Forecast'
        self.dataset_group_arn = self.dataset_group()
        
        while self.check_status('import') != 'ACTIVE':
            t.sleep(60)
            self.check_status('import')
        
        self.forecast_name = forecast_name
        self.forecast_horizon = forecast_horizon
        self.forecast_frequency = forecast_frequency
        self.retrain = True
        if self.retrain == True:
            max_predictor_updatetime = max(([item['LastModificationTime'] for item in 
                                 client.list_predictors()['Predictors'] 
                                 if element.lower() in item['PredictorName'].lower()]))

            old_predictor_arn = ([item for item in client.list_predictors()['Predictors'] 
                                  if max_predictor_updatetime == item['LastModificationTime']])
            self.retrainedPredictorName = predictor_name
            self.referencePredictorArn = old_predictor_arn[0]['PredictorArn']
        else:
            self.predictor_name = predictor_name
        self.predictor_arn = self.train_predictor()
        
        while self.check_status('predictor') != 'ACTIVE':
            t.sleep(60)
            self.check_status('predictor')
        
        self.forecast = forecast_name
        self.forecast_arn = self.create_forecast()
        
    
    def create_dataset(self):
        dataset_dicts = self.client.list_datasets()['Datasets']

        if self.dataset_name in [i['DatasetName'] for i in self.client.list_datasets()['Datasets']]:
            print('Dataset Already Exists')
            ts_dataset_arn = ([item for item in dataset_dicts if 
                               item["DatasetName"] == self.dataset_name][0]['DatasetArn'])
        else:
            print('Creating New Dataset')
            schema = {
               "Attributes":[
                  {
                     "AttributeName":"timestamp",
                     "AttributeType":"timestamp"
                  },
                  {
                     "AttributeName":"target_value",
                     "AttributeType":"float"
                  },
                  {
                     "AttributeName":"item_id",
                     "AttributeType":"string"
                  }
               ]
            }

            # check if the dataset is created first:

            create_dataset_response = self.client.create_dataset(
                DatasetName=self.dataset_name,
                Domain='CUSTOM',
                DatasetType='TARGET_TIME_SERIES',
                DataFrequency='1D',
                Schema=schema
            )
            ts_dataset_arn = create_dataset_response['DatasetArn']
            print('Dataset Create Initiated')
        
        return ts_dataset_arn
    
    def import_dataset(self):
        TIMESTAMP_FORMAT = "yyyy-MM-dd"
        TIMEZONE = "UTC"

        import_job = self.client.list_dataset_import_jobs(
            Filters=[
                {
                    'Key': 'DatasetArn',
                    'Value': self.dataset_arn,
                    'Condition': 'IS'
                },
            ]
        )   

        if self.import_job_name in [i['DatasetImportJobName'] for i in import_job['DatasetImportJobs']]:
            print('Already Exists')
            ts_dataset_import_job_arn = [item for item in import_job['DatasetImportJobs'] if item["DatasetImportJobName"] == self.import_job_name][0]['DatasetImportJobArn']
        else:
            ts_dataset_import_job_response = \
                self.client.create_dataset_import_job(DatasetImportJobName=self.import_job_name,
                                                   DatasetArn=self.dataset_arn,
                                                   DataSource= {
                                                     "S3Config" : {
                                                         "Path": self.s3_path,
                                                         "RoleArn": self.role_arn
                                                     } 
                                                   },
                                                   TimestampFormat=TIMESTAMP_FORMAT,
                                                   TimeZone = TIMEZONE)

            ts_dataset_import_job_arn = ts_dataset_import_job_response['DatasetImportJobArn']
            describe_dataset_import_job_response = self.client.describe_dataset_import_job(DatasetImportJobArn=ts_dataset_import_job_arn)

            print(f"Waiting for Dataset Import Job with ARN to become ACTIVE. This process could take 5-10 minutes.")
        return ts_dataset_import_job_arn
    
    def dataset_group(self):
        DATASET_ARNS = [self.dataset_arn]
        
        if self.dataset_group_name in [i['DatasetGroupName'] for i in self.client.list_dataset_groups()['DatasetGroups']]:
            print('Already Exists')
            dataset_group_arn = ([item for item in self.client.list_dataset_groups()['DatasetGroups'] 
                                 if item["DatasetGroupName"] == self.dataset_group_name][0]['DatasetGroupArn'])
        else:
            create_dataset_group_response = \
                self.client.create_dataset_group(Domain="CUSTOM",
                                              DatasetGroupName=self.dataset_group_name,
                                              DatasetArns=DATASET_ARNS)

            dataset_group_arn = create_dataset_group_response['DatasetGroupArn']
            describe_dataset_group_response = self.client.describe_dataset_group(DatasetGroupArn=dataset_group_arn)

            print(f"The DatasetGroup with ARN {dataset_group_arn} is now {describe_dataset_group_response['Status']}.")
    
        return dataset_group_arn
    
    def train_predictor(self):
        if self.retrain == True:
            create_auto_predictor_response=self.client.create_auto_predictor(
                PredictorName=self.retrainedPredictorName, 
                ReferencePredictorArn=self.referencePredictorArn)
            predictor_arn = create_auto_predictor_response['PredictorArn']
        else:
            if self.predictor_name in [i['PredictorName'] for i in self.client.list_predictors()['Predictors']]:
                print('Already Exists')
                predictor_arn = ([item for item in self.client.list_predictors()['Predictors'] 
                                     if item["PredictorName"] == self.predictor_name][0]['PredictorArn'])
            else:
                create_auto_predictor_response = \
                    self.client.create_auto_predictor(PredictorName = self.predictor_name,
                                                   ForecastHorizon = self.forecast_horizon,
                                                   ForecastFrequency = self.forecast_frequency,
                                                   DataConfig = {
                                                       'DatasetGroupArn': self.dataset_group_arn
                                                    })

                predictor_arn = create_auto_predictor_response['PredictorArn']
                print(f"Waiting for Predictor with ARN to become ACTIVE. Depending on data size and predictor setting，it can take several hours to be ACTIVE")
        return predictor_arn
    
    def create_forecast(self):
        
        if self.forecast_name in [i['ForecastName'] for i in self.client.list_forecasts()['Forecasts']]:
            print('Already Exists')
            forecast_arn = ([item for item in self.client.list_forecasts()['Forecasts'] 
                                 if item["ForecastName"] == self.forecast_name][0]['ForecastArn'])
        else:

            create_forecast_response = \
                self.client.create_forecast(ForecastName=self.forecast_name,
                                         PredictorArn=self.predictor_arn)

            forecast_arn = create_forecast_response['ForecastArn']
            print(f"Waiting for Forecast to become ACTIVE. Depending on data size and predictor settings，it can take several hours to be ACTIVE.")

        return forecast_arn
    
    def check_status(self, describe_type):
        if describe_type == 'dataset':
            describe_dataset_response = self.client.describe_dataset(DatasetArn=self.dataset_arn)
            print(f"The Dataset is now {describe_dataset_response['Status']}.")
            return describe_dataset_response['Status']
        
        elif describe_type == 'import':
            status = self.client.describe_dataset_import_job(DatasetImportJobArn=self.dataset_import_job_arn)
            print(status['DatasetImportJobName'])
            try:
                print(f'Time Remaining in Minutes (estimated): {status["EstimatedTimeRemainingInMinutes"]}')
            except:
                pass
            print(f'Status: {status["Status"]}')
            return status["Status"]
        
        elif describe_type == 'predictor':
            pred_status = self.client.describe_auto_predictor(PredictorArn=self.predictor_arn)
            print(pred_status['PredictorName'])
            try:
                print(f'Time Remaining in Minutes (estimated): {pred_status["EstimatedTimeRemainingInMinutes"]}')
            except:
                pass
            print(f'Status: {pred_status["Status"]}')
            return pred_status["Status"]
        
        elif describe_type == 'forecast':
            forecast_status = self.client.describe_forecast(ForecastArn=self.forecast_arn)
            print(forecast_status['ForecastName'])
            try:
                print(f'Time Remaining in Minutes (estimated): {forecast_status["EstimatedTimeRemainingInMinutes"]}')
            except:
                pass
            print(f'Status: {forecast_status["Status"]}')
            
            return forecast_status["Status"]



In [86]:
key = get_current_year_key_from_s3(bucket)['SNOW']
element = 'Snow'

In [87]:
s3_path = f's3://{bucket}/{element.upper()}'
s3_path

's3://raw-weather-data/SNOW'

In [88]:
forecast_output = weather_forecast(s3_path = s3_path, retrain = True, element = element,
                                    import_job_name = f"Weather_New_Import_{datetime.now().strftime('%Y_%m_%d_%H_%M')}",
                                     forecast_horizon = 180, forecast_frequency = "1D", 
                                     forecast_name = "SNOW_FORECAST", 
                                     predictor_name = f"Snow_Predictor_{datetime.now().strftime('%Y_%m_%d_%H_%M')}",)

Dataset Already Exists
The Dataset is now ACTIVE.
Waiting for Dataset Import Job with ARNto become ACTIVE. This process could take 5-10 minutes.
Already Exists
Weather_New_Import_2022_11_20_15_43
Status: CREATE_PENDING
Weather_New_Import_2022_11_20_15_43
Time Remaining in Minutes (estimated): 30
Status: CREATE_IN_PROGRESS
Weather_New_Import_2022_11_20_15_43
Time Remaining in Minutes (estimated): 30
Status: CREATE_IN_PROGRESS
Weather_New_Import_2022_11_20_15_43
Time Remaining in Minutes (estimated): 29
Status: CREATE_IN_PROGRESS
Weather_New_Import_2022_11_20_15_43
Time Remaining in Minutes (estimated): 29
Status: CREATE_IN_PROGRESS
Weather_New_Import_2022_11_20_15_43
Time Remaining in Minutes (estimated): 28
Status: CREATE_IN_PROGRESS
Weather_New_Import_2022_11_20_15_43
Time Remaining in Minutes (estimated): 28
Status: CREATE_IN_PROGRESS
Weather_New_Import_2022_11_20_15_43
Time Remaining in Minutes (estimated): 27
Status: CREATE_IN_PROGRESS
Weather_New_Import_2022_11_20_15_43
Time Remai

NameError: name 'clear_output' is not defined