In [23]:
import boto3
import pandas as pd
import os
from datetime import datetime as dt

In [20]:
!pip install s3fs matplotlib ipywidgets

In [4]:
 os.environ.setdefault('AWS_PROFILE', 'default')

'default'

In [5]:
os.environ.setdefault('AWS_DEFAULT_REGION', 'us-east-1')

'us-east-1'

In [129]:
%%sh 

# aws s3 ls

In [18]:
! pip install kaggle

# ! mkdir ~/.kaggle
! cp ~/Dev/kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
# ! kaggle datasets list



In [14]:
! ls -ltr ~/.kaggle/

total 8
-rw-------@ 1 sreejith  staff  69 Dec 10 13:53 kaggle.json


In [15]:
!kaggle datasets download twinkle0705/state-wise-power-consumption-in-india
!unzip state-wise-power-consumption-in-india

Downloading state-wise-power-consumption-in-india.zip to /Users/sreejith/Dev/projects/timeseries/aws_forecast
  0%|                                                | 0.00/123k [00:00<?, ?B/s]
100%|████████████████████████████████████████| 123k/123k [00:00<00:00, 1.91MB/s]
Archive:  state-wise-power-consumption-in-india.zip
  inflating: dataset_tk.csv          
  inflating: long_data_.csv          


In [131]:
! ls

aws_forecast_energy_timeseries.ipynb
backup.txt
dataset_tk.csv
energy_dataset.csv
long_data_.csv
maharashtra_energy_dataset_filled.csv
maharashtra_energy_dataset_target_series.csv
state-wise-power-consumption-in-india.zip
[1m[36mvenv[m[m


## Prepare Training Data

In [67]:
data=pd.read_csv('dataset_tk.csv',usecols=['Maharashtra','Unnamed: 0'])

In [68]:
data.head()

Unnamed: 0.1,Unnamed: 0,Maharashtra
0,02/01/2019 00:00:00,428.6
1,03/01/2019 00:00:00,419.6
2,04/01/2019 00:00:00,395.8
3,05/01/2019 00:00:00,411.1
4,06/01/2019 00:00:00,408.6


In [69]:
data['Date']=pd.to_datetime(data['Unnamed: 0'], format='%d/%m/%Y %H:%M:%S').dt.date
data.drop(['Unnamed: 0'],inplace=True,axis=1)

In [70]:
data.head()

Unnamed: 0,Maharashtra,Date
0,428.6,2019-01-02
1,419.6,2019-01-03
2,395.8,2019-01-04
3,411.1,2019-01-05
4,408.6,2019-01-06


In [71]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 503 entries, 0 to 502
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Maharashtra  503 non-null    float64
 1   Date         503 non-null    object 
dtypes: float64(1), object(1)
memory usage: 8.0+ KB


In [72]:
data.set_index('Date', inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 503 entries, 2019-01-02 to 2020-12-05
Data columns (total 1 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Maharashtra  503 non-null    float64
dtypes: float64(1)
memory usage: 7.9+ KB


In [73]:
data.head()

Unnamed: 0_level_0,Maharashtra
Date,Unnamed: 1_level_1
2019-01-02,428.6
2019-01-03,419.6
2019-01-04,395.8
2019-01-05,411.1
2019-01-06,408.6


In [81]:
data.index

Index([2019-01-02, 2019-01-03, 2019-01-04, 2019-01-05, 2019-01-06, 2019-01-07,
       2019-01-08, 2019-01-09, 2019-01-10, 2019-01-11,
       ...
       2020-11-01, 2020-11-02, 2020-11-03, 2020-11-04, 2020-11-05, 2020-12-01,
       2020-12-02, 2020-12-03, 2020-12-04, 2020-12-05],
      dtype='object', name='Date', length=503)

In [82]:
# Add missing dates
# date_range = pd.date_range(data.index[0], data.index[-1])  # only creates upto 2020-12-05
date_range = pd.date_range(data.index[0], dt(2020, 12, 31))
print(len(date_range))
date_range

730


DatetimeIndex(['2019-01-02', '2019-01-03', '2019-01-04', '2019-01-05',
               '2019-01-06', '2019-01-07', '2019-01-08', '2019-01-09',
               '2019-01-10', '2019-01-11',
               ...
               '2020-12-22', '2020-12-23', '2020-12-24', '2020-12-25',
               '2020-12-26', '2020-12-27', '2020-12-28', '2020-12-29',
               '2020-12-30', '2020-12-31'],
              dtype='datetime64[ns]', length=730, freq='D')

In [83]:
# Create a DataFrame containing all dates
df = pd.DataFrame(index=date_range)

In [84]:
df.head()

2019-01-02
2019-01-03
2019-01-04
2019-01-05
2019-01-06


In [85]:
df = df.join(data, how='outer')
df.tail()

Unnamed: 0,Maharashtra
2020-12-27,
2020-12-28,
2020-12-29,
2020-12-30,
2020-12-31,


In [86]:
df

Unnamed: 0,Maharashtra
2019-01-02,428.6
2019-01-03,419.6
2019-01-04,395.8
2019-01-05,411.1
2019-01-06,408.6
...,...
2020-12-27,
2020-12-28,
2020-12-29,
2020-12-30,


In [88]:
# Fill missing values

df['Maharashtra'] = df['Maharashtra'].fillna(method='ffill')
df

Unnamed: 0,Maharashtra
2019-01-02,428.6
2019-01-03,419.6
2019-01-04,395.8
2019-01-05,411.1
2019-01-06,408.6
...,...
2020-12-27,470.5
2020-12-28,470.5
2020-12-29,470.5
2020-12-30,470.5


In [89]:
df.to_csv('maharashtra_energy_dataset_filled.csv')

In [90]:
!ls

aws_forecast_energy_timeseries.ipynb
backup.txt
dataset_tk.csv
long_data_.csv
maharashtra_energy_dataset_filled.csv
state-wise-power-consumption-in-india.zip
[1m[36mvenv[m[m


In [93]:
!head -n 5 maharashtra_energy_dataset_filled.csv

,Maharashtra
2019-01-02,428.6
2019-01-03,419.6
2019-01-04,395.8
2019-01-05,411.1


In [94]:
# AWS Forecast requires an column 'item_id'
df['item_id'] = 'maharashtra'

In [95]:
df.head()

Unnamed: 0,Maharashtra,item_id
2019-01-02,428.6,maharashtra
2019-01-03,419.6,maharashtra
2019-01-04,395.8,maharashtra
2019-01-05,411.1,maharashtra
2019-01-06,408.6,maharashtra


In [99]:
# Leave last 30 points for evaluating Forecast predictions
FORECAST_LENGTH = 30
train = df.iloc[:-FORECAST_LENGTH]

In [100]:
print(len(df), len(train))

735 705


In [103]:
# AWS can have a "target time series" and a "related time series"
train_target_series = train  # Here we doesn't have any additional data columns for related series

In [108]:
# Save data to be uploaded to S3
train_target_series.to_csv("maharashtra_energy_dataset_target_series.csv", header=None)

In [106]:
!head -n 5 maharashtra_energy_dataset_target_series.csv

2019-01-02,428.6,maharashtra
2019-01-03,419.6,maharashtra
2019-01-04,395.8,maharashtra
2019-01-05,411.1,maharashtra
2019-01-06,408.6,maharashtra


## Create Dataset Group and upload data to S3

In [110]:
bucket_name = 'energy-prediction-aws-forecast'

In [111]:
role_arn = 'arn:aws:iam::312202024311:role/ForecastRoleS3Access'

In [112]:
s3 = boto3.client('s3')

In [117]:
# s3.list_buckets()['Buckets']

In [123]:
DATASET_FREQUENCY = 'D'
TIMESTAMP_FORMAT = 'yyyy-MM-dd'

PROJECT = 'daily_energy_forecast'
DATA_VERSION = 1

dataset_group = f'{PROJECT}_dataset_group_{DATA_VERSION}'
print(f"Dataset Group Name = {dataset_group}")

Dataset Group Name = daily_energy_forecast_dataset_group_1


In [119]:
forecast_client = boto3.client('forecast')
forecastquery_client = boto3.client('forecastquery')

In [121]:
forecastquery_client

<botocore.client.ForecastQueryService at 0x11f7627d0>

In [None]:
# create a dataset group
dataset_arns = []
create_dataset_group_response = forecast_client.create_dataset_group(Domain='CUSTOM',
                                                                     DatasetGroupName=dataset_group,
                                                                     DatasetArns=dataset_arns)

In [None]:
# wait until its complete!
dataset_group_arn = create_dataset_group_response['DatasetGroupArn']
describe = forecast_client.describe_dataset_group(DatasetGroupArn=dataset_group_arn)

print(describe['Status'])
print(describe['CreationTime'])
print(describe['LastModificationTime'])

## Create Schema

In [None]:
# Specify the schema of your dataset here. Make sure the order of columns matches the raw data files.
target_schema ={
   "Attributes":[
      {
         "AttributeName":"timestamp",
         "AttributeType":"timestamp"
      },
      {
         "AttributeName":"target_value",
         "AttributeType":"float"
      },
      {
         "AttributeName":"item_id",
         "AttributeType":"string"
      }
   ]
}

In [124]:
target_dataset_name = f'{PROJECT}_{DATA_VERSION}'
print(f'Dataset name = {target_dataset_name}' )

Dataset name = daily_energy_forecast_1


## Create a Dataset

In [None]:
response = forecast_client.create_dataset(Domain="CUSTOM",
                                          DatasetType='TARGET_TIME_SERIES',
                                          DatasetName=target_dataset_name,
                                          DataFrequency=DATASET_FREQUENCY,
                                          Schema=target_schema)

In [None]:
# Check the response
target_dataset_arn = response['DatasetArn']
describe = forecast_client.describe_dataset(DatasetArn=target_dataset_arn)

print(describe['Status'])
print(describe['CreationTime'])
print(describe['LastModificationTime'])

## Update the dataset group with the datasets we created

In [None]:
dataset_arns = []
dataset_arns.append(target_dataset_arn)
forecast_client.update_dataset_group(DatasetGroupArn=dataset_group_arn, DatasetArns=dataset_arns)

## Upload Data to S3

In [125]:
s3r = boto3.resource('s3')

In [133]:
# s3r.Bucket(bucket_name).Object("maharashtra_energy_dataset_target_series.csv").upload_file("maharashtra_energy_dataset_target_series.csv")