In [102]:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.transforms import *
import ast
from io import StringIO
import boto3
from pyspark.sql.functions import year, col
from awsglue.dynamicframe import DynamicFrame


### below just for testing - not required if creating glue job script and deploying
sys.argv = [sys.argv[0]]
arg_vals = ['--JOB_NAME=test','--database=default','--table=sample_manning_csv','--destination=s3://aws-forecast-demo-examples/','--year_range=[2010, 2014]']
sys.argv.extend(arg_vals)

args = getResolvedOptions(sys.argv, ['JOB_NAME', 'database', 'table','destination', 'year_range'])
glueContext = GlueContext(SparkContext.getOrCreate())


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [34]:
try:
    assert len(ast.literal_eval(args['year_range'])) == 2 
except AssertionError as e:
        raise ValueError(f"--year_range needs to have two values in str list passed as arg '[lower_year, upper_year]'.\
You passed in '{args['year_range']}'")

                    

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

#### Create dynamic dataframe from glue data catalog table

In [143]:
# if enabling glue job bookmark, need to pass in transformation_ctx for bookmark to track dataframe states

ts_dyf = glueContext.create_dynamic_frame.from_catalog(database="default", table_name="sample_manning_csv", transformation_ctx = "read data from S3")
print("Count: " + str(ts_dyf.count()))
ts_dyf.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Count: 2905
root
|-- index: long
|-- ds: string
|-- y: double

#### Check schema inferred by spark when reading directly from S3 

In [37]:
ts_df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(
   's3://aws-forecast-demo-examples/sample_manning.csv')
ts_df.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- index: integer (nullable = true)
 |-- ds: timestamp (nullable = true)
 |-- y: double (nullable = true)

The glue crawler used to create the catalog table has inferred the ds column as string whilst we need it in timestamp
format for AWS forecast. Index column will be dropped anyway so it is irrelevant

#### Drop index field

In [57]:
dyf_dropped = DropFields.apply(frame=ts_dyf , paths=["index"], transformation_ctx = "drop index columnn")
dyf_dropped.toDF().show()
#dyf_dropped = ts_dyf.rename_field('ds', 'timestamp').rename_field('y', 'target_value').drop_fields(['index'])
#dyf_dropped.toDF().show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------+----------------+
|        ds|               y|
+----------+----------------+
|2007-12-10|9.59076113897809|
|2007-12-11|8.51959031601596|
|2007-12-12|8.18367658262066|
|2007-12-13|8.07246736935477|
|2007-12-14| 7.8935720735049|
|2007-12-15|7.78364059622125|
|2007-12-16|8.41405243249672|
|2007-12-17|8.82922635473185|
|2007-12-18|8.38251828808963|
|2007-12-19|8.06965530688617|
|2007-12-20|7.87929148508227|
|2007-12-21|7.76174498465891|
|2007-12-22|7.52940645783701|
|2007-12-23|8.38526052015541|
|2007-12-24|8.62011072542292|
|2007-12-25|7.85243908535751|
|2007-12-26|7.85399308722424|
|2007-12-27| 8.0519780789023|
|2007-12-28|7.92660259918138|
|2007-12-29|7.83834331555712|
+----------+----------------+
only showing top 20 rows

#### Rename columns and cast 

In [59]:
dyf_applyMapping = ApplyMapping.apply( frame = dyf_dropped, mappings = [ ("ds","String","timestamp","timestamp"),\
("y","double", "target_value", "double") ], transformation_ctx = "rename and cast columns")
dyf_applyMapping.toDF().show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------------------+----------------+
|          timestamp|    target_value|
+-------------------+----------------+
|2007-12-10 00:00:00|9.59076113897809|
|2007-12-11 00:00:00|8.51959031601596|
|2007-12-12 00:00:00|8.18367658262066|
|2007-12-13 00:00:00|8.07246736935477|
|2007-12-14 00:00:00| 7.8935720735049|
|2007-12-15 00:00:00|7.78364059622125|
|2007-12-16 00:00:00|8.41405243249672|
|2007-12-17 00:00:00|8.82922635473185|
|2007-12-18 00:00:00|8.38251828808963|
|2007-12-19 00:00:00|8.06965530688617|
|2007-12-20 00:00:00|7.87929148508227|
|2007-12-21 00:00:00|7.76174498465891|
|2007-12-22 00:00:00|7.52940645783701|
|2007-12-23 00:00:00|8.38526052015541|
|2007-12-24 00:00:00|8.62011072542292|
|2007-12-25 00:00:00|7.85243908535751|
|2007-12-26 00:00:00|7.85399308722424|
|2007-12-27 00:00:00| 8.0519780789023|
|2007-12-28 00:00:00|7.92660259918138|
|2007-12-29 00:00:00|7.83834331555712|
+-------------------+----------------+
only showing top 20 rows

#### Convert to Dynamic DF to pyspark df to use spark filter operation

In [74]:
filter_years = ast.literal_eval(args['year_range'])

filtered_df = dyf_applyMapping.toDF().filter((year(col("timestamp")) > filter_years[0]) & (year(col("timestamp")) < filter_years[1]))
filtered_df.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------------------+----------------+
|          timestamp|    target_value|
+-------------------+----------------+
|2011-01-01 00:00:00|9.00969189848934|
|2011-01-02 00:00:00|9.39897529082673|
|2011-01-03 00:00:00|9.99392223000734|
|2011-01-04 00:00:00|9.06149227523977|
|2011-01-05 00:00:00|8.97119446318447|
|2011-01-06 00:00:00|8.94689552388845|
|2011-01-07 00:00:00|9.18696938565294|
|2011-01-08 00:00:00| 9.0980671294934|
|2011-01-09 00:00:00|10.8781037947059|
|2011-01-10 00:00:00|9.38269576445829|
|2011-01-11 00:00:00|9.19897604189713|
|2011-01-12 00:00:00|8.62119278143472|
|2011-01-13 00:00:00|8.61323037961318|
|2011-01-14 00:00:00|8.69517199877606|
|2011-01-15 00:00:00|8.72029728739272|
|2011-01-16 00:00:00|9.50031980347665|
|2011-01-17 00:00:00|9.34757739028127|
|2011-01-18 00:00:00|8.78370269863522|
|2011-01-19 00:00:00|8.70217786562968|
|2011-01-20 00:00:00| 8.6821990260005|
+-------------------+----------------+
only showing top 20 rows

#### Write to S3

Convert back to Dynamic DF first before  writing the final output to S3 destination path passed through args

In [141]:
final_dyf = DynamicFrame.fromDF(filtered_df.repartition(1), glueContext, "final_dyf")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [144]:
glueContext.write_dynamic_frame.from_options(
       frame = final_dyf,
       connection_type = "s3",
       connection_options = {"path": args['destination']},
       format = "csv",transformation_ctx = "S3 upload")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

<awsglue.dynamicframe.DynamicFrame object at 0x7f20d83f5ac8>

#### Rename glue output filename in S3

By default write operation from glue renames file to random name based on job id. To rename, we have to do this manually by reading from S3 and then put object with another key

Data read from s3 is botocore streaming body object so need to decode bytes to csv string format
To read via pd.read_csv convert to in memory text stream object

In [187]:
bucket = args['destination'].split('//')[-1].rstrip('/')

client = boto3.client('s3')
response = client.list_objects(
    Bucket=bucket,
    Prefix='run-'
)

objects = [item['Key'] for item in response['Contents']]
print(objects)
max_date = max([item['LastModified'] for item in response['Contents']])
print(max_date)
for item in response['Contents']:
    if item['LastModified'] == max_date:
        key = item['Key']


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

['run-1652665904728-part-r-00000', 'run-1652669891142-part-r-00000']
2022-05-16 02:58:21+00:00

In [181]:
response = client.get_object(Bucket='aws-forecast-demo-examples',Key=key)
bytes_data = response['Body'].read()
csv_string = bytes_data.decode()
pd.read_csv(StringIO(csv_str)).head(10)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

               timestamp  target_value
0  2011-01-01 00:00:00.0      9.009692
1  2011-01-02 00:00:00.0      9.398975
2  2011-01-03 00:00:00.0      9.993922
3  2011-01-04 00:00:00.0      9.061492
4  2011-01-05 00:00:00.0      8.971194
5  2011-01-06 00:00:00.0      8.946896
6  2011-01-07 00:00:00.0      9.186969
7  2011-01-08 00:00:00.0      9.098067
8  2011-01-09 00:00:00.0     10.878104
9  2011-01-10 00:00:00.0      9.382696

In [191]:
renamed = 'glue_prep_aws_forecast.csv'
print(f"Renaming file {key} to {renamed}")
client.put_object(Body=bytes_data, Bucket=bucket, Key=renamed)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Renaming file run-1652669891142-part-r-00000 to glue_prep_aws_forecast.csv
{'ResponseMetadata': {'RequestId': 'HV91JPZYN9ANRWDD', 'HostId': 'bCMUQXf8VUeeVkBnUEu9gOa33HXY4niHd4Ia5VZrsCI1jMwNVdgnMCznBQTwA5HCte2X07pwxss=', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amz-id-2': 'bCMUQXf8VUeeVkBnUEu9gOa33HXY4niHd4Ia5VZrsCI1jMwNVdgnMCznBQTwA5HCte2X07pwxss=', 'x-amz-request-id': 'HV91JPZYN9ANRWDD', 'date': 'Mon, 16 May 2022 04:12:44 GMT', 'etag': '"d7c0b11b765cc0f54a63783ff165775c"', 'server': 'AmazonS3', 'content-length': '0'}, 'RetryAttempts': 0}, 'ETag': '"d7c0b11b765cc0f54a63783ff165775c"'}

Delete original glue output object after renaming

In [190]:

for item in objects:
    client.delete_object(Bucket=bucket, Key=item)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

{'ResponseMetadata': {'RequestId': 'AYGZYQWPWH8X31EB', 'HostId': 'qCqKD7iVytMr3XGhTwXsW5nBKtd2906YfHD/GXrjX44HEz4iErl0mz7dSoLdahXPoEfNAG3czCQ=', 'HTTPStatusCode': 204, 'HTTPHeaders': {'x-amz-id-2': 'qCqKD7iVytMr3XGhTwXsW5nBKtd2906YfHD/GXrjX44HEz4iErl0mz7dSoLdahXPoEfNAG3czCQ=', 'x-amz-request-id': 'AYGZYQWPWH8X31EB', 'date': 'Mon, 16 May 2022 04:09:13 GMT', 'server': 'AmazonS3'}, 'RetryAttempts': 0}}
{'ResponseMetadata': {'RequestId': 'AYGQW5WEYT31SSMZ', 'HostId': '1xM+px/2M5OcY6tK2T3yx0PwHoK2PSsvb7sLnqCrHxxtnOSmotUDmue/Zfva/itSeZaKi29WLDA=', 'HTTPStatusCode': 204, 'HTTPHeaders': {'x-amz-id-2': '1xM+px/2M5OcY6tK2T3yx0PwHoK2PSsvb7sLnqCrHxxtnOSmotUDmue/Zfva/itSeZaKi29WLDA=', 'x-amz-request-id': 'AYGQW5WEYT31SSMZ', 'date': 'Mon, 16 May 2022 04:09:13 GMT', 'server': 'AmazonS3'}, 'RetryAttempts': 0}}