# AWS Glue Studio Notebook
##### You are now running a AWS Glue Studio notebook; To start using your notebook you need to start an AWS Glue Interactive Session.


#### Optional: Run this cell to see available notebook commands ("magics").


In [None]:
%help

####  Run this cell to set up and start your interactive session.


In [17]:
%idle_timeout 2880
%glue_version 3.0
%worker_type G.1X
%number_of_workers 5

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
  
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

You are already connected to a glueetl session 2a587f2d-6a92-4d36-81bc-0e79c3e968e7.

No change will be made to the current session that is set as glueetl. The session configuration change will apply to newly created sessions.


Current idle_timeout is 2880 minutes.
idle_timeout has been set to 2880 minutes.


You are already connected to a glueetl session 2a587f2d-6a92-4d36-81bc-0e79c3e968e7.

No change will be made to the current session that is set as glueetl. The session configuration change will apply to newly created sessions.


Setting Glue version to: 3.0


You are already connected to a glueetl session 2a587f2d-6a92-4d36-81bc-0e79c3e968e7.

No change will be made to the current session that is set as glueetl. The session configuration change will apply to newly created sessions.


Previous worker type: G.1X
Setting new worker type to: G.1X


You are already connected to a glueetl session 2a587f2d-6a92-4d36-81bc-0e79c3e968e7.

No change will be made to the current session that is set as glueetl. The session configuration change will apply to newly created sessions.


Previous number of workers: 5
Setting new number of workers to: 5



## Read

In [18]:
motion_static = glueContext.create_dynamic_frame.from_options(
    connection_type="s3",
    connection_options={
        "paths": ["s3://refit-iot/data/motionsense/static/"],
        "recurse": True,
        "header": "true"
    },
    format="csv"
)

# Read in trips static as dynamic frame
motion_streamed = glueContext.create_dynamic_frame.from_options(
    connection_type="s3",
    connection_options={
        "paths": ["s3://refit-iot/data/motionsense/streamed/"],
        "recurse": True,
        "header": "true"
    },
    format="csv"
)
# Convert to spark df
motion_df_static = motion_static.toDF()
motion_df_streamed = motion_streamed.toDF()




## Fix header

In [51]:
# Static
header = motion_df_static.rdd.first()
motion_final_static = spark.createDataFrame(motion_df_static.rdd.filter(lambda x: x != header), header)
motion_final_static = motion_final_static.drop("Unnamed: 0", "")

#Streamed
header = motion_df_streamed.rdd.first()
motion_final_streamed = spark.createDataFrame(motion_df_streamed.rdd.filter(lambda x: x != header), header)
motion_final_streamed = motion_final_streamed.drop("Unnamed: 0", "")




In [52]:
from pyspark.sql.functions import col, to_date, to_timestamp

# Time: str to timestamp
motion_final_streamed = motion_final_streamed.withColumn("time_series_data", to_timestamp(col("time_series_data"), "yyyy-MM-dd HH:mm:ss.SSS"))
motion_final_static = motion_final_static.withColumn("time_series_data", to_timestamp(col("time_series_data"), "yyyy-MM-dd HH:mm:ss.SSS"))

cols_to_cast = [val for val in motion_final_streamed.columns if (val != 'test_type' and val != 'time_series_data')]
for col_name in cols_to_cast:
    motion_final_streamed = motion_final_streamed.withColumn(col_name, col(col_name).cast("double"))
    motion_final_static = motion_final_static.withColumn(col_name, col(col_name).cast("double"))




## Write

In [55]:
from awsglue.dynamicframe import DynamicFrame

#Convert from spark df to dynamic frame
motion_static_dyf = DynamicFrame.fromDF(motion_final_static, glueContext, 'convert')
motion_streamed_dyf = DynamicFrame.fromDF(motion_final_streamed, glueContext, 'convert')




In [56]:
import boto3

# Streamed

# Housekeeping
database_name = "motionsense"
table_name = "streamed"
glue_client = boto3.client('glue')

# Define schema
schema = motion_streamed_dyf.schema()
columns = [
    {
        "Name": field.name,
        "Type": field.dataType.typeName()
    }
    for field in schema.fields
]

# Create table configurations
create_table_options_streamed = {
    "DatabaseName": database_name,
    "TableInput": {
        "Name": table_name,
        "Description": "Streamed data for motion sense",
        
        "StorageDescriptor": {
            "Columns": columns,
            "Location": "s3://refit-iot/final_data_landing/motionsense/streamed/",
            "InputFormat": "org.apache.hadoop.mapred.TextInputFormat",
            "OutputFormat": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat",
            "Compressed": False,
            "SerdeInfo": {
                "SerializationLibrary": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe",
                "Parameters": {
                    "field.delim": ",",
                    "skip.header.line.count" : "1"
                }
            }
        },
        "PartitionKeys": []
    }
}

# Check if streamed table exists
# If the streamed table does not exist, create

try: 
    response = glue_client.get_table(
    DatabaseName=database_name,
    Name=table_name
)
    print(f"{table_name} already exists. Directly writing...")
except:
    glue_client = boto3.client('glue')
    response_streamed = glue_client.create_table(**create_table_options_streamed)
    print(f"{table_name} does not exist. Creating...")

glueContext.write_dynamic_frame.from_catalog(
    frame = motion_streamed_dyf,
    database = database_name,
    table_name = table_name
    
)

print(f"Sucessfully wrote to {table_name}")

streamed does not exist. Creating...
Sucessfully wrote to streamed


In [57]:
# Static

# Housekeeping
database_name = "motionsense"
table_name = "static"
glue_client = boto3.client('glue')

# Define schema
schema = motion_static_dyf.schema()
columns = [
    {
        "Name": field.name,
        "Type": field.dataType.typeName()
    }
    for field in schema.fields
]

# Create table configurations
create_table_options_streamed = {
    "DatabaseName": database_name,
    "TableInput": {
        "Name": table_name,
        "Description": "Static data for motion sense",
        
        "StorageDescriptor": {
            "Columns": columns,
            "Location": "s3://refit-iot/final_data_landing/motionsense/static/",
            "InputFormat": "org.apache.hadoop.mapred.TextInputFormat",
            "OutputFormat": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat",
            "Compressed": False,
            "SerdeInfo": {
                "SerializationLibrary": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe",
                "Parameters": {
                    "field.delim": ",",
                    "skip.header.line.count" : "1"
                }
            }
        },
        "PartitionKeys": []
    }
}

# Check if streamed table exists
# If the streamed table does not exist, create

try: 
    response = glue_client.get_table(
    DatabaseName=database_name,
    Name=table_name
)
    print(f"{table_name} already exists. Directly writing...")
except:
    glue_client = boto3.client('glue')
    response_streamed = glue_client.create_table(**create_table_options_streamed)
    print(f"{table_name} does not exist. Creating...")

glueContext.write_dynamic_frame.from_catalog(
    frame = motion_static_dyf,
    database = database_name,
    table_name = table_name
    
)

print(f"Sucessfully wrote to {table_name}")

static does not exist. Creating...
Sucessfully wrote to static
