####  Run this cell to set up and start your interactive session.


In [1]:
%idle_timeout 2880
%glue_version 3.0
%worker_type G.1X
%number_of_workers 5

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
  
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 0.37.0 
Current idle_timeout is 2800 minutes.
idle_timeout has been set to 2880 minutes.
Setting Glue version to: 3.0
Previous worker type: G.1X
Setting new worker type to: G.1X
Previous number of workers: 5
Setting new number of workers to: 5
Authenticating with environment variables and user-defined glue_role_arn: arn:aws:iam::001898544471:role/glue
Trying to create a Glue session for the kernel.
Worker Type: G.1X
Number of Workers: 5
Session ID: 2b613577-06fb-4617-83fd-6a4ffb401f96
Job Type: glueetl
Applying the following default arguments:
--glue_kernel_version 0.37.0
--enable-glue-datacatalog true
Waiting for session 2b613577-06fb-4617-83fd-6a

## Trips

In [4]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Read CSV into PySpark DataFrame") \
    .getOrCreate()
file_path = "s3://usecases-glue-jobs/divvy/static/final_static.csv"  # Replace this with the path to your CSV file
trips_df_static = spark.read.csv(file_path, header=True, inferSchema=True)
trips_final_static = trips_df_static.drop("_c0")

# Show the first few rows of the DataFrame
trips_df_static.show(5)

+---+-------------------+-----+-----+
|_c0|         start_time|trips|  zip|
+---+-------------------+-----+-----+
|  0|2013-06-27 01:00:00|    1|60661|
|  1|2013-06-27 11:00:00|    1|60622|
|  2|2013-06-27 11:00:00|    3|60607|
|  3|2013-06-27 12:00:00|    1|60614|
|  4|2013-06-27 12:00:00|    2|60611|
+---+-------------------+-----+-----+
only showing top 5 rows


## Landmark

In [5]:
# Static and Streamed all in 1 DF
from pyspark.sql.functions import col

file_path = "s3://usecases-glue-jobs/divvy/static/landmark_clean.csv"  # Replace this with the path to your CSV file
landmark_df = spark.read.csv(file_path, header=True, inferSchema=True)
landmark_df = landmark_df.drop("_c0")
landmark_df = landmark_df.withColumn("zip_code", col('zip_code').cast("string"))

# Show the first few rows of the DataFrame
landmark_df.show(5)

+--------+---------+
|zip_code|landmarks|
+--------+---------+
|   60302|        1|
|   60409|        1|
|   60601|       15|
|   60602|        9|
|   60603|       12|
+--------+---------+
only showing top 5 rows


## Weather

In [6]:
# Static and Streamed all in 1 DF
file_path = "s3://usecases-glue-jobs/divvy/static/weather_static.csv"  # Replace this with the path to your CSV file
weather_df_static = spark.read.csv(file_path, header=True, inferSchema=True)
weather_df_static = weather_df_static.drop("_c0")

# Show the first few rows of the DataFrame
weather_df_static.show(5)
weather_df_static.printSchema()

+-------------------+----+------------+--------+-------------+------+----+----+----------+---------+
|               time|temp|rel_humidity|dewpoint|apparent_temp|precip|rain|snow|cloudcover|windspeed|
+-------------------+----+------------+--------+-------------+------+----+----+----------+---------+
|2013-01-01 00:00:00|-4.2|          66|    -9.7|        -10.2|   0.0| 0.0| 0.0|        79|     15.8|
|2013-01-01 01:00:00|-4.3|          67|    -9.5|        -10.5|   0.0| 0.0| 0.0|        72|     16.1|
|2013-01-01 02:00:00|-4.4|          67|    -9.7|        -10.3|   0.0| 0.0| 0.0|        82|     14.6|
|2013-01-01 03:00:00|-4.6|          67|    -9.8|        -10.5|   0.0| 0.0| 0.0|        80|     14.4|
|2013-01-01 04:00:00|-4.8|          68|    -9.9|        -11.9|   0.0| 0.0| 0.0|        37|     16.3|
+-------------------+----+------------+--------+-------------+------+----+----+----------+---------+
only showing top 5 rows

root
 |-- time: string (nullable = true)
 |-- temp: double (nullab

## Join

In [7]:
# Weather and trips
from pyspark.sql.functions import col, to_timestamp, date_format

wt_static = trips_final_static.join(weather_df_static, trips_final_static.start_time == weather_df_static.time, "left")

# Weather, trips, and landmark
wtl_static = wt_static.join(landmark_df, wt_static.zip == landmark_df.zip_code, "left")

# Drop duplicate
wtl_static_final = wtl_static.drop("zip_code", "time").orderBy("start_time")
wtl_static_final = wtl_static_final.withColumn("start_time", date_format("start_time", "yyyy-MM-dd HH:mm:ss"))




In [8]:
import pyspark.sql.functions as F

spark = SparkSession.builder \
    .appName("Read CSV into PySpark DataFrame") \
    .getOrCreate()

df2 = (wtl_static_final.select('zip', F.explode(F.split('zip', ', ')).alias('zip_2'))
         .groupBy('zip')
         .pivot('zip')
         .agg(F.lit(1))
         .fillna(0)
      )




In [9]:
wtl_static_final = wtl_static_final.withColumnRenamed("zip", "zip_code")
wt_ohe = wtl_static_final.join(df2, wtl_static_final.zip_code == df2.zip, "left").orderBy("start_time")
wt_ohe = wt_ohe.drop("zip_code", "zip")
wt_ohe.show(5)

+-------------------+-----+----+------------+--------+-------------+------+----+----+----------+---------+---------+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+
|         start_time|trips|temp|rel_humidity|dewpoint|apparent_temp|precip|rain|snow|cloudcover|windspeed|landmarks|60201|60202|60208|60301|60302|60304|60601|60602|60603|60604|60605|60606|60607|60608|60609|60610|60611|60612|60613|60614|60615|60616|60617|60618|60619|60620|60621|60622|60623|60624|60625|60626|60628|60629|60630|60632|60636|60637|60638|60640|60641|60642|60643|60644|60645|60646|60647|60649|60651|60653|60654|60657|60659|60660|60661|60696|60804|
+-------------------+-----+----+------------+--------+-------------+------+----+--

In [None]:
from pyspark.sql import functions as F
from pyspark.sql import types as T
from math import pi

# Make sure start_time is of correct data type
wt_ohe = wt_ohe.withColumn("start_time", wt_ohe["start_time"].cast(T.TimestampType()))

start = wt_ohe.select('start_time').first()[0]

# Calculate hours_since_start
df_new = wt_ohe.withColumn('hours_since_start', 
                       (F.unix_timestamp('start_time') - F.unix_timestamp(F.lit(start)))/3600)

# Drop start_time column
df_new = df_new.drop('start_time')

# Create Year sin, Year cos, Week sin, Week cos, Day sin, Day cos columns
df_new = df_new.withColumn('Year_sin', F.sin(df_new['hours_since_start'] * (2 * pi / (365*24))))
df_new = df_new.withColumn('Year_cos', F.cos(df_new['hours_since_start'] * (2 * pi / (365*24))))

df_new = df_new.withColumn('Week_sin', F.sin(df_new['hours_since_start'] * (2 * pi / (7*24))))
df_new = df_new.withColumn('Week_cos', F.cos(df_new['hours_since_start'] * (2 * pi / (7*24))))

df_new = df_new.withColumn('Day_sin', F.sin(df_new['hours_since_start'] * (2 * pi / 24)))
df_new = df_new.withColumn('Day_cos', F.cos(df_new['hours_since_start'] * (2 * pi / 24)))

## Spark to Glue

In [None]:
from awsglue.dynamicframe import DynamicFrame

#Convert from spark df to dynamic frame
wtl_static_dyf = DynamicFrame.fromDF(df_new, glueContext, 'convert')

In [None]:
# Check if static table exists
# If the static table does not exist, create
import boto3

database_name = "divvy"
table_name = "static"
glue_client = boto3.client('glue')

schema = wtl_static_dyf.schema()
columns = [
    {
        "Name": field.name,
        "Type": field.dataType.typeName()
    }
    for field in schema.fields
]

# Create table configurations
create_table_options_static = {
    "DatabaseName": database_name,
    "TableInput": {
        "Name": table_name,
        "Description": "Streamed data for divvy bikes",
        "StorageDescriptor": {
            "Columns": columns,
            "Location": "s3://usecases-glue-jobs/divvy/static/static_to_store/",
            "InputFormat": "org.apache.hadoop.mapred.TextInputFormat",
            "OutputFormat": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat",
            "Compressed": False,
            "SerdeInfo": {
                "SerializationLibrary": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe",
                "Parameters": {
                    "field.delim": ","
                }
            }
        },
        "PartitionKeys": []
    }
}


try: 
    response = glue_client.get_table(
    DatabaseName=database_name,
    Name=table_name
)
except:
    glue_client = boto3.client('glue')
    response_static = glue_client.create_table(**create_table_options_static)
    print(f"{table_name} does not exist. Creating...")

glueContext.write_dynamic_frame.from_catalog(
    frame = wtl_streamed_dyf,
    database = "divvy",
    table_name = "static",
    create_dynamic_frame_options={
        "type": "csv",
        "schema": wtl_static_dyf.schema()
    }
)

print(f"Sucessfully wrote to {table_name}")