# AWS Glue Studio Notebook
##### You are now running a AWS Glue Studio notebook; To start using your notebook you need to start an AWS Glue Interactive Session.


#### Optional: Run this cell to see available notebook commands ("magics").


In [None]:
%help

####  Run this cell to set up and start your interactive session.


In [1]:
%idle_timeout 2880
%glue_version 4.0
%worker_type G.1X
%number_of_workers 5

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
  
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

Current idle_timeout is 2880 minutes.
idle_timeout has been set to 2880 minutes.
Setting Glue version to: 4.0
Previous worker type: G.1X
Setting new worker type to: G.1X
Previous number of workers: 5
Setting new number of workers to: 5
Trying to create a Glue session for the kernel.
Session Type: glueetl
Worker Type: G.1X
Number of Workers: 5
Session ID: 4f0ce413-f578-40fe-ba00-cba060e70a4f
Applying the following default arguments:
--glue_kernel_version 1.0.4
--enable-glue-datacatalog true
Waiting for session 4f0ce413-f578-40fe-ba00-cba060e70a4f to get into ready status...
Session 4f0ce413-f578-40fe-ba00-cba060e70a4f has been created.



#### Example: Create a DynamicFrame from a table in the AWS Glue Data Catalog and display its schema


In [2]:
dyf = glueContext.create_dynamic_frame.from_catalog(database='test-flights-db', table_name='flightscsv')
dyf.printSchema()

root
|-- year: long
|-- quarter: long
|-- month: long
|-- day_of_month: long
|-- day_of_week: long
|-- fl_date: string
|-- unique_carrier: string
|-- airline_id: long
|-- carrier: string
|-- tail_num: string
|-- fl_num: long
|-- origin_airport_id: long
|-- origin_airport_seq_id: long
|-- origin_city_market_id: long
|-- origin: string
|-- origin_city_name: string
|-- origin_state_abr: string
|-- origin_state_fips: long
|-- origin_state_nm: string
|-- origin_wac: long
|-- dest_airport_id: long
|-- dest_airport_seq_id: long
|-- dest_city_market_id: long
|-- dest: string
|-- dest_city_name: string
|-- dest_state_abr: string
|-- dest_state_fips: long
|-- dest_state_nm: string
|-- dest_wac: long
|-- crs_dep_time: long
|-- dep_time: long
|-- dep_delay: long
|-- dep_delay_new: long
|-- dep_del15: long
|-- dep_delay_group: long
|-- dep_time_blk: string
|-- taxi_out: long
|-- wheels_off: long
|-- wheels_on: long
|-- taxi_in: long
|-- crs_arr_time: long
|-- arr_time: long
|-- arr_delay: long
|-- 

#### Example: Convert the DynamicFrame to a Spark DataFrame and display a sample of the data


In [3]:
df = dyf.toDF()
df.show()

+----+-------+-----+------------+-----------+--------+--------------+----------+-------+--------+------+-----------------+---------------------+---------------------+------+------------------+----------------+-----------------+---------------+----------+---------------+-------------------+-------------------+----+---------------+--------------+---------------+-------------+--------+------------+--------+---------+-------------+---------+---------------+------------+--------+----------+---------+-------+------------+--------+---------+-------------+---------+---------------+------------+---------+-----------------+--------+----------------+-------------------+--------+-------+--------+--------------+---+-------------+-------------+---------+--------------+-------------------+--------------+---------------+-----------------+
|year|quarter|month|day_of_month|day_of_week| fl_date|unique_carrier|airline_id|carrier|tail_num|fl_num|origin_airport_id|origin_airport_seq_id|origin_city_market_id

In [5]:
df.count() 

5248439


In [7]:
df.groupBy("month").avg("weather_delay").show()

+-----+------------------+
|month|avg(weather_delay)|
+-----+------------------+
|    6|3.0652843235918334|
|    7|3.7375463244161584|
|    1| 2.836573460116814|
|    8|2.6909718529118845|
|    2| 3.420119974059663|
|    5| 2.978542477946436|
|    9|2.2802970825613165|
|   10|1.0367928260582884|
|   11|1.8415311529325782|
|    3|2.1046518669234215|
|    4|2.0263035416697943|
|   12|3.2350931047464595|
+-----+------------------+


In [8]:
df.groupBy("carrier").avg("dep_delay").show()

+-------+--------------------+
|carrier|      avg(dep_delay)|
+-------+--------------------+
|     VX|  11.430539700659564|
|     DL|   8.046840033440557|
|     B6|  14.078356804135487|
|     UA|    11.4228323487426|
|     HA|-0.05495938351609...|
|     OO|   7.491768940979489|
|     F9|  13.005154146165209|
|     NK|  12.359647183659032|
|     EV|   9.602250750250084|
|     AS|  0.6186801842342103|
|     AA|   9.788575652992082|
|     WN|    9.50768465033291|
+-------+--------------------+
