# AWS Glue Studio Notebook
##### You are now running a AWS Glue Studio notebook; To start using your notebook you need to start an AWS Glue Interactive Session.


#### Optional: Run this cell to see available notebook commands ("magics").


In [None]:
%help

####  Run this cell to set up and start your interactive session.


In [2]:
%idle_timeout 2880
%glue_version 5.0
%worker_type G.1X
%number_of_workers 5

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
  
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

You are already connected to a glueetl session af0ec154-ffe0-4ecb-8ad5-7f60343446cc.

No change will be made to the current session that is set as glueetl. The session configuration change will apply to newly created sessions.


Current idle_timeout is None minutes.
idle_timeout has been set to 2880 minutes.


You are already connected to a glueetl session af0ec154-ffe0-4ecb-8ad5-7f60343446cc.

No change will be made to the current session that is set as glueetl. The session configuration change will apply to newly created sessions.


Setting Glue version to: 5.0


You are already connected to a glueetl session af0ec154-ffe0-4ecb-8ad5-7f60343446cc.

No change will be made to the current session that is set as glueetl. The session configuration change will apply to newly created sessions.


Previous worker type: None
Setting new worker type to: G.1X


You are already connected to a glueetl session af0ec154-ffe0-4ecb-8ad5-7f60343446cc.

No change will be made to the current session that is set as glueetl. The session configuration change will apply to newly created sessions.


Previous number of workers: None
Setting new number of workers to: 5



#### Example: Create a DynamicFrame from a table in the AWS Glue Data Catalog and display its schema


In [None]:
#dyf = glueContext.create_dynamic_frame.from_catalog(database='database_name', table_name='table_name')
#dyf.printSchema()

In [4]:
dyf_emp = glueContext.create_dynamic_frame.from_options(
    connection_type = "s3",
    connection_options = {
        "paths": ["s3://practice-spark-shu/employees_120.csv"]
    },
    format = "CSV",
    format_options = {
    "withHeader" : True
    }

)




In [12]:
dyf_emp = dyf_emp.resolveChoice(specs=[("emp_id","cast:int"),
                                      ("salary","cast:double"),])




In [13]:
dyf_emp.printSchema()

root
|-- emp_id: int
|-- name: string
|-- department: string
|-- salary: double
|-- join_date: string
|-- city: string


In [14]:
df_emp = dyf_emp.toDF()




In [15]:
df_emp.show()

+------+------+----------+--------+----------+---------+
|emp_id|  name|department|  salary| join_date|     city|
+------+------+----------+--------+----------+---------+
|    20| Emp20|        HR|106276.0|2018-05-31|    Delhi|
|    40| Emp40|        HR|119149.0|2019-01-20|     Pune|
|    60| Emp60|   Finance|130434.0|2022-02-15|Bangalore|
|    80| Emp80|     Sales| 48704.0|2019-03-12|   Mumbai|
|   100|Emp100|        IT|136382.0|2019-11-28|   Mumbai|
|   120|Emp120|        HR| 45058.0|2018-04-17|Bangalore|
|     1|  Emp1|        HR|118675.0|2021-04-13|Bangalore|
|    21| Emp21|        IT| 37551.0|2020-01-28|   Mumbai|
|    41| Emp41|        HR| 93352.0|2022-07-05|     Pune|
|    61| Emp61|     Sales| 59468.0|2020-10-02|     Pune|
|    81| Emp81|        IT|147980.0|2021-09-27|Bangalore|
|   101|Emp101|        HR| 81036.0|2023-01-05|Bangalore|
|     2|  Emp2|        HR|123537.0|2023-01-18|    Delhi|
|    22| Emp22|       Ops|127669.0|2022-02-02|   Mumbai|
|    42| Emp42|        HR|11677

In [18]:
dyf_sales = glueContext.create_dynamic_frame.from_options(
    connection_type = 's3',
    connection_options = {
        "paths":["s3://practice-spark-shu/sales_120.csv"]
        
    },
    format = "CSV",
    format_options = {
        "withHeader" : True
    }
)




In [21]:
dyf_sales = dyf_sales.resolveChoice(specs=[("order_id","cast:int"),
                                          ("customer_id","cast:int"),
                                          ("price","cast:double"),
                                          ("quantity","cast:int")])




In [22]:
dyf_sales.printSchema()

root
|-- order_id: int
|-- customer_id: int
|-- product: string
|-- price: double
|-- quantity: int
|-- order_date: string


In [23]:
df_sales = dyf_sales.toDF()




In [24]:
df_sales.show()

+--------+-----------+----------+------+--------+----------+
|order_id|customer_id|   product| price|quantity|order_date|
+--------+-----------+----------+------+--------+----------+
|      20|       null|    Tablet| 100.0|       4|2024-01-12|
|      40|       null|Headphones| 500.0|       1|2024-01-31|
|      60|       null|     Watch| 300.0|       4|2024-01-27|
|      80|       null|     Phone| 300.0|       2|2024-01-03|
|     100|       null|     Watch| 500.0|       4|2024-01-18|
|     120|       null|Headphones| 300.0|       1|2024-01-24|
|       1|       null|     Phone| 200.0|       5|2024-01-22|
|      21|       null|    Tablet| 800.0|       2|2024-01-29|
|      41|       null|    Tablet| 800.0|       1|2024-01-07|
|      61|       null|    Laptop| 300.0|       3|2024-01-25|
|      81|       null|    Laptop| 200.0|       4|2024-01-16|
|     101|       null|    Laptop|1200.0|       1|2024-01-04|
|       2|       null|    Laptop|1200.0|       5|2024-01-31|
|      22|       null|  

In [25]:
dyf_tran = glueContext.create_dynamic_frame.from_options(
    connection_type = "s3",
    connection_options = {
        "paths":["s3://practice-spark-shu/transactions_120.csv"]
    },
    format = "CSV",
    format_options = {"withHeader":True}
)




In [28]:
dyf_tran = dyf_tran.resolveChoice(specs = [("amount","cast:double")])




In [29]:
dyf_tran.printSchema()

root
|-- txn_id: string
|-- user_id: string
|-- amount: double
|-- status: string
|-- txn_date: string


In [30]:
df_tran = dyf_tran.toDF()




In [31]:
df_tran.show()

+------+-------+------+-------+----------+
|txn_id|user_id|amount| status|  txn_date|
+------+-------+------+-------+----------+
|   T20|    U12|1034.0|SUCCESS|2024-02-12|
|   T40|     U8| 628.0|SUCCESS|2024-02-21|
|   T60|    U10|1824.0|SUCCESS|2024-02-13|
|   T80|     U8| 504.0| FAILED|2024-02-12|
|  T100|    U11|1832.0| FAILED|2024-02-04|
|  T120|    U19| 711.0|SUCCESS|2024-02-15|
|    T1|     U9|1990.0| FAILED|2024-02-16|
|   T21|    U17|1696.0| FAILED|2024-02-13|
|   T41|    U19|1681.0| FAILED|2024-02-12|
|   T61|    U16|1624.0| FAILED|2024-02-16|
|   T81|     U9|1511.0|SUCCESS|2024-02-09|
|  T101|     U5| 202.0| FAILED|2024-02-14|
|    T2|    U11| 458.0| FAILED|2024-02-05|
|   T22|    U15|1182.0|SUCCESS|2024-02-04|
|   T42|     U3|1985.0|SUCCESS|2024-02-13|
|   T62|    U20| 287.0|SUCCESS|2024-02-02|
|   T82|     U8| 386.0|SUCCESS|2024-02-11|
|  T102|    U16| 497.0| FAILED|2024-02-10|
|    T3|    U16|1897.0|SUCCESS|2024-02-08|
|   T23|    U21|1769.0| FAILED|2024-02-07|
+------+---