In [1]:
from pyspark.context import SparkContext, SparkConf
from awsglue.dynamicframe import DynamicFrame

In [2]:
jars = '/Users/joe/aws-glue-libs/jarsv1/*'
sc = SparkContext(conf=SparkConf().setAll([
    ('spark.executor.extraClassPath', jars),
    ('spark.driver.extraClassPath', jars)
]))

sc.getConf().getAll()

[('spark.executor.extraClassPath', '/Users/joe/aws-glue-libs/jarsv1/*'),
 ('spark.rdd.compress', 'True'),
 ('spark.driver.host', '192.168.0.14'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.driver.port', '65079'),
 ('spark.master', 'local[*]'),
 ('spark.executor.id', 'driver'),
 ('spark.submit.deployMode', 'client'),
 ('spark.app.id', 'local-1616068294858'),
 ('spark.ui.showConsoleProgress', 'true'),
 ('spark.app.name', 'pyspark-shell'),
 ('spark.driver.extraClassPath', '/Users/joe/aws-glue-libs/jarsv1/*')]

In [3]:
from awsglue.context import GlueContext

glueContext = GlueContext(sc)

spark = glueContext.spark_session

In [4]:
data = spark.read.parquet('data/catalog.parquet')

In [5]:
datasource = DynamicFrame.fromDF(data, glueContext, 'datasource')

In [6]:
locations = datasource.filter(
    lambda r: r['resourceType'] == 'Location'
)
locations = locations.select_fields(
    ['identifier','name','type','address','position']
)

In [7]:
df = locations.toDF()

# care_sites = df.na.drop(subset=["type"])

In [None]:
df.show()

In [8]:
import pyspark.sql.functions as F

In [9]:
df = df.withColumn('exploded', F.explode('identifier'))

In [10]:
df = df.withColumn('id', df['exploded']['value'])

In [11]:
df = df.drop(*['exploded','identifier'])

In [12]:
df.show()

+--------------------+----+--------------------+--------------------+--------------------+
|                name|type|             address|            position|                  id|
+--------------------+----+--------------------+--------------------+--------------------+
|[FALMOUTH HOSPITAL,]|null|[[02540, FALMOUTH...|[41.57072, -70.55...|9cf3cd22-2eec-34e...|
|        [PCP142036,]|null|[[02536-5671, TEA...|[41.562579, -70.5...|830716da-523f-3ca...|
+--------------------+----+--------------------+--------------------+--------------------+



In [13]:
df.printSchema()

root
 |-- name: struct (nullable = true)
 |    |-- string: string (nullable = true)
 |    |-- array: null (nullable = true)
 |-- type: null (nullable = true)
 |-- address: struct (nullable = true)
 |    |-- struct: struct (nullable = true)
 |    |    |-- postalCode: string (nullable = true)
 |    |    |-- city: string (nullable = true)
 |    |    |-- country: string (nullable = true)
 |    |    |-- state: string (nullable = true)
 |    |    |-- line: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |-- array: null (nullable = true)
 |-- position: struct (nullable = true)
 |    |-- latitude: double (nullable = true)
 |    |-- longitude: double (nullable = true)
 |-- id: string (nullable = true)



In [16]:
df = df.withColumn('city', F.col('address.struct.city'))\
       .withColumn('state', F.col('address.struct.state'))\
       .withColumn('zip', F.col('address.struct.postalCode'))\
       .withColumn('country', F.col('address.struct.country'))

In [18]:
# df = df.withColumn('exploded', F.explode('address.struct.line'))

In [22]:
df = df.withColumn('address_1', F.col('address.struct.line').getItem(0))
df = df.withColumn('address_2', F.col('address.struct.line').getItem(1))

In [25]:
df = df.withColumnRenamed('id', 'location_id')

In [32]:
df = df.drop(*['address','position','exploded','name','type'])

In [33]:
df.show()

+--------------------+---------+-----+----------+-------+--------------------+---------+
|         location_id|     city|state|       zip|country|           address_1|address_2|
+--------------------+---------+-----+----------+-------+--------------------+---------+
|9cf3cd22-2eec-34e...| FALMOUTH|   MA|     02540|     US|67 & 100 TER HEUN...|     null|
|830716da-523f-3ca...|TEATICKET|   MA|02536-5671|     US|270 TEATICKET HWY 1A|     null|
+--------------------+---------+-----+----------+-------+--------------------+---------+



In [34]:
locations = DynamicFrame.fromDF(df, glueContext, 'locations')