# Load geolocation data

In [None]:
import pyspark.sql.functions as F
from functools import reduce

In [None]:
# delta table paths for geo_df, pin_df and user_df
delta_base_path = "/mnt/pinterest_data/delta_tables/"

#### Paths for raw/original data
raw_delta_geo_path = delta_base_path + "raw/geo"

#### Paths for transformed/processed data
transformed_delta_geo_path = delta_base_path + "transformed/geo"

In [None]:
# load the raw geolocation delta table to df_geo
df_geo = spark.read.format("delta").load(raw_delta_geo_path)

# Transformations

In [None]:
# transformations
transformed_df = (

        # create a new column coordinates with latitude and longitude
        df_geo.withColumn(
            "coordinates",
            F.struct(
                F.col("latitude"),
                F.col("longitude")
            )
        )

        # drop the latitude and longitude columns
        .drop("latitude", "longitude")

        # convert "timestamp" column from string to timestamp
        .withColumn(
            "timestamp",
            F.to_timestamp(
                F.col("timestamp")
            ).cast("timestamp")
        )

        # reorder the columns
        .select(["ind", "country", "coordinates", "timestamp"])
)

In [None]:
# action to execute transformations
display(transformed_df)

ind,country,coordinates,timestamp
1408,Rwanda,"List(-86.9741, 39.3835)",2018-08-20T12:43:59.000+0000
1409,Saint Pierre and Miquelon,"List(-47.3201, 64.1745)",2020-12-10T04:51:24.000+0000
1410,Lao People's Democratic Republic,"List(-78.0163, -64.578)",2019-04-28T03:51:32.000+0000
1411,Cambodia,"List(-55.4968, -165.517)",2019-10-24T03:10:37.000+0000
1412,Aruba,"List(-45.0466, -58.6816)",2021-09-01T02:54:38.000+0000
1413,Burkina Faso,"List(-4.91336, 116.494)",2019-04-08T15:40:34.000+0000
1414,American Samoa,"List(-81.8896, -153.897)",2018-01-24T17:07:15.000+0000
1415,Cape Verde,"List(-84.8196, -76.5704)",2022-09-23T04:59:53.000+0000
1416,South Africa,"List(-47.3985, 39.0919)",2017-12-09T12:04:26.000+0000
1417,American Samoa,"List(-88.5252, -172.436)",2021-03-06T14:15:24.000+0000


# Write cleaned dataframe to delta_table

In [None]:
transformed_df.write.format("delta").mode("overwrite").save(transformed_delta_geo_path)