# Load geolocation data

In [None]:
import pyspark.sql.functions as F
from functools import reduce

In [None]:
# delta table paths for geo_df, pin_df and user_df
delta_base_path = "/mnt/pinterest_data/delta_tables/"

#### Paths for raw/original data
raw_delta_geo_path = delta_base_path + "raw/geo"

#### Paths for transformed/processed data
transformed_delta_geo_path = delta_base_path + "transformed/geo"

In [None]:
# load the raw geolocation delta table to df_geo
df_geo = spark.read.format("delta").load(raw_delta_geo_path)

# Transformations

In [None]:
# transformations
transformed_df = (

        # create a new column coordinates with latitude and longitude
        df_geo.withColumn(
            "coordinates",
            F.struct(
                F.col("latitude"),
                F.col("longitude")
            )
        )

        # drop the latitude and longitude columns
        .drop("latitude", "longitude")

        # convert "timestamp" column from string to timestamp
        .withColumn(
            "timestamp",
            F.to_timestamp(
                F.col("timestamp")
            ).cast("timestamp")
        )

        # reorder the columns
        .select(["ind", "country", "coordinates", "timestamp"])
)

In [None]:
# action to execute transformations
transformed_df.show()

# Write cleaned dataframe to delta_table

In [None]:
transformed_df.write.format("delta").mode("overwrite").save(transformed_delta_geo_path)