# Load geolocation, user and pinterest data frames from delta tables

In [None]:
import pyspark.sql.functions as F
from pyspark.sql.types import StringType

In [None]:
# delta table paths for geo_df, pin_df and user_df
delta_base_path = "/mnt/pinterest_data/delta_tables/"

# Paths for raw/original data
raw_delta_pin_path = delta_base_path + "raw/pin"

# Paths for transformed/processed data
transformed_delta_pin_path = delta_base_path + "transformed/pin"

In [None]:
# Read the delta tables
df_pin = spark.read.format("delta").load(raw_delta_pin_path)

# Transformations

In [None]:
# define a udf to convert numerical abbreviation to string numeric form
@F.udf(returnType=StringType())
def convert_numeric_abb_to_str_numeric_representation(value: str) -> str:
    """
    converts value in numberical abbreviation form to string numeric form

    Args:
        value (str): value in numerical abbreviation form

    Returns:
        str: string numeric form of value
    """
    try:
        if value == None:
            return None
        elif value[-1] == "M":
            num_part = value[:-1]
            return num_part + "000000"
        elif value[-1] == "k":
            num_part = value[:-1]
            return num_part + '000'
        else:
            return value
    except Exception as e:
        return "An exception occured!"
    
# define irrelevant values for the description column
irrelevant_data = ["No description available Story format", "No description available"]
    
# Replace empty strings with null
for column in df_pin.columns:
    transformed_df = df_pin.withColumn(
        column,
        F.when(
            F.col(column) == "", None
        ).otherwise(F.col(column))
    )

# other transformations
transformed_df = (
    # change the irrelevant values in description column to null
    transformed_df.withColumn(
        "description",
        F.when(
            (F.col("description") == irrelevant_data[0]) | (F.col("description") == irrelevant_data[1]), None
        ).otherwise(F.col("description"))
    )

    # change 'User Info Error' values to null in 'follower_count' column
    .withColumn(
        "follower_count",
        F.when(F.col("follower_count") == "User Info Error", None).otherwise(F.col("follower_count"))
    )

    # change 'Image src error' values to null in 'image_src' column
    .withColumn(
        "image_src",
        F.when(F.col("image_src") == 'Image src error', None).otherwise(F.col("image_src"))
    )

    # change 'User Info Error' values to null in 'poster_name' column
    .withColumn(
        "poster_name",
        F.when(F.col("poster_name") == 'User Info Error', None).otherwise(F.col("poster_name"))
    )

    # change "N,o,,T,a,g,s,,A,v,a,i,l,a,b,l,e" value to null in 'tag_list'
    .withColumn(
        "tag_list",
        F.when(F.col("tag_list") == "N,o, ,T,a,g,s, ,A,v,a,i,l,a,b,l,e", None).otherwise(F.col("tag_list"))
    )

    # change "No Title Data Available" value to null in 'title' column
    .withColumn(
        "title",
        F.when(F.col("title") == "No Title Data Available", None).otherwise(F.col("title"))
    )

    # convert 'follower_count' column from numerical abbreviation to string numeric form
    .withColumn("follower_count", convert_numeric_abb_to_str_numeric_representation(F.col("follower_count")))

    # change datatype of 'follower_count' to int
    .withColumn("follower_count", F.col("follower_count").cast("int"))

    # modify 'save_location' column to only contain the save location path
    .withColumn("save_location",
                F.col("save_location").substr(
                    F.locate(substr="/", str="save_location", pos=1),
                    F.length(F.col("save_location"))
                ))
    
    # rename 'index' column to ind
    .withColumnRenamed("index", "ind")
)

In [None]:
# action to execute transformations
transformed_df.show()

# Write cleaned dataframe as a delta table

In [None]:
transformed_df.write.format("delta").mode("overwrite").save(transformed_delta_pin_path)