# Load geolocation, user and pinterest data frames from delta tables

In [None]:
import pyspark.sql.functions as F
from pyspark.sql.types import StringType

In [None]:
# delta table paths for geo_df, pin_df and user_df
delta_base_path = "/mnt/pinterest_data/delta_tables/"

# Paths for raw/original data
raw_delta_pin_path = delta_base_path + "raw/pin"

# Paths for transformed/processed data
transformed_delta_pin_path = delta_base_path + "transformed/pin"

In [None]:
# Read the delta tables
df_pin = spark.read.format("delta").load(raw_delta_pin_path)

# Transformations

In [None]:
# define a udf to convert numerical abbreviation to string numeric form
@F.udf(returnType=StringType())
def convert_numeric_abb_to_str_numeric_representation(value: str) -> str:
    """
    converts value in numberical abbreviation form to string numeric form

    Args:
        value (str): value in numerical abbreviation form

    Returns:
        str: string numeric form of value
    """
    try:
        if value == None:
            return None
        elif value[-1] == "M":
            num_part = value[:-1]
            return num_part + "000000"
        elif value[-1] == "k":
            num_part = value[:-1]
            return num_part + '000'
        else:
            return value
    except Exception as e:
        return "An exception occured!"
    
# define irrelevant values for the description column
irrelevant_data = ["No description available Story format", "No description available"]
    
# Replace empty strings with null
for column in df_pin.columns:
    transformed_df = df_pin.withColumn(
        column,
        F.when(
            F.col(column) == "", None
        ).otherwise(F.col(column))
    )

# other transformations
transformed_df = (
    # change the irrelevant values in description column to null
    transformed_df.withColumn(
        "description",
        F.when(
            (F.col("description") == irrelevant_data[0]) | (F.col("description") == irrelevant_data[1]), None
        ).otherwise(F.col("description"))
    )

    # change 'User Info Error' values to null in 'follower_count' column
    .withColumn(
        "follower_count",
        F.when(F.col("follower_count") == "User Info Error", None).otherwise(F.col("follower_count"))
    )

    # change 'Image src error' values to null in 'image_src' column
    .withColumn(
        "image_src",
        F.when(F.col("image_src") == 'Image src error', None).otherwise(F.col("image_src"))
    )

    # change 'User Info Error' values to null in 'poster_name' column
    .withColumn(
        "poster_name",
        F.when(F.col("poster_name") == 'User Info Error', None).otherwise(F.col("poster_name"))
    )

    # change "N,o,,T,a,g,s,,A,v,a,i,l,a,b,l,e" value to null in 'tag_list'
    .withColumn(
        "tag_list",
        F.when(F.col("tag_list") == "N,o, ,T,a,g,s, ,A,v,a,i,l,a,b,l,e", None).otherwise(F.col("tag_list"))
    )

    # change "No Title Data Available" value to null in 'title' column
    .withColumn(
        "title",
        F.when(F.col("title") == "No Title Data Available", None).otherwise(F.col("title"))
    )

    # convert 'follower_count' column from numerical abbreviation to string numeric form
    .withColumn("follower_count", convert_numeric_abb_to_str_numeric_representation(F.col("follower_count")))

    # change datatype of 'follower_count' to int
    .withColumn("follower_count", F.col("follower_count").cast("int"))

    # modify 'save_location' column to only contain the save location path
    .withColumn("save_location",
                F.col("save_location").substr(
                    F.locate(substr="/", str="save_location", pos=1),
                    F.length(F.col("save_location"))
                ))
    
    # rename 'index' column to ind
    .withColumnRenamed("index", "ind")
)

In [None]:
# action to execute transformations
display(transformed_df)

category,description,downloaded,follower_count,image_src,ind,is_image_or_video,poster_name,save_location,tag_list,title,unique_id
art,Material: Canvas,1,2.0,https://i.pinimg.com/originals/04/e1/6f/04e16fea522a2a7bd6af4a1241fa8038.jpg,512,image,Houzedition,/data/art,"Plant Painting,Plant Art,Spray Painting,Watercolor Plants,Watercolor Art,Watercolor Leaves,Leaf Wall Art,Canvas Wall Art,Canvas Prints",Plant Green Leaves Canvas Watercolor Painting - A3 30x42cm No Frame / 5pcs Set,cc095e77-6ee2-4624-9367-e4ed93b7bfce
art,"These 15 artists have used their talents and skills to present natural hair in stunning ways, which any curly can love and appreciate.",1,185000.0,https://i.pinimg.com/originals/5b/97/68/5b9768ca6d191aa23dd979a7e8e98d14.jpg,513,image,NaturallyCurly.com,/data/art,"Black Love Art,Black Girl Art,Art Girl,Black Art Painting,Black Artwork,Lips Painting,Natural Hair Art,Natural Hair Styles,Natural Curls",15 Artists that Show the Beauty and Versatility of Natural Hair,3280903a-06d5-424d-91d6-474e2d301e17
art,"Tara Jane Crandon is a self-taught artist and psychologist from Brisbane, Australia who creates mesmerizing illustrations inspired by nature.",1,2000000.0,https://i.pinimg.com/originals/4d/b0/ac/4db0ac2e4da00e8224b5a77965f10223.png,514,image,Bored Panda,/data/art,"Gouache Painting,Painting & Drawing,Watercolor Paintings,Artist Painting,Nature Paintings,Oil Paintings,Watercolor Trees,Watercolor Artists,Indian Paintings",59 Vibrant Illustrations Inspired By Nature By Australian Artist,ebfe4f29-8e59-4562-aba1-e5d3d2a6b7bd
art,"By Cassie Rief in Featured Artists > Oil Paintings When oil painter Susan Hale speaks of her career in art (and by the way, she’s been an exhibiting artist for more than 30 year…",1,53000.0,https://i.pinimg.com/originals/3d/97/9b/3d979b82b559873e57378412bcccffa6.jpg,515,image,EmptyEasel,/data/art,"Landscape Art,Landscape Paintings,Impressionist Landscape,Landscapes To Paint,Landscape Quilts,Painting Inspiration,Art Inspo,Pintura Hippie,Bel Art",Susan Hale: Gorgeous & Impressionistic Oil Landscape Paintings - EmptyEasel.com,646b0d53-bb70-4bc6-9237-98c1018896c7
art,Create a fantastical symmetrical creature drawing that’s starts with your name in this free video drawing tutorial. #videoart #symmetry #drawinglesson #nameactivities #drawingtu…,1,221000.0,https://i.pinimg.com/videos/thumbnails/originals/be/86/90/be86903d6bf062fb0c6c7fd4dcc07cce.0000001.jpg,516,video,The Kitchen Table Classroom,/data/art,"Name Art Projects,Art Education Projects,Art Education Lessons,Art Projects For Adults,Toddler Art Projects,Art Lessons For Kids,Art Lessons Elementary,Art For Kids,Teaching Elementary Art",What is Symmetry in Art- A Classic Project and a Free Printable - The Kitchen Table Classroom,91185669-c7a7-4ce9-abfb-9f025ba5c238
art,"Fascinating Paintings Of Artist Vanessa Stockard's Cat, Kevin - World's largest collection of cat memes and other animals",1,2000000.0,https://i.pinimg.com/originals/e2/1d/70/e21d70352bccbdb3afe1e3c92be73ac2.png,517,image,Cheezburger,/data/art,"Pretty Art,Cute Art,Black Cat Painting,Wow Art,Aesthetic Art,Artsy Fartsy,Art Inspo,Art Reference,Art Drawings","Fascinating Paintings Of Artist Vanessa Stockard's Cat, Kevin",5d28475c-9d8e-4ff7-86e0-ba57df6874cf
art,Paint this Snowy Winter Scene while mastering basic painting techniques with the help of Christie Hawkins' step-by-step tutorials to create your very own masterpiece. #thesocial…,1,20000.0,https://i.pinimg.com/originals/5b/b5/60/5bb560cad517ac9ae5cf98cfff62fe66.jpg,518,image,The Social Easel Online Paint Studio | Video Painting Tutorials,/data/art,"Basic Painting,Painting Snow,Easy Canvas Painting,Acrylic Painting For Beginners,Winter Painting,Acrylic Painting Tutorials,Painting Studio,Beginner Painting,Diy Painting",Acrylic Painting Tips and Tutorials,b682620e-e5eb-47ac-acf2-74dcd410611a
art,"I've always wanted to learn more about colors and paint. So I decided to give it a go with gouache, a medium I had never used before. Now, I'm halfway down the 100 days challenge!",1,2000000.0,https://i.pinimg.com/originals/57/55/66/5755668b5ed4d5705b2a82bb01f26283.jpg,519,image,Bored Panda,/data/art,"Diy Canvas Art,Acrylic Painting Canvas,Gouche Painting,Guache,Unique Paintings,Painting & Drawing,Plant Painting,Art Challenge,Art Plastique",Painting-100-Days-Challenge-With-Gouache-Thomethis,202abdc0-3512-4ff5-9700-e7659e7304d4
art,"Step-by-step dowry chest preparation, including both traditional and modern ideas, plus suggestions for boys.",1,2000.0,https://i.pinimg.com/originals/c6/16/dd/c616ddf4eb43a9b0b8730bc09cbdd8df.jpg,520,image,WeHaveKids,/data/art,"Art And Illustration,Cool Art Drawings,Art Drawings Sketches,Drawings Of Hands,Drawing Ideas,Arte Inspo,Art Du Croquis,Ap Art,Art Sketchbook",What to Put in a Hope Chest (Dowry Chest): Lists and Modern Considerations,ac04144a-9a8c-410a-be06-047f7157a1fd
art,Van Gogh Art Project for Kids! Create this Starry Night Masterpiece with your students! This Van Gogh inspired Art Project for Kids uses simple materials you already have in you…,1,2000.0,https://i.pinimg.com/originals/8c/52/56/8c52564cc32b1b65f3e1d79c1f611d85.jpg,521,image,‿✿ Carmel ✿⁀,/data/art,"Art History Projects For Kids,Art History Lessons,Art Projects For Adults,Toddler Art Projects,Art Lessons For Kids,Art Lessons Elementary,Art For Kids,Van Gogh For Kids,Artwork For Kids",Van Gogh Art Project for Kids,3b6c5345-b948-41f4-acde-7176dcc96e7b


# Write cleaned dataframe as a delta table

In [None]:
transformed_df.write.format("delta").mode("overwrite").save(transformed_delta_pin_path)