###  Clean df_pin Dataframe

Perform the necessary transformations on the follower_count to ensure every entry is a number. Make sure the data type of this column is an int.

In [None]:
df_pin = df_pin.withColumn("follower_count", regexp_replace("follower_count", "k", "000"))
df_pin = df_pin.withColumn("follower_count", regexp_replace("follower_count", "M", "000000"))
# cast follower_count column to integer type
df_pin = df_pin.withColumn("follower_count", col("follower_count").cast('int'))

In [None]:
# Replace irrelevant values with None across all columns
irrelevant_values = ["", "NA", "N/A", "null"]
df_pin = df_pin.replace(irrelevant_values, None)
#display(df_pin)

In [None]:
# Clean the save_location by removing "Local save in "
df_pin = df_pin.withColumn("save_location", regexp_replace(col("save_location"), r"Local save in ", ""))
#display(df_pin)

In [None]:
# Rename the 'index' column to 'ind'
df_pin = df_pin.withColumnRenamed("index", "ind")
#display(df_pin)

In [None]:
# reorder columns
display(df_pin)
new_df_pin_column_order = [
    "ind",
    "unique_id",
    "title",
    "description",
    "follower_count",
    "poster_name",
    "tag_list",
    "is_image_or_video",
    "image_src",
    "save_location",
    "category"
]
df_pin = df_pin.select(new_df_pin_column_order)
#df_pin.show()

In [None]:

# display changes
df_pin.printSchema()

### Clean df_geo dataframe

In [None]:
file_location = "/mnt/aws-bucket/topics/129bc7e0bd61.geo/partition=0/"
file_type = "json"
infer_schema = "true"
# Read in JSONs from mounted S3 bucket
df_geo = spark.read.format(file_type) \
.option("inferSchema", infer_schema) \
.load(file_location)
display(df_geo)

In [None]:
from pyspark.sql.functions import array, col
# Create a new column 'coordinates' that contains an array with latitude and longitude
df_geo = df_geo.withColumn("coordinates", array(col("latitude"), col("longitude")))
display(df_geo)

In [None]:
#Drop the latitude and longitude columns from the DataFrame
df_geo = df_geo.drop("latitude", "longitude")
display(df_geo)
df_geo.show()


In [None]:
#Convert the timestamp column from a string to a timestamp data type
from pyspark.sql.functions import to_timestamp
df_geo = df_geo.withColumn("timestamp", to_timestamp("timestamp"))

In [None]:
#Reorder the DataFrame columns
df_geo = df_geo.select("ind","country","coordinates","timestamp")

### Clean df_user Dataframe

In [None]:
file_location = "/mnt/aws-bucket/topics/129bc7e0bd61.user/partition=0/"
file_type = "json"
infer_schema = "true"
# Read in JSONs from mounted S3 bucket
df_user = spark.read.format(file_type) \
.option("inferSchema", infer_schema) \
.load(file_location)
display(df_user)

In [None]:
from pyspark.sql.functions import concat, col, lit
#Create a new column user_name that concatenates the information found in the first_name and last_name columns
df_user = df_user.withColumn("user_name", concat(col("first_name"), lit(" "), col("last_name")))
df_user.show()

In [None]:
#Drop the first_name and last_name columns from the DataFrame
df_user = df_user.drop("first_name", "last_name")
df_user.show()


In [None]:
#Convert the date_joined column from a string to a timestamp data type
df_user = df_user.withColumn("date_joined", col("date_joined").cast("timestamp"))
df_user.show()

In [None]:
#Reorder the DataFrame columns
df_user = df_user.select("ind", "user_name","age","date_joined")
df_user.show()

In [None]:

# display changes
df_user.printSchema()