In [0]:
# Read json files containing Pinterest data into DataFrames
df_pin = spark.read.json("/mnt/0a1667ad2f7f/topics/0a1667ad2f7f.pin/partition=0/")
df_geo = spark.read.json("/mnt/0a1667ad2f7f/topics/0a1667ad2f7f.geo/partition=0/")
df_user = spark.read.json("/mnt/0a1667ad2f7f/topics/0a1667ad2f7f.user/partition=0/")

In [0]:
# Task 1: Clean the df_pin DataFrame

# Replace empty entries and entries that do not contain relevant data in each column with None
cleaned_df_pin = (df_pin.replace({'No description available Story format': None}, subset=['description'])
                    .replace({'No description available': None}, subset=['description'])
                    #.replace({'Untitled': None}, subset=['description']) # Unsure of this one 🤔
                    .replace({'User Info Error': None}, subset=['follower_count'])
                    .replace({'Image src Error.': None}, subset=['image_src'])
                    .replace({'N,o, ,T,a,g,s, ,A,v,a,i,l,a,b,l,e': None}, subset=['tag_list'])
                    .replace({'No Title Data Available': None}, subset=['title'])
                    .replace({'User Info Error': None}, subset=['poster_name']))

# Perform necessary transformations on the follower_count to ensure every entry is a number.
from pyspark.sql.functions import regexp_replace
cleaned_df_pin = cleaned_df_pin.withColumn("follower_count", regexp_replace("follower_count", "k", "000"))
cleaned_df_pin = cleaned_df_pin.withColumn("follower_count", regexp_replace("follower_count", "M", "000000"))
# Make sure the data type of this column is an integer
cleaned_df_pin = cleaned_df_pin.withColumn("follower_count", cleaned_df_pin["follower_count"].cast("integer"))

# Ensure each column containing numeric data has a numeric data type
cleaned_df_pin = cleaned_df_pin.withColumn("downloaded", cleaned_df_pin["downloaded"].cast("integer"))
cleaned_df_pin = cleaned_df_pin.withColumn("index", cleaned_df_pin["index"].cast("integer"))

# Clean the data in the save_location column to include only the save location path
cleaned_df_pin = cleaned_df_pin.withColumn("save_location", regexp_replace("save_location", "Local save in ", ""))

# Rename the index column to ind
cleaned_df_pin = cleaned_df_pin.withColumnRenamed("index", "ind")

# Reorder the DataFrame columns
cleaned_df_pin = cleaned_df_pin.select("ind", "unique_id", "title", "description", "follower_count", "poster_name", "tag_list", "is_image_or_video", "image_src", "save_location", "category")
# This removes the 'download' column

In [0]:
# Task 2: Clean df_geo DataFrame

# Create new column 'coordinates' containing an array based on the 'latitude' and 'longitude' columns
from pyspark.sql.functions import array
cleaned_df_geo = df_geo.withColumn("coordinates", array("latitude", "longitude"))

# Drop the latitude and longitude columns from DataFrame
cleaned_df_geo = (cleaned_df_geo.drop("latitude")
                                .drop("longitude"))

# Convert the timestamp column from a string to a timestamp data type
from pyspark.sql.functions import to_timestamp
cleaned_df_geo = cleaned_df_geo.withColumn("timestamp", to_timestamp("timestamp"))

# Reorder the DataFrame columns
cleaned_df_geo = cleaned_df_geo.select("ind", "country", "coordinates", "timestamp")

In [0]:
# Task 3: Clean df_user DataFrame

# Create new column user_name that concatenates the first_name and last_name columns
from pyspark.sql.functions import concat
cleaned_df_user = df_user.withColumn("user_name", concat("first_name", "last_name"))

# Drop the first_name and last_name columns from the DataFrame
cleaned_df_user = (cleaned_df_user.drop("first_name")
                                    .drop("last_name"))

# Convert the date_joined column from a string to a timestamp data type
from pyspark.sql.functions import to_timestamp
cleaned_df_user = cleaned_df_user.withColumn("date_joined", to_timestamp("date_joined"))

# Reorder the DataFrame columns
cleaned_df_user = cleaned_df_user.select("ind", "user_name", "age", "date_joined")

In [0]:
display(cleaned_df_pin)
display(cleaned_df_geo)
display(cleaned_df_user)

ind,unique_id,title,description,follower_count,poster_name,tag_list,is_image_or_video,image_src,save_location,category
1706,b5c8a1b5-9e90-4522-9bec-2477b698d5b7,Standing Figurine Toys Xmas Santa Claus Snowman Reindeer Figure Plush Dolls Christmas Decorations Ornaments Home Indoor Table Ornaments Christmas Party Tree Hanging Decor Toys Gifts for Kids Friends…,"Features: Material:Lint Size:48ｘ18cm Quantity:1 pc Shape:Santa Claus, snowman. Elk Occasion:Christmas Description: 1. Fashion design, high quality 2. Santa Claus, snowman. Elk C…",5000.0,Wear24-7,"Merry Christmas To You,Christmas Toys,Great Christmas Gifts,Christmas Snowman,Christmas Ornaments,Holiday,Christmas Party Decorations,Christmas Themes,Decoration Party",image,https://i.pinimg.com/originals/b5/7f/21/b57f219fa89c1165b57525b8eae711da.jpg,/data/christmas,christmas
2482,08604f20-fa17-4b9a-9949-781717eca6cd,FORNT PORCH CHRISTMAS DECORATING IDEAS,"Christmas decorating ideas for porches. Beautiful holiday decor ideas for front porches both small and large. Outdoor decorations like sleds, lanterns, Christmas trees, wreaths,…",46000.0,"Life on Summerhill | Home, Holiday Decor & DIY Website","Diy Christmas Decorations For Home,Farmhouse Christmas Decor,Christmas Home,Christmas Holidays,Christmas Front Porches,How To Decorate For Christmas,Christmas Porch Ideas,Christmas Decorating Ideas,Large Outdoor Christmas Decorations",video,https://i.pinimg.com/videos/thumbnails/originals/40/83/f5/4083f5b4971bf235f89a4784ab87271e.0000001.jpg,/data/christmas,christmas
4357,ccf116e9-9096-4943-a344-1960ce216445,First Steps in Launching Your Own Event Business - Learn About Event Planning,"Updated: January 25, 2017 You’ve organized some events for your family, friends or community and you have gained a budding reputation for knowing how put events together. You’ve…",4000.0,EventPlanning.com | Learn How To Become An Event Planner,"Event Planning Quotes,Event Planning Checklist,Event Planning Business,Business Events,Business Ideas,Business Names,Business Opportunities,Corporate Events,Wedding Event Planner",image,https://i.pinimg.com/originals/c3/2b/c6/c32bc6ad263857cb0eea19f9cd12beb9.jpg,/data/event-planning,event-planning
1967,0b9d5b95-51a6-465e-ae4a-2cb68ceada29,15 Fun & Festive Christmas Porch Ideas,15 unique Christmas porch ideas that will leave you feeling inspired and help you tackle decorating your own entryway for the holidays! It’s almost time to start decorating for…,19000.0,Ashley - Modern Glam,"Exterior Christmas Lights,Front Door Christmas Decorations,Christmas Lights Outside,Christmas House Lights,Decorating With Christmas Lights,Porch Decorating,Christmas Porch Decorations,Front Porch Ideas For Christmas,Christmas Lights Outdoor Trees",image,https://i.pinimg.com/originals/ff/f8/3b/fff83b02aeb29e2e9341a56fc5e63345.png,/data/christmas,christmas
10138,927c4658-cc3f-4b92-9b5c-70743d0c238d,"14 Amazing Things To Do In Costa Rica | Volcanoes, Waterfalls, Wildlife And More","This Costa Rica itinerary is the ultimate guide to spending two weeks in Costa Rica. Find out about visiting La Fortuna, Arenal, Monteverde, Naranjo, Corcovado National Park, Or…",10000.0,"Wanderlust Chloe ✈️ Travel guides, inspo and adventure travel ✈️","Costa Rica Travel,Rio Celeste Costa Rica,Dream Vacations,Vacation Spots,Vacation Travel,Travel Pictures,Travel Photos,Fortuna Costa Rica,Costa Rica Pictures",image,https://i.pinimg.com/originals/30/93/cb/3093cb01d9de2d125fda8ba5e3e41946.jpg,/data/travel,travel
2604,087b0fa9-f901-4262-aa0a-6caf234d1b35,75+ Neutral Christmas Home Decor for the Holiday Season in Farmhouse Style using Earth Tones Modern,"My favorite 75+ Neutral Christmas Home Decor for decorating your house during the Holiday Season in earth tones and a farmhouse, rustic style all winter. I love this modern, sim…",31000.0,Everyday Wholesome,"Colorful Christmas Decorations,Colorful Christmas Tree,Christmas Centerpieces,Christmas Colors,Xmas Colors,Winter Decorations,Christmas Trends,Christmas Inspiration,Christmas Home",image,https://i.pinimg.com/originals/86/84/39/868439dd894969e3abd6a2a8a9fe1e9c.jpg,/data/christmas,christmas
3156,fa6e31a4-18c2-4eca-a6d8-e903eee2c2a4,Handprint Reindeer Ornaments - Crafty Morning,"This post may contain affiliate links, read our Disclosure Policy for more information. As an Amazon Associate I earn from qualifying purchases, thank you! Make some cute handpr…",892000.0,Michelle {CraftyMorning.com},"Christmas Gifts For Parents,Christmas Decorations For Kids,Christmas Crafts For Toddlers,Preschool Christmas,Christmas Crafts For Gifts,Christmas Activities,Toddler Crafts,Kids Christmas,Christmas Feeling",image,https://i.pinimg.com/originals/ff/fe/38/fffe384f3ec18a0d87cb2d80cc8c1499.jpg,/data/diy-and-crafts,diy-and-crafts
1864,6f1951f0-63be-4c4f-8d21-e4995217f69e,120 Christmas Decorations from the Dollar Store,Love Christmas decorations but hate spending a lot bunch of money? Check out some of these budget DIY decorations you can easily make from the dollar store!,42000.0,Caroline|CarolineVencil.com | Saving & Making Money | Pro Blogger,"Diy Snowman Decorations,Christmas Candle Decorations,Diy Christmas Ornaments,Christmas Ideas,Christmas Christmas,Snowman Ornaments,Christmas Diy Gifts,Vase Decorations,Diy Christmas Decorations For Home",image,https://i.pinimg.com/originals/30/85/21/3085215db77e55770202724268465490.jpg,/data/christmas,christmas
6014,d4c57afb-4775-4482-89c8-71d1bf85b488,Coffee Table Decor Ideas for a Cozy Living Room - Salvaged Living,"Grab these coffee table decor ideas for a cozy living room. This post is awesome, it has a list of must have elements for cozy coffee table styling plus a list of supply ideas f…",40000.0,Salvaged Living,"Coffee Table Decor Living Room,Coffee Table Vignettes,Coffee Table Centerpieces,Coffee Table Styling,Diy Coffee Table,Decorating Coffee Tables,Cozy Living Rooms,Livingroom Table Decor,Living Room Candles",image,https://i.pinimg.com/originals/77/b2/bb/77b2bb477d1164908048dabcd78cabd5.jpg,/data/home-decor,home-decor
10119,40eab9ba-7812-4f26-baca-35a6bed95a9f,How to Afford Family Travel: 10 Mistakes You're Making (and what to do instead) | Our Next Adventure,"See families traveling all the time and wonder, ""how the heck do they afford this?"" Read 10 mistakes you might be making, and what you should do instead.",9000.0,OUR NEXT ADVENTURE | family travel blog,"Family Vacation Destinations,Vacation Trips,Travel Destinations,Vacation Ideas,Cheap Family Vacations,Vacation Travel,Best Family Vacation Spots,Vacation Quotes,Vacation Memories",image,https://i.pinimg.com/originals/0a/49/fb/0a49fbcec746c4219d3a6f30834f378e.jpg,/data/travel,travel


ind,country,coordinates,timestamp
2418,Antarctica (the territory South of 60 deg S),"List(-88.4642, -171.061)",2022-05-27T11:30:59.000+0000
5162,Antarctica (the territory South of 60 deg S),"List(-71.6607, -149.206)",2019-09-27T19:06:43.000+0000
2301,Saint Vincent and the Grenadines,"List(13.4683, 51.7244)",2020-11-14T00:25:28.000+0000
6014,French Southern Territories,"List(-26.6026, 155.206)",2019-04-30T12:33:13.000+0000
10794,Cocos (Keeling) Islands,"List(-89.5236, -154.567)",2022-01-01T02:26:50.000+0000
2074,Central African Republic,"List(-52.3213, -50.11)",2019-11-03T05:41:59.000+0000
2293,British Virgin Islands,"List(-87.7946, -159.647)",2022-03-21T10:46:53.000+0000
10663,Saint Kitts and Nevis,"List(-27.3474, -162.83)",2019-07-25T18:53:51.000+0000
5293,Sao Tome and Principe,"List(-13.1463, -25.9649)",2019-05-31T20:49:36.000+0000
9426,Sao Tome and Principe,"List(-29.8646, -78.1449)",2021-06-30T02:14:47.000+0000


ind,user_name,age,date_joined
2015,ChristopherBradshaw,27,2016-03-08T13:38:37.000+0000
10673,AlexanderCervantes,59,2017-05-12T21:22:17.000+0000
6398,ChristinaDavenport,39,2016-06-29T20:43:59.000+0000
3599,AlexandriaAlvarado,20,2015-10-23T04:13:23.000+0000
4256,AlexandriaAlvarado,20,2015-10-23T04:13:23.000+0000
5076,ChristopherButler,20,2015-12-01T15:08:31.000+0000
7790,MichelleGutierrez,39,2017-07-19T07:12:04.000+0000
10509,BrittanyThompson,49,2016-04-22T20:36:02.000+0000
10119,ChelseaGonzalez,43,2016-07-21T15:25:08.000+0000
3716,CatherineFerrell,21,2017-01-02T03:01:09.000+0000


In [0]:
cleaned_df_pin.printSchema()
cleaned_df_geo.printSchema()
cleaned_df_user.printSchema()

In [0]:
# Task 4: Find the most popular category in each country

# Join pin and geo DataFrames
combined_df_t4 = cleaned_df_pin.join(cleaned_df_geo, cleaned_df_pin["ind"] == cleaned_df_geo["ind"], how="inner")

# Identify most popular category in each country
from pyspark.sql.functions import col, count, rank
from pyspark.sql.window import Window

# # Use groupBy and aggregation functions
# grouped_df_t4 = combined_df_t4.groupBy(["country", "category"]).agg(count("category").alias("category_count"))
# grouped_df_t4 = grouped_df_t4.orderBy(["country", "category_count"], ascending=[True, False])
# # This does not quite work - you can't filter to keep just the top category

# Using window function to present only the most popular category

grouped_df_t4 = (combined_df_t4.groupBy(["country", "category"])
                                .agg(count("category").alias("category_count")))
# Create Window specification
window_spec = Window.partitionBy("country").orderBy(col("category_count").desc())
# Apply window function
grouped_df_t4 = grouped_df_t4.withColumn("rank", rank().over(window_spec))
# Filter to keep only top category in each country
grouped_df_t4 = grouped_df_t4.filter(col("rank") == 1).drop("rank")

In [0]:
display(grouped_df_t4)

country,category,category_count
Afghanistan,education,2
Albania,mens-fashion,10
Algeria,quotes,3
American Samoa,tattoos,2
American Samoa,beauty,2
Andorra,tattoos,1
Andorra,beauty,1
Andorra,quotes,1
Andorra,art,1
Angola,diy-and-crafts,1


In [0]:
# Task 5: Find which was the most popular category each year

# Join pin and geo DataFrames
combined_df_t5 = cleaned_df_pin.join(cleaned_df_geo, cleaned_df_pin["ind"] == cleaned_df_geo["ind"], how="inner")

# Identify most popular category each year
from pyspark.sql.functions import year, col, count, rank
from pyspark.sql.window import Window

# Create post_year column
combined_df_t5 = combined_df_t5.withColumn("post_year", year("timestamp"))

# Filter to restrict post_year to between 2018 and 2022
combined_df_t5 = combined_df_t5.filter((col("post_year") >= 2018) & (col("post_year") <= 2022))

# # Use groupBy and aggregation functions
grouped_df_t5 = (combined_df_t5.groupBy(["post_year", "category"])
                            .agg(count("category").alias("category_count")))

# Create a Window specification
window_spec = Window.partitionBy("post_year").orderBy(col("category_count").desc())

# Apply window function to rank yearly category posts
grouped_df_t5 = grouped_df_t5.withColumn("rank", rank().over(window_spec))

# Filter to keep only top category in each year
grouped_df_t5 = grouped_df_t5.filter(col("rank") == 1).drop("rank")

In [0]:
display(grouped_df_t5)

post_year,category,category_count
2018,art,6
2019,diy-and-crafts,7
2020,mens-fashion,10
2021,finance,10
2022,beauty,7


In [0]:
# Task 6: Find the user with most followers in each country

# Step 1: For each country find the user with the most followers
from pyspark.sql.functions import max, desc
# Join cleaned pin and geo DataFrames
combined_df_t6 = cleaned_df_pin.alias("pin").join(cleaned_df_geo.alias("geo"), cleaned_df_pin["ind"] == cleaned_df_geo["ind"], how="inner")

# Aggregation to find maximum follower count per country
max_follower_per_country = combined_df_t6.groupBy(["geo.country"]).agg(max("pin.follower_count").alias("max_follower_count"))

# Rename country_column in max_follower_per_country to avoid naming conflict later
max_follower_per_country = max_follower_per_country.withColumnRenamed("country", "country_agg")

# Join max_follower_per_country DataFrame with combined_df_t6 to get poster_name for each maximum follower count
df_with_max_follower = combined_df_t6.join(max_follower_per_country,
                                           (combined_df_t6["geo.country"] == max_follower_per_country["country_agg"]) & (combined_df_t6["pin.follower_count"] == max_follower_per_country["max_follower_count"]), how="inner")

# Select desired columns
result_df_step1 = df_with_max_follower.select("country", "poster_name", "follower_count").distinct()

# Step 2: Based on the above query, find the country with the user with most followers
# Return a DataFrame with the following columns: country, follower_count
grouped_df_step2 = result_df_step1.groupBy("country").agg(max("follower_count").alias("follower_count"))

# Order grouped_df_step2 by follower_count
ordered_df_step2 = grouped_df_step2.orderBy(desc("follower_count"))

# Select country with highest follower_count
result_df_step2 = ordered_df_step2.select(["country", "follower_count"]).limit(1)

In [0]:
display(result_df_step2)

country,follower_count
American Samoa,8000000


In [0]:
# Task 7: Find the most popular category for different age groups
#  18-24, 25-35, 36-50, +50
# Query should return a DataFrame containing the following columns;
# age_group, category, category_count

from pyspark.sql.functions import when
# Join pin and user DataFrames
combined_df_t7 = cleaned_df_pin.join(cleaned_df_user, cleaned_df_pin["ind"] == cleaned_df_user["ind"], "inner")
# Select relevant columns
combined_df_t7 = combined_df_t7.select("age", "category")
# Categorise age into ranges
combined_df_t7 = combined_df_t7.withColumn("age_group",
                                           when((col("age") >= 18) & (col("age") <= 24), "18-24")
                                           .when((col("age") >= 25) & (col("age") <= 35), "25-35")
                                           .when((col("age") >= 36) & (col("age") <= 50), "36-50")
                                           .otherwise("+50"))

# Group by age_group and count the categories
grouped_df_t7 = combined_df_t7.groupBy(["age_group", "category"]).agg(count("category").alias("category_count"))

# Find the most popular category within each age group
# Rank and filter each age group

# Create window specification that partitions data by age_group and orders by category_count
window_spec = Window.partitionBy("age_group").orderBy(col("category_count").desc())
# Apply window function
grouped_df_t7 = grouped_df_t7.withColumn("rank", rank().over(window_spec))
# Filter for most popular category for each age group
result_df_t7 = grouped_df_t7.filter(col("rank") == 1).drop("rank")

In [0]:
display(result_df_t7)

age_group,category,category_count
+50,beauty,3
+50,education,3
18-24,mens-fashion,18
25-35,diy-and-crafts,12
36-50,finance,9


In [0]:
# Task 8: Find the median follower count for different age groups
# What is the median follower count for users in the following age groups:
#  18-24, 25-35, 36-50, +50
# Query should return DataFrame containing age_group, median_follower_count

from pyspark.sql.functions import when
# Join pin and user DataFrames
combined_df_t8 = cleaned_df_pin.join(cleaned_df_user, cleaned_df_pin["ind"] == cleaned_df_user["ind"], "inner")
# Categorise age into ranges
combined_df_t8 = combined_df_t8.withColumn("age_group",
                                           when((col("age") >= 18) & (col("age") <= 24), "18-24")
                                           .when((col("age") >= 25) & (col("age") <= 35), "25-35")
                                           .when((col("age") >= 36) & (col("age") <= 50), "36-50")
                                           .otherwise("+50"))

# Calculate median for each age group
age_groups = ["18-24", "25-35", "36-50", "+50"]
medians = []
for group in age_groups:
    # Filter DataFrame for current age group in for loop
    temp_df = combined_df_t8.filter(col("age_group") == group)
    # Calculate median follower count for current age group
    median = temp_df.stat.approxQuantile("follower_count", [0.5], 0.01)
    # Store result
    medians.append((group, median[0]))

# Convert list of medians into DataFrame
median_df = spark.createDataFrame(medians, ["age_group", "median_follower_count"])

# # Select relevant columns
# combined_df_t8 = combined_df_t8.select("age_group", "follower_count")

# # Group by age_group and aggregate by median follower_count
# grouped_df_t8 = combined_df_t8.groupBy("age_group").agg(max("follower_count")-(min("follower_count"))).alias("median_follower_count")

In [0]:
display(median_df)

age_group,median_follower_count
18-24,89000.0
25-35,42000.0
36-50,3000.0
+50,1000.0


In [0]:
# Task 9: Find how many users have joined each year (2015-2020)?
# Query should return DataFrame that contains post_year and number_users_joined

# Join pin and user DataFrames
combined_df_t9 = cleaned_df_pin.join(cleaned_df_user, cleaned_df_pin["ind"] == cleaned_df_user["ind"], "inner")
# Select relevant columns
combined_df_t9 = combined_df_t9.select("date_joined")
# Extract year
task9_df = combined_df_t9.withColumn("post_year", year("date_joined"))
# Group and aggregate
task9_df = task9_df.groupBy("post_year").agg(count("date_joined").alias("number_users_joined"))

In [0]:
display(task9_df)

post_year,number_users_joined
2015,88
2016,91
2017,29


In [0]:
# Task 10: Find the median follower count of users based on their joining year (2015-2020)
#  Query should return DataFrame containing post_year and median_follower_count

from pyspark.sql.functions import year
from pyspark.sql.utils import AnalysisException

# Join pin and user DataFrames
combined_df_t10 = cleaned_df_pin.join(cleaned_df_user, cleaned_df_pin["ind"] == cleaned_df_user["ind"], "inner")
# Select relevant columns
combined_df_t10 = combined_df_t10.select("date_joined", "follower_count")
# Extract year
task10_df = combined_df_t10.withColumn("post_year", year("date_joined")).drop("date_joined")

# Calculate median follower count using approxQuantile
post_years = [2015, 2016, 2017, 2018, 2019, 2020]
medians = []
for year in post_years:
    try:
        # Filter DataFrame for the current posting year
        temp_df = task10_df.filter(col("post_year") == year)

        # Calculate median follower count for current posting year
        median = temp_df.stat.approxQuantile("follower_count", [0.5], 0.01)
        # Store the result
        medians.append((year, median[0]))
    # Append None if median is null
    except IndexError:
        medians.append((year, None))

median_df_t10 = spark.createDataFrame(medians, ["post_year", "median_follower_count"])

In [0]:
display(median_df_t10)

post_year,median_follower_count
2015,51000.0
2016,27000.0
2017,6000.0
2018,
2019,
2020,


In [0]:
# Task 11: Find the median follower count of users based on their joining year and age group
# 2015-2020, [18-24, 25-35, 36-50, +50]
# Your query should return age_group, post_year, median_follower_count

from pyspark.sql.functions import year, col
# Join pin and user DataFrames
combined_df_t11 = (cleaned_df_pin.join(cleaned_df_user, cleaned_df_pin["ind"] == cleaned_df_user["ind"], "inner")
                                .join(cleaned_df_geo, cleaned_df_pin["ind"] == cleaned_df_geo["ind"], "inner"))
# Extract year from timestamp column
combined_df_t11 = combined_df_t11.withColumn("post_year", year("date_joined"))
# Create age_group column
task11_df = combined_df_t11.withColumn("age_group",
                                           when((col("age") >= 18) & (col("age") <= 24), "18-24")
                                           .when((col("age") >= 25) & (col("age") <= 35), "25-35")
                                           .when((col("age") >= 36) & (col("age") <= 50), "36-50")
                                           .otherwise("+50"))
# Select relevant columns
task11_df = task11_df.select("age_group","post_year", "follower_count")

In [0]:
# Calculate median_follower_count using approxQuantile
post_years = [2015, 2016, 2017, 2018, 2019, 2020]
age_groups = ["18-24", "25-35", "36-50", "+50"]
medians = []
for year in post_years:
    # Filter DataFrame for the current posting year
    year_df = task11_df.filter(col("post_year") == year)
    for age in age_groups:
        try:
            # Filter DataFrame for the current age group
            age_df = year_df.filter(col("age_group") == age)

            # Calculate median follower count for current age group
            median = age_df.stat.approxQuantile("follower_count", [0.5], 0.01)

            # Store the result
            medians.append((year, age, median[0]))
        except IndexError:
            medians.append((year, age, None))

median_df_t11 = spark.createDataFrame(medians, ["post_year", "age_group", "median_follower_count"])



In [0]:
display(median_df_t11)

post_year,age_group,median_follower_count
2015,18-24,211000.0
2015,25-35,51000.0
2015,36-50,0.0
2015,+50,196.0
2016,18-24,28000.0
2016,25-35,66000.0
2016,36-50,7000.0
2016,+50,1000.0
2017,18-24,10000.0
2017,25-35,30000.0


In [0]:
display(cleaned_df_pin)
display(cleaned_df_geo)
display(cleaned_df_user)

ind,unique_id,title,description,follower_count,poster_name,tag_list,is_image_or_video,image_src,save_location,category
1706,b5c8a1b5-9e90-4522-9bec-2477b698d5b7,Standing Figurine Toys Xmas Santa Claus Snowman Reindeer Figure Plush Dolls Christmas Decorations Ornaments Home Indoor Table Ornaments Christmas Party Tree Hanging Decor Toys Gifts for Kids Friends…,"Features: Material:Lint Size:48ｘ18cm Quantity:1 pc Shape:Santa Claus, snowman. Elk Occasion:Christmas Description: 1. Fashion design, high quality 2. Santa Claus, snowman. Elk C…",5000.0,Wear24-7,"Merry Christmas To You,Christmas Toys,Great Christmas Gifts,Christmas Snowman,Christmas Ornaments,Holiday,Christmas Party Decorations,Christmas Themes,Decoration Party",image,https://i.pinimg.com/originals/b5/7f/21/b57f219fa89c1165b57525b8eae711da.jpg,/data/christmas,christmas
2482,08604f20-fa17-4b9a-9949-781717eca6cd,FORNT PORCH CHRISTMAS DECORATING IDEAS,"Christmas decorating ideas for porches. Beautiful holiday decor ideas for front porches both small and large. Outdoor decorations like sleds, lanterns, Christmas trees, wreaths,…",46000.0,"Life on Summerhill | Home, Holiday Decor & DIY Website","Diy Christmas Decorations For Home,Farmhouse Christmas Decor,Christmas Home,Christmas Holidays,Christmas Front Porches,How To Decorate For Christmas,Christmas Porch Ideas,Christmas Decorating Ideas,Large Outdoor Christmas Decorations",video,https://i.pinimg.com/videos/thumbnails/originals/40/83/f5/4083f5b4971bf235f89a4784ab87271e.0000001.jpg,/data/christmas,christmas
4357,ccf116e9-9096-4943-a344-1960ce216445,First Steps in Launching Your Own Event Business - Learn About Event Planning,"Updated: January 25, 2017 You’ve organized some events for your family, friends or community and you have gained a budding reputation for knowing how put events together. You’ve…",4000.0,EventPlanning.com | Learn How To Become An Event Planner,"Event Planning Quotes,Event Planning Checklist,Event Planning Business,Business Events,Business Ideas,Business Names,Business Opportunities,Corporate Events,Wedding Event Planner",image,https://i.pinimg.com/originals/c3/2b/c6/c32bc6ad263857cb0eea19f9cd12beb9.jpg,/data/event-planning,event-planning
1967,0b9d5b95-51a6-465e-ae4a-2cb68ceada29,15 Fun & Festive Christmas Porch Ideas,15 unique Christmas porch ideas that will leave you feeling inspired and help you tackle decorating your own entryway for the holidays! It’s almost time to start decorating for…,19000.0,Ashley - Modern Glam,"Exterior Christmas Lights,Front Door Christmas Decorations,Christmas Lights Outside,Christmas House Lights,Decorating With Christmas Lights,Porch Decorating,Christmas Porch Decorations,Front Porch Ideas For Christmas,Christmas Lights Outdoor Trees",image,https://i.pinimg.com/originals/ff/f8/3b/fff83b02aeb29e2e9341a56fc5e63345.png,/data/christmas,christmas
10138,927c4658-cc3f-4b92-9b5c-70743d0c238d,"14 Amazing Things To Do In Costa Rica | Volcanoes, Waterfalls, Wildlife And More","This Costa Rica itinerary is the ultimate guide to spending two weeks in Costa Rica. Find out about visiting La Fortuna, Arenal, Monteverde, Naranjo, Corcovado National Park, Or…",10000.0,"Wanderlust Chloe ✈️ Travel guides, inspo and adventure travel ✈️","Costa Rica Travel,Rio Celeste Costa Rica,Dream Vacations,Vacation Spots,Vacation Travel,Travel Pictures,Travel Photos,Fortuna Costa Rica,Costa Rica Pictures",image,https://i.pinimg.com/originals/30/93/cb/3093cb01d9de2d125fda8ba5e3e41946.jpg,/data/travel,travel
2604,087b0fa9-f901-4262-aa0a-6caf234d1b35,75+ Neutral Christmas Home Decor for the Holiday Season in Farmhouse Style using Earth Tones Modern,"My favorite 75+ Neutral Christmas Home Decor for decorating your house during the Holiday Season in earth tones and a farmhouse, rustic style all winter. I love this modern, sim…",31000.0,Everyday Wholesome,"Colorful Christmas Decorations,Colorful Christmas Tree,Christmas Centerpieces,Christmas Colors,Xmas Colors,Winter Decorations,Christmas Trends,Christmas Inspiration,Christmas Home",image,https://i.pinimg.com/originals/86/84/39/868439dd894969e3abd6a2a8a9fe1e9c.jpg,/data/christmas,christmas
3156,fa6e31a4-18c2-4eca-a6d8-e903eee2c2a4,Handprint Reindeer Ornaments - Crafty Morning,"This post may contain affiliate links, read our Disclosure Policy for more information. As an Amazon Associate I earn from qualifying purchases, thank you! Make some cute handpr…",892000.0,Michelle {CraftyMorning.com},"Christmas Gifts For Parents,Christmas Decorations For Kids,Christmas Crafts For Toddlers,Preschool Christmas,Christmas Crafts For Gifts,Christmas Activities,Toddler Crafts,Kids Christmas,Christmas Feeling",image,https://i.pinimg.com/originals/ff/fe/38/fffe384f3ec18a0d87cb2d80cc8c1499.jpg,/data/diy-and-crafts,diy-and-crafts
1864,6f1951f0-63be-4c4f-8d21-e4995217f69e,120 Christmas Decorations from the Dollar Store,Love Christmas decorations but hate spending a lot bunch of money? Check out some of these budget DIY decorations you can easily make from the dollar store!,42000.0,Caroline|CarolineVencil.com | Saving & Making Money | Pro Blogger,"Diy Snowman Decorations,Christmas Candle Decorations,Diy Christmas Ornaments,Christmas Ideas,Christmas Christmas,Snowman Ornaments,Christmas Diy Gifts,Vase Decorations,Diy Christmas Decorations For Home",image,https://i.pinimg.com/originals/30/85/21/3085215db77e55770202724268465490.jpg,/data/christmas,christmas
6014,d4c57afb-4775-4482-89c8-71d1bf85b488,Coffee Table Decor Ideas for a Cozy Living Room - Salvaged Living,"Grab these coffee table decor ideas for a cozy living room. This post is awesome, it has a list of must have elements for cozy coffee table styling plus a list of supply ideas f…",40000.0,Salvaged Living,"Coffee Table Decor Living Room,Coffee Table Vignettes,Coffee Table Centerpieces,Coffee Table Styling,Diy Coffee Table,Decorating Coffee Tables,Cozy Living Rooms,Livingroom Table Decor,Living Room Candles",image,https://i.pinimg.com/originals/77/b2/bb/77b2bb477d1164908048dabcd78cabd5.jpg,/data/home-decor,home-decor
10119,40eab9ba-7812-4f26-baca-35a6bed95a9f,How to Afford Family Travel: 10 Mistakes You're Making (and what to do instead) | Our Next Adventure,"See families traveling all the time and wonder, ""how the heck do they afford this?"" Read 10 mistakes you might be making, and what you should do instead.",9000.0,OUR NEXT ADVENTURE | family travel blog,"Family Vacation Destinations,Vacation Trips,Travel Destinations,Vacation Ideas,Cheap Family Vacations,Vacation Travel,Best Family Vacation Spots,Vacation Quotes,Vacation Memories",image,https://i.pinimg.com/originals/0a/49/fb/0a49fbcec746c4219d3a6f30834f378e.jpg,/data/travel,travel


ind,country,coordinates,timestamp
2418,Antarctica (the territory South of 60 deg S),"List(-88.4642, -171.061)",2022-05-27T11:30:59.000+0000
5162,Antarctica (the territory South of 60 deg S),"List(-71.6607, -149.206)",2019-09-27T19:06:43.000+0000
2301,Saint Vincent and the Grenadines,"List(13.4683, 51.7244)",2020-11-14T00:25:28.000+0000
6014,French Southern Territories,"List(-26.6026, 155.206)",2019-04-30T12:33:13.000+0000
10794,Cocos (Keeling) Islands,"List(-89.5236, -154.567)",2022-01-01T02:26:50.000+0000
2074,Central African Republic,"List(-52.3213, -50.11)",2019-11-03T05:41:59.000+0000
2293,British Virgin Islands,"List(-87.7946, -159.647)",2022-03-21T10:46:53.000+0000
10663,Saint Kitts and Nevis,"List(-27.3474, -162.83)",2019-07-25T18:53:51.000+0000
5293,Sao Tome and Principe,"List(-13.1463, -25.9649)",2019-05-31T20:49:36.000+0000
9426,Sao Tome and Principe,"List(-29.8646, -78.1449)",2021-06-30T02:14:47.000+0000


ind,user_name,age,date_joined
2015,ChristopherBradshaw,27,2016-03-08T13:38:37.000+0000
10673,AlexanderCervantes,59,2017-05-12T21:22:17.000+0000
6398,ChristinaDavenport,39,2016-06-29T20:43:59.000+0000
3599,AlexandriaAlvarado,20,2015-10-23T04:13:23.000+0000
4256,AlexandriaAlvarado,20,2015-10-23T04:13:23.000+0000
5076,ChristopherButler,20,2015-12-01T15:08:31.000+0000
7790,MichelleGutierrez,39,2017-07-19T07:12:04.000+0000
10509,BrittanyThompson,49,2016-04-22T20:36:02.000+0000
10119,ChelseaGonzalez,43,2016-07-21T15:25:08.000+0000
3716,CatherineFerrell,21,2017-01-02T03:01:09.000+0000
