#Batch Processing: Spark on Databricks
## Mount AWS S3 bucket to Databricks

Confirm location of authentificationauthentication_credentials.csv

In [0]:
dbutils.fs.ls("/FileStore/tables/")

### Read the csv file containing the AWS keys to Databricks

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
import urllib

file_type = "csv"
first_row_is_header = "true"
delimiter = ","
aws_keys_df = spark.read.format(file_type)\
.option("header", first_row_is_header)\
.option("sep", delimiter)\
.load("/FileStore/tables/authentication_credentials.csv")

In [0]:
# Get the AWS access key and secret key from the spark dataframe
ACCESS_KEY = aws_keys_df.select('Access key ID').collect()[0]['Access key ID']
SECRET_KEY = aws_keys_df.select('Secret access key').collect()[0]['Secret access key']
# Encode the secrete key
ENCODED_SECRET_KEY = urllib.parse.quote(string=SECRET_KEY, safe="")

## Mount the S3 bucket to local workspace

In [0]:
# AWS S3 bucket name
AWS_S3_BUCKET = "user-0ecf5ea19ac5-bucket"
# Mount name for the bucket
MOUNT_NAME = "/mnt/0ecf5ea19ac5_s3_mount"
# Source url
SOURCE_URL = "s3n://{0}:{1}@{2}".format(ACCESS_KEY, ENCODED_SECRET_KEY, AWS_S3_BUCKET)
# Mount the drive
dbutils.fs.mount(SOURCE_URL, MOUNT_NAME)

## Confirm that we can read data from the mounted S3 bucket

In [0]:
display(dbutils.fs.ls("/mnt/0ecf5ea19ac5_s3_mount/topics/0ecf5ea19ac5.geo/partition=0/"))

path,name,size,modificationTime
dbfs:/mnt/0ecf5ea19ac5_s3_mount/topics/0ecf5ea19ac5.geo/partition=0/0ecf5ea19ac5.geo+0+0000000000.json,0ecf5ea19ac5.geo+0+0000000000.json,112,1703700941000
dbfs:/mnt/0ecf5ea19ac5_s3_mount/topics/0ecf5ea19ac5.geo/partition=0/0ecf5ea19ac5.geo+0+0000000001.json,0ecf5ea19ac5.geo+0+0000000001.json,106,1703701097000
dbfs:/mnt/0ecf5ea19ac5_s3_mount/topics/0ecf5ea19ac5.geo/partition=0/0ecf5ea19ac5.geo+0+0000000002.json,0ecf5ea19ac5.geo+0+0000000002.json,108,1703701109000
dbfs:/mnt/0ecf5ea19ac5_s3_mount/topics/0ecf5ea19ac5.geo/partition=0/0ecf5ea19ac5.geo+0+0000000003.json,0ecf5ea19ac5.geo+0+0000000003.json,113,1703701243000
dbfs:/mnt/0ecf5ea19ac5_s3_mount/topics/0ecf5ea19ac5.geo/partition=0/0ecf5ea19ac5.geo+0+0000000004.json,0ecf5ea19ac5.geo+0+0000000004.json,126,1703701302000
dbfs:/mnt/0ecf5ea19ac5_s3_mount/topics/0ecf5ea19ac5.geo/partition=0/0ecf5ea19ac5.geo+0+0000000005.json,0ecf5ea19ac5.geo+0+0000000005.json,107,1703701389000
dbfs:/mnt/0ecf5ea19ac5_s3_mount/topics/0ecf5ea19ac5.geo/partition=0/0ecf5ea19ac5.geo+0+0000000006.json,0ecf5ea19ac5.geo+0+0000000006.json,112,1703692542000
dbfs:/mnt/0ecf5ea19ac5_s3_mount/topics/0ecf5ea19ac5.geo/partition=0/0ecf5ea19ac5.geo+0+0000000007.json,0ecf5ea19ac5.geo+0+0000000007.json,108,1703692542000
dbfs:/mnt/0ecf5ea19ac5_s3_mount/topics/0ecf5ea19ac5.geo/partition=0/0ecf5ea19ac5.geo+0+0000000008.json,0ecf5ea19ac5.geo+0+0000000008.json,104,1703697445000
dbfs:/mnt/0ecf5ea19ac5_s3_mount/topics/0ecf5ea19ac5.geo/partition=0/0ecf5ea19ac5.geo+0+0000000009.json,0ecf5ea19ac5.geo+0+0000000009.json,108,1703697445000


##  Create the following three dataframes: 
## df_pin, df_geo and df_user from S3 data

Re-run from this point after loading fresh data


In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
import urllib

topics =  {'df_pin':'0ecf5ea19ac5.pin', 'df_geo':'0ecf5ea19ac5.geo', 'df_user':'0ecf5ea19ac5.user'}
# File location and type
# Asterisk(*) indicates reading all the content of the specified file that have .json extension
for df, topic in topics.items():
    file_location = f"/mnt/0ecf5ea19ac5_s3_mount/topics/{topic}/partition=0/*.json" 
    file_type = "json"
    # Ask Spark to infer the schema
    infer_schema = "true"
    # Read in JSONs from mounted S3 bucket
    df = spark.read.format(file_type) \
    .option("inferSchema", infer_schema) \
    .load(file_location)
    # Display Spark dataframe to check its content
    if 'pin' in topic:
        df_pin = df
    elif 'geo' in topic:
        df_geo = df
    elif 'user' in topic:
        df_user = df
    display(df)
    

category,description,downloaded,follower_count,image_src,index,is_image_or_video,poster_name,save_location,tag_list,title,unique_id
quotes,Here are 15 inspirational quotes to start your day feeling motivated and positive. Inspiring words are a great tool to combat anxiety and fear. They can offer hope and spark you…,1,19k,https://i.pinimg.com/originals/db/ce/e4/dbcee4c357b42cb18fcefa044aacd55a.jpg,8161,image,Dream Dash Journal,Local save in /data/quotes,"Good Vibes Quotes Positivity,Positive Quotes For Life Encouragement,Positive Morning Quotes,Funny Positive Quotes,Positive Uplifting Quotes,Feeling Positive Quotes,Motivational Quotes For Success Positivity,Positive Quotes About Love,Morning Qoutes",15 Inspirational quotes to start your day off feeling motivated and positive. Inspiring words can he,da45e81e-3768-4e7c-862f-a1aae4289cf4
event-planning,Brand Name: HeronsbillOrigin: CN(Origin)is_customized: NoPattern: stripedModel Number: LED-D04Material: CopperOccasion: Wedding & EngagementOccasion: Christening & BaptismOccasi…,1,649,https://i.pinimg.com/originals/47/47/de/4747de8c2310834f982702dda0263ec6.jpg,4678,image,Qfdian,Local save in /data/event-planning,"Party Girlande,Led Curtain Lights,Icicle Lights,Window Lights,Backdrop Lights,Twinkle Lights,Backdrop Photobooth,Wall Lights,Wall Fairy Lights",qfdian 3m 100/200/300 LED Curtain String Light Garland Wedding Party Decorations Table Bachelorette Birthday Christmas New Year Festoon - Colorful With Hook / 3MX2M 200LED,b512bba6-5ce9-4414-ab31-156a3bff6320
quotes,Motivational quotes. Motivational quotes for success. Motivational quotes for women. Affirmations for self-love. Affirmations for positivity. Positive mindset. Law of attraction…,1,45k,https://i.pinimg.com/originals/53/86/6d/53866d35a32393ba29c75582a428bc09.jpg,8468,image,"Boss Babe Chronicles | Personal Development, Finances, Self Care",Local save in /data/quotes,"Positive Self Affirmations,Positive Affirmations Quotes,Affirmation Quotes,Positive Mindset,Postive Quotes,Positive Uplifting Quotes,Affirmations Success,Law Of Attraction Affirmations,Law Of Attraction Quotes",50 Positive Affirmations That Will Change Your Life,bb3ea8aa-c9c0-4164-b050-aa7811c051a0
quotes,"It's possible to reprogram your thoughts and fill your life with more success and happiness. No matter what your current beliefs are, you can change them for the better.",1,17k,https://i.pinimg.com/originals/b1/4a/fa/b14afaae339fca00365b8ba8dea67c57.jpg,8359,image,Lovely Refinement Health & Wellness,Local save in /data/quotes,"Positive Quotes For Life Encouragement,Good Life Quotes,Self Love Quotes,Inspiring Quotes About Life,Quotes To Live By,Quotes Positive,Happy Quotes About Life,Cute Quotes About Happiness,Quotes About Mindset",5 Ways To Reprogram Your Thoughts For More Happiness - Lovely Refinement,1856773e-ee8a-4ab8-8325-2a5bd306d825
christmas,This artificial pre-lit skinny Christmas tree is designed with plenty branches and one branch tip at the top in a small slim pencil tree shape for you to save space! You can DIY…,1,5k,https://i.pinimg.com/originals/d9/ab/c0/d9abc01ad3ffde9365869e472c33c375.jpg,2227,image,Wear24-7,Local save in /data/christmas,"Cashmere Christmas Tree,Skinny Christmas Tree,Pencil Christmas Tree,Christmas Tree With Snow,Flocked Christmas Trees,Christmas Greenery,Christmas Fireplace,Christmas Store,Holiday Ornaments",VEIKOUS 6ft/7.5ft Slim Pencil Christmas Tree Pre-lit with Adjustable Lights and Stand - 7.5 Foot,c2358cc5-7458-4350-a413-7689c2360b3a
diy-and-crafts,"HALLOWEEN CRAFTS FOR KIDS: Spider handprint craft - this is too cute! A perfect Halloween craft for preschoolers, kindergarten or toddlers. #halloween #halloweencrafts #kidscraf…",1,903k,https://i.pinimg.com/videos/thumbnails/originals/78/27/21/782721435dc388129a66ef4dba432b06.0000001.jpg,3421,video,The Best Ideas for Kids,Local save in /data/diy-and-crafts,"Halloween Arts And Crafts,Halloween Crafts For Toddlers,Halloween Tags,Halloween Crafts For Kids,Toddler Crafts,Kids Crafts,Diy For Kids,Halloween Preschool Activities,Decor Crafts",Spider Handprint Craft - The Best Ideas for Kids,49dec45b-e807-4f2c-b442-a92ff4ef0d6c
event-planning,"Being an event planner could easily be described as one of the most stressful yet entertaining jobs in the universe. With all of the food, guests, promoting, tracking, analyzing…",1,4k,https://i.pinimg.com/originals/47/78/6b/47786be4dc12f7401504b68caa56247b.jpg,4494,image,EventPlanning.com | Learn How To Become An Event Planner,Local save in /data/event-planning,"Event Planning Tips,Event Planning Business,Event Ideas,Theme Ideas,Party Planning,Wedding Event Planner,Wedding Events,Tent Wedding,Weddings",10 Tech Tools that Help Event Planners Do Their Job - Learn About Event Planning,32c8ac38-4283-4275-8fd7-8ddc9c04e13b
quotes,"The honeymoon phase is the most romantic stage of a relationship, and it's no wonder it's the one we miss the most it when gone and passed. Look to these romantic love quotes to…",1,942k,https://i.pinimg.com/originals/83/2b/d5/832bd5e2517d84e843e1c183a6c1b860.jpg,7679,image,YourTango,Local save in /data/quotes,"Love Quotes For Him Cute,Love Quotes For Him Boyfriend,Quotes About Moving On From Love,Falling In Love Quotes,Love Quotes Funny,Inspirational Quotes About Love,Romantic Love Quotes,New Quotes,Words Quotes",20 Romantic Love Quotes That Will Make You Fall In Love All Over Again,5f557e9c-ec27-4976-af4c-534190dc0941
education,March is the perfect time to pull out ALL your rainbow and weather activities! Turn your classroom into a seasonal storm with our Made For Me Literacy Weather Unit! This unit is…,1,21k,https://i.pinimg.com/originals/53/f0/f8/53f0f80e2048085e7e5a866cfd558ecd.png,4159,image,Teaching Special Thinkers,Local save in /data/education,"Special Education Classroom,Classroom Community,Special Education Activities,Teaching Math,Preschool Kindergarten,Preschool Education,Science Classroom,Classroom Ideas,Lesson Plan For Kindergarten",Weather Activities For Kids - Teaching Special Thinkers,0c482cf8-b912-4886-83c6-fe57bf9eac00
art,Mit der Puste-Technik kann man einfach wunderschöne Kunstwerke gestalten! Lade dir einfach das Mädchen mit Schirm (auch verfügbar als Junge mit Schirm) herunter und los geht der…,1,71k,https://i.pinimg.com/videos/thumbnails/originals/76/53/0b/76530b1d736cea566c9a68dd8d96d3f7.0000001.jpg,674,video,"Faminino | Basteln mit Kinder, Ratgeber und easy DIY-Ideen",Local save in /data/art,"Crayon Art,Melted Crayon Crafts,Art Drawings Sketches Simple,Colorful Drawings,Diy Canvas Art,Art Club,Art Plastique,Art Activities,Diy Art",Regenbogen-Regen - Tanzendes Mädchen (inkl. Download),db168555-2951-4295-8961-317efe30e0bb


country,ind,latitude,longitude,timestamp
British Indian Ocean Territory (Chagos Archipelago),7216,-86.5675,-149.565,2020-08-19 00:32:59
United States Minor Outlying Islands,9732,-31.2934,-154.838,2021-03-09 15:16:13
Falkland Islands (Malvinas),1644,-47.1523,-151.883,2017-11-15 15:58:54
Bouvet Island (Bouvetoya),8695,-84.3984,-144.933,2021-01-14 01:06:27
Bouvet Island (Bouvetoya),1125,-18.0738,-83.9147,2018-07-02 10:40:58
Slovakia (Slovak Republic),5668,8.92926,-12.1636,2019-04-30 16:25:13
Bouvet Island (Bouvetoya),9784,-88.516,-178.811,2022-02-25 12:30:26
United States of America,10846,38.3898,62.1414,2017-10-23 14:20:19
Bosnia and Herzegovina,6050,-85.2391,-26.6006,2018-10-25 00:03:16
British Virgin Islands,8161,-59.1209,-93.9878,2022-01-08 23:39:59


age,date_joined,first_name,ind,last_name
20,2015-10-23 04:13:23,Alexandria,3798,Alvarado
28,2015-12-18 10:50:10,Jennifer,10903,Arellano
42,2016-12-09 08:27:49,Shelley,10283,Jefferson
43,2016-02-27 17:57:17,Christopher,5211,Keller
60,2016-07-29 12:38:45,Kathleen,2377,Alvarado
42,2016-06-22 14:06:33,Christopher,1199,Adams
20,2015-11-20 09:08:00,Andrew,7669,Alexander
42,2016-07-22 10:18:12,Sherry,10619,Thompson
23,2016-03-10 06:28:23,Lauren,9660,Patterson
26,2016-03-11 23:42:05,Amanda,10580,Anderson


# Data Cleaning

## Data cleaning for df_pin<br>

- Remove duplicate rows in the dataframe
- Rename the column index to ind
- Re-order the column names in the dataframe
- Replace the values of follower_count column wherever necessary and hence converting the column into a integer data type
- Remove any additional strings from the save_location column
- Replace all the NA with None
- Drop the rows where all columns have null values

In [0]:
#df_pin data clean
df_pin = df_pin.dropDuplicates()
df_pin = df_pin.withColumnRenamed('index', 'ind')
df_pin = df_pin.select("ind", "unique_id", "title", "description", "follower_count", "poster_name", 
"tag_list", "is_image_or_video", "image_src", "save_location", "category", "downloaded")
df_pin = df_pin.withColumn('follower_count', regexp_replace('follower_count', '[%k]', '000'))
df_pin = df_pin.withColumn('follower_count', regexp_replace('follower_count', '[%M]', '000000'))
df_pin = df_pin.withColumn('follower_count', regexp_replace('follower_count', '[%User Info Error%]', ''))
df_pin = df_pin.withColumn('follower_count', df_pin['follower_count'].cast(IntegerType()))
df_pin = df_pin.withColumn('save_location', regexp_replace('save_location', 'Local save in *', ''))
df_pin.na.fill('None', ['is_image_or_video', 'image_src'])
df_pin.na.drop(how = "all")
display(df_pin)

ind,unique_id,title,description,follower_count,poster_name,tag_list,is_image_or_video,image_src,save_location,category,downloaded
4494,32c8ac38-4283-4275-8fd7-8ddc9c04e13b,10 Tech Tools that Help Event Planners Do Their Job - Learn About Event Planning,"Being an event planner could easily be described as one of the most stressful yet entertaining jobs in the universe. With all of the food, guests, promoting, tracking, analyzing…",4000.0,EventPlanning.com | Learn How To Become An Event Planner,"Event Planning Tips,Event Planning Business,Event Ideas,Theme Ideas,Party Planning,Wedding Event Planner,Wedding Events,Tent Wedding,Weddings",image,https://i.pinimg.com/originals/47/78/6b/47786be4dc12f7401504b68caa56247b.jpg,/data/event-planning,event-planning,1
9977,3719f505-20cc-485b-8b01-ea2aee6f6e62,15 jaw-droppingly beautiful waterfalls in Iceland,Sigöldugljúfur waterfall near Landmannalaugar in Iceland. Click for the full list of the most beautiful waterfalls in Iceland. #Iceland #europe #photography #travel #travelguide…,25000.0,Norman - Luxury travel,"Travel Photography Inspiration,Travel Inspiration,Beautiful Waterfalls,Beautiful Landscapes,Beautiful Scenery,Beautiful Things,Beautiful Pictures,Dream Pictures,Beautiful Beautiful",image,https://i.pinimg.com/originals/1d/78/37/1d78374e72c9eb4a9d27039861bbd355.jpg,/data/travel,travel,1
5303,573cd520-b16e-41f0-a59a-e0d8d04d2ee2,"Credit Sesame Review: The Pros, Cons and Best Features","Credit Sesame offers free access to a version of your credit score + $50,000 in free identity theft insurance, but there are a few downsides to the service.",46000.0,R.J. Weiss at The Ways To Wealth | Personal Finance | Debt Payoff,"What Is Credit Score,Fix Your Credit,Build Credit,Improve Your Credit Score,Rebuilding Credit,Credit Repair Companies,Ideas Prácticas,Paying Off Credit Cards,Planning Budget",image,https://i.pinimg.com/originals/e2/6d/7a/e26d7a597f2f83a21803ead5c401613e.png,/data/finance,finance,1
7679,5f557e9c-ec27-4976-af4c-534190dc0941,20 Romantic Love Quotes That Will Make You Fall In Love All Over Again,"The honeymoon phase is the most romantic stage of a relationship, and it's no wonder it's the one we miss the most it when gone and passed. Look to these romantic love quotes to…",942000.0,YourTango,"Love Quotes For Him Cute,Love Quotes For Him Boyfriend,Quotes About Moving On From Love,Falling In Love Quotes,Love Quotes Funny,Inspirational Quotes About Love,Romantic Love Quotes,New Quotes,Words Quotes",image,https://i.pinimg.com/originals/83/2b/d5/832bd5e2517d84e843e1c183a6c1b860.jpg,/data/quotes,quotes,1
7216,bed8c98e-c3af-458f-91bb-49d9e0ac0bfa,Leisure Motion Standing Collar Warm Cotton Jacket,Product Name: Leisure Motion Standing Collar Warm Cotton Jacket Item NO.: 4356748476465 Weight: 0.8 kg = 1.7637 lb = 28.2192 oz Category: Men's Outerwear > Men's Jackets Tag: 20…,31000.0,joymanmall,"Winter Outfits Men,Stylish Mens Outfits,Casual Outfits,Urban Style Outfits Men,Men Winter Fashion,Outfits For Men,Smart Casual Outfit,Mens Winter,Simple Outfits",image,https://i.pinimg.com/originals/80/78/65/807865c3b7f390a6be457a79bf7b5d2c.jpg,/data/mens-fashion,mens-fashion,1
4678,b512bba6-5ce9-4414-ab31-156a3bff6320,qfdian 3m 100/200/300 LED Curtain String Light Garland Wedding Party Decorations Table Bachelorette Birthday Christmas New Year Festoon - Colorful With Hook / 3MX2M 200LED,Brand Name: HeronsbillOrigin: CN(Origin)is_customized: NoPattern: stripedModel Number: LED-D04Material: CopperOccasion: Wedding & EngagementOccasion: Christening & BaptismOccasi…,649.0,Qfdian,"Party Girlande,Led Curtain Lights,Icicle Lights,Window Lights,Backdrop Lights,Twinkle Lights,Backdrop Photobooth,Wall Lights,Wall Fairy Lights",image,https://i.pinimg.com/originals/47/47/de/4747de8c2310834f982702dda0263ec6.jpg,/data/event-planning,event-planning,1
2227,c2358cc5-7458-4350-a413-7689c2360b3a,VEIKOUS 6ft/7.5ft Slim Pencil Christmas Tree Pre-lit with Adjustable Lights and Stand - 7.5 Foot,This artificial pre-lit skinny Christmas tree is designed with plenty branches and one branch tip at the top in a small slim pencil tree shape for you to save space! You can DIY…,5000.0,Wear24-7,"Cashmere Christmas Tree,Skinny Christmas Tree,Pencil Christmas Tree,Christmas Tree With Snow,Flocked Christmas Trees,Christmas Greenery,Christmas Fireplace,Christmas Store,Holiday Ornaments",image,https://i.pinimg.com/originals/d9/ab/c0/d9abc01ad3ffde9365869e472c33c375.jpg,/data/christmas,christmas,1
4459,c57ab99a-a118-435f-9748-c2b1d6dc19dc,Fun and Cheap DIY Party Decorations For All Celebrations,A party doesn't have to cost an arm and a leg to be amazing. Fun and cheap DIY party decorations are all things that you can make yourself! Give it a try!,42000.0,Caroline|CarolineVencil.com | Saving & Making Money | Pro Blogger,"Trendy Wedding,Dream Wedding,Elegant Wedding,Perfect Wedding,Spring Wedding,Rustic Wedding,Romantic Weddings,Vintage Weddings,Wedding House",image,https://i.pinimg.com/originals/2e/9e/7e/2e9e7eb160b70fccb554a25af077bf5e.jpg,/data/event-planning,event-planning,1
8359,1856773e-ee8a-4ab8-8325-2a5bd306d825,5 Ways To Reprogram Your Thoughts For More Happiness - Lovely Refinement,"It's possible to reprogram your thoughts and fill your life with more success and happiness. No matter what your current beliefs are, you can change them for the better.",17000.0,Lovely Refinement Health & Wellness,"Positive Quotes For Life Encouragement,Good Life Quotes,Self Love Quotes,Inspiring Quotes About Life,Quotes To Live By,Quotes Positive,Happy Quotes About Life,Cute Quotes About Happiness,Quotes About Mindset",image,https://i.pinimg.com/originals/b1/4a/fa/b14afaae339fca00365b8ba8dea67c57.jpg,/data/quotes,quotes,1
674,db168555-2951-4295-8961-317efe30e0bb,Regenbogen-Regen - Tanzendes Mädchen (inkl. Download),Mit der Puste-Technik kann man einfach wunderschöne Kunstwerke gestalten! Lade dir einfach das Mädchen mit Schirm (auch verfügbar als Junge mit Schirm) herunter und los geht der…,71000.0,"Faminino | Basteln mit Kinder, Ratgeber und easy DIY-Ideen","Crayon Art,Melted Crayon Crafts,Art Drawings Sketches Simple,Colorful Drawings,Diy Canvas Art,Art Club,Art Plastique,Art Activities,Diy Art",video,https://i.pinimg.com/videos/thumbnails/originals/76/53/0b/76530b1d736cea566c9a68dd8d96d3f7.0000001.jpg,/data/art,art,1


## Data cleaning for df_geo
- Remove duplicate rows in the dataframe
- Create new column coordinates with the values to be the array of latitude and longitude column and deleting these two columns
- Convert the timestamp column into a timestamp data type
- Re-order the column names in the dataframe

In [0]:
#df_geo data clean
df_geo = df_geo.dropDuplicates()
df_geo = df_geo.withColumn("coordinates", array(col("latitude"), col("longitude")))
df_geo = df_geo.drop('latitude', 'longitude')
df_geo = df_geo.withColumn("timestamp", df_geo["timestamp"].cast(TimestampType()))
df_geo = df_geo.select("ind", "country", "coordinates", "timestamp")
display(df_geo)

ind,country,coordinates,timestamp
7973,Saint Kitts and Nevis,"List(-68.0247, 59.5982)",2021-11-14T10:16:04.000+0000
5668,Slovakia (Slovak Republic),"List(8.92926, -12.1636)",2019-04-30T16:25:13.000+0000
10962,Saint Kitts and Nevis,"List(-27.3474, -162.83)",2022-04-28T18:49:25.000+0000
2644,Afghanistan,"List(-75.7662, -171.892)",2020-08-13T14:26:22.000+0000
4495,French Polynesia,"List(-59.1849, -77.2275)",2018-05-18T15:10:24.000+0000
10619,Saint Helena,"List(-68.9094, 121.026)",2022-09-29T00:05:47.000+0000
8695,Bouvet Island (Bouvetoya),"List(-84.3984, -144.933)",2021-01-14T01:06:27.000+0000
2377,Burkina Faso,"List(38.2875, -173.383)",2021-10-29T02:25:35.000+0000
10846,United States of America,"List(38.3898, 62.1414)",2017-10-23T14:20:19.000+0000
3798,Afghanistan,"List(-88.5478, -174.971)",2018-04-13T02:57:54.000+0000


## Data cleaning for df_user
- Remove duplicate rows in the dataframe
- Create new column user_name by combining the first_name and last_name column and deleting these two columns
- Convert the date_joined column into a timestamp data type
- Re-order the column names in the dataframe

In [0]:
#df_user data clean
df_user = df_user.dropDuplicates()
df_user = df_user.withColumn("user_name", concat(col("first_name"), lit(" "), col("last_name")))
df_user = df_user.drop("first_name", "last_name")
df_user = df_user.withColumn('date_joined', df_user['date_joined'].cast(TimestampType()))
df_user = df_user.select("ind", "user_name", "age", "date_joined")
display(df_user)

ind,user_name,age,date_joined
8110,Andrew Alexander,20,2015-11-20T09:08:00.000+0000
6266,Albert Allison,20,2015-10-21T22:27:27.000+0000
10580,Amanda Anderson,26,2016-03-11T23:42:05.000+0000
1644,Douglas Griffin,37,2016-05-18T06:56:32.000+0000
4678,Zachary Elliott,55,2016-07-26T23:44:27.000+0000
10826,Jeremy Peterson,39,2016-01-27T22:19:07.000+0000
1199,Christopher Adams,42,2016-06-22T14:06:33.000+0000
2377,Kathleen Alvarado,60,2016-07-29T12:38:45.000+0000
8037,Aaron Alexander,21,2015-10-25T07:36:08.000+0000
9784,Annette Andrews,28,2015-12-20T22:09:10.000+0000


# Pinterest Business Intelligence


1. Create initial setup for subsequent queries.<br>
2. There are two queries per report, one in pyspark SQL and one in regular SQL  

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.window import Window

df_geo.createOrReplaceTempView("geo_table")
df_pin.createOrReplaceTempView("pin_table")
df_user.createOrReplaceTempView("user_table")

1. Find the most popular Pinterest category people post to based on their country


In [0]:
#pyspark sql

# create partition by country and order by category_count descending
windowCountryByCatCount = Window.partitionBy("country").orderBy(col("category_count").desc())

# find the most popular category in each country
df_pin.join(df_geo, df_pin.ind == df_geo.ind)\
    .groupBy("country", "category") \
    .agg(count("category").alias("category_count")) \
    .withColumn("rank", row_number().over(windowCountryByCatCount)) \
    .filter(col("rank") == 1) \
    .drop("rank") \
    .show()


In [0]:
#regular sql

result_df = spark.sql("""
                      
    WITH Ranked as (
    SELECT 
        geo_table.country AS country, 
        pin_table.category AS category, 
        count(pin_table.category) AS category_count,
        ROW_NUMBER() OVER(PARTITION BY geo_table.country ORDER BY count(pin_table.category) DESC) category_rank
    FROM 
        geo_table
    INNER JOIN 
        pin_table ON geo_table.ind = pin_table.ind
    GROUP BY 
        geo_table.country, 
        pin_table.category 
    )   
        SELECT
            country,
            category,
            category_count
        FROM
            Ranked
        WHERE
            category_rank = 1
""")


display(result_df)

country,category,category_count
Afghanistan,quotes,2
Albania,art,3
Algeria,quotes,6
American Samoa,tattoos,2
Andorra,quotes,1
Antigua and Barbuda,finance,1
Argentina,home-decor,1
Armenia,vehicles,1
Aruba,christmas,1
Bahamas,education,1


2. Which was the most popular category between 2018 and 2022.


In [0]:
#pyspark sql

# create partition by year and order by category_count descending
windowYearByCatCount = Window.partitionBy("post_year").orderBy(col("category_count").desc())

# find which was the most popular category each year between 2018 and 2022
df_pin.join(df_geo, df_pin.ind == df_geo.ind)\
    .withColumn("post_year", year("timestamp")) \
    .filter(col("post_year") >= 2018) \
    .filter(col("post_year") <= 2022) \
    .groupBy("post_year", "category") \
    .agg(count("category").alias("category_count")) \
    .withColumn("rank", row_number().over(windowYearByCatCount)) \
    .filter(col("rank") == 1) \
    .drop("rank") \
    .show()


In [0]:
# regular SQL

result_df = spark.sql("""
                      
WITH Ranked As (
    SELECT DISTINCT 
        YEAR(geo_table.timestamp) AS post_year, 
        pin_table.category AS category, 
        COUNT(pin_table.category) AS category_count,
        ROW_NUMBER() OVER(PARTITION BY YEAR(geo_table.timestamp) ORDER BY COUNT(pin_table.category) DESC) category_rank
    FROM 
        geo_table
    INNER JOIN 
        pin_table ON geo_table.ind = pin_table.ind
    WHERE 
        YEAR(geo_table.timestamp) >= 2018 AND YEAR(geo_table.timestamp) <= 2022
    GROUP BY 
        YEAR(geo_table.timestamp), pin_table.category
) 
    SELECT
        post_year, category, category_count
    FROM
        Ranked
    WHERE
        category_rank == 1
        
""")

display(result_df)
     

post_year,category,category_count
2018,quotes,7
2019,tattoos,4
2020,mens-fashion,3
2021,event-planning,4
2022,home-decor,4


3. Step 1: For each country find the user with the most followers.<br>
   Step 2: Based on the above query, find the country with the user with most followers.

In [0]:
#pyspark sql

#Find user with most followers in each country
# create partition by country and order by follower_count descending
windowCountryByFollowers = Window.partitionBy("country").orderBy(col("follower_count").desc())

# find the user with the most followers in each country
max_followers_by_country = \
    df_pin.join(df_geo, df_pin.ind == df_geo.ind) \
    .withColumn("rank", row_number().over(windowCountryByFollowers)) \
    .filter(col("rank") == 1) \
    .select("country", "poster_name", "follower_count") \
    .orderBy(col("country"))

# get highest number of followers from all countries
max_followers_all_countries = max_followers_by_country.select(max("follower_count")).collect()[0][0]

# find the country with the user with most followers
country_with_max_followers = \
    max_followers_by_country \
    .select("country","follower_count") \
    .orderBy(col("follower_count").desc()) \
    .limit(1)

max_followers_by_country.show()
country_with_max_followers.show()


In [0]:
# regular SQL

# STEP 1
result_df = spark.sql("""
                      
WITH Ranked AS (
    
    SELECT 
        pin_table.poster_name AS poster,
        geo_table.country AS country,
        pin_table.follower_count AS follower_count,
        ROW_NUMBER() OVER (PARTITION BY geo_table.country ORDER BY pin_table.follower_count DESC) AS rank
    FROM
        geo_table
    INNER JOIN pin_table ON geo_table.ind = pin_table.ind
)

    SELECT
        country, poster, follower_count
    FROM
        Ranked
    WHERE
        rank = 1
    ORDER BY 
        country;
        
""")

display(result_df)              

country,poster,follower_count
Afghanistan,Walmart,2000000
Albania,The Minds Journal,5000000
Algeria,Apartment Therapy,5000000
American Samoa,Homesthetics.net,556000
Andorra,The Best Ideas for Kids,903000
Antigua and Barbuda,Kenya. Finances 101,760
Argentina,Jen | Save to Splurge,8000
Armenia,Yanko Design - Form Beyond Function,416000
Aruba,Tim Adam - Car Enthusiast,49000
Bahamas,EventPlanning.com | Learn How To Become An Event Planner,4000


In [0]:
# Use SQL to join DataFrames: STEP 2
result_df = spark.sql("""
                      
WITH Ranked AS (
    
    SELECT 
        pin_table.poster_name AS poster,
        geo_table.country AS country,
        pin_table.follower_count AS follower_count,
        ROW_NUMBER() OVER (PARTITION BY geo_table.country ORDER BY pin_table.follower_count DESC) AS rank
    FROM
        geo_table
    INNER JOIN pin_table ON geo_table.ind = pin_table.ind
)

    SELECT
        country, follower_count
    FROM
        Ranked
    WHERE
        rank = 1
    ORDER BY 
        follower_count DESC  
    LIMIT 1
    
""")

display(result_df)       

country,follower_count
Albania,5000000


4. Find the most popular category people post to, based on the age groups - 18-24, 25-35, 36-50, +50

In [0]:
#pyspark sql

#Define age groups
pin_user_age_group =\
df_pin.join(df_user, 'ind') \
    .withColumn('age_group', expr("""case
    when age BETWEEN 18 AND 24 THEN '18-24'
    when age BETWEEN 25 AND 35 THEN '25-35'
    when age BETWEEN 36 AND 50 THEN '36-50'
    when age> 50 THEN '+50'
    END
    """))

# create partition by age_group and order by category_count descending
windowAgeGroup = Window.partitionBy("age_group").orderBy(col("category_count").desc())

# find the most popular category for different age groups
pin_user_age_group.groupBy("category","age_group") \
    .agg(count("category").alias("category_count")) \
    .withColumn("rank", row_number().over(windowAgeGroup)) \
    .filter(col("rank") == 1) \
    .drop("rank") \
    .show()
    

In [0]:
# regular SQL

result_df = spark.sql("""
WITH Ranked AS (
              
    SELECT 
        pin_table.category as category,
        CASE
            WHEN user_table.age BETWEEN 18 AND 24 THEN '18-24'
            when user_table.age BETWEEN 25 AND 35 THEN '25-35'
            when user_table.age BETWEEN 36 AND 50 THEN '36-50'
            when user_table.age > 50 THEN '+50'
        END as age_group,
        COUNT(pin_table.category) AS category_count
    FROM
        pin_table
    INNER JOIN 
        user_table ON pin_table.ind = user_table.ind   
    GROUP BY
        pin_table.category, age_group
    ORDER BY
        age_group, category_count DESC
        
),

Ranked_Window as (
    
    SELECT
        category, age_group, category_count,
        ROW_NUMBER() OVER (PARTITION BY age_group ORDER BY category_count DESC) AS rank
    FROM
        Ranked
)

    SELECT 
        category, age_group, category_count
    FROM
        Ranked_Window
    WHERE
        rank = 1
    
""")

display(result_df) 

category,age_group,category_count
event-planning,+50,4
quotes,18-24,14
education,25-35,5
finance,36-50,3


5. Find the median follower count for users in the age groups, 18-24, 25-35, 36-50, +50

In [0]:
#pyspark sql

#Find the median follower count for different age groups
pin_user_age_group \
    .select("age_group", "follower_count") \
    .groupBy("age_group") \
    .agg(percentile_approx("follower_count", 0.5).alias("median_follower_count")) \
    .orderBy("age_group") \
    .show()
    

In [0]:
#Standard SQL

result_df = spark.sql("""
                      
WITH  CTE AS 
(
    
    SELECT 
        CASE
            WHEN user_table.age BETWEEN 18 AND 24 THEN '18-24'
            when user_table.age BETWEEN 25 AND 35 THEN '25-35'
            when user_table.age BETWEEN 36 AND 50 THEN '36-50'
            when user_table.age > 50 THEN '+50'
        END as age_group,
        pin_table.follower_count as follower_count
    FROM
        pin_table
    INNER JOIN 
        user_table ON pin_table.ind = user_table.ind
    WHERE
        pin_table.follower_count IS NOT NULL
    ORDER BY
        age_group
        
), 
    
CTE2 AS (
    SELECT 
        CTE.age_group, CTE.follower_count,
        NTILE(2) OVER(PARTITION BY CTE.age_group ORDER BY CTE.follower_count) as half1, 
        NTILE(2) OVER(PARTITION BY CTE.age_group ORDER BY CTE.follower_count DESC) as half2
    FROM
        CTE
)
    
    SELECT  CTE2.age_group,
            ROUND((MAX(CASE WHEN CTE2.half1 = 1 THEN CTE2.follower_count END) + 
            MIN(CASE WHEN CTE2.half2 = 1 THEN CTE2.follower_count END)) / 2.0) as median_follower_count
    FROM    CTE2
    GROUP BY CTE2.age_group;
    
""")

display(result_df)

age_group,median_follower_count
+50,649
18-24,313500
25-35,21000
36-50,3000


6. Find how many users have joined between 2015 and 2020.

In [0]:
#pyspark sql

df_user\
    .groupBy(year('date_joined').alias('post_year'))\
    .agg(count_distinct('ind').alias('number_users_joined')) \
    .select('post_year', 'number_users_joined')\
    .where(col('post_year').between('2015', '2020'))\
    .show()
    

In [0]:
# standard sql

result_df = spark.sql("""

    SELECT DISTINCT
        YEAR(date_joined) as post_year,
        COUNT (ind) as number_of_users_joined
    FROM
        user_table
    WHERE 
        YEAR(date_joined) BETWEEN 2015 AND 2020
    GROUP BY
        post_year
    ORDER BY
        post_year

""")

display(result_df)

post_year,number_of_users_joined
2015,49
2016,40
2017,16


7. Find the median follower count of users who have joined between 2015 and 2020.

In [0]:
#pyspark sql

df_user.join(df_pin, 'ind')\
    .groupBy(year('date_joined').alias('post_year'))\
    .agg(percentile_approx("follower_count", 0.5).alias("median_follower_count")) \
    .select('post_year', 'median_follower_count')\
    .where(col('post_year').between('2015', '2020'))\
    .show()
    

In [0]:
# standard sql

result_df = spark.sql("""

WITH CTE AS (        
    SELECT 
        YEAR(user_table.date_joined) as post_year,
        pin_table.follower_count as follower_count
    FROM
        user_table
    INNER JOIN 
        pin_table ON user_table.ind = pin_table.ind
    WHERE 
        YEAR(user_table.date_joined) BETWEEN 2015 AND 2020
    ORDER BY
        post_year
),
    
CTE2 AS (
    SELECT 
        CTE.post_year, CTE.follower_count,
        NTILE(2) OVER(PARTITION BY CTE.post_year ORDER BY CTE.follower_count) as half1, 
        NTILE(2) OVER(PARTITION BY CTE.post_year ORDER BY CTE.follower_count DESC) as half2
    FROM
        CTE
)
    
    SELECT  CTE2.post_year,
            ROUND((MAX(CASE WHEN CTE2.half1 = 1 THEN CTE2.follower_count END) + 
            MIN(CASE WHEN CTE2.half2 = 1 THEN CTE2.follower_count END)) / 2.0) as median_follower_count
    FROM    CTE2
    GROUP BY CTE2.post_year;    
    
""")

display(result_df)

post_year,median_follower_count
2015,190000
2016,16000
2017,7000


8. Find the median follower count of users who have joined between 2015 and 2020, based on age group that they are part of.

In [0]:
#pyspark sql

df_pin.join(df_user, 'ind')\
    .withColumn('age_group', expr("""case
        when age BETWEEN 18 AND 24 THEN '18-24'
        when age BETWEEN 25 AND 35 THEN '25-35'
        when age BETWEEN 36 AND 50 THEN '36-50'
        when age> 50 THEN '+50'
        END
        """))\
    .groupBy(year('date_joined').alias('post_year'), 'age_group')\
    .agg(percentile_approx("follower_count", 0.5).alias("median_follower_count")) \
    .select('post_year', 'age_group','median_follower_count')\
    .where(col('post_year').between('2015', '2020'))\
    .orderBy('post_year','age_group')\
    .show()  


In [0]:
#standard sql

result_df = spark.sql("""
    WITH CTE AS (
        SELECT 
            YEAR(user_table.date_joined) as post_year,
            CASE
                WHEN user_table.age BETWEEN 18 AND 24 THEN '18-24'
                when user_table.age BETWEEN 25 AND 35 THEN '25-35'
                when user_table.age BETWEEN 36 AND 50 THEN '36-50'
                when user_table.age > 50 THEN '+50'
            END as age_group,
            pin_table.follower_count as follower_count
        FROM
            pin_table
        INNER JOIN 
            user_table ON pin_table.ind = user_table.ind
        WHERE 
            YEAR(user_table.date_joined) BETWEEN 2015 AND 2020 
        ORDER BY
            post_year, age_group
     ),
    
    CTE2 AS (
        SELECT 
            CTE.post_year, CTE.age_group, CTE.follower_count,
            NTILE(2) OVER(PARTITION BY CTE.post_year, CTE.age_group  ORDER BY CTE.follower_count) as half1, 
            NTILE(2) OVER(PARTITION BY CTE.post_year, CTE.age_group ORDER BY CTE.follower_count DESC) as half2
        FROM
            CTE
    )
    
    SELECT  CTE2.post_year, CTE2.age_group, 
            ROUND((MAX(CASE WHEN CTE2.half1 = 1 THEN CTE2.follower_count END) + 
            MIN(CASE WHEN CTE2.half2 = 1 THEN CTE2.follower_count END)) / 2.0) as median_follower_count
    FROM    CTE2
    GROUP BY CTE2.post_year, CTE2.age_group; 
    
""")

display(result_df)

post_year,age_group,median_follower_count
2015,18-24,494000
2015,25-35,31500
2015,36-50,5500
2016,+50,598
2016,18-24,49000
2016,25-35,23000
2016,36-50,3000
2017,+50,760
2017,18-24,22500
2017,25-35,5000
