
# Batch data processing & querying

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

In [0]:
%run /Users/peter.galuska@proton.me/AWS_S3_mounting_Notebook

## Data ingestion

Reading the data from the pin topic into a table and checking it's content.

In [0]:
# Reading the the data from the pinterest topic into a table.
df_pin = make_df_tables('df_pin', 'pin')
df_pin.show(5)


Reading the data from the geo topic into a table and checking it's content.

In [0]:
# Reading the the data from the geolocation topic into a table.
df_geo = make_df_tables('df_geo', 'geo')
df_geo.show(5)


Reading the data from the user topic into a table and checking it's content.

In [0]:
# Reading the the data from the user topic into a table.
df_user = make_df_tables('df_user', 'user')
df_user.show(5)

## Data cleaning

### Cleaning the pinterest posts dataset

In [0]:
# Dictionary containing the value and column to clean.
# Key-Value pair reversed due to the dictionary key restriction.
to_replace = {
    'No description available': 'description',
    'Image src error.': 'image_src',
    'User Info Error': 'poster_name',
    'N,o, ,T,a,g,s, ,A,v,a,i,l,a,b,l,e': 'tag_list', 
    'No Title Data Available': 'title',
    'No description available Story format': 'description',
    'Untitled': 'description'
}

# Looping through the dictionary and calling the replace function to remove unnecessary data.
for value, column in to_replace.items():
    df_pin = df_pin.replace({value: None}, subset=[column])

# There is an extra cleaning necessary as it would be a duplicate key entry in the dictionary.
df_pin = df_pin.replace({'User Info Error': None}, subset=['follower_count'])

# Dropping duplicate rows.
df_pin = df_pin.dropDuplicates(df_pin.columns)

In [0]:
# Rewriting the follower_count column to be numeric only.
df_pin = df_pin.withColumn('follower_count', F.regexp_replace(df_pin['follower_count'], 'k','000'))
df_pin = df_pin.withColumn('follower_count', F.regexp_replace(df_pin['follower_count'], 'M','000000'))

# Transforming the columns to integer format.   
df_pin = df_pin.withColumn('follower_count', df_pin['follower_count'].cast('int'))
df_pin = df_pin.withColumn('downloaded', df_pin['downloaded'].cast('int'))
df_pin = df_pin.withColumn('index', df_pin['index'].cast('int'))

# Cleaning the save_location column
df_pin = df_pin.withColumn('save_location', F.regexp_replace(df_pin['save_location'], 'Local save in ',''))

# Renaming index column
df_pin = df_pin.withColumnRenamed('index', 'ind')

# Rearranging the columns
df_pin = df_pin.select('ind',
                    'unique_id',
                    'title',
                    'description',
                    'follower_count',
                    'poster_name',
                    'tag_list',
                    'is_image_or_video',
                    'image_src',
                    'save_location',
                    'category'
)


### Cleaning the geolocation dataset

In [0]:
# Creating a new column coordinates with the latitude and longitude as array.
df_geo = df_geo.withColumn('coordinates', F.array(df_geo['latitude'], df_geo['longitude']))

# Dropping the latutide and longitude columns as no longer needed.
df_geo = df_geo.drop('latitude','longitude')

# Casting the timestamp column into timestamp format.
df_geo = df_geo.withColumn('timestamp', F.to_timestamp('timestamp'))
df_geo = df_geo.withColumn('timestamp', F.date_format('timestamp', 'yyyy-MM-dd HH:mm:ss'))

# Dropping duplicate entries
df_pin = df_pin.dropDuplicates(df_pin.columns)

# Reordering the columns
df_geo = df_geo.select('ind',
                       'country',
                       'coordinates',
                       'timestamp')


### Cleaning the users dataset

In [0]:
# Creating a new colum containing the first name and last name of the user
df_user = df_user.withColumn('user_name', F.concat(df_user['first_name'], F.lit(' '), df_user['last_name']))

# Dropping the first_name and last_name columns as no longer needed.
df_user = df_user.drop('first_name','last_name')

# Casting the date_joined column into timestamp format.
df_user = df_user.withColumn('date_joined', F.to_timestamp('date_joined'))
df_user = df_user.withColumn('date_joined', F.date_format('date_joined', 'yyyy-MM-dd HH:mm:ss'))

# Dropping duplicate entries
df_pin = df_pin.dropDuplicates(df_pin.columns)

# Reordering the columns
df_user = df_user.select('ind',
                       'user_name',
                       'age',
                       'date_joined')


## Data querying

#### 1. What are the most popular Pinterest category people post to based on their country?

In [0]:
df_joined = df_pin.join(df_geo, df_pin['ind'] == df_geo['ind'], how = 'inner')

In [0]:
q1_temp = df_joined.groupBy('country','category')\
              .agg(F.count("category")\
              .alias("category_count"))\
              .orderBy(['category_count'], ascending=False)
w = Window.partitionBy('country')
q1 = q1_temp.withColumn('max_count', F.max('category_count').over(w))\
           .where(F.col('category_count') == F.col('max_count'))\
           .drop('max_count')
display(q1)

country,category,category_count
Afghanistan,education,3
Albania,mens-fashion,5
Algeria,quotes,6
American Samoa,beauty,2
American Samoa,education,2
American Samoa,tattoos,2
Andorra,quotes,2
Angola,diy-and-crafts,3
Anguilla,diy-and-crafts,2
Anguilla,home-decor,2


#### 2. How many posts each category had between 2018 and 2022?

In [0]:
df_joined = df_joined.withColumn('post_year', F.year(df_joined['timestamp']))
q2 =   df_joined.filter("post_year BETWEEN '2018' AND '2022'")\
                .groupBy('post_year','category')\
                .agg(F.count('*')\
                .alias('category_count'))\
                .orderBy(['post_year', 'category_count'], ascending=False)
display(q2)

post_year,category,category_count
2022,beauty,8
2022,christmas,7
2022,quotes,6
2022,diy-and-crafts,5
2022,tattoos,4
2022,mens-fashion,4
2022,home-decor,4
2022,art,3
2022,vehicles,3
2022,education,2


#### 3. Who are the most followed users in each country? 

In [0]:
df_joined = df_pin.join(df_geo, df_pin['ind'] == df_geo['ind'], how = 'inner')\
                  .join(df_user, df_pin['ind'] == df_user['ind'], how = 'inner')
df_joined = df_joined.dropDuplicates(['timestamp'])

ind,unique_id,title,description,follower_count,poster_name,tag_list,is_image_or_video,image_src,save_location,category,ind.1,country,coordinates,timestamp,ind.2,user_name,age,date_joined
7234,c05f564d-2783-4bec-b205-d3f756276296,16 Amazing Casual Outfit Grids For Guys,Your spring summer wardrobe inspiration..,613000.0,Mens Fashion - LIFESTYLE BY PS,"Mode Outfits,Casual Outfits,Men Casual,Fashion Outfits,Fashion Clothes,Hijab Casual,Gentleman Mode,Gentleman Style,Mode Masculine",image,https://i.pinimg.com/originals/0e/cf/eb/0ecfeb441e7a3559aa41f11b94cdd6ef.jpg,/data/mens-fashion,mens-fashion,7234,Algeria,"List(-86.4791, -169.547)",2017-10-21 14:31:02,7234,Angela Allen,20,2015-10-26 04:12:32
7738,17fd85fd-a790-4c91-b6b2-f4cfabb6a798,25 Simple Living Quotes to Inspire you to Declutter & Simplify your Life!,25 of my favourite simple living quotes & minimalist quotes to inspire and encourage you to declutter your home and simplify your life!,31000.0,Simple Lionheart Life,"Great Quotes,Me Quotes,Inspirational Quotes,Wisdom Quotes,Unique Quotes,Truth Quotes,Fact Quotes,Minimalist Quotes,Life Quotes To Live By",image,https://i.pinimg.com/originals/b6/3e/ad/b63eadd0568cdb0310c817c09d39898f.png,/data/quotes,quotes,7738,Comoros,"List(66.8478, 22.777)",2017-11-12 03:45:21,7738,Alexis George,34,2016-02-26 00:38:01
7801,55f282f7-3038-450c-9b72-366072583f1c,"Art Print: Wilson's Be Stronger Than Your Excuses, 32x24in.","Size: 32x24in Be Stronger Than Your ExcusesWe have more Brett Wilson Posters. Choose from our catalog of over 500,000 posters! This art print displays sharp, vivid images with a…",72000.0,AllPosters,"Motivation Positive,Fitness Motivation Quotes,Motivational Workout Quotes,Quotes About Fitness,Motivational Quotes For Working Out,Health Fitness Quotes,Gym Fitness,Motivating Quotes,Motivational Quotes For Athletes",image,https://i.pinimg.com/originals/d3/49/5e/d3495e71e97c681086ef12fbb10e4eb7.jpg,/data/quotes,quotes,7801,Afghanistan,"List(-75.9161, -27.6285)",2017-11-18 22:54:58,7801,Nancy Clay,36,2016-01-12 18:33:12
2048,18574885-466a-45b5-acd8-131467643d3c,Outdoor Christmas Decorations to Give Your Yard Holiday Cheer,All I want for Christmas is a beautifully decorated home.,1000000.0,Country Living Magazine,"Noel Christmas,Winter Christmas,Magical Christmas,Christmas Garlands,Christmas 2019,Christmas Displays,Office Christmas,Christmas Vacation,Christmas Movies",image,https://i.pinimg.com/originals/92/65/29/926529446fc63379a274a946aa85f9b8.jpg,/data/christmas,christmas,2048,Antigua and Barbuda,"List(-89.4008, -142.186)",2017-11-20 06:33:50,2048,Ann Chung,22,2015-11-18 23:11:15
159,841a161a-47b8-4161-884d-adeb67a28b1e,Valentine's Day Bee Directed Drawing {Art Project},This bee directed drawing and associated pages will help you create a fun and creative Valentine's Day Directed Drawing Art Project activity for your class.Choose to do a painti…,1000000.0,Teachers Pay Teachers,"Classroom Art Projects,School Art Projects,Art Classroom,Art Projects For Kindergarteners,Spring Art Projects,Classroom Posters,Valentines Art Lessons,Valentines Day Activities,Grade 1 Art",image,https://i.pinimg.com/originals/49/ff/2e/49ff2e83c0cefdd37213f6084c6f0566.jpg,/data/art,art,159,Andorra,"List(-88.0812, -166.603)",2017-11-20 21:14:56,159,Alison Bell,21,2016-01-07 08:11:35
1706,b5c8a1b5-9e90-4522-9bec-2477b698d5b7,Standing Figurine Toys Xmas Santa Claus Snowman Reindeer Figure Plush Dolls Christmas Decorations Ornaments Home Indoor Table Ornaments Christmas Party Tree Hanging Decor Toys Gifts for Kids Friends…,"Features: Material:Lint Size:48ｘ18cm Quantity:1 pc Shape:Santa Claus, snowman. Elk Occasion:Christmas Description: 1. Fashion design, high quality 2. Santa Claus, snowman. Elk C…",5000.0,Wear24-7,"Merry Christmas To You,Christmas Toys,Great Christmas Gifts,Christmas Snowman,Christmas Ornaments,Holiday,Christmas Party Decorations,Christmas Themes,Decoration Party",image,https://i.pinimg.com/originals/b5/7f/21/b57f219fa89c1165b57525b8eae711da.jpg,/data/christmas,christmas,1706,Aruba,"List(-71.5025, -179.257)",2017-11-24 23:36:46,1706,Amy Adams,20,2015-10-24 05:05:28
8978,b98e5485-de15-45e7-b4b1-c45351d84f30,50 Elephant Tattoo Designs for Women Stylish Picture,Attractive Geometric Elephant Tattoo Female Will Love Today elephant tattoo for women are popular for…,6000.0,Travel Pins Design Ideas,"Cute Elephant Tattoo,Elephant Tattoo Design,Small Elephant Tattoos,Elephant Thigh Tattoo,Elephant Tattoo Meaning,Elephant Outline,Tattoo Oma,Tigh Tattoo,Small Thigh Tattoos",image,https://i.pinimg.com/originals/f9/14/11/f9141167d060e6a949e585f1b877a949.jpg,/data/tattoos,tattoos,8978,Faroe Islands,"List(-2.01499, 3.78198)",2017-12-01 13:26:06,8978,Mark Jones,34,2017-08-22 00:01:29
10371,8af04cdf-818b-4748-b0e0-83d002260581,Car Photos,1930 Duesenberg Model J Convertible Sedan by Murphy 2015 Auburn-Cord-Duesenberg Automotive Museum,5000.0,Andrew Martin,"Classy Cars,Sexy Cars,Duesenberg Car,Vintage Cars,Antique Cars,Ford Mustang Car,Ford Mustangs,Automobile,Old Classic Cars",image,https://i.pinimg.com/originals/f6/c2/42/f6c2421e227a29cea0ff110335cac415.jpg,/data/vehicles,vehicles,10371,Bahamas,"List(44.5638, 101.933)",2017-12-17 12:54:54,10371,Lori Campos,48,2016-04-26 11:49:56
8488,92524b9c-4c6e-4af7-8f95-357fa1358a6f,Top 77 Best Teacup Tattoo Ideas - [2021 Inspiration Guide],Teacup tattoos are great tribute to friendship and utilized in other cool designs. Check out the top 77 best teacup ideas from small to old school and weird,800000.0,Next Luxury,"Blue Ink Tattoos,Dope Tattoos,Pretty Tattoos,Black Tattoos,Body Art Tattoos,Sleeve Tattoos,Creative Tattoos,Unique Tattoos,Small Tattoos",image,https://i.pinimg.com/originals/98/d8/74/98d87429da639f3d047e4be71462e21b.jpg,/data/tattoos,tattoos,8488,Argentina,"List(-89.4739, -176.154)",2017-12-20 03:09:10,8488,Andrew Anderson,23,2015-11-28 11:52:37
6380,71f3e739-268c-4b3b-84b0-c20e1ab865a2,Most Beautiful Christmas Cottage Decor Ideas - Dagmar's Home,Christmas farmhouse and cottage decor ideas. #homedecor #homedecorideas #DIYhomedecor #farmhousestyle,105000.0,Dagmar Bleasdale {Dagmar's Home},"Farmhouse Kitchen Cabinets,Kitchen Redo,New Kitchen,Kitchen White,Rustic Cabinets,Farmhouse Kitchens,Kitchen Shelves,Wood Cabinets,Kitchen With Blue Walls",image,https://i.pinimg.com/originals/0b/8b/09/0b8b09823d0897347d8aad0d3193f997.jpg,/data/home-decor,home-decor,6380,Argentina,"List(-57.051, -133.377)",2017-12-21 21:46:23,6380,Matthew Newman,41,2016-04-03 05:17:47


In [0]:
w = Window.partitionBy('country')
q3_step1 = df_joined.withColumn('max_follower', F.max('follower_count').over(w))\
                   .where(F.col('follower_count') == F.col('max_follower'))\
                   .drop('max_follower')\
                   .select(['country', 'user_name', 'follower_count'])\
                   .dropDuplicates(['country','user_name'])
display(q3_step1)

country,user_name,follower_count
Afghanistan,Amanda Carlson,3000000
Albania,Aaron Anderson,5000000
Algeria,Aaron Abbott,942000
American Samoa,Abigail Bates,8000000
Andorra,Alison Bell,1000000
Angola,April Brown,8000000
Anguilla,Corey Andrews,92000
Antarctica (the territory South of 60 deg S),Benjamin Campbell,1000000
Antigua and Barbuda,Ann Chung,1000000
Argentina,Andrew Anderson,800000


In [0]:
w = Window.partitionBy('country')
q3 = df_joined.withColumn('max_follower', F.max('follower_count').over(w))\
                   .where(F.col('follower_count') == F.col('max_follower'))\
                   .drop('max_follower')\
                   .select(['country', 'user_name', 'follower_count'])\
                   .dropDuplicates(['country','user_name'])\
                   .groupBy('country')\
                   .agg(F.max('follower_count')\
                   .alias('follower_count'))\
                   .orderBy(['follower_count'], ascending=False)
display(q3)

country,follower_count
American Samoa,8000000
Angola,8000000
Azerbaijan,6000000
Bouvet Island (Bouvetoya),5000000
Albania,5000000
Bangladesh,4000000
Afghanistan,3000000
Christmas Island,3000000
Antigua and Barbuda,1000000
Martinique,1000000


#### 4. What are the most popular categories by age group?

In [0]:
df_joined = df_joined.withColumn('age_group', 
              F.when(df_joined['age'] <= 24, '18-24') \
              .when(df_joined['age'] <= 35, '25-35') \
              .when(df_joined['age'] <= 50, '36-50') \
              .otherwise('50+'))

q4_temp = df_joined.groupBy('age_group','category')\
              .agg(F.count('*')\
              .alias('category_count'))\
              .orderBy('age_group')

w = Window.partitionBy('age_group')
q4 = q4_temp.withColumn('max_count', F.max('category_count').over(w))\
           .where(F.col('category_count') == F.col('max_count'))\
           .drop('max_count')

display(q4)

age_group,category,category_count
18-24,art,14
25-35,finance,11
36-50,quotes,8
50+,beauty,3
50+,education,3
50+,vehicles,3


#### 5. What is the median follower count for users by age group?

In [0]:
q5 = df_joined.groupBy('age_group')\
              .agg(F.percentile_approx('follower_count', 0.5)\
              .alias('median_follower_count'))
display(q5)

age_group,median_follower_count
50+,3000
36-50,6000
18-24,60000
25-35,31000


#### 6. What are the number of users joined each year?

In [0]:
df_joined = df_joined.withColumn('join_year', F.year(df_joined['date_joined']))
q6 = df_joined.filter("join_year BETWEEN '2015' AND '2020'")\
              .groupBy('join_year')\
              .agg(F.count_distinct('unique_id','join_year'))\
              .alias('number_users_joined')
display(q6)

join_year,"count(unique_id, join_year)"
2015,98
2016,132
2017,47


#### 7. What us the median follower count of users based on their joining year?

In [0]:
q7 = df_joined.filter("join_year BETWEEN '2015' AND '2020'")\
              .groupBy('join_year')\
              .agg(F.percentile_approx('follower_count', 0.5)\
              .alias('median_follower_count'))
display(q7)

join_year,median_follower_count
2015,67000
2016,23000
2017,6000


#### 8. What are the median follower count of users based on their joining year and age group?

In [0]:
q8 = df_joined.filter("join_year BETWEEN '2015' AND '2020'")\
              .groupBy('age_group','join_year')\
              .agg(F.percentile_approx('follower_count', 0.5)\
              .alias('median_follower_count'))\
              .orderBy('age_group','join_year')
display(q8)

age_group,join_year,median_follower_count
18-24,2015,211000
18-24,2016,40000
18-24,2017,11000
25-35,2015,42000
25-35,2016,27000
25-35,2017,8000
36-50,2015,6000
36-50,2016,9000
36-50,2017,3000
50+,2015,196
