# Data Wrangling with DataFrames Coding Quiz

Use this Jupyter notebook to find the answers to the quiz in the previous section. There is an answer key in the next part of the lesson.

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.functions import desc
# TODOS: 
# 1) import any other libraries you might need
# 2) instantiate a Spark session 
spark = SparkSession.builder.appName('Data Wrangling').getOrCreate()
# 3) read in the data set located at the path "data/sparkify_log_small.json"
path = "data/sparkify_log_small.json"
user_log = spark.read.json(path)
# 4) write code to answer the quiz questions 

In [None]:
user_log.select('page').dropDuplicates().show()

# Question 1

Which page did user id "" (empty string) NOT visit?

In [2]:
# TODO: write your code to answer question 1
# list all pages
pages = user_log.select('page').dropDuplicates()

In [4]:
# TODO: write your code to answer question 1
# list of pages that empty user visited
empty_user_pages_as_list = user_log.select('page').where(user_log.userId == '').dropDuplicates().rdd.flatMap(lambda i:i).collect()

# filter pages that empty user did not visit
not_visit_page = pages.filter(~pages['page'].isin(empty_user_pages_as_list))
not_visit_page.collect()

[Row(page='Submit Downgrade'),
 Row(page='Downgrade'),
 Row(page='Logout'),
 Row(page='Save Settings'),
 Row(page='Settings'),
 Row(page='NextSong'),
 Row(page='Upgrade'),
 Row(page='Error'),
 Row(page='Submit Upgrade')]

# Question 2 - Reflect

What type of user does the empty string user id most likely refer to?


In [None]:
# TODO: use this space to explore the behavior of the user with an empty string
# empty user is either a guest or a logged out user
#user_log.filter(user_log.userId=='').count()
#user_log.filter(((user_log.auth == 'Logged Out')|(user_log.auth == 'Guest')) & (user_log.userId == '')).count()
user_log.filter(((user_log.auth == 'Logged Out') | (user_log.auth == 'Guest')) & (user_log.userId != '')).count()

# Question 3

How many female users do we have in the data set?

In [5]:
# TODO: write your code to answer question 3
#user_log.printSchema()
user_log.select('userId','gender').dropDuplicates().where(user_log.gender == 'F').count()

462

# Question 4

How many songs were played from the most played artist?

In [8]:
# TODO: write your code to answer question 4
#get most played actists
#user_log.printSchema()
user_log.select('userId','song','artist').filter(user_log.artist !="null").groupby(user_log.artist).count().orderBy(desc('count')).take(1)

[Row(artist='Coldplay', count=83)]

# Question 5 (challenge)

How many songs do users listen to on average between visiting our home page? Please round your answer to the closest integer.



In [None]:
# TODO: write your code to answer question 5
#user_log.select('userId','page').take(20)
#user_log.select('userId','page','ts').where(col('userId') == '1046').orderBy(col('ts')).take(15)
# window with range and descending order: https://stackoverflow.com/questions/59571231/how-spark-rangebetween-works-with-descending-order
from pyspark.sql import Window
from pyspark.sql.functions import sum as Fsum
from pyspark.sql.functions import count as Fcount
from pyspark.sql.functions import avg as Favg
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType
#user_log.select('page').dropDuplicates().show()
#windowval = Window.partitionBy("userId").orderBy(desc("ts")).rangeBetween(Window.unboundedPreceding,Window.currentRow) #Window.currentRow=0
#user_log.filter(col('userId') == '1046').withColumn("phase", Fsum("userId").over(windowval)).select('userId','phase','ts').take(1000)
#mark homepage visit as one, others as 0
udf_mark_homepage = udf(lambda x: 1 if x == 'Home' else 0, IntegerType())
user_log_with_marked_homepage = user_log.withColumn('isHomepageVisit',udf_mark_homepage('page'))
# phase 0:before first time visiting home, 1:when visiting home and before the 2nd visit to home, 
# 2: 2nd visit to home and after
user_window_by_timestamp = Window.partitionBy('userId').orderBy(desc('ts')) \
.rangeBetween(Window.unboundedPreceding,Window.currentRow) 
user_with_phase = user_log_with_marked_homepage \
.withColumn("phase",Fsum('isHomepageVisit').over(user_window_by_timestamp))
#user_with_phase.filter(col('userId')=='2162').select('userId','page','phase').take(50)
# get list of users who visit home the 2nd time
return_to_homepage_user_list = user_with_phase.filter(col('phase')==2).select('userId').dropDuplicates().rdd.flatMap(lambda i:i).collect()
# filter user that visited home the 2nd time, visited NextSong page between first and 2nd visit to Home,
# then count these instances for each user, and average out
user_with_phase \
.filter((col('userId').isin(return_to_homepage_user_list)) \
        & ((col('phase')==1) & (col('page')=='NextSong'))).groupBy('userId') \
.count() \
.select(Favg('count')).show()