In [1]:
# Imports

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, desc, udf, col
from pyspark.sql.functions import sum as Fsum
from pyspark.sql.window import Window
from pyspark.sql.types import IntegerType

In [2]:
# Spark Session instance

spark = (SparkSession
         .builder
         .appName("Quiz data wrangling")
         .getOrCreate())

logs_df = spark.read.json("../data/sparkify_log_small.json")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/05/31 21:15:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
logs_df.printSchema()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: long (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)



In [8]:
# # Question 1
# 
# Which page did user id "" (empty string) NOT visit?

blank_pages_df = (logs_df
                  .filter(logs_df.userId == "")
                  .select(col("page")
                  .alias("blank_pages"))
                  .dropDuplicates())

all_pages_df = logs_df.select("page").dropDuplicates()

for row in set(all_pages_df.collect()) - set(blank_pages_df.collect()):
    print(row.page)

Save Settings
Settings
Logout
Upgrade
Submit Downgrade
NextSong
Submit Upgrade
Downgrade
Error


In [10]:
# # Question 3
# 
# How many female users do we have in the data set?

print((logs_df
 .filter(logs_df.gender == "F")
 .select("userId", "gender")
 .dropDuplicates()
 .count()))

462


In [11]:
# # Question 4
# 
# How many songs were played from the most played artist?

logs_df.filter(logs_df.page == 'NextSong') \
    .select('Artist') \
    .groupBy('Artist') \
    .agg({'Artist':'count'}) \
    .withColumnRenamed('count(Artist)', 'Playcount') \
    .sort(desc('Playcount')) \
    .show(1)

+--------+---------+
|  Artist|Playcount|
+--------+---------+
|Coldplay|       83|
+--------+---------+
only showing top 1 row



In [12]:
# # Question 5 (challenge)
# 
# How many songs do users listen to on average between visiting our home page? Please round your answer to the closest integer.

user_window = Window \
    .partitionBy('userID') \
    .orderBy(desc('ts')) \
    .rangeBetween(Window.unboundedPreceding, 0)

ishome = udf(lambda ishome : int(ishome == 'Home'), IntegerType())

# Filter only NextSong and Home pages, add 1 for each time they visit Home
# Adding a column called period which is a specific interval between Home visits
cusum = logs_df.filter((logs_df.page == 'NextSong') | (logs_df.page == 'Home')) \
    .select('userID', 'page', 'ts') \
    .withColumn('homevisit', ishome(col('page'))) \
    .withColumn('period', Fsum('homevisit') \
    .over(user_window)) 
    
# This will only show 'Home' in the first several rows due to default sorting

cusum.show(300)


# See how many songs were listened to on average during each period
cusum.filter((cusum.page == 'NextSong')) \
    .groupBy('userID', 'period') \
    .agg({'period':'count'}) \
    .agg({'count(period)':'avg'}) \
    .show()

+------+--------+-------------+---------+------+
|userID|    page|           ts|homevisit|period|
+------+--------+-------------+---------+------+
|      |    Home|1513846494284|        1|     1|
|      |    Home|1513845761284|        1|     2|
|      |    Home|1513845132284|        1|     3|
|      |    Home|1513845055284|        1|     4|
|      |    Home|1513844251284|        1|     5|
|      |    Home|1513844026284|        1|     6|
|      |    Home|1513843602284|        1|     7|
|      |    Home|1513843098284|        1|     8|
|      |    Home|1513842367284|        1|     9|
|      |    Home|1513841138284|        1|    10|
|      |    Home|1513841121284|        1|    11|
|      |    Home|1513839824284|        1|    12|
|      |    Home|1513838865284|        1|    13|
|      |    Home|1513838857284|        1|    14|
|      |    Home|1513838835284|        1|    15|
|      |    Home|1513838141284|        1|    16|
|      |    Home|1513838110284|        1|    17|
|      |    Home|151