In [None]:
from pyspark.sql import SparkSession
import os
import configparser
import pyspark.sql.functions as F
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType, StringType

# Start spark session

In [None]:
spark = SparkSession.builder\
                     .config("spark.jars.packages","org.apache.hadoop:hadoop-aws:2.7.0")\
                     .getOrCreate()

# Read data from Parquet

In [None]:
# enter your database path here
# Exemple : 
# S3 bucket : db_path = "s3a://mybucket/mydir"
# HDFS directory : db_path = "HDFS:///user/mydir"
db_path = "./OUT/"

In [None]:
df_artists = spark.read.parquet( os.path.join(db_path, "OUT/ARTISTS") )
df_songs = spark.read.parquet(os.path.join(db_path, "SONGS"))
df_users = spark.read.parquet(os.path.join(db_path, "USERS/"))
df_timestamps = spark.read.parquet(os.path.join(db_path,"TIMESTAMPS/") )
df_songplays = spark.read.parquet(os.path.join(db_path, "SONGPLAYS/" ) )

## Number of rows per table

In [None]:
print("artists :", df_artists.count())
print("songs :", df_songs.count())
print("users :", df_users.count())
print("timestamps :", df_timestamps.count())
print("songplays :", df_songplays.count())

## Database schemas

In [None]:
print("Artists schema :")
df_artists.printSchema()
print("Songs schema :")
df_songs.printSchema()
print("Users schema :")
df_users.printSchema()
print("Timestamps schema :")
df_timestamps.printSchema()
print("Songplays schema :")
df_songplays.printSchema()

# Example queries

## Artists with most songs

In [None]:
df_songs.join(df_artists, df_songs["artist_id"] == df_artists["artist_id"])\
    .groupBy(df_artists["artist_id"], "name")\
    .count()\
    .orderBy("count", "name", ascending = [False, True])\
    .select(col("name").alias("artist name"), "count")\
    .limit(10)\
    .toPandas()


## Biggest song consumers

In [None]:
df_songplays.join(df_users, df_songplays["user_id"] == df_users["user_id"])\
    .groupBy(df_users["user_id"], "first_name", "last_name")\
    .count()\
    .orderBy(col("count").desc(), col("last_name").asc() )\
    .select("first_name", "last_name", "count")\
    .limit(10)\
    .toPandas()

## Average usage per week day

In [None]:
import calendar
list(calendar.day_name)

In [None]:
@udf(returnType= StringType())
def weekdaynumToName(daynum):
    '''
    convert the day of week number (0 to 6) to name ("Monday" -> "Sunday")
    '''
    return calendar.day_name[daynum]

In [None]:
# drop columns year and months from songplays (duplicate from timestamps)
# and join on ts (start time of songplay)
join_plays_timestamps = df_songplays.withColumnRenamed("start_time", "ts")\
        .drop("year", "month")\
        .join(df_timestamps, ["ts"])
join_plays_timestamps.printSchema()

In [None]:
# nb of distinct dates for a given weekday in the database
num_weeks = join_plays_timestamps \
        .select("weekday", "year", "month", "day")\
        .groupBy("weekday")\
        .agg( F.countDistinct("year", "month","day"))\
        .orderBy("weekday")\
        .select(weekdaynumToName("weekday").alias("week day"))\
        .toPandas()
    
join_plays_timestamps.groupBy(df_timestamps["weekday"])\
    .count()\
    .orderBy("weekday")\
    .select(weekdaynumToName("weekday").alias("week day"), "count")\
    .toPandas()
    

## Percentage of songs in the database that the users listen

In [None]:
nb_songs = df_songs.count()
nb_songplays = df_songplays.select("song_id").distinct().count()
print("Percentage of songs in the database which are listened to :", 
      100.*nb_songplays/nb_songs)