# Spark SQL Examples

Run the code cells below. This is the same code from the previous screencast.

In [24]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import desc
from pyspark.sql.functions import asc
from pyspark.sql.functions import sum as Fsum

import datetime

import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
spark = SparkSession \
    .builder \
    .appName("Data wrangling with Spark SQL") \
    .getOrCreate()

In [3]:
path = "data/sparkify_log_small.json"
user_log = spark.read.json(path)

In [4]:
user_log.take(1)

[Row(artist='Showaddywaddy', auth='Logged In', firstName='Kenneth', gender='M', itemInSession=112, lastName='Matthews', length=232.93342, level='paid', location='Charlotte-Concord-Gastonia, NC-SC', method='PUT', page='NextSong', registration=1509380319284, sessionId=5132, song='Christmas Tears Will Fall', status=200, ts=1513720872284, userAgent='"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36"', userId='1046')]

In [5]:
user_log.printSchema()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: long (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)



# Create a View And Run Queries

The code below creates a temporary view against which you can run SQL queries.

In [4]:
user_log.createOrReplaceTempView("user_log_table")

In [7]:
spark.sql("SELECT * FROM user_log_table LIMIT 2").show()

+-------------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+--------+-------------+---------+--------------------+------+-------------+--------------------+------+
|       artist|     auth|firstName|gender|itemInSession|lastName|   length|level|            location|method|    page| registration|sessionId|                song|status|           ts|           userAgent|userId|
+-------------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+--------+-------------+---------+--------------------+------+-------------+--------------------+------+
|Showaddywaddy|Logged In|  Kenneth|     M|          112|Matthews|232.93342| paid|Charlotte-Concord...|   PUT|NextSong|1509380319284|     5132|Christmas Tears W...|   200|1513720872284|"Mozilla/5.0 (Win...|  1046|
|   Lily Allen|Logged In|Elizabeth|     F|            7|   Chase|195.23873| free|Shreveport-Bossie...|   PUT|NextSong|1512718541284|     5027|      

In [8]:
spark.sql('''
          SELECT * 
          FROM user_log_table 
          LIMIT 2
          '''
          ).show()

+-------------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+--------+-------------+---------+--------------------+------+-------------+--------------------+------+
|       artist|     auth|firstName|gender|itemInSession|lastName|   length|level|            location|method|    page| registration|sessionId|                song|status|           ts|           userAgent|userId|
+-------------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+--------+-------------+---------+--------------------+------+-------------+--------------------+------+
|Showaddywaddy|Logged In|  Kenneth|     M|          112|Matthews|232.93342| paid|Charlotte-Concord...|   PUT|NextSong|1509380319284|     5132|Christmas Tears W...|   200|1513720872284|"Mozilla/5.0 (Win...|  1046|
|   Lily Allen|Logged In|Elizabeth|     F|            7|   Chase|195.23873| free|Shreveport-Bossie...|   PUT|NextSong|1512718541284|     5027|      

In [9]:
spark.sql('''
          SELECT COUNT(*) 
          FROM user_log_table 
          '''
          ).show()

+--------+
|count(1)|
+--------+
|   10000|
+--------+



In [10]:
spark.sql('''
          SELECT userID, firstname, page, song
          FROM user_log_table 
          WHERE userID == '1046'
          '''
          ).collect()

[Row(userID='1046', firstname='Kenneth', page='NextSong', song='Christmas Tears Will Fall'),
 Row(userID='1046', firstname='Kenneth', page='NextSong', song='Be Wary Of A Woman'),
 Row(userID='1046', firstname='Kenneth', page='NextSong', song='Public Enemy No.1'),
 Row(userID='1046', firstname='Kenneth', page='NextSong', song='Reign Of The Tyrants'),
 Row(userID='1046', firstname='Kenneth', page='NextSong', song='Father And Son'),
 Row(userID='1046', firstname='Kenneth', page='NextSong', song='No. 5'),
 Row(userID='1046', firstname='Kenneth', page='NextSong', song='Seventeen'),
 Row(userID='1046', firstname='Kenneth', page='Home', song=None),
 Row(userID='1046', firstname='Kenneth', page='NextSong', song='War on war'),
 Row(userID='1046', firstname='Kenneth', page='NextSong', song='Killermont Street'),
 Row(userID='1046', firstname='Kenneth', page='NextSong', song='Black & Blue'),
 Row(userID='1046', firstname='Kenneth', page='Logout', song=None),
 Row(userID='1046', firstname='Kenneth'

In [11]:
spark.sql('''
          SELECT DISTINCT page
          FROM user_log_table 
          ORDER BY page ASC
          '''
          ).show()

+----------------+
|            page|
+----------------+
|           About|
|       Downgrade|
|           Error|
|            Help|
|            Home|
|           Login|
|          Logout|
|        NextSong|
|   Save Settings|
|        Settings|
|Submit Downgrade|
|  Submit Upgrade|
|         Upgrade|
+----------------+



# User Defined Functions

In [12]:
spark.udf.register("get_hour", lambda x: int(datetime.datetime.fromtimestamp(x / 1000.0).hour))

<function __main__.<lambda>(x)>

In [13]:
spark.sql('''
          SELECT *, get_hour(ts) AS hour
          FROM user_log_table 
          LIMIT 1
          '''
          ).collect()

[Row(artist='Showaddywaddy', auth='Logged In', firstName='Kenneth', gender='M', itemInSession=112, lastName='Matthews', length=232.93342, level='paid', location='Charlotte-Concord-Gastonia, NC-SC', method='PUT', page='NextSong', registration=1509380319284, sessionId=5132, song='Christmas Tears Will Fall', status=200, ts=1513720872284, userAgent='"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36"', userId='1046', hour='22')]

In [14]:
songs_in_hour = spark.sql('''
          SELECT get_hour(ts) AS hour, COUNT(*) as plays_per_hour
          FROM user_log_table
          WHERE page = "NextSong"
          GROUP BY hour
          ORDER BY cast(hour as int) ASC
          '''
          )

In [15]:
songs_in_hour.show()

+----+--------------+
|hour|plays_per_hour|
+----+--------------+
|   0|           456|
|   1|           454|
|   2|           382|
|   3|           302|
|   4|           352|
|   5|           276|
|   6|           348|
|   7|           358|
|   8|           375|
|   9|           249|
|  10|           216|
|  11|           228|
|  12|           251|
|  13|           339|
|  14|           462|
|  15|           479|
|  16|           484|
|  17|           430|
|  18|           362|
|  19|           295|
+----+--------------+
only showing top 20 rows



# Converting Results to Pandas

In [16]:
songs_in_hour_pd = songs_in_hour.toPandas()

In [17]:
print(songs_in_hour_pd)

   hour  plays_per_hour
0     0             456
1     1             454
2     2             382
3     3             302
4     4             352
5     5             276
6     6             348
7     7             358
8     8             375
9     9             249
10   10             216
11   11             228
12   12             251
13   13             339
14   14             462
15   15             479
16   16             484
17   17             430
18   18             362
19   19             295
20   20             257
21   21             248
22   22             369
23   23             375


## How long on avg an user listening to songs daily?

In [None]:
# convert spark df to pandas df

In [14]:
log_pd = user_log.toPandas()

In [16]:
log_pd['ts'] = pd.to_datetime(log_pd['ts'], unit='ms')

In [19]:
log_pd['ts_dt'] = log_pd['ts'].dt.date
log_pd['ts_month'] = log_pd['ts'].dt.month

In [20]:
print(log_pd.head())

                                      artist       auth  firstName gender  \
0                              Showaddywaddy  Logged In    Kenneth      M   
1                                 Lily Allen  Logged In  Elizabeth      F   
2  Cobra Starship Featuring Leighton Meester  Logged In       Vera      F   
3                                 Alex Smoke  Logged In     Sophee      F   
4                                       None  Logged In     Jordyn      F   

   itemInSession   lastName     length level  \
0            112   Matthews  232.93342  paid   
1              7      Chase  195.23873  free   
2              6  Blackwell  196.20526  paid   
3              8     Barker  405.99465  paid   
4              0      Jones        NaN  free   

                                        location method      page  \
0              Charlotte-Concord-Gastonia, NC-SC    PUT  NextSong   
1                    Shreveport-Bossier City, LA    PUT  NextSong   
2                                     Rac

In [None]:
# convert pandas df to spark df

In [31]:
from pyspark.context import SparkContext as sc
from pyspark.sql import SQLContext
sc = SparkContext.getOrCreate()
sqlContext = SQLContext(sc)


spark_log = sqlContext.createDataFrame(log_pd)

In [32]:
spark_log.take(1)

[Row(artist='Showaddywaddy', auth='Logged In', firstName='Kenneth', gender='M', itemInSession=112, lastName='Matthews', length=232.93342, level='paid', location='Charlotte-Concord-Gastonia, NC-SC', method='PUT', page='NextSong', registration=1509380319284.0, sessionId=5132, song='Christmas Tears Will Fall', status=200, ts=datetime.datetime(2017, 12, 19, 22, 1, 12, 284000), userAgent='"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36"', userId='1046', ts_dt=datetime.date(2017, 12, 19), ts_month=12)]

In [None]:
# register the spark df as a table

In [35]:
spark_log.registerTempTable("spark_log_table")

In [44]:
spark.sql('''
          SELECT AVG(avg_daily_length)
          FROM
          (
              SELECT userId, AVG(daily_length) as avg_daily_length
              FROM
              (
                  SELECT userId, ts_dt, SUM(length) AS daily_length
                  FROM spark_log_table
                  WHERE page = "NextSong"
                  GROUP BY userId, ts_dt
              ) t
              GROUP BY userId
          ) t1
          '''
          ).show()

+---------------------+
|avg(avg_daily_length)|
+---------------------+
|   1973.4758263747538|
+---------------------+



## number of songs played per day (undedup) for an user on avg

In [46]:
spark.sql('''
          SELECT AVG(avg_songcnt)
          FROM
          (
              SELECT userId, AVG(songcnt) as avg_songcnt
              FROM
              (
                  SELECT userId, ts_dt, COUNT(*) AS songcnt
                  FROM spark_log_table
                  WHERE page = "NextSong"
                  GROUP BY userId, ts_dt
              ) t
              GROUP BY userId
          ) t1
          '''
          ).show()

+-----------------+
| avg(avg_songcnt)|
+-----------------+
|7.886587771203155|
+-----------------+



## MAU of the past month (listened at least 1 song)

In [47]:
spark.sql('''
          SELECT distinct ts_dt, ts_month
          FROM spark_log_table
          WHERE page = "NextSong"
          '''
          ).show()

+----------+--------+
|     ts_dt|ts_month|
+----------+--------+
|2017-12-21|      12|
|2017-12-19|      12|
|2017-12-20|      12|
+----------+--------+



In [49]:
spark.sql('''
          SELECT COUNT(distinct userId) as MAU
          FROM spark_log_table
          WHERE page = "NextSong"
              AND ts_month = 12
          '''
          ).show()

+---+
|MAU|
+---+
|845|
+---+



## DAU of the past month, % of MAU

In [57]:
spark.sql('''
          SELECT ts_dt, COUNT(distinct userId) as DAU, ROUND(COUNT(distinct userId)/845*100, 0) as percent_of_MAU
          FROM spark_log_table
          WHERE page = "NextSong"
              AND ts_month = 12
          GROUP BY ts_dt
          '''
          ).show()

+----------+---+--------------+
|     ts_dt|DAU|percent_of_MAU|
+----------+---+--------------+
|2017-12-19| 67|           8.0|
|2017-12-20|694|          82.0|
|2017-12-21|229|          27.0|
+----------+---+--------------+



In [None]:
# number of paid, free users

In [58]:
spark.sql('''
          SELECT level, COUNT(distinct userId) as user_cnt
          FROM spark_log_table
          GROUP BY level
          '''
          ).show()

+-----+--------+
|level|user_cnt|
+-----+--------+
| free|     816|
| paid|     160|
+-----+--------+

