In [1]:
pip install pyspark

Defaulting to user installation because normal site-packages is not writeable
Collecting pyspark
  Downloading pyspark-3.3.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 48 kB/s s eta 0:00:01
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 39.9 MB/s eta 0:00:01
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25ldone
[?25h  Created wheel for pyspark: filename=pyspark-3.3.0-py2.py3-none-any.whl size=281764026 sha256=8bf29cdf6dbb66cc1175f87ad2cafd8357712ea32bc5b3c41120ee41953fd8d6
  Stored in directory: /home/emr-notebook/.cache/pip/wheels/7a/8e/1b/f73a52650d2e5f337708d9f6a1750d451a7349a867f928b885
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.0
Note: you may need to restart the kernel to use updated packages.


In [1]:
from pyspark.sql import SparkSession
import os
import configparser

In [2]:
spark = SparkSession.builder \
                     .config("spark.jars.packages","com.amazonaws:aws-java-sdk-bundle:1.12.262,org.apache.hadoop:hadoop-aws:3.3.4") \
                     .getOrCreate()

In [3]:
from pyspark.sql.types import StructType as R, StructField as Fld, DoubleType as Dbl, StringType as Str, IntegerType as Int, DateType as Date
song_schema = R([
    Fld("artist_id",Str()),
    Fld("artist_latitude",Dbl()),
    Fld("artist_location",Str()),
    Fld("artist_longitude",Dbl()),
    Fld("artist_name",Str()),
    Fld("duration",Dbl()),
    Fld("num_songs",Int()),
    Fld("song_id",Str()),
    Fld("title",Str()),
    Fld("year",Int()),
])

In [4]:
df_songs = spark.read.option("recursiveFileLookup","true") \
                .json("s3a://udacity-dend/song_data/", schema=song_schema)

In [5]:
df_songs.count()

14896

In [8]:
import pyspark.sql.functions as f

df_songs.printSchema()
df_songs = df_songs.withColumn("artist_name",f.lower(f.col("artist_name")))
df_songs = df_songs.withColumn("title",f.lower(f.col("title")))
df_songs.limit(5).toPandas()

root
 |-- artist_id: string (nullable = true)
 |-- artist_latitude: double (nullable = true)
 |-- artist_location: string (nullable = true)
 |-- artist_longitude: double (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- num_songs: integer (nullable = true)
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: integer (nullable = true)



Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year
0,AR4T2IF1187B9ADBB7,63.96027,"<a href=""http://billyidol.net"" onmousedown='Un...",10.22442,billy idol,233.22077,1,SOVIYJY12AF72A4B00,the dead next door (digitally remastered 99),1983
1,AR4T2IF1187B9ADBB7,63.96027,"<a href=""http://billyidol.net"" onmousedown='Un...",10.22442,billy idol,287.92118,1,SOVYXYL12AF72A3373,rebel yell (1999 digital remaster),1983
2,ARQ846I1187B9A7083,,,,yvonne s. moriarty / walt fowler / ladd mcinto...,196.04853,1,SOEPTVC12A67ADD0DA,"to zucchabar [""gladiator"" - music from the mot...",0
3,AR4T2IF1187B9ADBB7,63.96027,"<a href=""http://billyidol.net"" onmousedown='Un...",10.22442,billy idol,247.53587,1,SOLQYSZ12AB0181F97,mony mony (live),1987
4,AR3TZ691187FB3DBB1,,,,russell watson / pino palladino / robbie mcint...,273.44934,1,SOVPFJK12A6701CB16,barcelona - (friends until the end),2000


In [11]:
from pyspark.sql.types import StructType as R, StructField as Fld, DoubleType as Dbl, StringType as Str, IntegerType as Int, TimestampType as Timestamp, LongType as Long
event_schema = R([
    Fld("artist",Str()),
    Fld("auth",Str()),
    Fld("firstName",Str()),
    Fld("gender",Str()),
    Fld("itemInSession",Int()),
    Fld("lastName",Str()),
    Fld("length",Dbl()),
    Fld("level",Str()),
    Fld("location",Str()),
    Fld("method",Str()),
    Fld("page",Str()),
    Fld("registration",Dbl()),
    Fld("sessionId",Int()),
    Fld("song",Str()),
    Fld("status",Str()),
    Fld("ts",Long()),
    Fld("userAgent",Str()),
    Fld("userId",Str())
])

In [16]:
df_events = spark.read.option("recursiveFileLookup","true") \
                .json("s3a://udacity-dend/log_data/", schema=event_schema)
df_events.printSchema()
df_events = df_events.withColumn("artist",f.lower(f.col("artist")))
df_events = df_events.withColumn("song",f.lower(f.col("song")))
df_events.limit(5).toPandas()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: integer (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- sessionId: integer (nullable = true)
 |-- song: string (nullable = true)
 |-- status: string (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)



Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userAgent,userId
0,harmonia,Logged In,Ryan,M,0,Smith,655.77751,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541017000000.0,583,sehr kosmisch,200,1542241826796,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",26
1,the prodigy,Logged In,Ryan,M,1,Smith,260.07465,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541017000000.0,583,the big gundown,200,1542242481796,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",26
2,train,Logged In,Ryan,M,2,Smith,205.45261,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541017000000.0,583,marry me,200,1542242741796,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",26
3,,Logged In,Wyatt,M,0,Scott,,free,"Eureka-Arcata-Fortuna, CA",GET,Home,1540872000000.0,563,,200,1542247071796,Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7....,9
4,,Logged In,Austin,M,0,Rosales,,free,"New York-Newark-Jersey City, NY-NJ-PA",GET,Home,1541060000000.0,521,,200,1542252577796,Mozilla/5.0 (Windows NT 6.1; rv:31.0) Gecko/20...,12


In [17]:
df_events.count()

8056

In [18]:
from pyspark.sql.functions import udf
import pyspark.sql.functions as F

@udf
def parseTimestamp(ts):
    from datetime import datetime
    date_string = str(datetime.fromtimestamp(ts/1000))
    return date_string

In [19]:
df_events = df_events.withColumn("ts", parseTimestamp("ts"))
df_events.limit(5).toPandas()

Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userAgent,userId
0,harmonia,Logged In,Ryan,M,0,Smith,655.77751,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541017000000.0,583,sehr kosmisch,200,2018-11-15 00:30:26.796000,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",26
1,the prodigy,Logged In,Ryan,M,1,Smith,260.07465,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541017000000.0,583,the big gundown,200,2018-11-15 00:41:21.796000,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",26
2,train,Logged In,Ryan,M,2,Smith,205.45261,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541017000000.0,583,marry me,200,2018-11-15 00:45:41.796000,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",26
3,,Logged In,Wyatt,M,0,Scott,,free,"Eureka-Arcata-Fortuna, CA",GET,Home,1540872000000.0,563,,200,2018-11-15 01:57:51.796000,Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7....,9
4,,Logged In,Austin,M,0,Rosales,,free,"New York-Newark-Jersey City, NY-NJ-PA",GET,Home,1541060000000.0,521,,200,2018-11-15 03:29:37.796000,Mozilla/5.0 (Windows NT 6.1; rv:31.0) Gecko/20...,12


In [20]:
from pyspark.sql.types import StructType as R, StructField as Fld, DoubleType as Dbl, StringType as Str, IntegerType as Int, TimestampType as Timestamp, LongType as Long
df_events = df_events.withColumn("userId", df_events["userId"].cast(Int()))
df_events.limit(5).toPandas()

Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userAgent,userId
0,harmonia,Logged In,Ryan,M,0,Smith,655.77751,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541017000000.0,583,sehr kosmisch,200,2018-11-15 00:30:26.796000,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",26
1,the prodigy,Logged In,Ryan,M,1,Smith,260.07465,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541017000000.0,583,the big gundown,200,2018-11-15 00:41:21.796000,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",26
2,train,Logged In,Ryan,M,2,Smith,205.45261,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541017000000.0,583,marry me,200,2018-11-15 00:45:41.796000,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",26
3,,Logged In,Wyatt,M,0,Scott,,free,"Eureka-Arcata-Fortuna, CA",GET,Home,1540872000000.0,563,,200,2018-11-15 01:57:51.796000,Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7....,9
4,,Logged In,Austin,M,0,Rosales,,free,"New York-Newark-Jersey City, NY-NJ-PA",GET,Home,1541060000000.0,521,,200,2018-11-15 03:29:37.796000,Mozilla/5.0 (Windows NT 6.1; rv:31.0) Gecko/20...,12


In [21]:
df_events.printSchema()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: integer (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- sessionId: integer (nullable = true)
 |-- song: string (nullable = true)
 |-- status: string (nullable = true)
 |-- ts: string (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: integer (nullable = true)



In [23]:
songs_table = df_songs.select("song_id", "title","artist_id", "year", "duration")
songs_table.limit(5).toPandas()

Unnamed: 0,song_id,title,artist_id,year,duration
0,SOVIYJY12AF72A4B00,the dead next door (digitally remastered 99),AR4T2IF1187B9ADBB7,1983,233.22077
1,SOVYXYL12AF72A3373,rebel yell (1999 digital remaster),AR4T2IF1187B9ADBB7,1983,287.92118
2,SOEPTVC12A67ADD0DA,"to zucchabar [""gladiator"" - music from the mot...",ARQ846I1187B9A7083,0,196.04853
3,SOLQYSZ12AB0181F97,mony mony (live),AR4T2IF1187B9ADBB7,1987,247.53587
4,SOVPFJK12A6701CB16,barcelona - (friends until the end),AR3TZ691187FB3DBB1,2000,273.44934


In [24]:
songs_table.count()

14896

In [28]:
from pyspark.sql.functions import desc

unique_songs = songs_table.groupBy("song_id").count().orderBy(desc("count"))

In [29]:
unique_songs.limit(5).toPandas()

Unnamed: 0,song_id,count
0,SOIEUCF12A6D4F9889,1
1,SOQEBML12A8C136AA4,1
2,SOEPTVC12A67ADD0DA,1
3,SODTEMK12AF72A7210,1
4,SOROAMT12A8C13C6D0,1


In [30]:
unique_songs.count()

14896

In [31]:
songs_data = songs_table.dropDuplicates(["song_id"])

In [32]:
songs_data.count()

14896

In [34]:
artists_table_fields = ["artist_id", "artist_name","artist_location", "artist_latitude", "artist_longitude"]
artists_table_new_fields = ["artist_id", "name","location", "latitude", "longitude"]
artists_table_exprs = [ "{} as {}".format(oldField, newField) for (oldField, newField) in zip(artists_table_fields, artists_table_new_fields) ]

artists_data = df_songs.selectExpr(*artists_table_exprs).dropDuplicates(["artist_id"])
artists_data.limit(5).toPandas()

Unnamed: 0,artist_id,name,location,latitude,longitude
0,AR00B1I1187FB433EB,eagle-eye cherry,"Stockholm, Sweden",,
1,AR00FVC1187FB5BE3E,panda,"Monterrey, NL, México",25.67084,-100.30953
2,AR00LNI1187FB444A5,bruce becvar,,,
3,AR00TGQ1187B994F29,paula toller,,,
4,AR016P51187B98E398,indian ropeman,,,


In [35]:
from pyspark.sql.functions import desc
users_table_fields = ["userId", "firstName","lastName", "gender", "level"]
users_table_new_fields = ["user_id", "first_name","last_name", "gender", "level"]
users_table_exprs = [ "{} as {}".format(oldField, newField) for (oldField, newField) in zip(users_table_fields, users_table_new_fields) ]
users_table = df_events.selectExpr(*users_table_exprs)
users_table.count()

8056

In [36]:
# Dropping duplicates users with user_id
users_data = users_table.dropDuplicates(["user_id"])
users_data.count()

98

In [38]:
from pyspark.sql.functions import desc
time_table_fields = ["ts"]
time_table_new_fields = ["start_time"]
time_table_exprs = [ "{} as {}".format(oldField, newField) for (oldField, newField) in zip(time_table_fields, time_table_new_fields) ]
time_table = df_events.selectExpr(*time_table_exprs)
time_table.count()


8056

In [39]:
# Dropping Duplicate time entries
time_table = time_table.dropDuplicates(["start_time"])
time_table = time_table.withColumn("hour", F.hour("start_time"))
time_table = time_table.withColumn("day", F.dayofweek("start_time"))
time_table = time_table.withColumn("week", F.weekofyear("start_time"))
time_table = time_table.withColumn("month", F.month("start_time"))
time_table = time_table.withColumn("year", F.year("start_time"))
# Clear parantheses for logical operators is necessary
time_table = time_table.withColumn("weekday", ((F.dayofweek("start_time") > 0) & (F.dayofweek("start_time") < 6)) )
time_table.count()

8023

In [40]:
time_table.limit(5).toPandas()

Unnamed: 0,start_time,hour,day,week,month,year,weekday
0,2018-11-15 11:22:06.796000,11,5,46,11,2018,True
1,2018-11-15 18:09:32.796000,18,5,46,11,2018,True
2,2018-11-15 18:59:14.796000,18,5,46,11,2018,True
3,2018-11-15 19:01:55.796000,19,5,46,11,2018,True
4,2018-11-21 03:57:19.796000,3,4,47,11,2018,True


In [41]:
# Inner Join between songs and events
df_events = df_events.filter(df_events.page == "NextSong")
condition = (( df_events["artist"] == df_songs["artist_name"]) & (df_events["song"] == df_songs["title"]) & (df_events["length"] == df_songs["duration"]) )
songplays = df_events.join(df_songs, condition, "inner").select(df_events["ts"], df_events["userId"], df_events["level"], df_songs["song_id"], df_songs["artist_id"], df_events["sessionId"], df_events["location"], df_events["userAgent"])
songplays.limit(5).toPandas()

Unnamed: 0,ts,userId,level,song_id,artist_id,sessionId,location,userAgent
0,2018-11-21 08:25:43.796000,88,paid,SOCHPTV12A6BD53113,ARN8NCB1187FB49652,744,"Sacramento--Roseville--Arden-Arcade, CA","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4..."
1,2018-11-29 16:58:01.796000,49,paid,SOGXSWA12A6D4FBC99,ARPFHN61187FB575F6,1041,"San Francisco-Oakland-Hayward, CA",Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20...
2,2018-11-28 08:18:57.796000,58,paid,SOJWCWM12A8C13B664,ARM6T8I1187FB36CC8,887,"Augusta-Richmond County, GA-SC","""Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebK..."
3,2018-11-28 23:34:43.796000,24,paid,SOHRHCN12AB018B0F4,ARHQBRZ1187FB3BDA2,984,"Lake Havasu City-Kingman, AZ","""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK..."
4,2018-11-26 15:37:14.796000,88,paid,SOARTQC12A58A77F0C,ARCE0IX1187FB528B4,900,"Sacramento--Roseville--Arden-Arcade, CA","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4..."


In [42]:
songplays.count()

319

In [43]:
# LEFT Outer join between songs and events
df_events = df_events.filter(df_events.page == "NextSong")
condition = (( df_events["artist"] == df_songs["artist_name"]) & (df_events["song"] == df_songs["title"]) & (df_events["length"] == df_songs["duration"]) )
songplays = df_events.join(df_songs, condition, "left_outer").select(df_events["ts"], df_events["userId"], df_events["level"], df_songs["song_id"], df_songs["artist_id"], df_events["sessionId"], df_events["location"], df_events["userAgent"])
songplays.limit(5).toPandas()

Unnamed: 0,ts,userId,level,song_id,artist_id,sessionId,location,userAgent
0,2018-11-15 00:30:26.796000,26,free,,,583,"San Jose-Sunnyvale-Santa Clara, CA","""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5..."
1,2018-11-15 00:41:21.796000,26,free,,,583,"San Jose-Sunnyvale-Santa Clara, CA","""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5..."
2,2018-11-15 00:45:41.796000,26,free,,,583,"San Jose-Sunnyvale-Santa Clara, CA","""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5..."
3,2018-11-15 03:44:09.796000,61,free,,,597,"Houston-The Woodlands-Sugar Land, TX","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4..."
4,2018-11-15 05:48:55.796000,80,paid,,,602,"Portland-South Portland, ME","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4..."


In [44]:
songplays.count()

6820

In [None]:
songplays = songplays.withColumnRenamed("ts", "start_time")\
                        .withColumnRenamed("userId", "user_id") \
                        .withColumnRenamed("sessionId", "session_id") \
                        .withColumnRenamed("userAgent", "user_agent")