In [88]:
import configparser
from datetime import datetime
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format, dayofweek, monotonically_increasing_id
from pyspark.sql.types import TimestampType

In [89]:
config = configparser.ConfigParser()
config.read('dl.cfg')

['dl.cfg']

In [4]:
os.environ['AWS_ACCESS_KEY_ID']=config.get('AWS','AWS_ACCESS_KEY_ID')
os.environ['AWS_SECRET_ACCESS_KEY']=config.get('AWS','AWS_SECRET_ACCESS_KEY')

In [5]:
spark = SparkSession \
        .builder \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
        .getOrCreate()

In [6]:
# input_data = "s3a://udacity-dend/"
input_data = 'data/'

In [7]:
song_data = '{}{}'.format(input_data, 'song_data/*/*/*/*.json')
print(song_data)

# song_data = '{}song_data/*.json'.format(input_data)
# print(song_data)

data/song_data/*/*/*/*.json


In [8]:
df = spark.read.json(song_data)

In [9]:
df.count()

71

In [10]:
df.printSchema()

root
 |-- artist_id: string (nullable = true)
 |-- artist_latitude: double (nullable = true)
 |-- artist_location: string (nullable = true)
 |-- artist_longitude: double (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- num_songs: long (nullable = true)
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: long (nullable = true)



In [11]:
# song_id, title, artist_id, year, duration
songs = df.select("song_id", "title", "artist_id", "year", "duration")

In [12]:
songs.show(5)

+------------------+--------------------+------------------+----+---------+
|           song_id|               title|         artist_id|year| duration|
+------------------+--------------------+------------------+----+---------+
|SOBAYLL12A8C138AF9|Sono andati? Fing...|ARDR4AC1187FB371A1|   0|511.16363|
|SOOLYAZ12A6701F4A6|Laws Patrolling (...|AREBBGV1187FB523D2|   0|173.66159|
|SOBBUGU12A8C13E95D|Setting Fire to S...|ARMAC4T1187FB3FA4C|2004|207.77751|
|SOAOIBZ12AB01815BE|I Hold Your Hand ...|ARPBNLO1187FB3D52F|2000| 43.36281|
|SONYPOM12A8C13B2D7|I Think My Wife I...|ARDNS031187B9924F0|2005|186.48771|
+------------------+--------------------+------------------+----+---------+
only showing top 5 rows



In [13]:
df.printSchema()

root
 |-- artist_id: string (nullable = true)
 |-- artist_latitude: double (nullable = true)
 |-- artist_location: string (nullable = true)
 |-- artist_longitude: double (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- num_songs: long (nullable = true)
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: long (nullable = true)



In [14]:
# artists - artists in music database
# artist_id, name, location, lattitude, longitude

artists = df.selectExpr("artist_id", "artist_name as name", "artist_location as location", 
                        "artist_latitude as lattitude", "artist_longitude as longitude")

In [15]:
artists.show(5)

+------------------+--------------------+-----------------+---------+---------+
|         artist_id|                name|         location|lattitude|longitude|
+------------------+--------------------+-----------------+---------+---------+
|ARDR4AC1187FB371A1|Montserrat Caball...|                 |     null|     null|
|AREBBGV1187FB523D2|Mike Jones (Featu...|      Houston, TX|     null|     null|
|ARMAC4T1187FB3FA4C|The Dillinger Esc...|Morris Plains, NJ| 40.82624|-74.47995|
|ARPBNLO1187FB3D52F|            Tiny Tim|     New York, NY| 40.71455|-74.00712|
|ARDNS031187B9924F0|          Tim Wilson|          Georgia| 32.67828|-83.22295|
+------------------+--------------------+-----------------+---------+---------+
only showing top 5 rows



In [101]:
log_data = '{}{}'.format(input_data, 'log-data/*.json')
print(song_data)

data/song_data/*/*/*/*.json


In [102]:
df = spark.read.json(log_data)
df.show(5)
# df.toPandas()

+-----------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+--------+-----------------+---------+---------------+------+-------------+--------------------+------+
|     artist|     auth|firstName|gender|itemInSession|lastName|   length|level|            location|method|    page|     registration|sessionId|           song|status|           ts|           userAgent|userId|
+-----------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+--------+-----------------+---------+---------------+------+-------------+--------------------+------+
|   Harmonia|Logged In|     Ryan|     M|            0|   Smith|655.77751| free|San Jose-Sunnyval...|   PUT|NextSong|1.541016707796E12|      583|  Sehr kosmisch|   200|1542241826796|"Mozilla/5.0 (X11...|    26|
|The Prodigy|Logged In|     Ryan|     M|            1|   Smith|260.07465| free|San Jose-Sunnyval...|   PUT|NextSong|1.541016707796E12|      583|The Big Gundown|

In [103]:
df.printSchema()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)



In [104]:
df = df.filter(df.page == "NextSong")

In [105]:
df.show(5)

+-----------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+--------+-----------------+---------+--------------------+------+-------------+--------------------+------+
|     artist|     auth|firstName|gender|itemInSession|lastName|   length|level|            location|method|    page|     registration|sessionId|                song|status|           ts|           userAgent|userId|
+-----------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+--------+-----------------+---------+--------------------+------+-------------+--------------------+------+
|   Harmonia|Logged In|     Ryan|     M|            0|   Smith|655.77751| free|San Jose-Sunnyval...|   PUT|NextSong|1.541016707796E12|      583|       Sehr kosmisch|   200|1542241826796|"Mozilla/5.0 (X11...|    26|
|The Prodigy|Logged In|     Ryan|     M|            1|   Smith|260.07465| free|San Jose-Sunnyval...|   PUT|NextSong|1.541016707796E12|      

In [106]:
# users - users in the app
# user_id, first_name, last_name, gender, level

users = df.selectExpr("userId as user_id", "firstName as first_name", "lastName as last_name", "gender", "level").distinct()

In [107]:
users.show(4)

+-------+----------+---------+------+-----+
|user_id|first_name|last_name|gender|level|
+-------+----------+---------+------+-----+
|     26|      Ryan|    Smith|     M| free|
|      7|    Adelyn|   Jordan|     F| free|
|     71|    Ayleen|     Wise|     F| free|
|     81|    Sienna|    Colon|     F| free|
+-------+----------+---------+------+-----+
only showing top 4 rows



In [108]:
df.printSchema()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)



In [109]:
get_datetime = udf(lambda x: datetime.fromtimestamp(int(x)/1000.0), TimestampType())
df = df.withColumn("timestamp", get_datetime(df.ts))

df = df.withColumn("hour", hour(df.timestamp)) \
       .withColumn("month", month(df.timestamp)) \
       .withColumn("year", year(df.timestamp)) \
       .withColumn("week", weekofyear(df.timestamp)) \
       .withColumn("day", dayofmonth(df.timestamp)) \
       .withColumn("weekday", dayofweek(df.timestamp))

In [111]:
# get_start_time = udf(lambda x: to_date(x), TimestampType())
# df = df.withColumn("start_time", get_datetime(df.ts))
df.printSchema()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- hour: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- week: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- weekday: integer (nullable = true)



In [112]:
# time - timestamps of records in songplays broken down into specific units
# start_time, hour, day, week, month, year, weekday

df_time = df.selectExpr("timestamp as start_time", "hour" , "day", "week", "month", "year", "weekday")
df_time.show(5)

+--------------------+----+---+----+-----+----+-------+
|          start_time|hour|day|week|month|year|weekday|
+--------------------+----+---+----+-----+----+-------+
|2018-11-15 00:30:...|   0| 15|  46|   11|2018|      5|
|2018-11-15 00:41:...|   0| 15|  46|   11|2018|      5|
|2018-11-15 00:45:...|   0| 15|  46|   11|2018|      5|
|2018-11-15 03:44:...|   3| 15|  46|   11|2018|      5|
|2018-11-15 05:48:...|   5| 15|  46|   11|2018|      5|
+--------------------+----+---+----+-----+----+-------+
only showing top 5 rows



In [113]:
songs_table = spark.read.parquet('{}songs/*/*/*'.format(input_data))
artists_table = spark.read.parquet('{}artists/*'.format(input_data))

In [114]:
songs_table.printSchema()
artists_table.printSchema()

root
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- duration: double (nullable = true)

root
 |-- artist_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- location: string (nullable = true)
 |-- lattitude: double (nullable = true)
 |-- longitude: double (nullable = true)



In [156]:
# read in song data to use for songplays table
# song_df = df.join(songs_table.alias('s'), (s['title'] == df.song) & (s['duration'] == df.length)) \
#             .join(artists_table.alias('a'), a.name == df.artist).select('df.*')
# songplays - records in log data associated with song plays i.e. records with page NextSong
# songplay_id, start_time, user_id, level, song_id, artist_id, session_id, location, user_agent

song_df = df.join(songs_table, (songs_table.title == df.song) & (songs_table.duration == df.length)) \
            .join(artists_table, artists_table.name == df.artist) \
            .select(monotonically_increasing_id().alias("songplay_id"), 
                    df.timestamp, df.userId, df.level, songs_table.song_id, 
                    artists_table.artist_id, df.sessionId, df.location, 
                    df.userAgent, df.year, df.month)

In [157]:
song_df = song_df.selectExpr("songplay_id", "timestamp as start_time", "userId as user_id", 
                             "level", "song_id", "artist_id", "sessionId as session_id", 
                             "location", "userAgent as user_agent", "year", "month")

In [158]:
song_df.printSchema()

root
 |-- songplay_id: long (nullable = false)
 |-- start_time: timestamp (nullable = true)
 |-- user_id: string (nullable = true)
 |-- level: string (nullable = true)
 |-- song_id: string (nullable = true)
 |-- artist_id: string (nullable = true)
 |-- session_id: long (nullable = true)
 |-- location: string (nullable = true)
 |-- user_agent: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)

