In [1]:
from pyspark.sql import SparkSession
import os
import configparser

In [2]:
spark = SparkSession.builder\
                     .config("spark.jars.packages","org.apache.hadoop:hadoop-aws:2.7.0")\
                     .getOrCreate()

In [3]:
%ls data

log-data.zip  song-data.zip


In [4]:
#!unzip data/song-data.zip

In [5]:
%cat song_data/A/A/C/TRAACCG128F92E8A55.json

{"num_songs": 1, "artist_id": "AR5KOSW1187FB35FF4", "artist_latitude": 49.80388, "artist_longitude": 15.47491, "artist_location": "Dubai UAE", "artist_name": "Elena", "song_id": "SOZCTXZ12AB0182364", "title": "Setanta matins", "duration": 269.58322, "year": 0}

In [6]:
df = spark.read.json("song_data/A/A/C/TRAACCG128F92E8A55.json")

In [7]:
df.printSchema()

root
 |-- artist_id: string (nullable = true)
 |-- artist_latitude: double (nullable = true)
 |-- artist_location: string (nullable = true)
 |-- artist_longitude: double (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- num_songs: long (nullable = true)
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: long (nullable = true)



In [8]:
df = spark.read.json("song_data/A/A/A")

In [9]:
%ls song_data/A/A/A/

TRAAAAW128F429D538.json  TRAAAFD128F92F423A.json  TRAAARJ128F9320760.json
TRAAABD128F429CF47.json  TRAAAMO128F1481E7F.json  TRAAAVG12903CFA543.json
TRAAADZ128F9348C2E.json  TRAAAMQ128F1460CD3.json  TRAAAVO128F93133D4.json
TRAAAEF128F4273421.json  TRAAAPK128E0786D96.json


In [10]:
df5 = spark.read.json("song_data/{A,B,C}/{A,B,C}/{A,B,C}")

In [11]:
df5.count()

71

In [12]:
df5.printSchema()

root
 |-- artist_id: string (nullable = true)
 |-- artist_latitude: double (nullable = true)
 |-- artist_location: string (nullable = true)
 |-- artist_longitude: double (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- num_songs: long (nullable = true)
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: long (nullable = true)



In [13]:
type(df5)

pyspark.sql.dataframe.DataFrame

In [14]:
df_artists = df5.select("artist_id", 
                        "artist_name",
                        "artist_location",
                        "artist_latitude",
                        "artist_longitude").\
        groupBy("artist_id", "artist_name", "artist_location", 
                "artist_latitude", "artist_longitude")


In [15]:
df_artists.count().show()

+------------------+--------------------+--------------------+---------------+----------------+-----+
|         artist_id|         artist_name|     artist_location|artist_latitude|artist_longitude|count|
+------------------+--------------------+--------------------+---------------+----------------+-----+
|ARPBNLO1187FB3D52F|            Tiny Tim|        New York, NY|       40.71455|       -74.00712|    1|
|ARBEBBY1187B9B43DB|           Tom Petty|     Gainesville, FL|           null|            null|    1|
|AR0IAWL1187B9A96D0|        Danilo Perez|              Panama|         8.4177|       -80.11278|    1|
|ARMBR4Y1187B9990EB|        David Martin|     California - SF|       37.77916|      -122.42005|    1|
|ARD0S291187B9B7BF5|             Rated R|                Ohio|           null|            null|    1|
|AR0RCMP1187FB3F427|    Billie Jo Spears|        Beaumont, TX|       30.08615|       -94.10158|    1|
|ARKRRTF1187B9984DA|    Sonora Santanera|                    |           null|    

In [16]:
df_artists = df5.select("artist_id", 
                        "artist_name",
                        "artist_location",
                        "artist_latitude",
                        "artist_longitude").\
        distinct()

In [17]:
df_artists.printSchema()

root
 |-- artist_id: string (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- artist_location: string (nullable = true)
 |-- artist_latitude: double (nullable = true)
 |-- artist_longitude: double (nullable = true)



In [18]:
df_artists.show(10)

+------------------+--------------------+--------------------+---------------+----------------+
|         artist_id|         artist_name|     artist_location|artist_latitude|artist_longitude|
+------------------+--------------------+--------------------+---------------+----------------+
|AR3JMC51187B9AE49D|     Backstreet Boys|         Orlando, FL|       28.53823|       -81.37739|
|AR0IAWL1187B9A96D0|        Danilo Perez|              Panama|         8.4177|       -80.11278|
|ARWB3G61187FB49404|         Steve Morse|      Hamilton, Ohio|           null|            null|
|AR47JEX1187B995D81|        SUE THOMPSON|          Nevada, MO|       37.83721|       -94.35868|
|ARHHO3O1187B989413|           Bob Azzam|                    |           null|            null|
|ARAGB2O1187FB3A161|Pucho & His Latin...|                    |           null|            null|
|AREBBGV1187FB523D2|Mike Jones (Featu...|         Houston, TX|           null|            null|
|ARGSAFR1269FB35070|          Blingtones

In [19]:
#song_id, title, artist_id, year, duration
df_songs = df5.select("song_id", 
                      "title", 
                      "artist_id", 
                      "year", 
                      "duration").\
    distinct()

In [20]:
df_songs.show()

+------------------+--------------------+------------------+----+---------+
|           song_id|               title|         artist_id|year| duration|
+------------------+--------------------+------------------+----+---------+
|SOGOSOV12AF72A285E|   ¿Dónde va Chichi?|ARGUVEV1187B98BA17|1997|313.12934|
|SOTTDKS12AB018D69B|It Wont Be Christmas|ARMBR4Y1187B9990EB|   0|241.47546|
|SOBBUGU12A8C13E95D|Setting Fire to S...|ARMAC4T1187FB3FA4C|2004|207.77751|
|SOIAZJW12AB01853F1|          Pink World|AR8ZCNI1187B9A069B|1984|269.81832|
|SONYPOM12A8C13B2D7|I Think My Wife I...|ARDNS031187B9924F0|2005|186.48771|
|SOYMRWW12A6D4FAB14|The Moon And I (O...|ARKFYS91187B98E58F|   0| 267.7024|
|SOAOIBZ12AB01815BE|I Hold Your Hand ...|ARPBNLO1187FB3D52F|2000| 43.36281|
|SOBCOSW12A8C13D398|  Rumba De Barcelona|AR7SMBG1187B9B9066|   0|218.38322|
|SOWTBJW12AC468AC6E|Broken-Down Merry...|ARQGYP71187FB44566|   0|151.84934|
|SOQHXMF12AB0182363|     Young Boy Blues|ARGSJW91187B9B1D6B|   0|218.77506|
|SOGDBUF12A8

## Write 

In [21]:
!mkdir OUT

mkdir: cannot create directory ‘OUT’: File exists


In [22]:
df_songs.write.partitionBy("year", "artist_id").mode("overwrite").parquet("./OUT/songs.parquet")

In [23]:
!ls OUT/songs.parquet/year\=1987/artist_id\=ARD842G1187B997376

part-00059-9f3bd952-49fc-4f8e-93d7-7f72d1f2bc34.c000.snappy.parquet


# log data

In [24]:
#!mkdir log_data
#!cd ./log_data; unzip ../data/log-data.zip
#####!mv 2018-11*json log_data

In [25]:
df_log = spark.read.json("log_data/")

In [26]:
df_log.printSchema()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)



In [27]:
# user_id, first_name, last_name, gender, level
df_users = df_log.select("userId", 
                         "firstName", 
                         "lastName", 
                         "gender", 
                         "level")\
                    .distinct()\
                    .orderBy("userId")

In [28]:
df_users.printSchema()

root
 |-- userId: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- lastName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- level: string (nullable = true)



In [29]:
df_users.show()

+------+---------+--------+------+-----+
|userId|firstName|lastName|gender|level|
+------+---------+--------+------+-----+
|      |     null|    null|  null| paid|
|      |     null|    null|  null| free|
|    10|   Sylvie|    Cruz|     F| free|
|   100|    Adler| Barrera|     M| free|
|   101|   Jayden|     Fox|     M| free|
|    11|Christian|  Porter|     F| free|
|    12|   Austin| Rosales|     M| free|
|    13|      Ava|Robinson|     F| free|
|    14| Theodore|  Harris|     M| free|
|    15|     Lily|    Koch|     F| free|
|    15|     Lily|    Koch|     F| paid|
|    16|    Rylan|  George|     M| free|
|    16|    Rylan|  George|     M| paid|
|    17| Makinley|   Jones|     F| free|
|    18|    Jacob|  Rogers|     M| free|
|    19|  Zachary|  Thomas|     M| free|
|     2|  Jizelle|Benjamin|     F| free|
|    20|    Aiden| Ramirez|     M| paid|
|    21|  Preston| Sanders|     M| free|
|    22|     Sean|  Wilson|     F| free|
+------+---------+--------+------+-----+
only showing top

In [30]:
#songplay_id, start_time, user_id, level, song_id, artist_id, session_id, location, user_agent
df_log.printSchema()


root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)



In [31]:
df_log.select("ts").show(10)

+-------------+
|           ts|
+-------------+
|1542241826796|
|1542242481796|
|1542242741796|
|1542247071796|
|1542252577796|
|1542253449796|
|1542253460796|
|1542260074796|
|1542260277796|
|1542260935796|
+-------------+
only showing top 10 rows



In [32]:
import pyspark.sql.functions as F
import datetime

In [33]:
#ts_to_date = F.udf(lambda ts : F.from_unixtime(F.floor( ts/1000 ) ) )
#ts_to_date = F.udf(lambda ts : F.from_unixtime(ts ) )
ts_to_date = F.udf(lambda ts : datetime.datetime.fromtimestamp(ts/1000.))

In [34]:
df_log = df_log.withColumn("tsdate", ts_to_date(df_log.ts))

In [35]:
df_log.select("ts", "tsdate").limit(10).toPandas()

Unnamed: 0,ts,tsdate
0,1542241826796,"java.util.GregorianCalendar[time=?,areFieldsSe..."
1,1542242481796,"java.util.GregorianCalendar[time=?,areFieldsSe..."
2,1542242741796,"java.util.GregorianCalendar[time=?,areFieldsSe..."
3,1542247071796,"java.util.GregorianCalendar[time=?,areFieldsSe..."
4,1542252577796,"java.util.GregorianCalendar[time=?,areFieldsSe..."
5,1542253449796,"java.util.GregorianCalendar[time=?,areFieldsSe..."
6,1542253460796,"java.util.GregorianCalendar[time=?,areFieldsSe..."
7,1542260074796,"java.util.GregorianCalendar[time=?,areFieldsSe..."
8,1542260277796,"java.util.GregorianCalendar[time=?,areFieldsSe..."
9,1542260935796,"java.util.GregorianCalendar[time=?,areFieldsSe..."


In [36]:
df_log2 = df_log.withColumn("year", F.udf(lambda x:x.year)(df_log.tsdate) ).\
    select("year", "ts").show(10)

+----+-------------+
|year|           ts|
+----+-------------+
|2018|1542241826796|
|2018|1542242481796|
|2018|1542242741796|
|2018|1542247071796|
|2018|1542252577796|
|2018|1542253449796|
|2018|1542253460796|
|2018|1542260074796|
|2018|1542260277796|
|2018|1542260935796|
+----+-------------+
only showing top 10 rows



In [37]:
#raw_timestamp = df_log.withColumn("date2", df_log.tsdate.year).select("date2", "tsdate")

In [55]:
#cvt_date_2 = F.udf(lambda x : F.from_unixtime(x))
import pyspark.sql.types as Ptype
cvt_date_3 = F.udf(lambda x : F.from_unixtime(x/1000) )
cvt_date_4 = F.udf(lambda x : x /1000, Ptype.FloatType())
cvt_date_5 = F.udf(lambda x : floor(x /1000), Ptype.IntegerType())

In [60]:
t1 = df_log.withColumn("datetime", F.from_unixtime(df_log.ts/1000))
t1

DataFrame[artist: string, auth: string, firstName: string, gender: string, itemInSession: bigint, lastName: string, length: double, level: string, location: string, method: string, page: string, registration: double, sessionId: bigint, song: string, status: bigint, ts: bigint, userAgent: string, userId: string, tsdate: string, datetime: string]

In [63]:
t1.select("datetime").show(10)

+-------------------+
|           datetime|
+-------------------+
|2018-11-15 00:30:26|
|2018-11-15 00:41:21|
|2018-11-15 00:45:41|
|2018-11-15 01:57:51|
|2018-11-15 03:29:37|
|2018-11-15 03:44:09|
|2018-11-15 03:44:20|
|2018-11-15 05:34:34|
|2018-11-15 05:37:57|
|2018-11-15 05:48:55|
+-------------------+
only showing top 10 rows



In [None]:
df_log.select("ts").show(5)