In [None]:
from pyspark.sql import SparkSession
import os
import configparser

In [None]:
spark = SparkSession.builder\
                     .config("spark.jars.packages","org.apache.hadoop:hadoop-aws:2.7.0")\
                     .getOrCreate()

In [None]:
%ls data

In [None]:
#!unzip data/song-data.zip

In [None]:
%cat song_data/A/A/C/TRAACCG128F92E8A55.json

In [None]:
df = spark.read.json("song_data/A/A/C/TRAACCG128F92E8A55.json")

In [None]:
df.printSchema()

In [None]:
df = spark.read.json("song_data/A/A/A")

In [None]:
%ls song_data/A/A/A/

In [None]:
df5 = spark.read.json("song_data/{A,B,C}/{A,B,C}/{A,B,C}")

In [None]:
df5.count()

In [None]:
df5.printSchema()

In [None]:
type(df5)

In [None]:
df_artists = df5.select("artist_id", 
                        "artist_name",
                        "artist_location",
                        "artist_latitude",
                        "artist_longitude").\
        groupBy("artist_id", "artist_name", "artist_location", 
                "artist_latitude", "artist_longitude")


In [None]:
df_artists.count().show()

In [None]:
df_artists = df5.select("artist_id", 
                        "artist_name",
                        "artist_location",
                        "artist_latitude",
                        "artist_longitude").\
        distinct()

In [None]:
df_artists.printSchema()

In [None]:
df_artists.show(10)

In [None]:
#song_id, title, artist_id, year, duration
df_songs = df5.select("song_id", 
                      "title", 
                      "artist_id", 
                      "year", 
                      "duration").\
    distinct()

In [None]:
df_songs.show()

## Write 

In [None]:
!mkdir OUT

In [None]:
df_songs.write.partitionBy("year", "artist_id").mode("overwrite").parquet("./OUT/songs.parquet")

In [None]:
!ls OUT/songs.parquet/year\=1987/artist_id\=ARD842G1187B997376

# log data

In [None]:
#!mkdir log_data
#!cd ./log_data; unzip ../data/log-data.zip
#####!mv 2018-11*json log_data

In [None]:
df_log = spark.read.json("log_data/")

In [None]:
df_log.printSchema()

In [None]:
# user_id, first_name, last_name, gender, level
df_users = df_log.select("userId", 
                         "firstName", 
                         "lastName", 
                         "gender", 
                         "level")\
                    .distinct()\
                    .orderBy("userId")

In [None]:
df_users.printSchema()

In [None]:
df_users.show()

In [None]:
#songplay_id, start_time, user_id, level, song_id, artist_id, session_id, location, user_agent
df_log.printSchema()


In [None]:
df_log.select("ts").show(10)

In [None]:
import pyspark.sql.functions as F
import pyspark.sql.types as Ptype


In [None]:
t1 = df_log.withColumn("datetime", F.from_unixtime(df_log.ts/1000))
t1

In [None]:
t1.select("datetime", "ts").show(10)

In [None]:
t1 = t1.withColumn("hour", F.hour("datetime"))\
            .withColumn("day", F.dayofmonth("datetime"))\
            .withColumn("week", F.weekofyear("datetime"))\
            .withColumn("month", F.month("datetime"))\
            .withColumn("year", F.year("datetime"))\
            .withColumn("weekday", F.dayofweek("datetime"))\

In [None]:
df_timestamp = t1.select("ts", "hour", "day", "week", "month", "year", "weekday")

In [None]:
df_timestamp.limit(10).toPandas()

### Fact table : songplays

In [None]:
#songplay_id, start_time, user_id, level, song_id, artist_id, session_id, location, user_agent

In [None]:
df_log.printSchema()

In [None]:
df_log.select("song", "artist").limit(10).toPandas()

In [None]:
df_log.createOrReplaceTempView("table_log")
df_artists.createOrReplaceTempView("table_artists")
df_songs.createOrReplaceTempView("table_songs")

In [None]:
df_artists.printSchema()

In [None]:
#songplay_id, start_time, user_id, level, song_id, 
#artist_id, session_id, location, user_agent
df_songplays = spark.sql("""
    SELECT lg. ts AS start_time,
        lg.userId AS user_id,
        lg.level AS level,
        sg.song_id,
        art.artist_id,
        lg.sessionId AS session_id,
        lg.location,
        lg.userAgent AS user_agent    
    FROM table_log AS lg
    JOIN table_artists as art ON art.artist_name = lg.artist
    JOIN table_songs AS sg ON sg.title = lg.song AND art.artist_name = lg.artist
    
""")

In [None]:
df_songplays.limit(10).toPandas()

In [None]:
df_songs.printSchema()