In [1]:
import configparser
import os
from pyspark.sql import SparkSession

In [2]:
config = configparser.ConfigParser()
config.read('dl.cfg')

os.environ['AWS_ACCESS_KEY_ID'] = config['AWS']['AWS_ACCESS_KEY_ID']
os.environ['AWS_SECRET_ACCESS_KEY'] = config['AWS']['AWS_SECRET_ACCESS_KEY']

In [3]:
spark = SparkSession \
        .builder \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
        .getOrCreate()

#### read in data for 5 tables and show their schema

In [5]:
input_data = 's3a://aws-logs-************-us-west-2/output_data/'

In [19]:
songs = os.path.join(input_data, "songs")
artists = os.path.join(input_data, "artists")
users = os.path.join(input_data, "users")
time = os.path.join(input_data, "time")
songplays = os.path.join(input_data, "songplays")

In [14]:
songs_table = spark.read.parquet(songs)
songs_table.printSchema()

root
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- year: integer (nullable = true)
 |-- artist_id: string (nullable = true)



In [15]:
artists_table = spark.read.parquet(artists)
artists_table.printSchema()

root
 |-- name: string (nullable = true)
 |-- location: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- artist_id: string (nullable = true)



In [20]:
users_table = spark.read.parquet(users)
users_table.printSchema()

root
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- level: string (nullable = true)
 |-- user_id: integer (nullable = true)



In [17]:
time_table = spark.read.parquet(time)
time_table.printSchema()

root
 |-- start_time: string (nullable = true)
 |-- hour: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- week: integer (nullable = true)
 |-- weekday: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)



In [18]:
songplays_table = spark.read.parquet(songplays)
songplays_table.printSchema()

root
 |-- songplay_id: long (nullable = true)
 |-- start_time: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- level: string (nullable = true)
 |-- song_id: string (nullable = true)
 |-- artist_id: string (nullable = true)
 |-- session_id: integer (nullable = true)
 |-- location: string (nullable = true)
 |-- user_agent: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)



#### sample queries

In [12]:
songs_table.createOrReplaceTempView("songs")

spark.sql("""
    SELECT COUNT(DISTINCT title) AS num_songs
    FROM songs
""").show()

+---------+
|num_songs|
+---------+
|       24|
+---------+



In [21]:
users_table.createOrReplaceTempView("users")

spark.sql("""
    SELECT gender, COUNT(DISTINCT user_id) AS cnt
    FROM users
    GROUP BY gender
""").show()

+------+---+
|gender|cnt|
+------+---+
|     F|  8|
|     M| 14|
+------+---+



In [22]:
artists_table.createOrReplaceTempView("artists")
songplays_table.createOrReplaceTempView("songplays")

spark.sql("""
    SELECT a.name, count(a.name) AS num_songplays
    FROM songplays s
    LEFT JOIN artists a 
    ON s.artist_id = a.artist_id
    GROUP BY a.name
    ORDER BY num_songplays DESC
    LIMIT 3
""").show()

+----+-------------+
|name|num_songplays|
+----+-------------+
|null|            0|
+----+-------------+

