In [1]:
import findspark
findspark.init('/home/ubuntu/spark-2.4.3-bin-hadoop2.7')

In [2]:
import configparser
import os
from pyspark.sql import SparkSession


In [3]:
input_data = "s3a://udacity-dend/"
output_data = "s3a://sougata-dend"

In [4]:
config = configparser.ConfigParser()
config.read('dl.cfg')

os.environ['AWS_ACCESS_KEY_ID']=config['AWS']['AWS_ACCESS_KEY_ID']
os.environ['AWS_SECRET_ACCESS_KEY']=config['AWS']['AWS_SECRET_ACCESS_KEY']

In [5]:
def create_spark_session():
    spark = SparkSession \
        .builder \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
        .getOrCreate()
    return spark

In [6]:
spark = create_spark_session()

**Testing with tables created from a single song file and a single log file**

In [7]:
songs = spark.read.parquet(os.path.join(output_data, "songs"))

In [8]:
songs.show()

+------------------+-------------------+---------+----+------------------+
|           song_id|              title| duration|year|         artist_id|
+------------------+-------------------+---------+----+------------------+
|SOUPIRU12A6D4FA1E1|Der Kleine Dompfaff|152.92036|   0|ARJIE2Y1187B994AB7|
+------------------+-------------------+---------+----+------------------+



In [9]:
songs.count()

1

In [10]:
songs.printSchema()

root
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- year: integer (nullable = true)
 |-- artist_id: string (nullable = true)



In [11]:
artists= spark.read.parquet(os.path.join(output_data, "artists"))

In [12]:
artists.show()

+------------------+-----------+--------+--------+---------+
|         artist_id|       name|location|latitude|longitude|
+------------------+-----------+--------+--------+---------+
|ARJIE2Y1187B994AB7|Line Renaud|        |    null|     null|
+------------------+-----------+--------+--------+---------+



In [12]:
artists.count()

1

In [13]:
artists.printSchema()

root
 |-- artist_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- location: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)



In [13]:
users = spark.read.parquet(os.path.join(output_data, "users"))

In [14]:
users.show()

+-------+----------+---------+------+-----+
|user_id|first_name|last_name|gender|level|
+-------+----------+---------+------+-----+
|     88|  Mohammad|Rodriguez|     M| free|
|     29|Jacqueline|    Lynch|     F| paid|
|     14|  Theodore|   Harris|     M| free|
|     55|    Martin|  Johnson|     M| free|
|     66|     Kevin| Arellano|     M| free|
|     25|    Jayden|   Graves|     M| paid|
|     83|   Stefany|    White|     F| free|
|      6|   Cecilia|    Owens|     F| free|
|     60|     Devin|   Larson|     M| free|
|     44|    Aleena|    Kirby|     F| paid|
|     37|    Jordan|    Hicks|     F| free|
|     49|     Chloe|   Cuevas|     F| free|
|     76|    Jayden|    Duffy|     F| free|
|     80|     Tegan|   Levine|     F| paid|
|     95|      Sara|  Johnson|     F| paid|
|     97|      Kate|  Harrell|     F| paid|
|     67|      Colm|  Santana|     M| free|
|     63|      Ayla|  Johnson|     F| free|
|     50|       Ava| Robinson|     F| free|
|     94|      Noah|   Chavez|  

In [15]:
users.count()

25

In [24]:
users.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- level: string (nullable = true)



In [16]:
time = spark.read.parquet(os.path.join(output_data, "time"))

In [17]:
time.show()

+-------------------+----+---+----+-------+----+-----+
|         start_time|hour|day|week|weekday|year|month|
+-------------------+----+---+----+-------+----+-----+
|2018-11-13 13:10:19|  13| 13|  46|      2|2018|   11|
|2018-11-13 13:47:41|  13| 13|  46|      2|2018|   11|
|2018-11-13 20:19:14|  20| 13|  46|      2|2018|   11|
|2018-11-13 20:57:33|  20| 13|  46|      2|2018|   11|
|2018-11-13 05:00:06|   5| 13|  46|      2|2018|   11|
|2018-11-13 09:29:08|   9| 13|  46|      2|2018|   11|
|2018-11-13 16:02:20|  16| 13|  46|      2|2018|   11|
|2018-11-13 17:09:49|  17| 13|  46|      2|2018|   11|
|2018-11-13 21:01:09|  21| 13|  46|      2|2018|   11|
|2018-11-13 08:43:13|   8| 13|  46|      2|2018|   11|
|2018-11-13 10:34:37|  10| 13|  46|      2|2018|   11|
|2018-11-13 18:38:03|  18| 13|  46|      2|2018|   11|
|2018-11-13 20:54:41|  20| 13|  46|      2|2018|   11|
|2018-11-13 23:22:55|  23| 13|  46|      2|2018|   11|
|2018-11-13 10:49:44|  10| 13|  46|      2|2018|   11|
|2018-11-1

In [46]:
time.count()

339

In [23]:
time.printSchema()

root
 |-- start_time: timestamp (nullable = true)
 |-- hour: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- week: integer (nullable = true)
 |-- weekday: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)



In [19]:
songplays = spark.read.parquet(os.path.join(output_data, "songplays"))

In [20]:
songplays.show()

+-------------------+-------+-----+-------+---------+----------+--------------------+--------------------+-----------+----+-----+
|         start_time|user_id|level|song_id|artist_id|session_id|            location|          user_agent|songplay_id|year|month|
+-------------------+-------+-----+-------+---------+----------+--------------------+--------------------+-----------+----+-----+
|2018-11-13 00:40:37|     66| free|   null|     null|       514|Harrisburg-Carlis...|"Mozilla/5.0 (Mac...|          0|2018|   11|
|2018-11-13 01:12:29|     51| free|   null|     null|       510|Houston-The Woodl...|"Mozilla/5.0 (Win...|          1|2018|   11|
|2018-11-13 03:19:02|      9| free|   null|     null|       379|Eureka-Arcata-For...|Mozilla/5.0 (Wind...|          2|2018|   11|
|2018-11-13 03:51:52|     49| free|   null|     null|       506|San Francisco-Oak...|Mozilla/5.0 (Wind...|          3|2018|   11|
|2018-11-13 05:00:06|     94| free|   null|     null|       492|Ogden-Clearfield, UT|Mozil

In [54]:
songplays.count()

339

In [22]:
songplays.printSchema()

root
 |-- start_time: timestamp (nullable = true)
 |-- user_id: string (nullable = true)
 |-- level: string (nullable = true)
 |-- song_id: string (nullable = true)
 |-- artist_id: string (nullable = true)
 |-- session_id: long (nullable = true)
 |-- location: string (nullable = true)
 |-- user_agent: string (nullable = true)
 |-- songplay_id: long (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)

