# Udacity Data Engineering - Project 4 - Data Lake

Read and check the correct information is provided in the Analytics Data Lake.

## Create Spark Session and configs

In [1]:
# import libraries
import os
import configparser
from datetime import datetime

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format, from_unixtime

In [2]:
# Aws env config
config = configparser.ConfigParser()
config.read('dl.cfg')

os.environ['AWS_ACCESS_KEY_ID'] = config['AWS']['ACCESS_KEY_ID']
os.environ['AWS_SECRET_ACCESS_KEY'] = config['AWS']['SECRET_ACCESS_KEY']

my_aws_path = config['AWS']['S3_FOLDER']

In [3]:
# create sparksession
spark = SparkSession \
    .builder \
    .appName("Pyspark DataLake") \
    .config("spark.jars.packages","org.apache.hadoop:hadoop-aws:2.7.0") \
    .getOrCreate()

## Read Artists Table Data

In [5]:
artists_table_df = spark.read.parquet("s3a://{}/artists_table/".format(my_aws_path))
artists_table_df.show()

+------------------+--------------------+--------------------+---------------+----------------+
|         artist_id|         artist_name|     artist_location|artist_latitude|artist_longitude|
+------------------+--------------------+--------------------+---------------+----------------+
|ARNF6401187FB57032|   Sophie B. Hawkins|New York, NY [Man...|       40.79086|       -73.96644|
|AROUOZZ1187B9ABE51|         Willie Bobo|New York, NY [Spa...|       40.79195|       -73.94512|
|AREBBGV1187FB523D2|Mike Jones (Featu...|         Houston, TX|           null|            null|
|ARD842G1187B997376|          Blue Rodeo|Toronto, Ontario,...|       43.64856|       -79.38533|
|ARDR4AC1187FB371A1|Montserrat Caball...|                    |           null|            null|
|ARPFHN61187FB575F6|         Lupe Fiasco|         Chicago, IL|       41.88415|       -87.63241|
|AR9AWNF1187B9AB0B4|Kenny G featuring...|Seattle, Washingt...|           null|            null|
|ARIG6O41187B988BDD|     Richard Souther

## Read Time Table Data

In [6]:
time_table_df = spark.read.parquet('s3a://{}/time_table/'.format(my_aws_path))
time_table_df.show()

+----------------+----+---+----+-------+----+-----+
|             ts_|hour|day|week|weekday|year|month|
+----------------+----+---+----+-------+----+-----+
|1.543069415796E9|  14| 24|  47|      6|2018|   11|
|1.543074987796E9|  15| 24|  47|      6|2018|   11|
|1.543524187796E9|  20| 29|  48|      4|2018|   11|
|1.543527094796E9|  21| 29|  48|      4|2018|   11|
|1.542642234796E9|  15| 19|  47|      1|2018|   11|
|1.541839459796E9|   8| 10|  45|      6|2018|   11|
|1.541878233796E9|  19| 10|  45|      6|2018|   11|
|1.542314996796E9|  20| 15|  46|      4|2018|   11|
|1.542317497796E9|  21| 15|  46|      4|2018|   11|
|1.542321920796E9|  22| 15|  46|      4|2018|   11|
|1.542801099796E9|  11| 21|  47|      3|2018|   11|
|1.542842644796E9|  23| 21|  47|      3|2018|   11|
|1.542165470796E9|   3| 14|  46|      3|2018|   11|
|1.542183561796E9|   8| 14|  46|      3|2018|   11|
|1.542210687796E9|  15| 14|  46|      3|2018|   11|
|1.543193525796E9|   0| 26|  48|      1|2018|   11|
|1.543215367

## Read Songs Table Data

In [7]:
songs_table_df = spark.read.parquet('s3a://{}/songs_table/'.format(my_aws_path))
songs_table_df.show()

+------------------+--------------------+---------+----+------------------+
|           song_id|               title| duration|year|         artist_id|
+------------------+--------------------+---------+----+------------------+
|SOAOIBZ12AB01815BE|I Hold Your Hand ...| 43.36281|2000|ARPBNLO1187FB3D52F|
|SONYPOM12A8C13B2D7|I Think My Wife I...|186.48771|2005|ARDNS031187B9924F0|
|SODREIN12A58A7F2E5|A Whiter Shade Of...|326.00771|   0|ARLTWXK1187FB5A3F8|
|SOYMRWW12A6D4FAB14|The Moon And I (O...| 267.7024|   0|ARKFYS91187B98E58F|
|SOWQTQZ12A58A7B63E|Streets On Fire (...|279.97995|   0|ARPFHN61187FB575F6|
|SOUDSGM12AC9618304|Insatiable (Instr...|266.39628|   0|ARNTLGG11E2835DDB9|
|SOPEGZN12AB0181B3D|Get Your Head Stu...| 45.66159|   0|AREDL271187FB40F44|
|SOBBUGU12A8C13E95D|Setting Fire to S...|207.77751|2004|ARMAC4T1187FB3FA4C|
|SOBAYLL12A8C138AF9|Sono andati? Fing...|511.16363|   0|ARDR4AC1187FB371A1|
|SOOLYAZ12A6701F4A6|Laws Patrolling (...|173.66159|   0|AREBBGV1187FB523D2|
|SOFFKZS12AB

## Read users table Data

In [9]:
users_table_df = spark.read.parquet('s3a://{}/users_table/'.format(my_aws_path))
users_table_df.show()

+------+----------+---------+------+-----+
|userId| firstName| lastName|gender|level|
+------+----------+---------+------+-----+
|    88|  Mohammad|Rodriguez|     M| paid|
|    88|  Mohammad|Rodriguez|     M| free|
|    68|    Jordan|Rodriguez|     F| free|
|    29|Jacqueline|    Lynch|     F| free|
|    11| Christian|   Porter|     F| free|
|    53|   Celeste| Williams|     F| free|
|    69|  Anabelle|  Simpson|     F| free|
|    75|    Joseph|Gutierrez|     M| free|
|    40|    Tucker| Garrison|     M| free|
|     2|   Jizelle| Benjamin|     F| free|
|    14|  Theodore|   Harris|     M| free|
|    52|  Theodore|    Smith|     M| free|
|    56|    Cienna|  Freeman|     F| free|
|    12|    Austin|  Rosales|     M| free|
|    19|   Zachary|   Thomas|     M| free|
|    23|    Morris|  Gilmore|     M| free|
|    55|    Martin|  Johnson|     M| free|
|    66|     Kevin| Arellano|     M| free|
|    64|    Hannah|  Calhoun|     F| free|
|   100|     Adler|  Barrera|     M| free|
+------+---

## Read Songsplay Table Data

In [10]:
songplays_table_df = spark.read.parquet('s3a://{}/songplays_table/'.format(my_aws_path))
songplays_table_df.show()

+----------------+------+-----+-------+---------+---------+--------------------+--------------------+
|             ts_|userId|level|song_id|artist_id|sessionId|            location|           userAgent|
+----------------+------+-----+-------+---------+---------+--------------------+--------------------+
|1.543551178796E9|    24| paid|   null|     null|     1051|Lake Havasu City-...|"Mozilla/5.0 (Win...|
|1.543595117796E9|    16| paid|   null|     null|     1076|Birmingham-Hoover...|"Mozilla/5.0 (Mac...|
|1.542391266796E9|    36| paid|   null|     null|      461|Janesville-Beloit...|"Mozilla/5.0 (Win...|
|1.542398700796E9|    36| paid|   null|     null|      461|Janesville-Beloit...|"Mozilla/5.0 (Win...|
|1.542673837796E9|    25| paid|   null|     null|      594|    Marinette, WI-MI|"Mozilla/5.0 (Win...|
|1.542680795796E9|    85| paid|   null|     null|      658|       Red Bluff, CA|"Mozilla/5.0 (Mac...|
|1.542708529796E9|    15| paid|   null|     null|      716|Chicago-Napervill...|"M