## Import Libraries

In [1]:
import configparser
import os

import logging
import boto3
from botocore.exceptions import ClientError

import pandas as pd

from pyspark.sql import SparkSession
from pyspark import SparkContext

from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format

## Make sure you have an AWS secret and access key

- Create a new IAM user in your AWS account
- Give it AdministratorAccess, From Attach existing policies directly Tab
- Take note of the access key and secret
- Edit the file dwh.cfg in the same folder as this notebook and fill
<font color='red'>
<BR>
[AWS]<BR>
KEY= YOUR_AWS_KEY<BR>
SECRET= YOUR_AWS_SECRET<BR>

## Read the Configuration file

In [2]:
config = configparser.ConfigParser()
config.read('dwh.cfg')

['dwh.cfg']

## Read the AWS User Access Key ID and Secret Key

In [3]:
os.environ['AWS_ACCESS_KEY_ID']=config['AWS']['AWS_ACCESS_KEY_ID']
os.environ['AWS_SECRET_ACCESS_KEY']=config['AWS']['AWS_SECRET_ACCESS_KEY']

KEY= config['AWS']['AWS_ACCESS_KEY_ID']
SECRET= config['AWS']['AWS_SECRET_ACCESS_KEY']

## Create a Local Spark Session

In [4]:
def create_spark_session():
    spark = SparkSession \
        .builder \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
        .getOrCreate()
    return spark

In [5]:
spark = create_spark_session()

In [6]:
spark

## Read the Song data JSON file stored in S3 bucket

In [7]:
INPUT_DATA = config['S3']['SONG_DATA']

song_data = INPUT_DATA + "/A/B/C/*.json"
song_data

's3a://udacity-dend/song_data/A/B/C/*.json'

## Read the JSON file into a Spark DataFrame

In [8]:

song_f1 = spark.read.json(song_data)
song_f1.limit(2).toPandas()

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year
0,ARLTWXK1187FB5A3F8,32.74863,"Fort Worth, TX",-97.32925,King Curtis,326.00771,1,SODREIN12A58A7F2E5,A Whiter Shade Of Pale (Live @ Fillmore West),0
1,ARIOZCU1187FB3A3DC,,"Hamlet, NC",,JOHN COLTRANE,220.44689,1,SOCEMJV12A6D4F7667,Giant Steps (Alternate Version_ Take 5_ Altern...,0


## Read the User log data JSON file stored in S3 bucket

In [9]:
INPUT_DATA = config['S3']['LOG_DATA']

log_data = INPUT_DATA + "/2018/11/*.json"
log_data

's3a://udacity-dend/log_data/2018/11/*.json'

In [10]:
log_f1 = spark.read.json(log_data)
log_f1.limit(2).toPandas()

Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userAgent,userId
0,Harmonia,Logged In,Ryan,M,0,Smith,655.77751,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541017000000.0,583,Sehr kosmisch,200,1542241826796,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",26
1,The Prodigy,Logged In,Ryan,M,1,Smith,260.07465,free,"San Jose-Sunnyvale-Santa Clara, CA",PUT,NextSong,1541017000000.0,583,The Big Gundown,200,1542242481796,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",26


## Read the DataWareHouse Configuration details to create a Redshift Cluster

In [11]:
DWH_CLUSTER_TYPE       = config.get("DWH","DWH_CLUSTER_TYPE")
DWH_NUM_NODES          = config.get("DWH","DWH_NUM_NODES")
DWH_NODE_TYPE          = config.get("DWH","DWH_NODE_TYPE")

DWH_CLUSTER_IDENTIFIER = config.get("DWH","DWH_CLUSTER_IDENTIFIER")
DWH_ENDPOINT           = config.get("DWH", "DWH_ENDPOINT")
DWH_DB                 = config.get("DWH","DWH_DB")
DWH_DB_USER            = config.get("DWH","DWH_DB_USER")
DWH_DB_PASSWORD        = config.get("DWH","DWH_DB_PASSWORD")
DWH_PORT               = config.get("DWH","DWH_PORT")

DWH_IAM_ROLE_NAME      = config.get("DWH", "DWH_IAM_ROLE_NAME")
DWH_ROLE_ARN           = config.get("IAM_ROLE", "ARN")

(DWH_DB_USER, DWH_DB_PASSWORD, DWH_DB)


pd.DataFrame({"Param":
["DWH_CLUSTER_TYPE", "DWH_NUM_NODES", "DWH_NODE_TYPE", "DWH_CLUSTER_IDENTIFIER", "DWH_DB", "DWH_DB_USER", "DWH_DB_PASSWORD", "DWH_PORT", "DWH_IAM_ROLE_NAME", "DWH_ROLE_ARN"],
              "Value":
[DWH_CLUSTER_TYPE, DWH_NUM_NODES, DWH_NODE_TYPE, DWH_CLUSTER_IDENTIFIER, DWH_DB, DWH_DB_USER, DWH_DB_PASSWORD, DWH_PORT, DWH_IAM_ROLE_NAME, DWH_ROLE_ARN]
             })


Unnamed: 0,Param,Value
0,DWH_CLUSTER_TYPE,multi-node
1,DWH_NUM_NODES,4
2,DWH_NODE_TYPE,dc2.large
3,DWH_CLUSTER_IDENTIFIER,dwhCluster
4,DWH_DB,dwh
5,DWH_DB_USER,dwhuser
6,DWH_DB_PASSWORD,Passw0rd
7,DWH_PORT,5439
8,DWH_IAM_ROLE_NAME,dwhRole
9,DWH_ROLE_ARN,arn:aws:iam::501327643673:role/dwhRole


## Create clients for EC2, S3, IAM, and RedShift

In [12]:
ec2 = boto3.resource("ec2", 
                     region_name="us-east-1",
                     aws_access_key_id=KEY,
                     aws_secret_access_key=SECRET)

s3 = boto3.resource("s3",
                    region_name="us-east-1",
                    aws_access_key_id=KEY,
                    aws_secret_access_key=SECRET)

iam = boto3.client("iam",
                    region_name="us-east-1",
                    aws_access_key_id=KEY,
                    aws_secret_access_key=SECRET)

redshift = boto3.client("redshift",
                          region_name="us-east-1",
                          aws_access_key_id=KEY,
                          aws_secret_access_key=SECRET)

## Filter the columns from the songs data dataframe to create dimension table

### Schema of the DataFrame

In [13]:
song_f1.printSchema()

root
 |-- artist_id: string (nullable = true)
 |-- artist_latitude: double (nullable = true)
 |-- artist_location: string (nullable = true)
 |-- artist_longitude: double (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- num_songs: long (nullable = true)
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: long (nullable = true)



### Create songs DataFrame for creating songs dimension table

- songs table: columns
- song_id, title, artist_id, year, duration

In [70]:
songs_table = song_f1.select(["song_id", "title", "artist_id", "year", "duration"]).dropDuplicates()
songs_table.show()

+------------------+--------------------+------------------+----+---------+
|           song_id|               title|         artist_id|year| duration|
+------------------+--------------------+------------------+----+---------+
|SOQFYBD12AB0182188|               Intro|ARAADXM1187FB3ECDB|1999| 67.63057|
|SOFIUVJ12A8C13C296|Will You Tell Me ...|AR9OEB71187B9A97C6|2005|397.16526|
|SOGDBUF12A8C140FAA|               Intro|AR558FS1187FB45658|2003| 75.67628|
|SOBHXUU12A6D4F5F14|National Emblem (...|ARBDJHO1252CCFA6FC|   0|188.73424|
|SOIKLJM12A8C136355|           Eso Duele|AR7AE0W1187B98E40E|2003|196.25751|
|SOEVPWF12A58A7D254|         Astral eyes|ARCJWLU1187B9ADD36|1997|274.54649|
|SODWBIK12AB017F87D|               Aüita|ARSMG8X1187B99CA99|2009| 210.1024|
|SOUPIRU12A6D4FA1E1| Der Kleine Dompfaff|ARJIE2Y1187B994AB7|   0|152.92036|
|SOHTCZS12A6D4FC402|  The Christmas Song|AROSPS51187B9B481F|1965|197.95546|
|SODAUVL12A8C13D184|           Prognosis|ARWB3G61187FB49404|2000|363.85914|
|SOYLILV12A8

### Create artists DataFrame for creating artists dimension table

- artists table: columns
- artist_id, name, location, lattitude, longitude

In [71]:
artists_table = song_f1.select(["artist_id", "artist_name", "artist_location", "artist_latitude", "artist_longitude"]).dropDuplicates()
artists_table.limit(10).show()

+------------------+----------------+------------------+---------------+----------------+
|         artist_id|     artist_name|   artist_location|artist_latitude|artist_longitude|
+------------------+----------------+------------------+---------------+----------------+
|AR0IAWL1187B9A96D0|    Danilo Perez|            Panama|         8.4177|       -80.11278|
|ARWB3G61187FB49404|     Steve Morse|    Hamilton, Ohio|           null|            null|
|ARJIE2Y1187B994AB7|     Line Renaud|                  |           null|            null|
|ARVBRGZ1187FB4675A|    Gwen Stefani|                  |           null|            null|
|ARCKOJF1241B9C75B4|    Eddie Sierra|                  |           null|            null|
|ARLTWXK1187FB5A3F8|     King Curtis|    Fort Worth, TX|       32.74863|       -97.32925|
|AR5S9OB1187B9931E3|     Bullet Boys|   Los Angeles, CA|       34.05349|      -118.24532|
|ARPFHN61187FB575F6|     Lupe Fiasco|       Chicago, IL|       41.88415|       -87.63241|
|ARZGTK711

## Filter the columns from the user log data dataframe to create dimension table

### Schema of the user log dataframe

In [16]:
log_f1.printSchema()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)



### Create users DataFrame for creating users dimension table

- users - users in the app
- user_id, first_name, last_name, gender, level

In [17]:
users = log_f1.select(['userId', 'firstName', 'lastName', 'gender', 'level'])
users.limit(5).toPandas()

Unnamed: 0,userId,firstName,lastName,gender,level
0,26,Ryan,Smith,M,free
1,26,Ryan,Smith,M,free
2,26,Ryan,Smith,M,free
3,9,Wyatt,Scott,M,free
4,12,Austin,Rosales,M,free


### Create time DataFrame for creating time dimension table

- time - timestamps of records in songplays broken down into specific units
- start_time, hour, day, week, month, year, weekday

In [18]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import desc
from pyspark.sql.functions import asc
from pyspark.sql.functions import sum as Fsum
from pyspark.sql import functions as f
from pyspark.sql import types as t

from pyspark.sql.types import TimestampType, DateType

from datetime import datetime

In [19]:
time = log_f1.select(['ts'])
time.limit(5).toPandas()

Unnamed: 0,ts
0,1542241826796
1,1542242481796
2,1542242741796
3,1542247071796
4,1542252577796


In [20]:
def format_datetime(ts):
    return datetime.fromtimestamp(ts/1000.0)

In [21]:
get_timestamp = udf(lambda x: datetime.fromtimestamp(x/1000.0),TimestampType())

In [22]:
time = time.withColumn("timestamp", get_timestamp(time.ts))

In [23]:
get_datetime = udf(lambda x: datetime.fromtimestamp(x/1000.0), DateType())

In [24]:
time = time.withColumn("datetime", get_datetime(time.ts))

In [25]:
time.limit(5).show()

+-------------+--------------------+----------+
|           ts|           timestamp|  datetime|
+-------------+--------------------+----------+
|1542241826796|2018-11-14 19:30:...|2018-11-14|
|1542242481796|2018-11-14 19:41:...|2018-11-14|
|1542242741796|2018-11-14 19:45:...|2018-11-14|
|1542247071796|2018-11-14 20:57:...|2018-11-14|
|1542252577796|2018-11-14 22:29:...|2018-11-14|
+-------------+--------------------+----------+



In [67]:
time_table = time.select(col('ts').alias('start_time'),
                        f.hour(time.timestamp).alias('hour'),
                        f.dayofweek(time.datetime).alias('day'),
                        f.weekofyear(time.datetime).alias('week'),
                        f.month(time.datetime).alias('month'), 
                        f.year(time.datetime).alias('year'),
                        date_format(time.datetime, 'E').alias('weekday')).dropDuplicates()

In [68]:
time_table.limit(5).toPandas()

Unnamed: 0,start_time,hour,day,week,month,year,weekday
0,1542285613796,7,5,46,11,2018,Thu
1,1542290569796,9,5,46,11,2018,Thu
2,1542292733796,9,5,46,11,2018,Thu
3,1542810192796,9,4,47,11,2018,Wed
4,1542833047796,15,4,47,11,2018,Wed


In [69]:
time_table.count()

8023

### Create songs DataFrame for creating songplays fact table

- songs - songs in music database
- song_id, title, artist_id, year, duration

In [73]:
songs_df = songs_table

- songplays - records in log data associated with song plays i.e. records with page NextSong
- songplay_id, start_time, user_id, level, song_id, artist_id, session_id, location, user_agent

In [None]:
songplays = log_f1.select([''])

In [28]:
spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.access.key", KEY)
spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.secret.key", SECRET)

spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "s3.amazonaws.com")

## Create a S3 Bucket for a given region

In [29]:
def create_bucket(bucket_name, region=None):
    """Create an S3 bucket in a specified region

    If a region is not specified, the bucket is created in the S3 default
    region (us-east-1).

    :param bucket_name: Bucket to create
    :param region: String region to create bucket in, e.g., 'us-west-2'
    :return: True if bucket created, else False
    """

    # Create bucket
    try:
        if region is None:
            s3_client = boto3.client('s3')
            s3_client.create_bucket(Bucket=bucket_name)
        else:
            s3_client = boto3.client('s3', region_name=region)
            location = {'LocationConstraint': region}
            s3_client.create_bucket(Bucket=bucket_name,
                                    CreateBucketConfiguration=location)
    except ClientError as e:
        logging.error(e)
        return False
    return True

### Create a bucket to store the dimension tables

In [None]:
S3 = create_bucket("deshagvrk")

## Save the songs DataFrame (table) in Parquet format in S3 bucket

In [30]:
spark.conf.set("spark.sql.parquet.compression.codec", "gzip")

In [31]:
type(songs)

pyspark.sql.dataframe.DataFrame

songs.write.parquet("s3a://deshgvrk1/parquet/songsparquet", mode="overwrite")