## Validate the data (tables) stored in Amazon S3.

In [101]:
import configparser
import os
from datetime import datetime

#import logging
#import boto3
#from botocore.exceptions import ClientError

import pandas as pd

from pyspark.sql import SparkSession
from pyspark import SparkContext

from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format

from pyspark.sql.types import StringType, IntegerType, TimestampType, DateType

from pyspark.sql.functions import desc
from pyspark.sql.functions import asc
from pyspark.sql.functions import sum as Fsum
from pyspark.sql import functions as f

from pyspark.sql import SQLContext

## Read from the AWS credentials from config file

In [2]:
config = configparser.ConfigParser()
config.read('dl.cfg')

['dl.cfg']

In [3]:
os.environ['AWS_ACCESS_KEY_ID']= config['AWS']['AWS_ACCESS_KEY_ID']
os.environ['AWS_SECRET_ACCESS_KEY']= config['AWS']['AWS_SECRET_ACCESS_KEY']

KEY= config['AWS']['AWS_ACCESS_KEY_ID']
SECRET= config['AWS']['AWS_SECRET_ACCESS_KEY']

In [4]:
def create_spark_session():
    spark = SparkSession \
        .builder \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
        .getOrCreate()
    return spark

In [5]:
spark = create_spark_session()

In [6]:
spark

## Read the data from S3 bucket

In [8]:
output = "s3a://desh1gvrk/parquet"
input =  "s3a://udacity-dend/"

## Validate the records for a given year and month (songplays fact table partition) to the log data.

<font color='red'><b>***** Complete this validation *****</b></font>

### Read the Songplays fact dimension table

In [11]:
songplays = output + '/songplays'
songplays_df = spark.read.parquet(songplays)
tot_fact_records = songplays_df.count()

In [13]:
print("Number of records in Fact Table: {}".format(tot_fact_records))

Number of records in Fact Table: 6820


### Validate the following
- Validate the artist attributes of a unique log data record.
- Validate the users attributes of a unique log data record.
- Validate the song attributes of a unique log data record.

### Validate the song attributes of a unique log data record.

### Filter all the records which have Non NULL song_id

In [16]:
fact_songNN = songplays_df.filter(songplays_df.song_id.isNotNull())

In [126]:
fact_songNN.filter(col('start_time') == 1542241826796).show()

+----------+-------+-----+-------+---------+----------+--------+----------+----+-----+
|start_time|user_id|level|song_id|artist_id|session_id|location|user_agent|year|month|
+----------+-------+-----+-------+---------+----------+--------+----------+----+-----+
+----------+-------+-----+-------+---------+----------+--------+----------+----+-----+



### Create View of the Fact Table

In [17]:
fact_songNN.createOrReplaceTempView('fact_songNN')

### Read the songs_data (Input) table 

In [18]:
song_data_sql = input + "song_data/A/A/*/*.json"

# read song data file
songs_data = spark.read.json(song_data_sql)

### Create View of the songs_data (Input) table

In [19]:
songs_data.createOrReplaceTempView('songs_data')

### Create View of the songs dimension (Output) table

In [58]:
songs = spark.read.parquet(output + '/songs')

In [61]:
songs.createOrReplaceTempView('songs')

### Filter the distinct songs (song_id) in the songs_data table and extract the song_id's from the fact table (with Non NUll song_id's)

In [64]:
distinct_songs = spark.sql("select s.* from fact_songNN f \
                           inner join songs s \
                           on f.song_id = s.song_id \
                           where f.song_id in \
                           (select distinct(song_id) from songs_data)")

In [65]:
type(distinct_songs)

pyspark.sql.dataframe.DataFrame

In [66]:
### Get the First row of the filtered 

In [67]:
first_row = distinct_songs.collect()[0]

In [72]:
value = first_row
value

Row(song_id='SOFVOQL12A6D4F7456', title='The Boy With The Thorn In His Side', duration=196.67546, year=1985, artist_id='ARPN0Y61187B9ABAA0')

### Find the song attributes in the songs_data table

#### Construct SQL Query

In [73]:
sql_query = "select * from songs_data where song_id = '{}'".format(value[0])
sql_query

"select * from songs_data where song_id = 'SOFVOQL12A6D4F7456'"

In [74]:
d = spark.sql(sql_query)

In [75]:
print(d.collect()[0])

Row(artist_id='ARPN0Y61187B9ABAA0', artist_latitude=53.4796, artist_location='Manchester, England', artist_longitude=-2.24881, artist_name='The Smiths', duration=196.67546, num_songs=1, song_id='SOFVOQL12A6D4F7456', title='The Boy With The Thorn In His Side', year=1985)


## SONGS ATTRIBUTES MATCH

### Create View of the artists dimension (Output) table

In [78]:
artists = spark.read.parquet(output + '/artists')

In [79]:
artists.createOrReplaceTempView('artists')

#### Construct SQL Query

In [83]:
value = 'ARPN0Y61187B9ABAA0'

In [84]:
sql_query = "select * from artists where artist_id = '{}'".format(value)
sql_query

"select * from artists where artist_id = 'ARPN0Y61187B9ABAA0'"

In [85]:
d = spark.sql(sql_query)

In [86]:
print(d.collect()[0])

Row(artist_id='ARPN0Y61187B9ABAA0', artist_name='The Smiths', artist_location='Manchester, England', artist_latitude=53.4796, artist_longitude=-2.24881)


## ARTISTS ATTRIBUTES MATCH

In [51]:
spark.sql("select * from songs where song_id ='SOZCTXZ12AB0182364'").show()

+------------------+--------------+---------+----+------------------+
|           song_id|         title| duration|year|         artist_id|
+------------------+--------------+---------+----+------------------+
|SOZCTXZ12AB0182364|Setanta matins|269.58322|   0|AR5KOSW1187FB35FF4|
+------------------+--------------+---------+----+------------------+



## Validate the total number of records match between the log data and fact table

In [87]:
log_data = input + 'log_data/*/*/*.json'

In [88]:
logdata_df = spark.read.json(log_data)
logdata_df = logdata_df.filter(logdata_df.page=='NextSong')
tot_log_records = logdata_df.count()

In [89]:
print(tot_fact_records == tot_log_records)

True


In [90]:
logdata_df.createOrReplaceTempView('logdata_df')

### Create View of the users dimension (Output) table

In [91]:
users = spark.read.parquet(output + '/users')

In [92]:
users.createOrReplaceTempView('users')

In [93]:
logdata_df.where(col('userId').isNotNull()).limit(5).show()

+-----------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+--------+-----------------+---------+--------------------+------+-------------+--------------------+------+
|     artist|     auth|firstName|gender|itemInSession|lastName|   length|level|            location|method|    page|     registration|sessionId|                song|status|           ts|           userAgent|userId|
+-----------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+--------+-----------------+---------+--------------------+------+-------------+--------------------+------+
|   Harmonia|Logged In|     Ryan|     M|            0|   Smith|655.77751| free|San Jose-Sunnyval...|   PUT|NextSong|1.541016707796E12|      583|       Sehr kosmisch|   200|1542241826796|"Mozilla/5.0 (X11...|    26|
|The Prodigy|Logged In|     Ryan|     M|            1|   Smith|260.07465| free|San Jose-Sunnyval...|   PUT|NextSong|1.541016707796E12|      

In [98]:
logdata_df.printSchema()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)



In [94]:
users.printSchema()

root
 |-- userId: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- lastName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- level: string (nullable = true)



#### Construct SQL Query

In [96]:
### Check the user attributes for userId = 26
value = 26

In [97]:
sql_query = "select * from users where userId = '{}'".format(value)
sql_query

"select * from users where userId = '26'"

In [99]:
d = spark.sql(sql_query)

In [100]:
print(d.collect()[0])

Row(userId='26', firstName='Ryan', lastName='Smith', gender='M', level='free')


## USERS ATTRIBUTES MATCH

<font color=red> <b>Find the time it takes to search a song record with and without using a partition key</b> </font>

In [152]:
sql_query = "select u.userId from users u \
                where u.firstName = 'Ryan' and u.lastName = 'Smith'"

In [153]:
start = datetime.now()
start

datetime.datetime(2020, 5, 26, 1, 31, 20, 569332)

In [154]:
d = spark.sql(sql_query).collect()
end = datetime.now()

In [155]:
tp1 = end - start
tp1

datetime.timedelta(seconds=28, microseconds=288266)

In [156]:
print(d)

[Row(userId='26')]


In [160]:
sql_query1 = "select userId from logdata_df where where firstName = 'Ryan' and lastName = 'Smith'"

In [161]:
start1 = datetime.now()
start1

datetime.datetime(2020, 5, 26, 1, 33, 21, 370540)

In [162]:
d1 = spark.sql(sql_query1).collect()
end1 = datetime.now()

In [163]:
tp2 = end1 - start1
tp2

datetime.timedelta(seconds=4, microseconds=837886)