# Assignment 2.1: Data Lake - Exercise
Sinthuja Bates

### Setup notebook

In [24]:
# import libraries
import boto3
import os
import pandas as pd
import sagemaker

from IPython.core.display import display, HTML
from pyathena import connect

  from IPython.core.display import display, HTML


In [2]:
# check pre-requisites are completed
%store

Stored variables and their in-db values:
setup_dependencies_passed             -> True
setup_s3_bucket_passed                -> True


In [7]:
# save Amazon information
sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
account_id = boto3.client("sts").get_caller_identity().get("Account")

sm = boto3.Session().client(service_name="sagemaker", region_name=region)

### Upload data to S3

In [19]:
# save path to local datset
current_directory = os.getcwd()
local_path = os.path.join(current_directory, 'data', 'dataset.csv')
print(local_path)

/root/aai-540-homework/homework-2-1/data/dataset.csv


In [9]:
# save path to S3 bucket
s3_path = "s3://{}".format(bucket)
print(s3_path)

s3://sagemaker-us-east-1-711667138246


In [20]:
# copy data from local to s3
!aws s3 cp "$local_path" $s3_path/

upload: data/dataset.csv to s3://sagemaker-us-east-1-711667138246/dataset.csv


In [23]:
# check it uploaded
display(
    HTML(
        '<b>Review <a target="blank" href="https://s3.console.aws.amazon.com/s3/buckets/sagemaker-{}-{}/?region={}&tab=overview">S3 Bucket</a></b>'.format(
            region, account_id, region
        )
    )
)

### Setup Athena

In [31]:
# set database name
database_name = "awsdata"

# set S3 staging directory
s3_staging_dir = "s3://{0}/athena/staging".format(bucket)

# create connection
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)

# create database if it doesn't exist
statement = "CREATE DATABASE IF NOT EXISTS {}".format(database_name)
pd.read_sql(statement, conn)

  pd.read_sql(statement, conn)


In [32]:
# verify the database has been created
statement = "SHOW DATABASES"

df_show = pd.read_sql(statement, conn)
df_show.head(5)

  df_show = pd.read_sql(statement, conn)


Unnamed: 0,database_name
0,awsdata
1,default


### Create csv data table in Athena

In [53]:
# set table name
table_name_csv = "dataset_csv"

# create table
statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
         track_id string,
         artists string,
         album_name string,
         track_name string,
         popularity int,
         duration_ms int,
         explicit boolean,
         danceability float,
         energy float,
         key int,
         loudness float,
         mode int,
         speechiness float,
         acousticness float,
         instrumentalness float,
         liveness float,
         valence float,
         tempo float,
         time_signature int,
         track_genre string
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\\n' LOCATION '{}/'
TBLPROPERTIES ('skip.header.line.count'='1')""".format(
    database_name, table_name_csv, s3_path
)

pd.read_sql(statement, conn)

  pd.read_sql(statement, conn)


In [54]:
# verify table has been created
statement = "SHOW TABLES in {}".format(database_name)
df_show = pd.read_sql(statement, conn)
df_show.head(5)

  df_show = pd.read_sql(statement, conn)


Unnamed: 0,tab_name
0,dataset
1,dataset_csv


In [56]:
# run sample query
statement = """SELECT * FROM {}.{}
    WHERE artists = 'data'""".format(
    database_name, table_name_csv
)
df = pd.read_sql(statement, conn)
df

  df = pd.read_sql(statement, conn)


Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre


### Create Parquet table in Athena

In [60]:
# set table name
table_name_parquet = "dataset_parquet"

# set S3 path to Parquet data
s3_path_parquet = "s3://{}/parquet".format(bucket)

# create table
statement = """CREATE TABLE IF NOT EXISTS {}.{}
WITH (format = 'PARQUET', external_location = '{}', partitioned_by = ARRAY['track_genre']) AS
SELECT track_id,
        artists,
        album_name,
        track_name,
        popularity,
        duration_ms,
        explicit,
        danceability,
        energy,
        key,
        loudness,
        mode,
        speechiness,
        acousticness,
        instrumentalness,
        liveness,
        valence,
        tempo,
        time_signature,
        track_genre
FROM {}.{}""".format(
    database_name, table_name_parquet, s3_path_parquet, database_name, table_name_csv
)
pd.read_sql(statement, conn)

  pd.read_sql(statement, conn)


DatabaseError: Execution failed on sql: CREATE TABLE IF NOT EXISTS awsdata.dataset_parquet
WITH (format = 'PARQUET', external_location = 's3://sagemaker-us-east-1-711667138246/parquet', partitioned_by = ARRAY['track_genre']) AS
SELECT track_id,
        artists,
        album_name,
        track_name,
        popularity,
        duration_ms,
        explicit,
        danceability,
        energy,
        key,
        loudness,
        mode,
        speechiness,
        acousticness,
        instrumentalness,
        liveness,
        valence,
        tempo,
        time_signature,
        track_genre
FROM awsdata.dataset_csv
HIVE_TOO_MANY_OPEN_PARTITIONS: Exceeded limit of 100 open writers for partitions/buckets. You may need to manually clean the data at location 's3://sagemaker-us-east-1-711667138246/athena/staging/tables/0af3c079-c233-4f46-aa24-c35a12ccbe5c' before retrying. Athena will not delete data in your account.
unable to rollback

In [57]:
# review the new table in Athena
display(
    HTML(
        '<b>Review <a target="top" href="https://console.aws.amazon.com/glue/home?region={}#">Athena</a></b>'.format(
            region
        )
    )
)

### Shut down notebook resources

In [None]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>

In [None]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}