In [3]:
import boto3
import sagemaker
import pandas as pd
from pyathena import connect
# from csv2tsv import to_csv

#### Auth with AWS

In [4]:

sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
account_id = boto3.client("sts").get_caller_identity().get("Account")

sm = boto3.Session().client(service_name="sagemaker", region_name=region)

#### Convert csv to tsv and move to S3 

In [5]:
s3_private_data_path = "s3://{}/w2-musicData/tsv".format(bucket)
print(s3_private_data_path)

s3://sagemaker-us-east-1-106006112223/w2-musicData/tsv


In [6]:
%store s3_private_data_path

Stored 's3_private_data_path' (str)


In [None]:
to_csv('dataset.csv')

In [19]:
!aws s3 cp "dataset_tsv.csv" $s3_private_data_path/

upload: ./dataset_tsv.csv to s3://sagemaker-us-east-1-106006112223/w2-musicData/tsv/dataset_tsv.csv


In [20]:
!aws s3 ls $s3_private_data_path/

2024-09-15 20:52:01   20103228 dataset_tsv.csv


#### Create DB in Athena for queries

In [6]:
# Set S3 staging directory -- this is a temporary directory used for Athena queries
s3_staging_dir = "s3://{0}/athena/staging".format(bucket)
print(s3_staging_dir)
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)

s3://sagemaker-us-east-1-106006112223/athena/staging


In [22]:
database_name = "w2_music_db"
statement = "CREATE DATABASE IF NOT EXISTS {}".format(database_name)
pd.read_sql(statement, conn)

  pd.read_sql(statement, conn)


In [7]:
statement = "SHOW DATABASES"

df_show = pd.read_sql(statement, conn)
df_show.head(5)

  df_show = pd.read_sql(statement, conn)


Unnamed: 0,database_name
0,default
1,dsoaws
2,w2_music_db


#### Create tables in DB and schemas

In [8]:
table_name_tsv = 'music_ds_tsv2'
database_name = "w2_music_db"
# SQL statement to execute
statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
         track_id string,
         artists string,
         album_name string,
         track_name string,
         popularity int,
         duration_ms int,
         explicit string,
         danceability decimal,
         energy decimal,
         key int,
         loudness decimal,
         mode int,
         speechiness decimal,
         acousticness decimal,
         instrumentalness decimal,
         liveness decimal,
         valence decimal,
         tempo decimal,
         time_signature int,
         track_genre string
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\\t' LINES TERMINATED BY '\\n' LOCATION '{}'
TBLPROPERTIES ('compressionType'='gzip', 'skip.header.line.count'='1')""".format(
    database_name, table_name_tsv,s3_private_data_path
)

print(statement)

CREATE EXTERNAL TABLE IF NOT EXISTS w2_music_db.music_ds_tsv2(
         track_id string,
         artists string,
         album_name string,
         track_name string,
         popularity int,
         duration_ms int,
         explicit string,
         danceability decimal,
         energy decimal,
         key int,
         loudness decimal,
         mode int,
         speechiness decimal,
         acousticness decimal,
         instrumentalness decimal,
         liveness decimal,
         valence decimal,
         tempo decimal,
         time_signature int,
         track_genre string
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n' LOCATION 's3://sagemaker-us-east-1-106006112223/w2-musicData/tsv'
TBLPROPERTIES ('compressionType'='gzip', 'skip.header.line.count'='1')


In [9]:
statement = "SHOW TABLES IN W2_MUSIC_DB"

df_show = pd.read_sql(statement, conn)
df_show.head(5)

  df_show = pd.read_sql(statement, conn)


Unnamed: 0,tab_name
0,music_ds_tsv
1,music_ds_tsv2


In [10]:
category = "Jason Mraz"

statement = """SELECT * FROM {}.{}
    WHERE album_name = '{}' LIMIT 10""".format(
    database_name, table_name_tsv, category
)
print(statement)
df = pd.read_sql(statement, conn)
display(df.head(10))

# pandas way
statement = """SELECT * FROM {}.{}
    LIMIT 200""".format(
    database_name, table_name_tsv
)
print(statement)
full_df = pd.read_sql(statement, conn)
display(full_df.head(10))


SELECT * FROM w2_music_db.music_ds_tsv2
    WHERE album_name = 'Jason Mraz' LIMIT 10


  df = pd.read_sql(statement, conn)


Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,7,1EzrEOXmMH3G43AXT1y7pA,Jason Mraz,We Sing. We Dance. We Steal Things.,,80,242946,,1.0,0,11.0,-9,1.0,0.0,1.0,0.0,0.0,1.0,150,4
1,11,5ivF4eQBqJiVL5IAE9jRyl,Jason Mraz,Love Is a Four Letter Word,,69,240165,,0.0,0,4.0,-10,1.0,0.0,1.0,0.0,0.0,0.0,133,3
2,20,3S0OXQeoh0w6AY8WQVckRW,Jason Mraz,We Sing. We Dance. We Steal Things.,,75,242946,,1.0,0,11.0,-9,1.0,0.0,1.0,0.0,0.0,1.0,150,4
3,23,0BUuuEvNa5T4lMaewyiudB,Jason Mraz,Coffee Moment,,0,216386,,1.0,0,3.0,-10,1.0,0.0,0.0,0.0,0.0,1.0,140,4
4,24,3Hn3LfhrQOaKihdCibJsTs,Jason Mraz,Human - Best Adult Pop Tunes,,0,231266,,1.0,0,5.0,-4,0.0,0.0,0.0,0.0,0.0,1.0,97,4


### Homework queries

#### 1. List artist, track_name, and popularity for songs that have a popularity greater than or equal to 99

In [None]:
statement = """SELECT artists, track_name, popularity FROM {}.{}
    WHERE popularity >= 99 LIMIT 10""".format(
    database_name, table_name_tsv
)

print(statement)

df = pd.read_sql(statement, conn)
df.head(10)

# pandas
pd_df = full_df[full_df['popularity'] >= 99][['artists', 'track_name', 'popularity']]
display(pd_df.head(10))


#### 2. List artists with an average popularity of 92


In [None]:
# SELECT artists
# FROM w2_music_db
# GROUP BY artists
# HAVING AVG(popularity) = 92;


statement = """SELECT artists FROM {}.{}
    GROUP BY artists HAVING AVG(popularity) = 92""".format(
    database_name, table_name_tsv
)

print(statement)

df = pd.read_sql(statement, conn)
df.head(10)

# pandas
# artists_avg_popularity = df.groupby('artists').filter(lambda x: x['popularity'].mean() == 92)
# artists_avg_popularity_list = artists_avg_popularity['artists'].unique()
# print(artists_avg_popularity_list)


#### 3. List the Top 10 most energetic genres
  

In [None]:
# SELECT track_genre, AVG(energy) AS avg_energy
# FROM w2_music_db.tracks
# GROUP BY track_genre
# ORDER BY avg_energy DESC
# LIMIT 10;

statement = """SELECT track_genre, AVG(energy) AS avg_energy FROM {}.{}
    GROUP BY track_genre 
    ORDER BY avg_energy DESC LIMIT 10""".format(
    database_name, table_name_tsv
)

print(statement)
df = pd.read_sql(statement, conn)
df.head(10)

# pandas
# top_energetic_genres = df.groupby('track_genre')['energy'].mean().sort_values(ascending=False).head(10)
# print(top_energetic_genres)


#### 4. How many tracks is Bad Bunny on?

In [None]:
# SELECT COUNT(*) AS track_count
# FROM w2_music_db.tracks
# WHERE artists LIKE '%Bad Bunny%';

statement = """SELECT COUNT(*) AS track_count FROM {}.{}
    WHERE artists LIKE '%Bad Bunny%'""".format(
    database_name, table_name_tsv
)

print(statement)
df = pd.read_sql(statement, conn)
df.head(10)

# bad_bunny_tracks_count = df[df['artists'].str.contains('Bad Bunny')].shape[0]
# print(bad_bunny_tracks_count)


#### 5. Show the top 10 genres in terms of popularity sorted by their most popular track

In [None]:
# SELECT track_genre, MAX(popularity) AS max_popularity
# FROM w2_music_db.tracks
# GROUP BY track_genre
# ORDER BY max_popularity DESC
# LIMIT 10;

statement = """SELECT track_genre, MAX(popularity) AS max_popularity FROM {}.{}
    GROUP BY track_genre
    ORDER BY max_popularity DESC
    LIMIT 10""".format(
    database_name, table_name_tsv
)

print(statement)
df = pd.read_sql(statement, conn)
df.head(10)

# pandas
# top_genres_by_popularity = df.groupby('track_genre')['popularity'].max().sort_values(ascending=False).head(10)
# print(top_genres_by_popularity)

In [None]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>

In [None]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}