<!-- instructions -->
The housing data set contains information about houses and their values, and the Google Maps raw data set contains information about addresses and their designations. Imagine we are building an ML tool to predict housing prices. To aid with prediction, we want to create a Neighborhood feature group. We can envision this neighborhood feature group helping us predict house prices by giving us a bucket to group new houses into.

In [30]:
import boto3
import sagemaker
import pandas as pd
import time
import datetime
import numpy as np
from time import gmtime, strftime, sleep
from sagemaker.feature_store.feature_group import FeatureGroup


#### Auth with AWS

In [3]:
sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
account_id = boto3.client("sts").get_caller_identity().get("Account")
boto_session = boto3.Session(region_name=region)

In [None]:
sagemaker_client = boto_session.client(service_name="sagemaker", region_name=region)
featurestore_runtime = boto_session.client(
    service_name="sagemaker-featurestore-runtime", region_name=region
)
feature_store_session = boto3.Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime,
)

### Get Data

In [12]:
# locally 

gmaps_houses_df = pd.read_csv('/Users/Steve/dev/aiMasters/aai-540-homework/homework-3-1/housing_gmaps_data_raw.csv')
housing_df = pd.read_csv('/Users/Steve/dev/aiMasters/aai-540-homework/homework-3-1/housing.csv')

display(gmaps_houses_df.head())
display(housing_df.head())

Unnamed: 0,street_number,route,locality-political,administrative_area_level_2-political,administrative_area_level_1-political,country-political,postal_code,address,longitude,latitude,...,establishment-natural_feature,airport-establishment-point_of_interest,political-sublocality-sublocality_level_1,administrative_area_level_3-political,post_box,establishment-light_rail_station-point_of_interest-transit_station,establishment-point_of_interest,aquarium-establishment-park-point_of_interest-tourist_attraction-zoo,campground-establishment-lodging-park-point_of_interest-rv_park-tourist_attraction,cemetery-establishment-park-point_of_interest
0,3130,Grizzly Peak Boulevard,Berkeley,Alameda County,California,United States,94705.0,"3130 Grizzly Peak Blvd, Berkeley, CA 94705, USA",-122.23,37.88,...,,,,,,,,,,
1,2005,Tunnel Road,Oakland,Alameda County,California,United States,94611.0,"2005 Tunnel Rd, Oakland, CA 94611, USA",-122.22,37.86,...,,,,,,,,,,
2,6886,Chabot Road,Oakland,Alameda County,California,United States,94618.0,"6886 Chabot Rd, Oakland, CA 94618, USA",-122.24,37.85,...,,,,,,,,,,
3,6365,Florio Street,Oakland,Alameda County,California,United States,94618.0,"6365 Florio St, Oakland, CA 94618, USA",-122.25,37.85,...,,,,,,,,,,
4,5407,Bryant Avenue,Oakland,Alameda County,California,United States,94618.0,"5407 Bryant Ave, Oakland, CA 94618, USA",-122.25,37.84,...,,,,,,,,,,


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,...,establishment-natural_feature,airport-establishment-point_of_interest,political-sublocality-sublocality_level_1,administrative_area_level_3-political,post_box,establishment-light_rail_station-point_of_interest-transit_station,establishment-point_of_interest,aquarium-establishment-park-point_of_interest-tourist_attraction-zoo,campground-establishment-lodging-park-point_of_interest-rv_park-tourist_attraction,cemetery-establishment-park-point_of_interest
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,...,,,,,,,,,,
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,...,,,,,,,,,,
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,...,,,,,,,,,,
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,...,,,,,,,,,,
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,...,,,,,,,,,,


In [72]:
df = pd.merge(housing_df, gmaps_houses_df, on=['longitude', 'latitude'], how='inner')

df['neighborhood'] = df['neighborhood-political']
print("find wharf" , list(df['neighborhood'].unique()).count("Fisherman's Wharf"))
df = df.dropna(subset=['neighborhood'])
df = df.drop(columns=['neighborhood-political'])
df['event_time'] = datetime.datetime.now()
df['ocean_proximity'] = df['ocean_proximity'].str.replace(' ', '_')


ocean_proximity_dummies = pd.get_dummies(df['ocean_proximity'], dtype=int)
df = pd.concat([df, ocean_proximity_dummies], axis=1)

df['median_house_value'] = df.groupby('neighborhood')['median_house_value'].transform('mean')
df['median_house_value'] = df['median_house_value'].clip(upper=500000)

# Average 'median_house_age', into 10-year bins
df['median_house_age'] = df.groupby('neighborhood')['housing_median_age'].transform('mean')
df['median_house_age'] = pd.cut(df['median_house_age'], bins=np.arange(0, 101, 10), right=False, labels=[f"{i}-{i+9}" for i in range(0, 100, 10)])

# Total households (average per neighborhood, rounded up)
df['total_households'] = df.groupby('neighborhood')['households'].transform('mean').apply(np.ceil).astype(int)
# Bedrooms per household (average and impute missing values)
df['total_bedrooms'] = df.groupby('neighborhood')['total_bedrooms'].transform(
    lambda x: x.fillna(x.mean()) if x.mean() > 0 else x.fillna(0)
)

df['bedrooms_per_household'] = df['total_bedrooms'] / df['households']

# Replacing locaility code with postal code - unsure from directions 
df['locality_code'] = df['postal_code']

# Selecting the final columns
final_cols = ['neighborhood', 'event_time', '<1H_OCEAN', 'INLAND', 'INLAND', 'NEAR_BAY', 'NEAR_OCEAN',
               'median_house_value', 'median_house_age', 'total_households', 'bedrooms_per_household', 'locality_code']

# Generate the final dataframe
final_df = df[final_cols]
display(final_df.head())


find wharf 1
Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity', 'street_number', 'route',
       'locality-political', 'administrative_area_level_2-political',
       'administrative_area_level_1-political', 'country-political',
       'postal_code', 'address', 'postal_code_suffix',
       'establishment-point_of_interest-transit_station',
       'establishment-park-point_of_interest', 'premise',
       'establishment-point_of_interest-subway_station-transit_station',
       'airport-establishment-finance-moving_company-point_of_interest-storage',
       'subpremise',
       'bus_station-establishment-point_of_interest-transit_station',
       'establishment-park-point_of_interest-tourist_attraction',
       'establishment-natural_feature',
       'airport-establishment-point_of_interest',
       'political-sublocality-sublocality_level_1',
       'adm

Unnamed: 0,neighborhood,event_time,<1H_OCEAN,INLAND,INLAND.1,NEAR_BAY,NEAR_OCEAN,median_house_value,median_house_age,total_households,bedrooms_per_household,locality_code
1,Merriewood,2024-09-23 15:40:15.320654,0,0,0,1,0,328500.0,30-39,797,0.97188,94611.0
2,Upper Rockridge,2024-09-23 15:40:15.320654,0,0,0,1,0,377557.285714,40-49,358,1.073446,94618.0
3,Rockridge,2024-09-23 15:40:15.320654,0,0,0,1,0,292483.333333,50-59,425,1.073059,94618.0
4,Rockridge,2024-09-23 15:40:15.320654,0,0,0,1,0,292483.333333,50-59,425,1.081081,94618.0
5,Rockridge,2024-09-23 15:40:15.320654,0,0,0,1,0,292483.333333,50-59,425,1.103627,94618.0


### Define Feature Group

In [None]:

neighborhood_feature_group_name = "neighborhood-feature-group-" + strftime("%d-%H-%M-%S", gmtime())

neighborhood_feature_group = FeatureGroup(
    name=neighborhood_feature_group_name, sagemaker_session=feature_store_session
)

current_time_sec = int(round(time.time()))

# def cast_object_to_string(data_frame):
#     for label in data_frame.columns:
#         if data_frame.dtypes[label] == "object":
#             data_frame[label] = data_frame[label].astype("str").astype("string")

# cast_object_to_string(gmaps_houses_df)
# cast_object_to_string(housing_df)

In [None]:
# record identifier and event time feature names
record_identifier_feature_name = "neighborhood"
event_time_feature_name = "event_time"

current_time_sec = int(round(time.time()))

# append EventTime feature
final_df[event_time_feature_name] = pd.Series(
    [current_time_sec] * len(final_df), dtype="float64"
)

# load feature definitions to the feature group. SageMaker FeatureStore Python SDK will auto-detect the data schema based on input data.
neighborhood_feature_group.load_feature_definitions(data_frame=final_df)

#### Create FeatureGroups in SageMaker FeatureStore

In [None]:
def wait_for_feature_group_creation_complete(feature_group):
    status = feature_group.describe().get("FeatureGroupStatus")
    while status == "Creating":
        print("Waiting for Feature Group Creation")
        time.sleep(5)
        status = feature_group.describe().get("FeatureGroupStatus")
    if status != "Created":
        raise RuntimeError(f"Failed to create feature group {feature_group.name}")
    print(f"FeatureGroup {feature_group.name} successfully created.")

s3_private_data_path = "s3://{}/feature_groups/".format(bucket)

neighborhood_feature_group.create(
    s3_uri= s3_private_data_path,
    record_identifier_name=record_identifier_feature_name,
    event_time_feature_name=event_time_feature_name,
    role_arn=role,
    enable_online_store=True,
)

wait_for_feature_group_creation_complete(feature_group=neighborhood_feature_group)

In [None]:
# Validate feature group
neighborhood_feature_group.describe()
sagemaker_client.list_feature_groups()

In [None]:
# Put Records into feature group 
neighborhood_feature_group.ingest(data_frame=final_df, max_workers=5, wait=True)

In [None]:
# retreive a record from the online store
# Brooktree, Fisherman’s Wharf, Los Osos

# change this 
record_identifier_value = 'Brooktree'

featurestore_runtime.get_record(
    FeatureGroupName=neighborhood_feature_group_name,
    RecordIdentifierValueAsString=record_identifier_value,
)

In [None]:
transaction_id = str(3450774)


# Helper to parse the feature value from the record.
def get_feature_value(record, feature_name):
    return str(list(filter(lambda r: r["FeatureName"] == feature_name, record))[0]["ValueAsString"])


transaction_response = featurestore_runtime.get_record(
    FeatureGroupName=neighborhood_feature_group_name, RecordIdentifierValueAsString=transaction_id
)
transaction_record = transaction_response["Record"]


get_feature_value(transaction_record, "TransactionDT")

----------------

#### Convert csv to tsv and move to S3 

In [6]:
s3_private_data_path = "s3://{}/w2-musicData/csv".format(bucket)
print(s3_private_data_path)

s3://sagemaker-us-east-1-106006112223/w2-musicData/csv


In [11]:
!aws s3 cp "dataset_clean.csv" $s3_private_data_path/

upload: ./dataset_clean.csv to s3://sagemaker-us-east-1-106006112223/w2-musicData/csv/dataset_clean.csv


In [12]:
!aws s3 ls $s3_private_data_path/

2024-09-17 05:56:57   16931936 dataset_clean.csv


#### Create DB in Athena for queries

In [4]:
# Set S3 staging directory -- this is a temporary directory used for Athena queries
database_name = "w2_music_db"
table_name_tsv = 'music_ds_tsv10'
s3_staging_dir = "s3://{0}/athena/staging".format(bucket)
print(s3_staging_dir)
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)

s3://sagemaker-us-east-1-106006112223/athena/staging


In [7]:
statement = "CREATE DATABASE IF NOT EXISTS {}".format(database_name)
pd.read_sql(statement, conn)

statement = "SHOW DATABASES"
df_show = pd.read_sql(statement, conn)
df_show.head(5)

  pd.read_sql(statement, conn)
  df_show = pd.read_sql(statement, conn)


Unnamed: 0,database_name
0,default
1,dsoaws
2,w2_music_db


#### Create tables in DB and schemas

In [7]:
# SQL statement to execute
statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
         track_id string,
         artists string,
         track_name string,
         popularity int,
         duration_ms int,
         explicit string,
         danceability float,
         energy float,
         key int,
         loudness float,
         mode int,
         speechiness float,
         acousticness float,
         instrumentalness float,
         liveness float,
         valence float,
         tempo float,
         time_signature int,
         track_genre string
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\\n' LOCATION '{}'
TBLPROPERTIES ('compressionType'='gzip', 'skip.header.line.count'='1')""".format(
    database_name, table_name_tsv, s3_private_data_path
)

print(statement)

pd.read_sql(statement, conn)

CREATE EXTERNAL TABLE IF NOT EXISTS w2_music_db.music_ds_tsv10(
         track_id string,
         artists string,
         track_name string,
         popularity int,
         duration_ms int,
         explicit boolean,
         danceability float,
         energy float,
         key int,
         loudness float,
         mode int,
         speechiness float,
         acousticness float,
         instrumentalness float,
         liveness float,
         valence float,
         tempo float,
         time_signature int,
         track_genre string
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' LOCATION 's3://sagemaker-us-east-1-106006112223/w2-musicData/csv'
TBLPROPERTIES ('compressionType'='gzip', 'skip.header.line.count'='1')


  pd.read_sql(statement, conn)


In [8]:
statement = "SHOW TABLES IN W2_MUSIC_DB"

df_show = pd.read_sql(statement, conn)
df_show.head(5)

  df_show = pd.read_sql(statement, conn)


Unnamed: 0,tab_name
0,music_ds_tsv10
1,music_ds_tsv8
2,music_ds_tsv9


In [9]:
# first test query to get all data via athena

statement = """SELECT * FROM {}.{} LIMIT 5""".format(
    database_name, table_name_tsv
)
print(statement)
sql_df = pd.read_sql(statement, conn)
display(sql_df.head(5))

SELECT * FROM w2_music_db.music_ds_tsv10 LIMIT 5


  sql_df = pd.read_sql(statement, conn)


Unnamed: 0,track_id,artists,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,73,230666,False,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost - Acoustic,55,149610,False,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,57,210826,False,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Can't Help Falling In Love,71,201933,False,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,82,198853,False,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


In [12]:
# reading local csv file using pandas
full_df = pd.read_csv('dataset_clean.csv')
full_df = full_df.dropna()
display(full_df.head())

Unnamed: 0,track_id,artists,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,73,230666,False,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost - Acoustic,55,149610,False,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,57,210826,False,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Can't Help Falling In Love,71,201933,False,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,82,198853,False,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


## Homework queries

#### 1. List artist, track_name, and popularity for songs that have a popularity greater than or equal to 99

In [10]:
statement = """SELECT artists, track_name, popularity FROM {}.{}
    WHERE popularity >= 99""".format(
    database_name, table_name_tsv
)

# CAST(popularity AS INTEGER) >= 99
print(statement)

df = pd.read_sql(statement, conn)
df.head(10)

SELECT artists, track_name, popularity FROM w2_music_db.music_ds_tsv10
    WHERE popularity >= 99


  df = pd.read_sql(statement, conn)


Unnamed: 0,artists,track_name,popularity
0,Sam Smith;Kim Petras,Unholy (feat. Kim Petras),100
1,Sam Smith;Kim Petras,Unholy (feat. Kim Petras),100


In [13]:
# pandas
pd_df = full_df[full_df['popularity'] >= 99][['artists','track_name','popularity']]
display(pd_df.head())

Unnamed: 0,artists,track_name,popularity
20001,Sam Smith;Kim Petras,Unholy (feat. Kim Petras),100
51664,Bizarrap;Quevedo,"Quevedo: Bzrp Music Sessions, Vol. 52",99
81051,Sam Smith;Kim Petras,Unholy (feat. Kim Petras),100


#### 2. List artists with an average popularity of 92


In [20]:
statement = """SELECT artists FROM {}.{}
    GROUP BY artists HAVING AVG(popularity) = 92""".format(
    database_name, table_name_tsv
)

print(statement)

df = pd.read_sql(statement, conn)
df.head(10)

SELECT artists FROM w2_music_db.music_ds_tsv9
    GROUP BY artists HAVING AVG(popularity) = 92


  df = pd.read_sql(statement, conn)


Unnamed: 0,artists
0,Harry Styles
1,Rema;Selena Gomez


In [21]:
# pandas
artists_avg_popularity = full_df.groupby('artists').filter(lambda x: x['popularity'].mean() == 92)
display(artists_avg_popularity.head())
artists_avg_popularity_list = artists_avg_popularity['artists'].unique()
print(artists_avg_popularity_list)


Unnamed: 0,track_id,artists,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
81052,4LRPiXqCikLlN15c3yImP7,Harry Styles,As It Was,95,167303,False,0.52,0.731,6,-5.338,0,0.0557,0.342,0.00101,0.311,0.662,173.93,4,pop
81100,0WtM2NBVQNNJLh6scP13H8,Rema;Selena Gomez,Calm Down (with Selena Gomez),92,239317,False,0.801,0.806,11,-5.206,1,0.0381,0.382,0.000669,0.114,0.802,106.999,4,pop
81158,6UelLqGlWMcVH1E5c4H7lY,Harry Styles,Watermelon Sugar,89,174000,False,0.548,0.816,0,-4.209,1,0.0465,0.122,0.0,0.335,0.557,95.39,4,pop
81205,4Dvkj6JhhA12EX05fT7y2e,Harry Styles,As It Was,92,167303,False,0.52,0.731,6,-5.338,0,0.0557,0.342,0.00101,0.311,0.662,173.93,4,pop


['Harry Styles' 'Rema;Selena Gomez']


#### 3. List the Top 10 most energetic genres
  

In [22]:
statement = """SELECT DISTINCT track_genre
    FROM {}.{}
    LIMIT 10;""".format(
    database_name, table_name_tsv
)

print(statement)
df = pd.read_sql(statement, conn)
display(df.head(10))

# Error in schema / parsing, track_genre is all messed up

SELECT DISTINCT track_genre
    FROM w2_music_db.music_ds_tsv10
    LIMIT 10;


  df = pd.read_sql(statement, conn)


Unnamed: 0,track_genre
0,acoustic
1,4
2,3
3,163.99
4,124.157
5,193.395
6,afrobeat
7,alt-rock
8,alternative
9,ambient


In [18]:
statement = """SELECT track_genre, AVG(energy) AS avg_energy FROM {}.{}
    GROUP BY track_genre 
    ORDER BY avg_energy DESC""".format(
    database_name, table_name_tsv
)

print(statement)
df = pd.read_sql(statement, conn)
df.head(10)

SELECT track_genre, AVG(energy) AS avg_energy FROM w2_music_db.music_ds_tsv10
    GROUP BY track_genre 
    ORDER BY avg_energy DESC


  df = pd.read_sql(statement, conn)


Unnamed: 0,track_genre,avg_energy
0,0.797,1174026.0
1,0.556,691306.0
2,0.492,542000.0
3,0.45,538160.0
4,0.347,526706.0
5,0.0761,502786.0
6,0.0903,449813.0
7,0.035,440310.0
8,0.483,371160.0
9,0.147,355693.0


In [23]:
# pandas
top_energetic_genres = full_df.groupby('track_genre')['energy'].mean().sort_values(ascending=False).head(10)
display(top_energetic_genres)

track_genre
death-metal      0.931470
grindcore        0.924201
metalcore        0.914485
happy            0.910971
hardstyle        0.901246
drum-and-bass    0.876635
black-metal      0.874897
heavy-metal      0.874003
party            0.871237
j-idol           0.868677
Name: energy, dtype: float64

#### 4. How many tracks is Bad Bunny on?

In [26]:
# SELECT COUNT(*) AS track_count
# FROM w2_music_db.tracks
# WHERE artists LIKE '%Bad Bunny%';

statement = """SELECT COUNT(*) AS track_count FROM {}.{}
    WHERE artists LIKE '%Bad Bunny%'""".format(
    database_name, table_name_tsv
)

print(statement)
df = pd.read_sql(statement, conn)
print(df)

  df = pd.read_sql(statement, conn)


SELECT COUNT(*) AS track_count FROM w2_music_db.music_ds_tsv10
    WHERE artists LIKE '%Bad Bunny%'
   track_count
0          416


In [27]:
bad_bunny_tracks_count = full_df[full_df['artists'].str.contains('Bad Bunny')].shape[0]
print(bad_bunny_tracks_count)

416


#### 5. Show the top 10 genres in terms of popularity sorted by their most popular track

In [27]:
# SELECT track_genre, MAX(popularity) AS max_popularity
# FROM w2_music_db.tracks
# GROUP BY track_genre
# ORDER BY max_popularity DESC
# LIMIT 10;

statement = """SELECT track_genre, MAX(popularity) AS max_popularity FROM {}.{}
    GROUP BY track_genre
    ORDER BY max_popularity DESC
    LIMIT 10""".format(
    database_name, table_name_tsv
)

print(statement)
df = pd.read_sql(statement, conn)
df.head(10)

# noticed slight difference in return... hip hop genre got ereased?

SELECT track_genre, MAX(popularity) AS max_popularity FROM w2_music_db.music_ds_tsv10
    GROUP BY track_genre
    ORDER BY max_popularity DESC
    LIMIT 10


  df = pd.read_sql(statement, conn)


Unnamed: 0,track_genre,max_popularity
0,dance,100
1,pop,100
2,latin,98
3,reggaeton,98
4,latino,98
5,edm,98
6,reggae,98
7,piano,96
8,rock,96
9,chill,93


In [28]:

# pandas
top_genres_by_popularity = full_df.groupby('track_genre')['popularity'].max().sort_values(ascending=False).head(10)
print(top_genres_by_popularity)

track_genre
dance        100
pop          100
hip-hop       99
latin         98
edm           98
latino        98
reggaeton     98
reggae        98
rock          96
piano         96
Name: popularity, dtype: int64


In [None]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>

In [None]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}