Song popularity prediction based on audio features (for now).

In [71]:
# Import packages
import os
import numpy as np 
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from datetime import date

%matplotlib inline
sns.set() # Setting seaborn as default style even if use only matplotlib

In [72]:
# constants
dataset_filename = 'Spotify_Dataset_V3_local.csv'
audio_features_col_names = ['Danceability', 'Energy', 'Loudness', 'Speechiness', 'Acousticness', 'Instrumentalness', 'Valence']

In [73]:
spotify_filepath = os.path.join(os.getcwd(),'data', dataset_filename)
print(spotify_filepath)

spotify_data = pd.read_csv(spotify_filepath, delimiter=";")
spotify_data.head()

/Users/neo/Documents/1-app-ml/aml-code/AML-spotify/data/Spotify_Dataset_V3_local.csv


Unnamed: 0,Rank,Title,Artists,Date,Danceability,Energy,Loudness,Speechiness,Acousticness,Instrumentalness,Valence,# of Artist,Artist (Ind.),# of Nationality,Nationality,Continent,Points (Total),Points (Ind for each Artist/Nat),id,Song URL
0,1,Ella Baila Sola,"Eslabon Armado, Peso Pluma",29/05/2023,0.668,0.758,-5176.0,0.033,0.483,0.0,0.834,Artist 1,Eslabon Armado,Nationality 1,Mexico,Latin-America,200,100.0,3qQbCzHBycnDpGskqOWY0E,https://open.spotify.com/track/3qQbCzHBycnDpGs...
1,1,Ella Baila Sola,"Eslabon Armado, Peso Pluma",29/05/2023,0.668,0.758,-5176.0,0.033,0.483,0.0,0.834,Artist 2,Peso Pluma,Nationality 2,Mexico,Latin-America,200,100.0,3qQbCzHBycnDpGskqOWY0E,https://open.spotify.com/track/3qQbCzHBycnDpGs...
2,2,WHERE SHE GOES,Bad Bunny,29/05/2023,0.652,0.8,-4019.0,0.061,0.143,0.629,0.234,Artist 1,Bad Bunny,Nationality 1,Puerto Rico,Latin-America,199,199.0,7ro0hRteUMfnOioTFI5TG1,https://open.spotify.com/track/7ro0hRteUMfnOio...
3,3,La Bebe - Remix,"Yng Lvcas, Peso Pluma",29/05/2023,0.812,0.479,-5678.0,0.333,0.213,0.0,0.559,Artist 1,Yng Lvcas,Nationality 1,Mexico,Latin-America,198,99.0,2UW7JaomAMuX9pZrjVpHAU,https://open.spotify.com/track/2UW7JaomAMuX9pZ...
4,3,La Bebe - Remix,"Yng Lvcas, Peso Pluma",29/05/2023,0.812,0.479,-5678.0,0.333,0.213,0.0,0.559,Artist 2,Peso Pluma,Nationality 2,Mexico,Latin-America,198,99.0,2UW7JaomAMuX9pZrjVpHAU,https://open.spotify.com/track/2UW7JaomAMuX9pZ...


## data cleaning:
- Assume that 'Song URL' column is not needed
- all values are non-null, no need to remove rows
- should change "Date" column type to datetime
- sort dates via chronological order
- could rename columns (standard variable names)
- Scale the audio features
    - All features [0, 1]
    - Loudness : Log base 10 (of the absolute value) then divde by 60.

*TODO:*
- ~~verify all years full except 2023?~~
    --> total, use 2023

- **for different values (in the audio features) for the same song id:**
    - average
        - currently just taking the first one

In [74]:
spotify_data.columns

Index(['Rank', 'Title', 'Artists', 'Date', 'Danceability', 'Energy',
       'Loudness', 'Speechiness', 'Acousticness', 'Instrumentalness',
       'Valence', '# of Artist', 'Artist (Ind.)', '# of Nationality',
       'Nationality', 'Continent', 'Points (Total)',
       'Points (Ind for each Artist/Nat)', 'id', 'Song URL'],
      dtype='object')

In [75]:
# Assume that 'Song URL' column is not needed
spotify_data.drop(['Song URL'], axis=1, inplace=True, errors='ignore')

# Rename columns
spotify_data = spotify_data.rename(columns={
    "# of Artist": "Num_of_artist", "Artist (Ind.)": "Artist_ind", "# of Nationality": "Num_of_nationality", 
    "Points (Total)": "Points_total", "Points (Ind for each Artist/Nat)": "Points_ind", "Song URL": "Song_URL"
})

In [76]:
# make "Date" column datetime type
spotify_data["Date"] = pd.to_datetime(spotify_data["Date"], dayfirst=True)
spotify_data.info()
# spotify_data.loc[0]["Date"]

# reverse dates
spotify_data = spotify_data.sort_values(by="Date")
# spotify_data.head()

# Keep in mind, ranks are also reversed now...

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 651936 entries, 0 to 651935
Data columns (total 19 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   Rank                651936 non-null  int64         
 1   Title               651936 non-null  object        
 2   Artists             651936 non-null  object        
 3   Date                651936 non-null  datetime64[ns]
 4   Danceability        651936 non-null  float64       
 5   Energy              651936 non-null  float64       
 6   Loudness            651936 non-null  float64       
 7   Speechiness         651936 non-null  float64       
 8   Acousticness        651936 non-null  float64       
 9   Instrumentalness    651936 non-null  float64       
 10  Valence             651936 non-null  float64       
 11  Num_of_artist       651936 non-null  object        
 12  Artist_ind          651936 non-null  object        
 13  Num_of_nationality  651936 no

In [77]:
# Scale the audio features.
spotify_data['Loudness'] = spotify_data['Loudness'].apply(lambda x: np.log10(np.abs(x)) / 60)
spotify_data.head()

Unnamed: 0,Rank,Title,Artists,Date,Danceability,Energy,Loudness,Speechiness,Acousticness,Instrumentalness,Valence,Num_of_artist,Artist_ind,Num_of_nationality,Nationality,Continent,Points_total,Points_ind,id
651935,200,Ni**as In Paris,"JAY-Z, Kanye West",2017-01-01,0.757,0.882,0.063118,0.248,0.076,0.0,0.684,Artist 2,Kanye West,Nationality 2,United States,Anglo-America,1,1.0,2KpCpk6HjXXLb7nnXoXA5O
651770,62,Sexual,"NEIKED, Dyo",2017-01-01,0.803,0.569,0.064479,0.074,0.062,0.0,0.809,Artist 1,NEIKED,Nationality 1,Sweden,Europe,139,70.0,3AsAuGTaDQzavZZThyYlop
651769,61,"How Far I'll Go - From ""Moa""",Alessia Cara,2017-01-01,0.314,0.555,0.066372,0.37,0.157,0.0,0.159,Artist 1,Alessia Cara,Nationality 1,Canada,Anglo-America,140,140.0,5hYTyyh2odQKphUbMqc5gN
651768,60,Too Good,"Drake, Rihanna",2017-01-01,0.804,0.648,0.064873,0.117,0.057,0.0,0.392,Artist 2,Rihanna,Nationality 2,Barbados,Anglo-America,141,70.0,11KJSRSgaDxqydKYiD2Jew
651767,60,Too Good,"Drake, Rihanna",2017-01-01,0.804,0.648,0.064873,0.117,0.057,0.0,0.392,Artist 1,Drake,Nationality 1,Canada,Anglo-America,141,70.0,11KJSRSgaDxqydKYiD2Jew


## Prepare for classification
Ideas:
* first split into "popular" vs "unpopular"
* then try "hit", "high", "mid", "low"
* then see whether other features such as cumulative days charting, release date, genre etc. can be used as well...?

In [78]:
# want one row per song.
dropped_artist_split = spotify_data.drop_duplicates(subset=['Title', 'Artists', 'Date'], keep="last") 
# TODO: should really keep the first listed artist.

grouped_by_title = dropped_artist_split.groupby(["Title", "Artists"])
grouped_by_title.head()

Unnamed: 0,Rank,Title,Artists,Date,Danceability,Energy,Loudness,Speechiness,Acousticness,Instrumentalness,Valence,Num_of_artist,Artist_ind,Num_of_nationality,Nationality,Continent,Points_total,Points_ind,id
651769,61,"How Far I'll Go - From ""Moa""",Alessia Cara,2017-01-01,0.314,0.555,0.066372,0.370,0.157,0.0,0.159,Artist 1,Alessia Cara,Nationality 1,Canada,Anglo-America,140,140.0,5hYTyyh2odQKphUbMqc5gN
651767,60,Too Good,"Drake, Rihanna",2017-01-01,0.804,0.648,0.064873,0.117,0.057,0.0,0.392,Artist 1,Drake,Nationality 1,Canada,Anglo-America,141,70.0,11KJSRSgaDxqydKYiD2Jew
651766,59,Panda,Desiigner,2017-01-01,0.576,0.766,0.061567,0.449,0.028,0.0,0.236,Artist 1,Desiigner,Nationality 1,United States,Anglo-America,142,142.0,5OOkp4U9P9oL23maHFHL1h
651764,58,Perfect Strangers,"Jos Blue, JP Cooper",2017-01-01,0.739,0.833,0.061667,0.046,0.350,0.0,0.699,Artist 1,Jos Blue,Nationality 1,United States,Anglo-America,143,72.0,5bZtRlMBU76vHuDOb1GM5u
651762,57,La Bicicleta,"Carlos Vives, Shakira",2017-01-01,0.736,0.964,0.055531,0.129,0.198,0.0,0.953,Artist 1,Carlos Vives,Nationality 1,Colombia,Latin-America,144,72.0,0sXvAOmXgjR2QUqLK1MltU
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128,85,Bye,Peso Pluma,2023-05-29,0.782,0.812,0.063673,0.047,0.567,0.0,0.700,Artist 1,Peso Pluma,Nationality 1,Mexico,Latin-America,116,116.0,6n2P81rPk2RTzwnNNgFOdb
57,37,Dance The Night (From Barbie The Album),Dua Lipa,2023-05-29,0.671,0.845,0.011547,0.048,0.021,0.0,0.775,Artist 1,Dua Lipa,Nationality 1,United Kingdom,Europe,164,164.0,1vYXt7VSjH9JIM5oRRo7vA
52,33,Hits Different,Taylor Swift,2023-05-29,0.672,0.782,0.063547,0.042,0.148,0.0,0.239,Artist 1,Taylor Swift,Nationality 1,United States,Anglo-America,168,168.0,3xYJScVfxByb61dYHTwiby
284,194,Car's Outside,James Arthur,2023-05-29,0.338,0.562,0.064270,0.029,0.039,0.0,0.259,Artist 1,James Arthur,Nationality 1,United Kingdom,Europe,7,7.0,0otRX6Z89qKkHkQ9OqJpKt


In [79]:
summed_points = grouped_by_title["Points_total"].sum()

In [80]:
grouped_by_title.get_group(("Queencard", "(G)I-DLE"))

Unnamed: 0,Rank,Title,Artists,Date,Danceability,Energy,Loudness,Speechiness,Acousticness,Instrumentalness,Valence,Num_of_artist,Artist_ind,Num_of_nationality,Nationality,Continent,Points_total,Points_ind,id
3991,132,Queencard,(G)I-DLE,2023-05-16,0.815,0.834,0.057525,0.048,0.033,0.0,0.692,Artist 1,(G)I-DLE,Nationality 1,South Korea,Asia,69,69.0,4uOBL4DDWWVx4RhYKlPbPC
3666,105,Queencard,(G)I-DLE,2023-05-17,0.815,0.834,0.057525,0.048,0.033,0.0,0.692,Artist 1,(G)I-DLE,Nationality 1,South Korea,Asia,96,96.0,4uOBL4DDWWVx4RhYKlPbPC
3390,112,Queencard,(G)I-DLE,2023-05-18,0.815,0.834,0.057525,0.048,0.033,0.0,0.692,Artist 1,(G)I-DLE,Nationality 1,South Korea,Asia,89,89.0,4uOBL4DDWWVx4RhYKlPbPC
3111,123,Queencard,(G)I-DLE,2023-05-19,0.815,0.834,0.057525,0.048,0.033,0.0,0.692,Artist 1,(G)I-DLE,Nationality 1,South Korea,Asia,78,78.0,4uOBL4DDWWVx4RhYKlPbPC
2844,143,Queencard,(G)I-DLE,2023-05-20,0.815,0.834,0.057525,0.048,0.033,0.0,0.692,Artist 1,(G)I-DLE,Nationality 1,South Korea,Asia,58,58.0,4uOBL4DDWWVx4RhYKlPbPC
2513,113,Queencard,(G)I-DLE,2023-05-21,0.815,0.834,0.057525,0.048,0.033,0.0,0.692,Artist 1,(G)I-DLE,Nationality 1,South Korea,Asia,88,88.0,4uOBL4DDWWVx4RhYKlPbPC
2215,105,Queencard,(G)I-DLE,2023-05-22,0.815,0.834,0.057525,0.048,0.033,0.0,0.692,Artist 1,(G)I-DLE,Nationality 1,South Korea,Asia,96,96.0,4uOBL4DDWWVx4RhYKlPbPC
1929,111,Queencard,(G)I-DLE,2023-05-23,0.815,0.834,0.057525,0.048,0.033,0.0,0.692,Artist 1,(G)I-DLE,Nationality 1,South Korea,Asia,90,90.0,4uOBL4DDWWVx4RhYKlPbPC
1644,113,Queencard,(G)I-DLE,2023-05-24,0.815,0.834,0.057525,0.048,0.033,0.0,0.692,Artist 1,(G)I-DLE,Nationality 1,South Korea,Asia,88,88.0,4uOBL4DDWWVx4RhYKlPbPC
1363,117,Queencard,(G)I-DLE,2023-05-25,0.815,0.834,0.057525,0.048,0.033,0.0,0.692,Artist 1,(G)I-DLE,Nationality 1,South Korea,Asia,84,84.0,4uOBL4DDWWVx4RhYKlPbPC


In [81]:
# add on the total points to the right
grouped_df = grouped_by_title.first()

In [82]:
one_row_one_song = grouped_df.merge(summed_points.rename("Points_sum"), left_index=True, right_index=True)
one_row_one_song.drop(['Num_of_nationality', "Nationality", "Continent", "Points_total", "Points_ind", "Num_of_artist", "Artist_ind"], axis=1, inplace=True, errors='ignore')
one_row_one_song

Unnamed: 0_level_0,Unnamed: 1_level_0,Rank,Date,Danceability,Energy,Loudness,Speechiness,Acousticness,Instrumentalness,Valence,id,Points_sum
Title,Artists,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
'98 Braves,Morgan Wallen,72,2023-03-03,0.488,0.670,0.062672,0.027,0.097,0.0,0.484,3oZ6dlSfCE9gZ55MGPJctc,287
!,Trippie Redd,56,2019-08-09,0.725,0.543,0.063335,0.084,0.030,0.0,0.693,5a1ofLoPiAn5xxf5UE6472,194
#PROUDCATOWNERREMIX,"XXXTENTACION, Rico Nasty",189,2019-08-23,0.783,0.522,0.063240,0.390,0.029,0.0,0.235,5dkZ2DrcPJrqwlRQe6Q35e,12
$$$,XXXTENTACION,67,2018-03-16,0.774,0.507,0.064035,0.065,0.064,0.0,0.508,65u1dHQyQyE4y4aN2eDmMF,1018
$€ Freestyle,Sfera Ebbasta,76,2020-11-20,0.813,0.670,0.065626,0.335,0.200,0.0,0.380,2j9wr1cxfYcMJoANfmoJA9,125
...,...,...,...,...,...,...,...,...,...,...,...,...
Î©. VIVRE UN PEU,Damso,166,2021-04-29,0.416,0.528,0.066914,0.291,0.178,0.0,0.203,5UNFWkGaEJgQFqoJxZWMzm,35
ÎŸ. OG,Damso,84,2021-04-29,0.576,0.446,0.066741,0.044,0.773,0.4,0.060,63ALxy05IGeKrfOjvGG7lO,117
İmdat,cakal,153,2022-04-18,0.786,0.748,0.064650,0.162,0.267,0.0,0.562,123hB20IVD2yw1NMIgrqb3,1161
アイドル,YOASOBI,148,2023-04-16,0.574,0.935,0.057409,0.093,0.112,0.0,0.836,7ovUcF5uHTBRzUpB6ZOmvt,3983


In [83]:
# sort by points sum
one_row_one_song.sort_values(by="Points_sum", ascending=False, inplace=True)

# create popularity column and top half is "popular", bottom half is "unpopular"
one_row_one_song["Popularity"] = "n/a"
l = len(one_row_one_song.index) // 2 
one_row_one_song.iloc[:l, -1] = 'popular'
one_row_one_song.iloc[l:, -1] = 'unpopular'

# create binary popularity column and top half is "popular", bottom half is "unpopular"
one_row_one_song["Binary_popularity"] = -2
l = len(one_row_one_song.index) // 2 
one_row_one_song.iloc[:l, -1] = 1
one_row_one_song.iloc[l:, -1] = 0
one_row_one_song

Unnamed: 0_level_0,Unnamed: 1_level_0,Rank,Date,Danceability,Energy,Loudness,Speechiness,Acousticness,Instrumentalness,Valence,id,Points_sum,Popularity,Binary_popularity
Title,Artists,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Believer,Imagine Dragons,165,2017-02-02,0.779,0.787,0.060566,0.108,0.052,0.0,0.708,0CcQNd8CINkwQfe1RDtGV6,244737,popular,1
Shape of You,Ed Sheeran,1,2017-01-06,0.825,0.652,0.058381,0.080,0.581,0.0,0.931,7qiZfU4dY1lWllzX7mPBI3,227794,popular,1
Blinding Lights,The Weeknd,8,2019-11-29,0.513,0.796,0.060169,0.063,0.001,0.0,0.345,0sf12qNH5qcw8qpgymFOqD,220508,popular,1
Someone You Loved,Lewis Capaldi,198,2019-01-07,0.501,0.405,0.062571,0.032,0.751,0.0,0.446,2TIlqbIneP0ZY1O0EzYLlc,220157,popular,1
Perfect,Ed Sheeran,10,2017-03-03,0.599,0.448,0.063336,0.023,0.163,0.0,0.168,0tgVpDi06FyKpA1z0VMD4v,200148,popular,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Maps,Maroon 5,200,2023-04-13,0.742,0.713,0.062368,0.030,0.021,0.0,0.879,4gbVRS8gloEluzf0GzDOFc,1,unpopular,0
Hood,"AK AUSSERKONTROLLE, Shindy",200,2020-03-27,0.874,0.435,0.015650,0.402,0.027,0.0,0.609,6xMBPfgaEQKshwp7oYrtP4,1,unpopular,0
DrIP DrIp Drip (feat. Meek Mill),Tory Lanez,200,2018-10-19,0.710,0.814,0.061007,0.056,0.162,0.0,0.714,39Bq9cOTO0H8M6P9iKLVdq,1,unpopular,0
Under Pressure,"Shawn Mendes, Teddy Geiger",200,2018-10-12,0.667,0.478,0.062132,0.032,0.200,0.0,0.337,2fLyikLZmHxW4XdToFpDdN,1,unpopular,0


## Naive Bayes

In [84]:
# only features and labels...
nb_df = one_row_one_song.copy(deep=True)

# Student needs to provide code similar to below
X = nb_df.drop(['Date', 'Rank', 'id', 'Points_sum', 'Popularity', 'Binary_popularity'], axis=1)
X

Unnamed: 0_level_0,Unnamed: 1_level_0,Danceability,Energy,Loudness,Speechiness,Acousticness,Instrumentalness,Valence
Title,Artists,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Believer,Imagine Dragons,0.779,0.787,0.060566,0.108,0.052,0.0,0.708
Shape of You,Ed Sheeran,0.825,0.652,0.058381,0.080,0.581,0.0,0.931
Blinding Lights,The Weeknd,0.513,0.796,0.060169,0.063,0.001,0.0,0.345
Someone You Loved,Lewis Capaldi,0.501,0.405,0.062571,0.032,0.751,0.0,0.446
Perfect,Ed Sheeran,0.599,0.448,0.063336,0.023,0.163,0.0,0.168
...,...,...,...,...,...,...,...,...
Maps,Maroon 5,0.742,0.713,0.062368,0.030,0.021,0.0,0.879
Hood,"AK AUSSERKONTROLLE, Shindy",0.874,0.435,0.015650,0.402,0.027,0.0,0.609
DrIP DrIp Drip (feat. Meek Mill),Tory Lanez,0.710,0.814,0.061007,0.056,0.162,0.0,0.714
Under Pressure,"Shawn Mendes, Teddy Geiger",0.667,0.478,0.062132,0.032,0.200,0.0,0.337


In [85]:
# Student needs to provide code similar to below
y = nb_df['Binary_popularity']
y

Title                             Artists                     
Believer                          Imagine Dragons                 1
Shape of You                      Ed Sheeran                      1
Blinding Lights                   The Weeknd                      1
Someone You Loved                 Lewis Capaldi                   1
Perfect                           Ed Sheeran                      1
                                                                 ..
Maps                              Maroon 5                        0
Hood                              AK AUSSERKONTROLLE, Shindy      0
DrIP DrIp Drip (feat. Meek Mill)  Tory Lanez                      0
Under Pressure                    Shawn Mendes, Teddy Geiger      0
Sogra - Ao Vivo                   Dilsinho, Henrique & Juliano    0
Name: Binary_popularity, Length: 7801, dtype: int64

In [86]:
# use Gaussian, not Multinomial?
gnb = GaussianNB()
gnb.fit(X=X, y=y)

Training set accuracy check: 53%.

Model application is fine, either it's the wrong model or the features aren't indicative enough.

In [87]:
ca = gnb.score(X,y)
print('Training set accuracy: {:.2f}%'.format(ca*100))

Training set accuracy: 53.01%


In [88]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=8, random_state=0)

In [89]:
gnb2 = GaussianNB()
gnb2.fit(X=x_train, y=y_train)

In [90]:
ca = gnb.score(x_test,y_test)
print('Training set accuracy: {:.2f}%'.format(ca*100))

Training set accuracy: 37.50%


## Logistic Regression

In [94]:
lr = LogisticRegression(solver='lbfgs')
lr.fit(x_train, y_train)
print('Classification accuracy on training set: {:.2f}%'.format(lr.score(x_train, y_train)*100))
print('Classification accuracy on test set: {:.2f}%'.format(lr.score(x_test, y_test)*100))

Classification accuracy on training set: 55.06%
Classification accuracy on test set: 37.50%
