In [29]:
# Import packages
import os
import datetime
from datetime import date

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

%matplotlib inline
sns.set()  # Setting seaborn as default style even if use only matplotlib

# constants
audio_features_col_names = [
    "Danceability",
    "Energy",
    "Loudness",
    "Speechiness",
    "Acousticness",
    "Instrumentalness",
    "Valence",
]

# reading data
spotify_filepath = os.path.join(os.getcwd(), "spotify", "spotify.csv")
print(spotify_filepath)

# creating dataframe
spotify_data = pd.read_csv(spotify_filepath, delimiter=";")

spotify_data.drop(["Song URL"], axis=1, inplace=True, errors="ignore")  # drop url
# spotify_data.info()

# make "Date" column datetime type
spotify_data["Date"] = pd.to_datetime(spotify_data["Date"], dayfirst=True)
spotify_data.loc[0]["Date"]

# reverse dates
spotify_data = spotify_data.sort_values(by="Date")
# spotify_data.head()
# ranks are also reversed now...

# Scale the audio features.
spotify_data["Loudness"] = spotify_data["Loudness"].apply(lambda x: np.log10(np.abs(x)) / 60)
spotify_data

/home/jovyan/spotify/spotify.csv


Unnamed: 0,Rank,Title,Artists,Date,Danceability,Energy,Loudness,Speechiness,Acousticness,Instrumentalness,Valence,# of Artist,Artist (Ind.),# of Nationality,Nationality,Continent,Points (Total),Points (Ind for each Artist/Nat),id
651935,200,Ni**as In Paris,"JAY-Z, Kanye West",2017-01-01,0.757,0.882,0.063118,0.248,0.076,0.00,0.684,Artist 2,Kanye West,Nationality 2,United States,Anglo-America,1,1.000000,2KpCpk6HjXXLb7nnXoXA5O
651770,62,Sexual,"NEIKED, Dyo",2017-01-01,0.803,0.569,0.064479,0.074,0.062,0.00,0.809,Artist 1,NEIKED,Nationality 1,Sweden,Europe,139,70.000000,3AsAuGTaDQzavZZThyYlop
651769,61,"How Far I'll Go - From ""Moa""",Alessia Cara,2017-01-01,0.314,0.555,0.066372,0.370,0.157,0.00,0.159,Artist 1,Alessia Cara,Nationality 1,Canada,Anglo-America,140,140.000000,5hYTyyh2odQKphUbMqc5gN
651768,60,Too Good,"Drake, Rihanna",2017-01-01,0.804,0.648,0.064873,0.117,0.057,0.00,0.392,Artist 2,Rihanna,Nationality 2,Barbados,Anglo-America,141,70.000000,11KJSRSgaDxqydKYiD2Jew
651767,60,Too Good,"Drake, Rihanna",2017-01-01,0.804,0.648,0.064873,0.117,0.057,0.00,0.392,Artist 1,Drake,Nationality 1,Canada,Anglo-America,141,70.000000,11KJSRSgaDxqydKYiD2Jew
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,135,Can't Hold Us (feat. Ray Dalton),"Macklemore & Ryan Lewis, Macklemore, Ryan Lewi...",2023-05-29,0.633,0.927,0.060835,0.084,0.027,0.00,0.880,Artist 4,Ray Dalton,Nationality 4,United States,Anglo-America,66,16.500000,22skzmqfdWrjJylampe0kt
199,136,Cartão Black,"MC Caverinha, KayBlack, Wall Hein",2023-05-29,0.842,0.468,0.065551,0.637,0.255,0.00,0.547,Artist 1,MC Caverinha,Nationality 1,Brazil,Latin-America,65,21.666667,12YCtLHGk6tP6RbHDHflCs
200,136,Cartão Black,"MC Caverinha, KayBlack, Wall Hein",2023-05-29,0.842,0.468,0.065551,0.637,0.255,0.00,0.547,Artist 2,KayBlack,Nationality 2,Brazil,Latin-America,65,21.666667,12YCtLHGk6tP6RbHDHflCs
202,137,Apocalypse,Cigarettes After Sex,2023-05-29,0.369,0.467,0.065919,0.027,0.019,0.46,0.174,Artist 1,Cigarettes After Sex,Nationality 1,United States,Anglo-America,64,64.000000,3AVrVz5rK8Hrqo9YGiVGN5


In [2]:
# First make copy and drop irrelevant features
spotify_data_copy = spotify_data.copy(deep=True)
spotify_data_copy.drop(["Artists", '# of Artist', 'Artist (Ind.)', '# of Nationality', 'Continent',
                        'Points (Ind for each Artist/Nat)', 'Rank'], axis=1, inplace=True)

# group by song title and take average vlues
data_by_song_mean = spotify_data_copy.groupby("Title").mean(numeric_only=True)
data_by_song_mean


Unnamed: 0_level_0,Danceability,Energy,Loudness,Speechiness,Acousticness,Instrumentalness,Valence,Points (Total)
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
'98 Braves,0.488,0.670,0.062672,0.027,0.097,0.0,0.484,71.750000
!,0.725,0.543,0.063335,0.084,0.030,0.0,0.693,97.000000
#PROUDCATOWNERREMIX,0.783,0.522,0.063240,0.390,0.029,0.0,0.235,12.000000
$$$,0.774,0.507,0.064035,0.065,0.064,0.0,0.508,84.833333
$€ Freestyle,0.813,0.670,0.065626,0.335,0.200,0.0,0.380,125.000000
...,...,...,...,...,...,...,...,...
Î©. VIVRE UN PEU,0.416,0.528,0.066914,0.291,0.178,0.0,0.203,35.000000
ÎŸ. OG,0.576,0.446,0.066741,0.044,0.773,0.4,0.060,117.000000
İmdat,0.786,0.748,0.064650,0.162,0.267,0.0,0.562,72.562500
アイドル,0.574,0.935,0.057409,0.093,0.112,0.0,0.836,90.522727


In [3]:
# create new dataframe with just data from 2022
data_copy_2022 = spotify_data.copy(deep=True)
data_copy_2022.sort_values("Date", ascending=True)
data_copy_2022 = data_copy_2022.loc[(data_copy_2022["Date"] > pd.to_datetime('2021-12-31')) # delete all dates excluding 2022
& (data_copy_2022["Date"] < pd.to_datetime('2023-1-1'))]

# dataset with one song per row and averaged feature values
songs_2022 = data_copy_2022.groupby("Title").mean(numeric_only=True)
songs_2022.drop(['Points (Total)', 'Rank', 'Points (Ind for each Artist/Nat)'], axis=1, inplace=True)
#songs_2022

# dataset with one song per day per row and total points
points_per_day = data_copy_2022.drop(audio_features_col_names + ['Artists', '# of Artist', 'Artist (Ind.)', 'Nationality', '# of Nationality', 'Continent', 'Points (Ind for each Artist/Nat)', 'id', 'Rank'], axis=1)

# split dataframe by date
points_by_date = {date: data for date, data in points_per_day.groupby('Date')}
# Now, points_by_date is a dictionary where the keys are dates and the values are the DataFrames for each date.
# Modify each date DataFrame to ensure no duplicate songs and random points for each song
for date, data in points_by_date.items():
    # Group by Song Title and select one random row for each song
    points_by_date[date] = data.groupby('Title', as_index=False).sample(n=1)
    
    # Reset the index of each grouped DataFrame
    points_by_date[date].reset_index(drop=True, inplace=True)
# To access the DataFrame for a specific date (for example, "2022-01-01"):
points_1 = points_by_date[pd.to_datetime('2022-01-01')]

# Now, we want to create a new DataFrame where:
# - Each row corresponds to a Song Title,
# - Each column corresponds to a Date,
# - Each entry represents the "Points" for that song on that date.

# We first concatenate all the DataFrames by date into a single DataFrame
points_concatenated = pd.concat(points_by_date.values(), ignore_index=True)

# Now we pivot the DataFrame to the required format
points_per_date_and_title = points_concatenated.pivot(index='Title', columns='Date', values='Points (Total)')
# convert NaN values to 0
points_per_date_and_title = points_per_date_and_title.fillna(0)
# add a total points accumulated column
#points_per_date_and_title['Total Points'] = points_per_date_and_title.sum(axis=1)
points_per_date_and_title


Date,2022-01-01,2022-01-02,2022-01-03,2022-01-04,2022-01-05,2022-01-06,2022-01-07,2022-01-08,2022-01-09,2022-01-10,...,2022-12-22,2022-12-23,2022-12-24,2022-12-25,2022-12-26,2022-12-27,2022-12-28,2022-12-29,2022-12-30,2022-12-31
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'Till I Collapse,0.0,0.0,7.0,31.0,29.0,17.0,0.0,0.0,0.0,11.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
(There's No Place Like) Home for the Holidays - 1959 Version,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,57.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0
0440972222,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"1, 2, 3 (feat. Jason Derulo & De La Ghetto)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10 Things I Hate About You,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
we fell in love in october,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
what would you do?,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
you & me,0.0,0.0,0.0,0.0,0.0,0.0,89.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
you broke me first,0.0,46.0,56.0,57.0,54.0,54.0,10.0,0.0,28.0,44.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
# now create a dataframe where the rows are songs and columns are weekday and weekend

# Convert columns to datetime (if not already)
points_per_date_and_title.columns = pd.to_datetime(points_per_date_and_title.columns)

# Create two masks: one for weekdays (Monday to Friday), and one for weekends (Saturday and Sunday)
weekdays_mask = points_per_date_and_title.columns.weekday < 5  # Monday (0) to Friday (4)
weekends_mask = ~weekdays_mask  # Saturday (5) and Sunday (6)

# Calculate average points for weekdays (Monday to Friday)
weekday_avg = points_per_date_and_title.loc[:, weekdays_mask].mean(axis=1)

# Calculate average points for weekends (Saturday and Sunday)
weekend_avg = points_per_date_and_title.loc[:, weekends_mask].mean(axis=1)

# Create a new DataFrame with song titles as rows and "weekday" and "weekend" columns
weekday_weekend = pd.DataFrame({
    'Title': points_per_date_and_title.index,
    'Weekday': weekday_avg,
    'Weekend': weekend_avg
})

weekday_weekend


Unnamed: 0_level_0,Title,Weekday,Weekend
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
'Till I Collapse,'Till I Collapse,28.119231,10.095238
(There's No Place Like) Home for the Holidays - 1959 Version,(There's No Place Like) Home for the Holidays ...,0.000000,1.495238
0440972222,0440972222,2.488462,0.790476
"1, 2, 3 (feat. Jason Derulo & De La Ghetto)","1, 2, 3 (feat. Jason Derulo & De La Ghetto)",0.438462,0.314286
10 Things I Hate About You,10 Things I Hate About You,6.869231,4.142857
...,...,...,...
we fell in love in october,we fell in love in october,7.038462,8.352381
what would you do?,what would you do?,0.311538,0.000000
you & me,you & me,0.342308,0.000000
you broke me first,you broke me first,5.738462,2.361905


In [5]:
# Create a new DataFrame where rows are song titles and the entries are binary
binary = (weekday_weekend['Weekday'] > weekday_weekend['Weekend']).astype(int)

# Create a final DataFrame where we use the song titles as the index
weekday_binary = pd.DataFrame(binary, columns=['Weekday > Weekend'])
weekday_binary.index = weekday_weekend['Title']
weekday_binary

Unnamed: 0_level_0,Weekday > Weekend
Title,Unnamed: 1_level_1
'Till I Collapse,1
(There's No Place Like) Home for the Holidays - 1959 Version,0
0440972222,1
"1, 2, 3 (feat. Jason Derulo & De La Ghetto)",1
10 Things I Hate About You,1
...,...
we fell in love in october,0
what would you do?,1
you & me,1
you broke me first,1


In [6]:
data_by_song_mean

Unnamed: 0_level_0,Danceability,Energy,Loudness,Speechiness,Acousticness,Instrumentalness,Valence,Points (Total)
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
'98 Braves,0.488,0.670,0.062672,0.027,0.097,0.0,0.484,71.750000
!,0.725,0.543,0.063335,0.084,0.030,0.0,0.693,97.000000
#PROUDCATOWNERREMIX,0.783,0.522,0.063240,0.390,0.029,0.0,0.235,12.000000
$$$,0.774,0.507,0.064035,0.065,0.064,0.0,0.508,84.833333
$€ Freestyle,0.813,0.670,0.065626,0.335,0.200,0.0,0.380,125.000000
...,...,...,...,...,...,...,...,...
Î©. VIVRE UN PEU,0.416,0.528,0.066914,0.291,0.178,0.0,0.203,35.000000
ÎŸ. OG,0.576,0.446,0.066741,0.044,0.773,0.4,0.060,117.000000
İmdat,0.786,0.748,0.064650,0.162,0.267,0.0,0.562,72.562500
アイドル,0.574,0.935,0.057409,0.093,0.112,0.0,0.836,90.522727


In [16]:
# Merge the two DataFrames on the song title (index)
songs_features_weekday = data_by_song_mean.join(weekday_binary, how='inner')
songs_features_weekday = songs_features_weekday.drop('Points (Total)', axis=1)
songs_features_weekday

Unnamed: 0_level_0,Danceability,Energy,Loudness,Speechiness,Acousticness,Instrumentalness,Valence,Weekday > Weekend
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
'Till I Collapse,0.549341,0.848788,0.058542,0.190916,0.06781,0.000,0.100000,1
(There's No Place Like) Home for the Holidays - 1959 Version,0.478000,0.341000,0.068314,0.051000,0.89700,0.000,0.474000,0
0440972222,0.696000,0.793000,0.062640,0.097000,0.06800,0.000,0.698000,1
"1, 2, 3 (feat. Jason Derulo & De La Ghetto)",0.792000,0.895000,0.058217,0.059000,0.16500,0.000,0.793862,1
10 Things I Hate About You,0.544000,0.786000,0.007879,0.050000,0.01000,0.000,0.446000,1
...,...,...,...,...,...,...,...,...
we fell in love in october,0.566000,0.366000,0.068458,0.028000,0.11300,0.181,0.237000,0
what would you do?,0.795000,0.620000,0.062849,0.066500,0.29200,0.000,0.711000,1
you & me,0.744000,0.535000,0.066295,0.074000,0.27900,0.000,0.106000,1
you broke me first,0.667000,0.373000,0.066210,0.050000,0.78500,0.000,0.082000,1


In [24]:
# Problem: some values of loudness (3) are negative
# Filter rows with any negative values
negative_rows = songs_features_weekday[(songs_features_weekday < 0).any(axis=1)]

# Print the filtered rows
print(negative_rows)

# Solution: set them to 0
songs_features_weekday[songs_features_weekday < 0] = 0

Empty DataFrame
Columns: [Danceability, Energy, Loudness, Speechiness, Acousticness, Instrumentalness, Valence, Weekday > Weekend]
Index: []


In [25]:
# now try Naive Bayes on this dataset
X = songs_features_weekday.drop('Weekday > Weekend', axis=1)
y = songs_features_weekday['Weekday > Weekend']

# 'string {}'.format(x) prints the string + x in the {} position
print('Number of songs more popular on weekdays: {}'.format((y==1).sum())) #summing all entries of y which is equal to 1
print('Number of songs more popular on weekends: {}'.format((y==0).sum()))


Number of songs more popular on weekdays: 796
Number of songs more popular on weekends: 637


In [31]:
gnb = GaussianNB()
gnb.fit(X=X, y=y)

In [32]:
tr_pred = gnb.predict(X=X)

ca = accuracy_score(y, gnb.predict(X)) # or ca = gnb.score(X,y)
print('Training set accuracy: {:.2f}%'.format(ca*100))

Training set accuracy: 58.83%
