In [1]:
# Import packages
import os
import datetime
from datetime import date

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

%matplotlib inline
sns.set()  # Setting seaborn as default style even if use only matplotlib

# constants
audio_features_col_names = [
    "Danceability",
    "Energy",
    "Loudness",
    "Speechiness",
    "Acousticness",
    "Instrumentalness",
    "Valence",
]

# reading data
spotify_filepath = os.path.join(os.getcwd(), "spotify", "spotify.csv")
print(spotify_filepath)

# creating dataframe
spotify_data = pd.read_csv(spotify_filepath, delimiter=";")

spotify_data.drop(["Song URL"], axis=1, inplace=True, errors="ignore")  # drop url
# spotify_data.info()

# make "Date" column datetime type
spotify_data["Date"] = pd.to_datetime(spotify_data["Date"], dayfirst=True)
spotify_data.loc[0]["Date"]

# reverse dates
spotify_data = spotify_data.sort_values(by="Date")
# spotify_data.head()
# ranks are also reversed now...

# Scale the audio features.
spotify_data["Loudness"] = spotify_data["Loudness"].apply(lambda x: np.log10(np.abs(x)) / 60)
spotify_data

/home/jovyan/spotify/spotify.csv


Unnamed: 0,Rank,Title,Artists,Date,Danceability,Energy,Loudness,Speechiness,Acousticness,Instrumentalness,Valence,# of Artist,Artist (Ind.),# of Nationality,Nationality,Continent,Points (Total),Points (Ind for each Artist/Nat),id
651935,200,Ni**as In Paris,"JAY-Z, Kanye West",2017-01-01,0.757,0.882,0.063118,0.248,0.076,0.00,0.684,Artist 2,Kanye West,Nationality 2,United States,Anglo-America,1,1.000000,2KpCpk6HjXXLb7nnXoXA5O
651770,62,Sexual,"NEIKED, Dyo",2017-01-01,0.803,0.569,0.064479,0.074,0.062,0.00,0.809,Artist 1,NEIKED,Nationality 1,Sweden,Europe,139,70.000000,3AsAuGTaDQzavZZThyYlop
651769,61,"How Far I'll Go - From ""Moa""",Alessia Cara,2017-01-01,0.314,0.555,0.066372,0.370,0.157,0.00,0.159,Artist 1,Alessia Cara,Nationality 1,Canada,Anglo-America,140,140.000000,5hYTyyh2odQKphUbMqc5gN
651768,60,Too Good,"Drake, Rihanna",2017-01-01,0.804,0.648,0.064873,0.117,0.057,0.00,0.392,Artist 2,Rihanna,Nationality 2,Barbados,Anglo-America,141,70.000000,11KJSRSgaDxqydKYiD2Jew
651767,60,Too Good,"Drake, Rihanna",2017-01-01,0.804,0.648,0.064873,0.117,0.057,0.00,0.392,Artist 1,Drake,Nationality 1,Canada,Anglo-America,141,70.000000,11KJSRSgaDxqydKYiD2Jew
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,135,Can't Hold Us (feat. Ray Dalton),"Macklemore & Ryan Lewis, Macklemore, Ryan Lewi...",2023-05-29,0.633,0.927,0.060835,0.084,0.027,0.00,0.880,Artist 4,Ray Dalton,Nationality 4,United States,Anglo-America,66,16.500000,22skzmqfdWrjJylampe0kt
199,136,Cartão Black,"MC Caverinha, KayBlack, Wall Hein",2023-05-29,0.842,0.468,0.065551,0.637,0.255,0.00,0.547,Artist 1,MC Caverinha,Nationality 1,Brazil,Latin-America,65,21.666667,12YCtLHGk6tP6RbHDHflCs
200,136,Cartão Black,"MC Caverinha, KayBlack, Wall Hein",2023-05-29,0.842,0.468,0.065551,0.637,0.255,0.00,0.547,Artist 2,KayBlack,Nationality 2,Brazil,Latin-America,65,21.666667,12YCtLHGk6tP6RbHDHflCs
202,137,Apocalypse,Cigarettes After Sex,2023-05-29,0.369,0.467,0.065919,0.027,0.019,0.46,0.174,Artist 1,Cigarettes After Sex,Nationality 1,United States,Anglo-America,64,64.000000,3AVrVz5rK8Hrqo9YGiVGN5


In [2]:
# First make copy and drop irrelevant features
data_copy = spotify_data.copy(deep=True)
data_copy.drop(["Artists", '# of Artist', 'Artist (Ind.)', '# of Nationality', 'Continent',
                        'Points (Ind for each Artist/Nat)', 'Rank'], axis=1, inplace=True)

# group by song title and take average vlues for features but one value of points
data_by_title = data_copy.groupby('Title').agg(
    {'Danceability': 'mean',
        'Energy': 'mean',
        'Loudness': 'mean',
        'Speechiness': 'mean',
        'Acousticness': 'mean',
        'Instrumentalness': 'mean',
        'Valence': 'mean',   # Average the feature columns
        'Points (Total)': 'sum'  },     # sum the first popularity values
).reset_index()

data_by_title


Unnamed: 0,Title,Danceability,Energy,Loudness,Speechiness,Acousticness,Instrumentalness,Valence,Points (Total)
0,'98 Braves,0.488,0.670,0.062672,0.027,0.097,0.0,0.484,287
1,!,0.725,0.543,0.063335,0.084,0.030,0.0,0.693,194
2,#PROUDCATOWNERREMIX,0.783,0.522,0.063240,0.390,0.029,0.0,0.235,24
3,$$$,0.774,0.507,0.064035,0.065,0.064,0.0,0.508,1018
4,$€ Freestyle,0.813,0.670,0.065626,0.335,0.200,0.0,0.380,125
...,...,...,...,...,...,...,...,...,...
7452,Î©. VIVRE UN PEU,0.416,0.528,0.066914,0.291,0.178,0.0,0.203,35
7453,ÎŸ. OG,0.576,0.446,0.066741,0.044,0.773,0.4,0.060,117
7454,İmdat,0.786,0.748,0.064650,0.162,0.267,0.0,0.562,1161
7455,アイドル,0.574,0.935,0.057409,0.093,0.112,0.0,0.836,3983


In [6]:
# dataset with one song per day per row and total points
points_per_day = spotify_data.drop(audio_features_col_names + ['Artists', '# of Artist', 'Artist (Ind.)', 'Nationality', '# of Nationality', 'Continent', 'Points (Ind for each Artist/Nat)', 'id', 'Rank'], axis=1)

# split dataframe by date
points_by_date = {date: data for date, data in points_per_day.groupby('Date')}
# Now, points_by_date is a dictionary where the keys are dates and the values are the DataFrames for each date.
# Modify each date DataFrame to ensure no duplicate songs and random points for each song
for date, data in points_by_date.items():
    # Group by Song Title and select one random row for each song
    points_by_date[date] = data.groupby('Title', as_index=False).sample(n=1)
    
    # Reset the index of each grouped DataFrame
    points_by_date[date].reset_index(drop=True, inplace=True)
# To access the DataFrame for a specific date (for example, "2022-01-01"):
points_1 = points_by_date[pd.to_datetime('2022-01-01')]

# Now, we want to create a new DataFrame where:
# - Each row corresponds to a Song Title,
# - Each column corresponds to a Date,
# - Each entry represents the "Points" for that song on that date.

# We first concatenate all the DataFrames by date into a single DataFrame
points_concatenated = pd.concat(points_by_date.values(), ignore_index=True)

# Now we pivot the DataFrame to the required format
points_per_date_and_title = points_concatenated.pivot(index='Title', columns='Date', values='Points (Total)')
# convert NaN values to 0
points_per_date_and_title = points_per_date_and_title.fillna(0)
#points_per_date_and_title['Total Points'] = points_per_date_and_title.sum(axis=1)

# Create a complete range of dates from the min to the max date
date_range = pd.date_range(start=points_per_date_and_title.columns.min(), end=points_per_date_and_title.columns.max())

# Reindex the DataFrame with the complete date range and fill missing columns with NaN
points_per_date_and_title = points_per_date_and_title.reindex(columns=date_range)

# Replace all 0's with NaN
points_per_date_and_title.replace(0, np.nan, inplace=True)

points_per_date_and_title

Unnamed: 0_level_0,2017-01-01,2017-01-02,2017-01-03,2017-01-04,2017-01-05,2017-01-06,2017-01-07,2017-01-08,2017-01-09,2017-01-10,...,2023-05-20,2023-05-21,2023-05-22,2023-05-23,2023-05-24,2023-05-25,2023-05-26,2023-05-27,2023-05-28,2023-05-29
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'98 Braves,,,,,,,,,,,...,,,,,,,,,,
!,,,,,,,,,,,...,,,,,,,,,,
#PROUDCATOWNERREMIX,,,,,,,,,,,...,,,,,,,,,,
$$$,,,,,,,,,,,...,,,,,,,,,,
$€ Freestyle,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Î©. VIVRE UN PEU,,,,,,,,,,,...,,,,,,,,,,
ÎŸ. OG,,,,,,,,,,,...,,,,,,,,,,
İmdat,,,,,,,,,,,...,,,,,,,,,,
アイドル,,,,,,,,,,,...,81.0,104.0,85.0,79.0,75.0,87.0,77.0,81.0,111.0,94.0


In [9]:
# Ensure columns are in DatetimeIndex format for proper resampling
if not isinstance(points_per_date_and_title.columns, pd.DatetimeIndex):
    points_per_date_and_title.columns = pd.to_datetime(points_per_date_and_title.columns)

# Step 2: Initialize dictionary to store weekly labels for each song
weekly_data = {}

# Loop through each song's data
for song_id in points_per_date_and_title.index:
    song_data = points_per_date_and_title.loc[song_id]
    
    # Resample data by week, starting each week on Monday
    song_weekly = song_data.resample('W-MON', axis=0)
    
    # Step 3: Calculate weekday (Mon-Thurs) and weekend (Fri-Sun) averages
    week_labels = []
    for _, week_data in song_weekly:
        # Weekday average (Monday to Thursday)
        weekday_data = week_data[week_data.index.weekday < 4]
        # Weekend average (Friday to Sunday)
        weekend_data = week_data[week_data.index.weekday >= 4]
        
        # Calculate averages if there are non-NaN values
        weekday_avg = weekday_data.mean() if not weekday_data.isna().all() else np.nan
        weekend_avg = weekend_data.mean() if not weekend_data.isna().all() else np.nan
        
        # Step 4: Assign label for the week
        if pd.isna(weekday_avg) and pd.isna(weekend_avg):
            week_labels.append(np.nan)  # No data for the week
        elif pd.isna(weekday_avg) or (not pd.isna(weekend_avg) and weekend_avg > weekday_avg):
            week_labels.append("weekend")  # Weekend average is higher
        else:
            week_labels.append("weekday")  # Weekday average is higher
    
    # Store results for this song
    weekly_data[song_id] = week_labels

# Step 5: Create DataFrame with columns labeled "Week 1", "Week 2", etc.
weekly_labels_df = pd.DataFrame.from_dict(weekly_data, orient='index')
weekly_labels_df.columns = [f"Week {i + 1}" for i in range(weekly_labels_df.shape[1])]
weekly_labels_df.index.name = "SongID"

weekly_labels_df.head()  # Display first few rows

  song_weekly = song_data.resample('W-MON', axis=0)


Unnamed: 0_level_0,Week 1,Week 2,Week 3,Week 4,Week 5,Week 6,Week 7,Week 8,Week 9,Week 10,...,Week 326,Week 327,Week 328,Week 329,Week 330,Week 331,Week 332,Week 333,Week 334,Week 335
SongID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'98 Braves,,,,,,,,,,,...,,,,,,,,,,
!,,,,,,,,,,,...,,,,,,,,,,
#PROUDCATOWNERREMIX,,,,,,,,,,,...,,,,,,,,,,
$$$,,,,,,,,,,,...,,,,,,,,,,
$€ Freestyle,,,,,,,,,,,...,,,,,,,,,,


In [10]:
# Create a new dataframe with columns 'weekday_count' and 'weekend_count'
count_df = pd.DataFrame(index=weekly_labels_df.index)

# Count how many times "weekday" and "weekend" appear for each song (row)
count_df['weekday_count'] = (weekly_labels_df == 'weekday').sum(axis=1)
count_df['weekend_count'] = (weekly_labels_df == 'weekend').sum(axis=1)

count_df

Unnamed: 0_level_0,weekday_count,weekend_count
SongID,Unnamed: 1_level_1,Unnamed: 2_level_1
'98 Braves,0,1
!,0,1
#PROUDCATOWNERREMIX,0,1
$$$,2,1
$€ Freestyle,0,1
...,...,...
Î©. VIVRE UN PEU,1,0
ÎŸ. OG,1,0
İmdat,3,1
アイドル,5,2


In [15]:

# Step 1: Create a new column 'weekday' based on the condition
count_df['weekday'] = (count_df['weekday_count'] > count_df['weekend_count']).astype(int)

# drop first two columns
count_df = count_df.drop(['weekday_count', 'weekend_count'], axis=1)
# Step 2: Display the updated DataFrame
count_df.head()

Unnamed: 0_level_0,weekday
SongID,Unnamed: 1_level_1
'98 Braves,0
!,0
#PROUDCATOWNERREMIX,0
$$$,1
$€ Freestyle,0


In [16]:
# now we can combine like we did before
# group by song title and take average vlues
data_by_song_mean = data_copy.groupby("Title").mean(numeric_only=True)
data_by_song_mean


Unnamed: 0_level_0,Danceability,Energy,Loudness,Speechiness,Acousticness,Instrumentalness,Valence,Points (Total)
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
'98 Braves,0.488,0.670,0.062672,0.027,0.097,0.0,0.484,71.750000
!,0.725,0.543,0.063335,0.084,0.030,0.0,0.693,97.000000
#PROUDCATOWNERREMIX,0.783,0.522,0.063240,0.390,0.029,0.0,0.235,12.000000
$$$,0.774,0.507,0.064035,0.065,0.064,0.0,0.508,84.833333
$€ Freestyle,0.813,0.670,0.065626,0.335,0.200,0.0,0.380,125.000000
...,...,...,...,...,...,...,...,...
Î©. VIVRE UN PEU,0.416,0.528,0.066914,0.291,0.178,0.0,0.203,35.000000
ÎŸ. OG,0.576,0.446,0.066741,0.044,0.773,0.4,0.060,117.000000
İmdat,0.786,0.748,0.064650,0.162,0.267,0.0,0.562,72.562500
アイドル,0.574,0.935,0.057409,0.093,0.112,0.0,0.836,90.522727


In [18]:
# Merge the two DataFrames on the song title (index)
songs_features_weekday = data_by_song_mean.join(count_df, how='inner')
songs_features_weekday = songs_features_weekday.drop(['Points (Total)',], axis=1)
songs_features_weekday

Unnamed: 0_level_0,Danceability,Energy,Loudness,Speechiness,Acousticness,Instrumentalness,Valence,weekday
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
'98 Braves,0.488,0.670,0.062672,0.027,0.097,0.0,0.484,0
!,0.725,0.543,0.063335,0.084,0.030,0.0,0.693,0
#PROUDCATOWNERREMIX,0.783,0.522,0.063240,0.390,0.029,0.0,0.235,0
$$$,0.774,0.507,0.064035,0.065,0.064,0.0,0.508,1
$€ Freestyle,0.813,0.670,0.065626,0.335,0.200,0.0,0.380,0
...,...,...,...,...,...,...,...,...
Î©. VIVRE UN PEU,0.416,0.528,0.066914,0.291,0.178,0.0,0.203,1
ÎŸ. OG,0.576,0.446,0.066741,0.044,0.773,0.4,0.060,1
İmdat,0.786,0.748,0.064650,0.162,0.267,0.0,0.562,1
アイドル,0.574,0.935,0.057409,0.093,0.112,0.0,0.836,1


In [19]:
# Problem: some values of loudness (3) are negative
# Filter rows with any negative values
negative_rows = songs_features_weekday[(songs_features_weekday < 0).any(axis=1)]

# Print the filtered rows
print(negative_rows)

# Solution: set them to 0
songs_features_weekday[songs_features_weekday < 0] = 0

                                                    Danceability    Energy  \
Title                                                                        
Agora Vai Sentar                                        0.814000  0.653000   
Desde Esa Noche (feat. Maluma)                          0.780000  0.929000   
Girls                                                   0.570000  0.974000   
Go                                                      0.755000  0.898000   
Good Drank                                              0.836000  0.776000   
It's A Raid (feat. Post Malone)                         0.472000  0.996000   
Live It Up - Official Song 2018 FIFA World Cup ...      0.582000  0.944000   
Murder In My Mind                                       0.712000  0.972000   
PUNTO 40                                                0.861459  0.828164   
Puta Mexica                                             0.853000  0.913000   
Sal y Perrea                                            0.786000

Naive Bayes

In [21]:
# now try Naive Bayes on this dataset
X = songs_features_weekday.drop('weekday', axis=1)
y = songs_features_weekday['weekday']

# 'string {}'.format(x) prints the string + x in the {} position
print('Number of songs more popular on weekdays: {}'.format((y==1).sum())) #summing all entries of y which is equal to 1
print('Number of songs more popular on weekends: {}'.format((y==0).sum()))


Number of songs more popular on weekdays: 3196
Number of songs more popular on weekends: 4261


In [22]:
gnb = GaussianNB()
gnb.fit(X=X, y=y)

In [23]:
tr_pred = gnb.predict(X=X)

ca = accuracy_score(y, gnb.predict(X)) # or ca = gnb.score(X,y)
print('Training set accuracy: {:.2f}%'.format(ca*100))

Training set accuracy: 56.43%
