In [8]:
import spotipy
import csv
import os
from spotipy.oauth2 import SpotifyClientCredentials, SpotifyOAuth
from dotenv import load_dotenv

In [9]:
load_dotenv()

client_id = os.getenv('SPOTIPY_CLIENT_ID')
client_secret = os.getenv('SPOTIPY_CLIENT_SECRET')
redirect_uri = os.getenv('SPOTIPY_REDIRECT_URI')

client_credentials_manager = SpotifyClientCredentials()
spotify = spotipy.Spotify(auth_manager=SpotifyClientCredentials())

In [96]:
tracks = []
for i in range(2018, 2023):
    search_result = spotify.search(q=str(i), limit=50) #the result only consists of tracks

    tracks.extend(search_result['tracks']['items'])

    for i in range(19):
        search_result = spotify.next(search_result['tracks'])
        tracks.extend(search_result['tracks']['items'])

In [97]:
tracks_id = []
for i in range(len(tracks)):
    tracks_id.append(tracks[i]['id'])

tracks_id = list(dict.fromkeys(tracks_id))

In [99]:
track_features_list = []

for i in range(len(tracks_id)):
    track_features_list.append(spotify.audio_features(tracks_id[i]))

In [100]:
#there are some tracks without their audio features so we need to remove them from the list
track_features_list = list(filter(lambda x: x != [None], track_features_list))

In [101]:
for i in range(len(track_features_list)):
    track_features_list[i] = dict(list(filter(lambda x: x[0] == 'id' or type(x[1]) != str, list(track_features_list[i][0].items()))))

In [103]:
artists_id = []
for i in range(len(tracks)):
    for j in range(len(tracks[i]['artists'])):
        artists_id.append(tracks[i]['artists'][j]['id'])

artists_id = list(dict.fromkeys(artists_id))

In [104]:
artists = []
for i in range(len(artists_id)):
    artist_info = spotify.artist(artist_id=artists_id[i])
    cols = ['id', 'genres']
    artist = dict.fromkeys(cols)
    for key in artist:
        artist[key] = artist_info[key]
    artists.append(artist)

In [105]:
artists = list(filter(lambda x: x['genres'] != [] ,artists))

In [106]:
genres_list = []

for i in range(len(artists)):
    keys = ['id', 'genre']
    genre = dict.fromkeys(keys)
    for j in range(len(tracks)):
        if artists[i]['id'] == tracks[j]['artists'][0]['id']:
            genre['id'] = tracks[j]['id']
            genre['genre'] = artists[i]['genres'][0]
            genres_list.append(genre)

In [107]:
for i in range(len(track_features_list)):
    for j in range(len(genres_list)):
        if track_features_list[i]['id'] == genres_list[j]['id']:
            track_features_list[i]['genre'] = genres_list[j]['genre']

In [108]:
track_features_list = list(filter(lambda x: len(x) == 15, track_features_list))

In [148]:
with open('audio_features.tsv', 'w') as file:
    writer = csv.DictWriter(file, fieldnames=track_features_list[0].keys(), delimiter='\t')
    writer.writeheader()
    for data in track_features_list:
        writer.writerow(data)

---------------------------------------------------------------------------------------------------------------

In [109]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

In [110]:
df = pd.DataFrame(track_features_list)

In [111]:
df.insert(0, 'id', df.pop('id'))

In [112]:
df.head(10)

Unnamed: 0,id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,genre
0,29d0nY7TzCoi22XBqDQkiP,0.625,0.533,10,-11.903,0,0.0596,0.659,0.00266,0.0546,0.139,108.296,300840,4,art pop
1,4DnHUy01jEA4b5ydC3HgsT,0.582,0.944,2,-0.928,0,0.185,0.00573,1e-06,0.547,0.8,143.97,207977,4,latin hip hop
2,1xzBco0xcoJEDXktl7Jxrr,0.729,0.625,4,-5.266,1,0.0315,0.194,0.00986,0.248,0.261,146.034,183907,4,rap
3,09IStsImFySgyp0pIQdqAc,0.753,0.657,7,-3.061,1,0.0449,0.171,0.0,0.112,0.437,107.01,184732,4,complextro
4,7sO5G9EABYOXQKNPNiE9NR,0.88,0.428,9,-8.28,1,0.206,0.149,5.1e-05,0.114,0.333,100.007,172800,4,atl hip hop
5,1M6SuEJZJkZf2S419vIPkB,0.931,0.387,1,-9.127,1,0.412,0.088,0.0,0.136,0.375,125.978,179405,4,emo rap
6,2xLMifQCjDGFmkHkpNLD9h,0.834,0.73,8,-3.714,1,0.222,0.00513,0.0,0.124,0.446,155.008,312820,4,rap
7,5TueD7dEIlQzQIGxH2mjvp,0.708,0.878,8,-5.099,1,0.0468,0.00753,0.646,0.0536,0.596,150.053,145621,4,hard bass
8,4qKcDkK6siZ7Jp1Jb4m0aL,0.922,0.581,10,-7.495,1,0.27,0.00104,5.9e-05,0.105,0.595,140.022,181263,4,memphis hip hop
9,6UrgPc2f78pgDrdodMJA42,0.298,0.831,8,-7.256,1,0.117,0.0058,0.288,0.637,0.724,157.17,267253,3,supergroup


In [113]:
data = df.drop(['id'], axis=1)

In [114]:
label = data.iloc[:, -1]
label.head(10)

0            art pop
1      latin hip hop
2                rap
3         complextro
4        atl hip hop
5            emo rap
6                rap
7          hard bass
8    memphis hip hop
9         supergroup
Name: genre, dtype: object

In [133]:
encoder = LabelEncoder()
y = encoder.fit_transform(label)
y

array([ 41, 385, 489, ...,  71,  68, 418])

In [134]:
scaler = StandardScaler()
x = scaler.fit_transform(np.array(data.iloc[:, :-1], dtype = float))
x.shape

(1307, 13)

In [136]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
x_train.shape

(1045, 13)

-------

In [137]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import BaggingClassifier

In [138]:
model = BaggingClassifier()
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, x_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')



In [143]:
print(n_scores)
print(n_scores.mean())

[0.08571429 0.08571429 0.08571429 0.0952381  0.08571429 0.05769231
 0.08653846 0.10576923 0.06730769 0.07692308 0.03809524 0.05714286
 0.04761905 0.11428571 0.08571429 0.13461538 0.06730769 0.08653846
 0.09615385 0.09615385 0.0952381  0.04761905 0.11428571 0.07619048
 0.05714286 0.06730769 0.05769231 0.07692308 0.04807692 0.08653846]
0.07943223443223443


In [144]:
model.fit(x_train, y_train)
pred = model.predict(x_test)
pred

array([150, 135,  50, 407, 182, 398,  20, 224, 274,  13,  21, 271, 548,
       135, 196,  29,  15,  42, 135,  20,  39, 432, 196, 403, 274,  20,
       115, 115,  13, 135,   5, 195,  20, 113, 135, 195,  21,  46, 135,
         1, 362,  13,  29, 408, 198,  78,  16,  56,  41, 276, 245,   7,
        13, 476, 135,  13, 475,  25, 195, 195,  13,  21, 195,  52, 135,
       111,   7, 399, 195, 416,  91, 449,  43, 135,  15, 224, 253,  63,
         2,  28,  41,  43,  29,  41,  69, 135, 351, 135, 100, 178,  42,
        52,  13,  13, 106,  28,  78, 105, 156, 371,  29, 147, 206, 250,
       195,  13, 297, 150, 453, 175,  13, 135, 198, 195, 113, 162,  13,
       106,  19, 114, 234,   6, 162, 176, 177, 135,  59,  85, 121, 106,
       473,  13, 385, 545,  73, 135, 326, 135,  43,   7, 221,  15,   8,
         5, 135, 171, 167, 135,  41, 295, 135, 250, 111, 195, 195,  13,
        52, 379, 179,  52,  20, 496, 195,   5,  52,  15, 198, 542,  70,
        20,  17,   7, 178, 434,  20,  43, 106,  13, 371, 162,  2

In [145]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
accuracy_score(y_test, pred)

0.07633587786259542

In [146]:
f1_score(y_test, pred, average='weighted')

0.05031490638722805