In [6]:
import spotipy
import csv
import os
from spotipy.oauth2 import SpotifyClientCredentials, SpotifyOAuth
from dotenv import load_dotenv

In [7]:
load_dotenv()

client_id = os.getenv('SPOTIPY_CLIENT_ID')
client_secret = os.getenv('SPOTIPY_CLIENT_SECRET')
redirect_uri = os.getenv('SPOTIPY_REDIRECT_URI')

client_credentials_manager = SpotifyClientCredentials()
spotify = spotipy.Spotify(auth_manager=SpotifyClientCredentials())

In [93]:
tracks = []
for i in range(2013, 2023):
    search_result = spotify.search(q=str(i), limit=50) #the result only consists of tracks

    tracks.extend(search_result['tracks']['items'])

    for i in range(19):
        search_result = spotify.next(search_result['tracks'])
        tracks.extend(search_result['tracks']['items'])

In [96]:
tracks_id = []
for i in range(len(tracks)):
    tracks_id.append(tracks[i]['id'])

tracks_id = list(dict.fromkeys(tracks_id))

In [97]:
track_features_list = []

for i in range(len(tracks_id)):
    track_features_list.append(spotify.audio_features(tracks_id[i]))

In [98]:
#there are some tracks without their audio features so we need to remove them from the list
track_features_list = list(filter(lambda x: x != [None], track_features_list))

In [99]:
for i in range(len(track_features_list)):
    track_features_list[i] = dict(list(filter(lambda x: x[0] == 'id' or type(x[1]) != str, list(track_features_list[i][0].items()))))

In [102]:
artists_id = []
for i in range(len(tracks)):
    for j in range(len(tracks[i]['artists'])):
        artists_id.append(tracks[i]['artists'][j]['id'])

artists_id = list(dict.fromkeys(artists_id))

In [103]:
artists = []
for i in range(len(artists_id)):
    artist_info = spotify.artist(artist_id=artists_id[i])
    cols = ['id', 'genres']
    artist = dict.fromkeys(cols)
    for key in artist:
        artist[key] = artist_info[key]
    artists.append(artist)

In [104]:
artists = list(filter(lambda x: x['genres'] != [] ,artists))

In [105]:
genres_list = []

for i in range(len(artists)):
    keys = ['id', 'genre']
    genre = dict.fromkeys(keys)
    for j in range(len(tracks)):
        if artists[i]['id'] == tracks[j]['artists'][0]['id']:
            genre['id'] = tracks[j]['id']
            genre['genre'] = artists[i]['genres'][0]
            genres_list.append(genre)

In [106]:
for i in range(len(track_features_list)):
    for j in range(len(genres_list)):
        if track_features_list[i]['id'] == genres_list[j]['id']:
            track_features_list[i]['genre'] = genres_list[j]['genre']

In [107]:
track_features_list = list(filter(lambda x: len(x) == 15, track_features_list))

In [252]:
with open('audio_features.tsv', 'w') as file:
    writer = csv.DictWriter(file, fieldnames=track_features_list[0].keys(), delimiter='\t')
    writer.writeheader()
    for data in track_features_list:
        writer.writerow(data)

---------------------------------------------------------------------------------------------------------------

In [2]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('audio_features.tsv', delimiter='\t')

In [4]:
df.insert(0, 'id', df.pop('id'))

In [5]:
df.head(10)

Unnamed: 0,id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,genre
0,2dLLR6qlu5UJ5gk0dKz0h3,0.674,0.428,7,-9.504,1,0.122,0.121,0.0,0.132,0.337,84.878,190185,4,art pop
1,5PUvinSo4MNqW7vmomGRS7,0.861,0.504,7,-7.707,1,0.0489,0.00412,1.8e-05,0.0783,0.881,120.0,263053,4,contemporary r&b
2,6CjtS2JZH9RkDz5UVInsa9,0.781,0.526,6,-6.986,0,0.293,0.0633,0.0,0.0457,0.665,94.993,235613,4,dance pop
3,2Foc5Q5nqNiosCNqttzHof,0.794,0.811,6,-8.966,0,0.038,0.0426,1e-06,0.101,0.862,116.047,248413,4,electro
4,2bYZxKw8wv9lGQjcskPRl3,0.537,0.569,5,-8.046,1,0.3,0.855,0.0,0.0909,0.524,78.907,135385,4,sad rap
5,1HFfMOxCAT4GAwaPfCdmUs,0.829,0.51,5,-9.334,0,0.0369,0.00821,0.00143,0.0829,0.45,119.994,208133,4,dutch hip hop
6,1q9bLSeIlGf2xBvbOkp2Wr,0.676,0.534,2,-6.901,0,0.0831,0.0594,9e-06,0.256,0.156,145.082,338413,4,east coast hip hop
7,3PJIKoSgXBohYJ4qTwRzyd,0.833,0.333,8,-14.611,1,0.0606,0.469,0.938,0.217,0.0658,114.995,105500,5,emo rap
8,6RtPijgfPKROxEzTHNRiDp,0.773,0.758,1,-4.993,1,0.0381,0.0422,0.0,0.305,0.925,144.033,224840,4,pop
9,2XHzzp1j4IfTNp1FTn7YFg,0.669,0.634,11,-6.476,1,0.0327,0.0125,0.0,0.0946,0.496,124.906,255053,4,hip hop


In [6]:
data = df.drop(['id'], axis=1)

In [7]:
label = data.iloc[:, -1]
label = label.sort_values()

In [8]:
words_list = []
for genre in label:
    words_list.extend(genre.split(' ')[-1:])
words_dict = dict.fromkeys(words_list)

for key in words_dict:
    words_dict[key] = words_list.count(key)

words_dict = dict(sorted(words_dict.items(), key=lambda words_dict: words_dict[1], reverse=True))

len(words_dict)

315

In [9]:
words_dict = dict(filter(lambda x: x[1] > 10, list(words_dict.items())))

In [10]:
words_dict

{'pop': 335,
 'rock': 206,
 'hop': 148,
 'rap': 104,
 'chillhop': 95,
 'metal': 66,
 'indie': 45,
 'house': 43,
 'country': 41,
 'edm': 34,
 'k-pop': 29,
 'r&b': 24,
 'standards': 23,
 'trap': 22,
 'punk': 20,
 'beats': 19,
 'soul': 16,
 'broadway': 16,
 'americana': 15,
 'z': 14,
 'dance': 14,
 'folk': 14,
 'anime': 14,
 'dancehall': 13,
 'hardstyle': 13,
 'band': 12,
 'jazz': 12,
 'emo': 11,
 'classical': 11,
 'room': 11,
 'corrido': 11,
 'youth': 11}

In [11]:
flag = 0
for i in range(len(label)):
    for key in words_dict:
        if key in label[i]:
            label[i] = key
            flag = 1
    if flag == 0:
        label[i] = 'None'
    flag = 0

In [12]:
encoder = LabelEncoder()
y = encoder.fit_transform(label)
len(y)

2230

In [13]:
scaler = StandardScaler()
x = scaler.fit_transform(np.array(data.iloc[:, :-1], dtype = float))
x.shape

(2230, 13)

In [14]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=50)
x_train.shape

(1784, 13)

-------

In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

import plotly.express as px

In [18]:
knn_model = KNeighborsClassifier()
dec_model = DecisionTreeClassifier()
mlp_model = MLPClassifier()
svm_model = SVC()
bagging_model = BaggingClassifier()
lgs_model = LogisticRegression()

In [19]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(knn_model, x_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
n_scores.mean()



0.21151005377356516

In [24]:
lgs_model.fit(x_train, y_train)
knn_model.fit(x_train, y_train)
dec_model.fit(x_train, y_train)
mlp_model.fit(x_train, y_train)
svm_model.fit(x_train, y_train)
bagging_model.fit(x_train, y_train)


Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.



In [25]:
lgs_acc = lgs_model.score(x_test, y_test)
knn_acc = knn_model.score(x_test, y_test)
dec_acc = dec_model.score(x_test, y_test)
mlp_acc = mlp_model.score(x_test, y_test)
svm_acc = svm_model.score(x_test, y_test)
bagging_acc = bagging_model.score(x_test, y_test)

print("Logistic Regression Accuracy", lgs_acc)
print("K-Nearest-Neighbors Accuracy:", knn_acc)
print("Decision Tree Accuracy:", dec_acc)
print("Neural Network Accuracy:", mlp_acc)
print("Bagging Algorithms Accuracy:", bagging_acc)
print("Support Vector Machine Accuracy:", svm_acc)

Logistic Regression Accuracy 0.2802690582959641
K-Nearest-Neighbors Accuracy: 0.21076233183856502
Decision Tree Accuracy: 0.16591928251121077
Neural Network Accuracy: 0.21748878923766815
Bagging Algorithms Accuracy: 0.21973094170403587
Support Vector Machine Accuracy: 0.273542600896861


In [28]:
accuracy_score(y_test, knn_model.predict(x_test))

0.21076233183856502

In [29]:
f1_score(y_test, knn_model.predict(x_test), average='weighted')

0.1705825005482088

In [30]:
fig = px.bar(
    x=["Logistic Regression", "K-Nearest-Neighbors", "Decision Tree", "Neural Network", "Bagging Algorithms", "Support Vector Machine"],
    y=[lgs_acc, knn_acc, dec_acc, mlp_acc, bagging_acc, svm_acc],
    color=["Logistic Regression", "K-Nearest-Neighbors", "Decision Tree", "Neural Network", "Bagging Algorithms", "Support Vector Machine"],
    labels={'x': "Model", 'y': "Accuracy"},
    title="Model Accuracy Comparison"
)

fig.show()