In [1]:
import pandas as pd
import numpy as np
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

In [6]:
artists = [
    # Pop
    "Taylor Swift", "Ariana Grande", "Ed Sheeran", "Lady Gaga", "Bruno Mars", "Beyoncé", "Adele", "Billie Eilish",
    # Rap
    "Kendrick Lamar", "Eminem", "Jay-Z", "Kanye West", "Drake", "Nas", "J. Cole", "Nicki Minaj",
    # MPB (Brazilian Popular Music)
    "Caetano Veloso", "Gilberto Gil", "Elis Regina", "Chico Buarque", "Maria Bethânia", "Jorge Ben Jor", "Marisa Monte", "Gal Costa",
    # Samba
    "Cartola", "Beth Carvalho", "Zeca Pagodinho", "Paulinho da Viola", "Martinho da Vila", "Alcione", "Clara Nunes", "Elza Soares",
    # Electronic
    "Daft Punk", "Deadmau5", "The Chemical Brothers", "Avicii", "Calvin Harris", "Skrillex", "Diplo", "Kygo",
    # Country
    "Johnny Cash", "Dolly Parton", "Willie Nelson", "Garth Brooks", "Shania Twain", "Kenny Rogers", "Carrie Underwood", "Luke Bryan",
    # Folk
    "Bob Dylan", "Simon & Garfunkel", "Joan Baez", "Nick Drake", "Joni Mitchell", "Leonard Cohen", "Cat Stevens", "Neil Young",
    # Reggae
    "Bob Marley", "Peter Tosh", "Burning Spear", "Jimmy Cliff", "Gregory Isaacs", "Bunny Wailer", "Toots and the Maytals", "Sean Paul",
    # Latin
    "Shakira", "Ricky Martin", "Gloria Estefan", "Enrique Iglesias", "Jennifer Lopez", "Marc Anthony", "Daddy Yankee", "Bad Bunny"
]



In [3]:
client_id = '058b5d690f824dfab89c2fb147b1f16d'
client_secret = '54082440a513434d843c83d58073436e'

# Configurar credenciais (use suas próprias credenciais do Spotify aqui)
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [4]:
def get_artist_top_tracks(artist_name):

    # Encontrar o artista pelo nome
    results = sp.search(q='artist:' + artist_name, type='artist')
    items = results['artists']['items']
    if len(items) > 0:
        artist = items[0]
    else:
        return "Artista não encontrado"

    # Obter as 10 faixas mais populares do artista
    tracks = sp.artist_top_tracks(artist['id'])['tracks']
    
    # Criar um DataFrame para armazenar as informações das faixas
    track_data = []
    for track in tracks:
        # Obter características de áudio da faixa
        track_features = sp.audio_features(track['id'])[0]
        album = track['album']

        track_info = {
            'track_name': track['name'],
            'album_name': album['name'],
            'release_date': album['release_date'],
            'acousticness': track_features['acousticness'],
            'danceability': track_features['danceability'],
            'duration_ms': track_features['duration_ms'],
            'energy': track_features['energy'],
            'id': track_features['id'],
            'instrumentalness': track_features['instrumentalness'],
            'key': track_features['key'],
            'liveness': track_features['liveness'],
            'loudness': track_features['loudness'],
            'mode': track_features['mode'],
            'speechiness': track_features['speechiness'],
            'tempo': track_features['tempo'],
            'time_signature': track_features['time_signature'],
            'track_href': track_features['track_href'],
            'type': track_features['type'],
            'uri': track_features['uri'],
            'valence': track_features['valence']
        }
        track_data.append(track_info)

    df = pd.DataFrame(track_data)
    return df

In [7]:
dataframes = {}

for artist in artists:
    print(artist)
    df = get_artist_top_tracks(artist)
    dataframes[artist] = df

Taylor Swift
Ariana Grande
Ed Sheeran
Lady Gaga
Bruno Mars
Beyoncé
Adele
Billie Eilish
Kendrick Lamar
Eminem
Jay-Z
Kanye West
Drake
Nas
J. Cole
Nicki Minaj
Caetano Veloso
Gilberto Gil
Elis Regina
Chico Buarque
Maria Bethânia
Jorge Ben Jor
Marisa Monte
Gal Costa
Cartola
Beth Carvalho
Zeca Pagodinho
Paulinho da Viola
Martinho da Vila
Alcione
Clara Nunes
Elza Soares
Daft Punk
Deadmau5
The Chemical Brothers
Avicii
Calvin Harris
Skrillex
Diplo
Kygo
Johnny Cash
Dolly Parton
Willie Nelson
Garth Brooks
Shania Twain
Kenny Rogers
Carrie Underwood
Luke Bryan
Bob Dylan
Simon & Garfunkel
Joan Baez
Nick Drake
Joni Mitchell
Leonard Cohen
Cat Stevens
Neil Young
Bob Marley
Peter Tosh
Burning Spear
Jimmy Cliff
Gregory Isaacs
Bunny Wailer
Toots and the Maytals
Sean Paul
Shakira
Ricky Martin
Gloria Estefan
Enrique Iglesias
Jennifer Lopez
Marc Anthony
Daddy Yankee
Bad Bunny


In [52]:
lista_dataframes = []
for artist in dataframes:
    lista_dataframes += [dataframes[artist]]

In [53]:
df_artists = pd.concat(lista_dataframes)

In [54]:
df_artists.set_index('id',inplace=True)

In [55]:
df_artists['odd_time_signature'] = [time%2 for time in df_artists['time_signature']]
df_artists.drop(['key','time_signature'],axis=1,inplace=True)
df_artists['cluster'] = 0

df_artists['year'] = [date.split('-')[0] for date in df_artists['release_date']]
df_artists['decade'] = [year[2]+'0'for year in df_artists['year']]

In [56]:
df_pinkfloyd = pd.read_csv('Pink Floyd Clusters.csv')
df_pinkfloyd['cluster'] = [cluster + 1 for cluster in df_pinkfloyd['cluster']]
df_pinkfloyd.drop(['period','0','1','2'],axis=1,inplace = True)
df_pinkfloyd.set_index('id',inplace=True)

In [57]:
colunas_pf = set(df_pinkfloyd.columns)
colunas_ar = set(df_artists.columns)

mesmas = colunas_pf.intersection(colunas_ar)
so_pf = colunas_pf.difference(colunas_ar)
so_ar = colunas_ar.difference(colunas_pf)

mesmas , so_ar, so_pf

({'acousticness',
  'album_name',
  'cluster',
  'danceability',
  'decade',
  'duration_ms',
  'energy',
  'instrumentalness',
  'liveness',
  'loudness',
  'mode',
  'odd_time_signature',
  'release_date',
  'speechiness',
  'tempo',
  'track_href',
  'track_name',
  'type',
  'uri',
  'valence',
  'year'},
 set(),
 set())

In [58]:
df_artists = pd.concat([df_artists,df_pinkfloyd])

In [59]:
df_infos = df_artists[['track_name', 'album_name', 'release_date','track_href', 'type', 'uri']].copy()


In [60]:
metrics_list = ['danceability', 'duration_ms', 'energy',  'instrumentalness',
                 'liveness', 'loudness', 'mode', 'speechiness', 'tempo', 
                 'odd_time_signature', 'valence','cluster']
df_metrics = df_artists[metrics_list].copy()

In [61]:
df_metrics['cluster'].value_counts()

cluster
0    720
2     66
3     42
1     31
4     10
Name: count, dtype: int64

In [62]:
from sklearn.utils import resample

max_class_size = df_metrics['cluster'].value_counts().max()

resampled_data = []

for class_ in range(5):

    class_data = df_metrics[df_metrics['cluster'] == class_]

    class_data_resampled = resample(class_data,
                                    replace=True, 
                                    n_samples=max_class_size,
                                    random_state=123)

    resampled_data.append(class_data_resampled)

df_metrics_resamp = pd.concat(resampled_data)
df_metrics_resamp

Unnamed: 0_level_0,danceability,duration_ms,energy,instrumentalness,liveness,loudness,mode,speechiness,tempo,odd_time_signature,valence,cluster
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
4KROoGIaPaR1pBHPnR3bwC,0.642,124001,0.427,0.090400,0.1240,-8.471,1,0.0402,82.344,0,0.4040,0
5eTaQYBE1yrActixMAeLcZ,0.636,186496,0.869,0.044600,0.0808,-5.289,1,0.0412,143.011,0,0.3060,0
60eA3QITW5G2D7Woc11uKc,0.670,204325,0.589,0.000000,0.0942,-5.693,1,0.0873,146.978,0,0.6280,0
2cGxRwrMyEAp8dEbuZaVv6,0.775,337560,0.585,0.619000,0.0770,-9.516,0,0.0271,109.942,0,0.5180,0
3UmaczJpikHgJFyBTAJVoz,0.780,404107,0.768,0.000002,0.5180,-4.325,0,0.2380,80.063,0,0.5070,0
...,...,...,...,...,...,...,...,...,...,...,...,...
2jvuMDqBK04WvCYYz5qjvG,0.321,1025280,0.479,0.004750,0.1510,-11.395,0,0.0500,109.289,0,0.1400,4
55e99bv2wlgTYtNEYHWtN1,0.280,619079,0.716,0.520000,0.5120,-12.095,0,0.0983,135.378,0,0.0618,4
21j1PsCiTaO8ZW88UZrh3A,0.288,747325,0.485,0.900000,0.3220,-13.162,0,0.0491,135.870,1,0.1240,4
0OtkNB4TnTzrlid4zipYDV,0.217,716693,0.779,0.900000,0.1060,-12.711,1,0.1770,117.938,0,0.0352,4


In [34]:
df_metrics_resamp['cluster'].value_counts()

cluster
0    720
1    720
2    720
3    720
4    720
Name: count, dtype: int64

In [35]:
df_metrics_resamp.to_csv('Metrics Resample.csv')

In [65]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = df_metrics_resamp.drop(['cluster'], axis=1)
y = df_metrics_resamp['cluster']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

columns_to_scale = ['duration_ms', 'loudness', 'tempo']

scaler = StandardScaler()

scaler.fit(X_train[columns_to_scale])

X_train_scaled = X_train.copy()
X_train_scaled[columns_to_scale] = scaler.transform(X_train[columns_to_scale])

X_test_scaled = X_test.copy()
X_test_scaled[columns_to_scale] = scaler.transform(X_test[columns_to_scale])

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2880, 11), (720, 11), (2880,), (720,))

In [66]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
import numpy as np


logreg = LogisticRegression(max_iter=1000)
rf = RandomForestClassifier()
svm = SVC(probability=True)
knn = KNeighborsClassifier()
dt = DecisionTreeClassifier()
gb = GradientBoostingClassifier()


cv_scores = {}

for clf, name in zip([logreg, rf, svm, knn, dt, gb], 
                     ['Logistic Regression', 'Random Forest', 'SVM', 'K-Nearest Neighbors', 'Decision Tree', 'Gradient Boosting']):
    scores = cross_val_score(clf, X_train_scaled, y_train, cv=5, scoring='accuracy')
    cv_scores[name] = np.mean(scores)

cv_scores

{'Logistic Regression': 0.9274305555555555,
 'Random Forest': 0.9940972222222222,
 'SVM': 0.960763888888889,
 'K-Nearest Neighbors': 0.9715277777777777,
 'Decision Tree': 0.9909722222222224,
 'Gradient Boosting': 0.9913194444444444}

In [67]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [20,50, 100, 200, 300],
    'max_features': [None, 'sqrt', 'log2'],
    'max_depth': [None, 5 , 10, 20, 30],
    'min_samples_split': [2, 5, 10, 15]
}

rf = RandomForestClassifier()

grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

grid_search.fit(X_train_scaled, y_train)

best_params = grid_search.best_params_
best_score = grid_search.best_score_

(best_params, best_score)

({'max_depth': 20,
  'max_features': 'log2',
  'min_samples_split': 5,
  'n_estimators': 20},
 0.9951388888888889)

In [71]:
from sklearn.metrics import recall_score

model = RandomForestClassifier(
  min_samples_split=5,  
  n_estimators=20,      
  max_depth=20,           
  max_features='log2',   
  random_state=0         
)
model.fit(X_train_scaled, y_train)

model.score(X_test_scaled,y_test)


0.9916666666666667

In [78]:
df_artists_predicted = df_artists.copy()
df_artists_predicted[columns_to_scale] = scaler.transform(df_artists[columns_to_scale])

In [79]:
df_artists_predicted['predicted_cluster'] = model.predict(df_artists_predicted[metrics_list].drop('cluster',axis=1))

In [80]:
predicted_probas = model.predict_proba(df_artists_predicted[metrics_list].drop('cluster', axis=1))
for i, class_name in enumerate(model.classes_):
    df_artists_predicted[f'proba_class_{class_name}'] = predicted_probas[:, i]

In [81]:
df_artists_predicted.columns

Index(['track_name', 'album_name', 'release_date', 'acousticness',
       'danceability', 'duration_ms', 'energy', 'instrumentalness', 'liveness',
       'loudness', 'mode', 'speechiness', 'tempo', 'track_href', 'type', 'uri',
       'valence', 'odd_time_signature', 'cluster', 'year', 'decade',
       'predicted_cluster', 'proba_class_0', 'proba_class_1', 'proba_class_2',
       'proba_class_3', 'proba_class_4'],
      dtype='object')

In [82]:
df_artists_predicted

Unnamed: 0_level_0,track_name,album_name,release_date,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,...,odd_time_signature,cluster,year,decade,predicted_cluster,proba_class_0,proba_class_1,proba_class_2,proba_class_3,proba_class_4
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1BxfuPKGuaTgP7aM0Bbdwr,Cruel Summer,Lover,2019-08-23,0.1170,0.552,-0.622952,0.702,0.000021,0.1050,1.516326,...,0,0,2019,10,0,1.000000,0.0,0.000000,0.0,0.0
1Iq8oo9XkmmvCQiGOfORiz,Is It Over Now? (Taylor's Version) (From The V...,1989 (Taylor's Version),2023-10-26,0.0504,0.596,-0.457173,0.658,0.000000,0.1270,1.236750,...,0,0,2023,20,0,1.000000,0.0,0.000000,0.0,0.0
0V3wPSX9ygBnCm8psDIegu,Anti-Hero,Midnights,2022-10-21,0.1300,0.637,-0.550657,0.643,0.000002,0.1420,1.368947,...,0,0,2022,20,0,1.000000,0.0,0.000000,0.0,0.0
1dGr1c8CrMLDpV6mPbImSI,Lover,Lover,2019-08-23,0.4920,0.359,-0.483707,0.543,0.000016,0.1180,1.196494,...,0,0,2019,10,0,1.000000,0.0,0.000000,0.0,0.0
3CWq0pAKKTWb0K4yiglDc4,You’re Losing Me (From The Vault),You're Losing Me (From The Vault),2023-11-29,0.5290,0.747,-0.300152,0.391,0.003000,0.1110,0.141476,...,0,0,2023,20,0,0.900000,0.0,0.000000,0.1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3xyTufSSGLP3oZnomceAVW,Interstellar Overdrive,The Piper at the Gates of Dawn,1967-08-05,0.7670,0.230,0.683412,0.704,0.869000,0.1120,0.617727,...,0,3,1967,60,3,0.000000,0.0,0.000000,1.0,0.0
1GqRmuG89ILDLcuAdcli6r,The Gnome,The Piper at the Gates of Dawn,1967-08-05,0.6550,0.533,-0.767510,0.353,0.000000,0.0975,0.398365,...,0,2,1967,60,2,0.016667,0.0,0.983333,0.0,0.0
0j9iv7tRpEpTGSGpkCMJve,Chapter 24,The Piper at the Gates of Dawn,1967-08-05,0.7040,0.231,-0.480070,0.376,0.000572,0.0606,0.630861,...,0,2,1967,60,2,0.000000,0.0,1.000000,0.0,0.0
1aV8FeJiQpZXFY35rLNNu0,The Scarecrow,The Piper at the Gates of Dawn,1967-08-05,0.3850,0.558,-0.776096,0.339,0.784000,0.0261,0.173203,...,1,2,1967,60,2,0.033333,0.0,0.966667,0.0,0.0


In [45]:
def get_spotify_recommendations(artist_name):
    
    results = sp.search(q='artist:' + artist_name, type='artist')
    artist_id = results['artists']['items'][0]['id']

    
    #top_tracks = sp.artist_top_tracks(artist_id)
    #track_ids = [track['id'] for track in top_tracks['tracks']][:5]
    
    try:
        recommendations = sp.recommendations(
            seed_artists=[artist_id],
            #seed_tracks=track_ids, 
            limit=100
        )
    except spotipy.exceptions.SpotifyException as e:
        print(f"Error getting recommendations: {e}")
        return pd.DataFrame() 

    
    tracks_info = []
    for track in recommendations['tracks']:
        artist_name = track['artists'][0]['name']
        album = track['album']
        track_features = sp.audio_features(track['id'])[0]
        track_info = {
            'artist_name': artist_name,
            'track_name': track['name'],
            'album_name': album['name'],
            'release_date': album['release_date'],
            'acousticness': track_features['acousticness'],
            'danceability': track_features['danceability'],
            'duration_ms': track_features['duration_ms'],
            'energy': track_features['energy'],
            'id': track_features['id'],
            'instrumentalness': track_features['instrumentalness'],
            'key': track_features['key'],
            'liveness': track_features['liveness'],
            'loudness': track_features['loudness'],
            'mode': track_features['mode'],
            'speechiness': track_features['speechiness'],
            'tempo': track_features['tempo'],
            'time_signature': track_features['time_signature'],
            'track_href': track_features['track_href'],
            'type': track_features['type'],
            'uri': track_features['uri'],
            'valence': track_features['valence']
        }
        tracks_info.append(track_info)

    df_recommendations = pd.DataFrame(tracks_info)
    return df_recommendations

In [46]:
df_recomend = get_spotify_recommendations('Pink Floyd')

In [47]:
df_recomend['odd_time_signature'] = [time%2 for time in df_recomend['time_signature']]
df_recomend.drop('time_signature',axis=1,inplace=True)

In [83]:
df_recomend_predict = df_recomend.copy()
df_recomend_predict[columns_to_scale] = scaler.transform(df_recomend[columns_to_scale])

In [84]:
metrics_list2 = [item for item in metrics_list if item != 'cluster' ]
df_recomend_predict['predicted_cluster'] = model.predict(df_recomend_predict[metrics_list2])

predicted_probas = model.predict_proba(df_recomend_predict[metrics_list2])
for i, class_name in enumerate(model.classes_):
    df_recomend_predict[f'proba_class_{class_name}'] = predicted_probas[:, i]

In [93]:
df_recomend_predict[
    [
    'artist_name','track_name','album_name','predicted_cluster','proba_class_0','proba_class_1','proba_class_2','proba_class_3','proba_class_4'
    ]
][
    (df_recomend_predict['proba_class_3']> 0.75)&
    (df_recomend_predict['artist_name']!='Pink Floyd')
]

Unnamed: 0,artist_name,track_name,album_name,predicted_cluster,proba_class_0,proba_class_1,proba_class_2,proba_class_3,proba_class_4
2,Genesis,Firth Of Fifth - Remastered 2008,Selling England By The Pound,3,0.15,0.0,0.05,0.8,0.0
42,Van Halen,Eruption - 2015 Remaster,Van Halen (Remastered),3,0.15,0.0,0.0,0.85,0.0
79,The Velvet Underground,Oh! Sweet Nuthin' - 2015 Remaster,Loaded: Re-Loaded 45th Anniversary Edition,3,0.15,0.0,0.0,0.85,0.0
81,Yes,"Starship Trooper: a. Life Seeker, b. Disillusi...",The Yes Album (Deluxe Edition),3,0.2,0.0,0.0,0.8,0.0
83,King Crimson,I Talk To The Wind,In The Court Of The Crimson King (Expanded & R...,3,0.05,0.0,0.0,0.95,0.0


In [88]:
df_recomendations = df_recomend[
    (df_recomend['predicted_cluster']>0)&
    (df_recomend['artist_name']!='Pink Floyd')
]

df_recomendations_old = pd.read_csv('Recomendações.csv')
df_recomendations = pd.concat([df_recomendations_old,df_recomendations])
df_recomendations.drop_duplicates(inplace= True)

df_recomendations.to_csv('Recomendações.csv')