In [1]:
import os
import numpy as np
import pandas as pd

import seaborn as sns
import plotly.express as px 
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist

import warnings
warnings.filterwarnings("ignore")

In [2]:
sample_p = pd.read_csv('./data/sample_music')
sample_p

Unnamed: 0.1,Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,...,uri,track_href,analysis_url,duration_ms,time_signature,song_name,artist_name,artist_genres,artist_pop,song_popularity
0,0,0.482,0.721,0,-6.839,1,0.0321,0.731000,0.000000,0.1890,...,spotify:track:3AhXZa8sUQht0UEdBJgpGc,https://api.spotify.com/v1/tracks/3AhXZa8sUQht...,https://api.spotify.com/v1/audio-analysis/3AhX...,369600,4,Like a Rolling Stone,Bob Dylan,"['classic rock', 'country rock', 'folk', 'folk...",72,68
1,1,0.485,0.863,1,-9.027,1,0.0495,0.000012,0.016200,0.1380,...,spotify:track:3oTlkzk1OtrhH8wBAduVEi,https://api.spotify.com/v1/tracks/3oTlkzk1Otrh...,https://api.spotify.com/v1/audio-analysis/3oTl...,300977,4,Smells Like Teen Spirit,Nirvana,"['grunge', 'permanent wave', 'rock']",82,0
2,2,0.364,0.457,4,-14.162,0,0.0675,0.290000,0.000106,0.9220,...,spotify:track:3ZFBeIyP41HhnALjxWy1pR,https://api.spotify.com/v1/tracks/3ZFBeIyP41Hh...,https://api.spotify.com/v1/audio-analysis/3ZFB...,337413,4,A Day In The Life - Remastered,The Beatles,"['beatlesque', 'british invasion', 'classic ro...",84,0
3,3,0.398,0.413,1,-10.934,1,0.0388,0.082200,0.000025,0.0891,...,spotify:track:5Qt4Cc66g24QWwGP3YYV9y,https://api.spotify.com/v1/tracks/5Qt4Cc66g24Q...,https://api.spotify.com/v1/audio-analysis/5Qt4...,219147,4,Good Vibrations (Mono),The Beach Boys,"['baroque pop', 'classic rock', 'psychedelic r...",78,20
4,4,0.518,0.756,10,-10.851,1,0.0915,0.735000,0.000062,0.3170,...,spotify:track:7MH2ZclofPlTrZOkPzZKhK,https://api.spotify.com/v1/tracks/7MH2ZclofPlT...,https://api.spotify.com/v1/audio-analysis/7MH2...,160893,4,Johnny B Goode,Chuck Berry,"['classic rock', 'rock', 'rock-and-roll', 'roc...",75,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1094,1094,0.509,0.680,5,-8.354,1,0.0372,0.360000,0.000000,0.0824,...,spotify:track:7tJQ4Ekp2vN3NlI3vJJW3v,https://api.spotify.com/v1/tracks/7tJQ4Ekp2vN3...,https://api.spotify.com/v1/audio-analysis/7tJQ...,185200,4,I Want You,Bob Dylan,"['classic rock', 'country rock', 'folk', 'folk...",72,56
1095,1095,0.612,0.668,6,-6.143,0,0.0302,0.057900,0.044500,0.3410,...,spotify:track:6EaHDsTWNNJRkCMtptSE6n,https://api.spotify.com/v1/tracks/6EaHDsTWNNJR...,https://api.spotify.com/v1/audio-analysis/6EaH...,315200,4,The Suburbs,Arcade Fire,"['baroque pop', 'canadian indie', 'indie rock'...",66,0
1096,1096,0.651,0.944,2,-5.480,1,0.0381,0.085500,0.145000,0.1160,...,spotify:track:5mWhSQfWEeBZD9YTd4yyFc,https://api.spotify.com/v1/tracks/5mWhSQfWEeBZ...,https://api.spotify.com/v1/audio-analysis/5mWh...,263440,4,Dum Surfer,King Krule,['uk alternative pop'],60,3
1097,1097,0.492,0.194,6,-14.073,1,0.0577,0.984000,0.104000,0.1820,...,spotify:track:5iltwljBW1H5ScLeeiB9ZB,https://api.spotify.com/v1/tracks/5iltwljBW1H5...,https://api.spotify.com/v1/audio-analysis/5ilt...,203373,4,Black and Tan Fantasy,Duke Ellington & His Washingtonians,['hot jazz'],18,11


In [3]:
clean_data= sample_p.drop(columns=['Unnamed: 0','uri','track_href','analysis_url','type', 'id'])
clean_data
clean_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1099 entries, 0 to 1098
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   danceability      1099 non-null   float64
 1   energy            1099 non-null   float64
 2   key               1099 non-null   int64  
 3   loudness          1099 non-null   float64
 4   mode              1099 non-null   int64  
 5   speechiness       1099 non-null   float64
 6   acousticness      1099 non-null   float64
 7   instrumentalness  1099 non-null   float64
 8   liveness          1099 non-null   float64
 9   valence           1099 non-null   float64
 10  tempo             1099 non-null   float64
 11  duration_ms       1099 non-null   int64  
 12  time_signature    1099 non-null   int64  
 13  song_name         1099 non-null   object 
 14  artist_name       1099 non-null   object 
 15  artist_genres     1099 non-null   object 
 16  artist_pop        1099 non-null   int64  


In [4]:
#Create genre list
def genre_list(self):
    split = self.strip().split(',')
    clean = [x.strip(" [']") for x in split]
    return clean

In [5]:
# convert the df to list
clean_data['artist_genres'] = clean_data['artist_genres'].apply(lambda x: genre_list(x))

In [6]:
#finding all the genres in the data set
all_genres = []
for row in clean_data.artist_genres:
    for x in row:
        if x not in all_genres:
            all_genres.append(x)
        
print(all_genres)
    

['classic rock', 'country rock', 'folk', 'folk rock', 'rock', 'roots rock', 'singer-songwriter', 'grunge', 'permanent wave', 'beatlesque', 'british invasion', 'merseybeat', 'psychedelic rock', 'baroque pop', 'sunshine pop', 'rock-and-roll', 'rockabilly', 'classic soul', 'jazz blues', 'memphis soul', 'soul', 'southern soul', 'adult standards', 'brill building pop', 'classic girl group', 'funk', 'indie r&b', 'motown', 'northern soul', 'quiet storm', 'pop', 'r&b', 'heartland rock', 'mellow gold', 'album rock', 'art rock', 'blues rock', 'hard rock', 'alternative rock', 'madchester', 'new wave', 'post-punk', 'uk post-punk', 'soul blues', 'atl hip hop', 'dirty south rap', 'hip hop', 'old school atlanta hip hop', 'pop rap', 'rap', 'southern hip hop', 'punk', 'dance pop', 'indietronica', 'new rave', 'bronx hip hop', 'electro', 'old school hip hop', 'piano blues', 'vocal jazz', 'glam rock', 'funk rock', 'minneapolis sound', 'synth funk', 'britpop', 'indie rock', 'pop rock', 'sheffield indie', '

In [35]:
clean_data.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,song_name,artist_name,artist_genres,artist_pop,song_popularity,cluster_label
0,0.482,0.721,0,-6.839,1,0.0321,0.731,0.0,0.189,0.557,95.263,369600,4,Like a Rolling Stone,Bob Dylan,"[classic rock, country rock, folk, folk rock, ...",72,68,6
1,0.485,0.863,1,-9.027,1,0.0495,1.2e-05,0.0162,0.138,0.767,116.835,300977,4,Smells Like Teen Spirit,Nirvana,"[grunge, permanent wave, rock]",82,0,3
2,0.364,0.457,4,-14.162,0,0.0675,0.29,0.000106,0.922,0.175,163.219,337413,4,A Day In The Life - Remastered,The Beatles,"[beatlesque, british invasion, classic rock, m...",84,0,5
3,0.398,0.413,1,-10.934,1,0.0388,0.0822,2.5e-05,0.0891,0.331,133.574,219147,4,Good Vibrations (Mono),The Beach Boys,"[baroque pop, classic rock, psychedelic rock, ...",78,20,2
4,0.518,0.756,10,-10.851,1,0.0915,0.735,6.2e-05,0.317,0.968,166.429,160893,4,Johnny B Goode,Chuck Berry,"[classic rock, rock, rock-and-roll, rockabilly]",75,0,9


In [37]:
#combining genres with music feature data average into a new df
genre_features_input = {}
genre_features = {}
#list of features were looking for
features_input = ['danceability','energy','loudness','speechiness','acousticness','liveness','valence']


#loop through df
for i, r in clean_data.iterrows():

    #loop through each genre
    for genre in r['artist_genres']:

        #create a dict for the genre if it isnt made
        if genre not in genre_features_input.keys():
            genre_features_input[genre] = []

            #start adding the features
            for feature in features_input:

                #create dict for features
                if feature not in genre_features.keys():
                    genre_features[feature] = []
                    genre_features[feature].append(r[feature])
                
            genre_features_input[genre].append(genre_features)
        #else add onto the generated genre key
        else:
            for genre_info in genre_features_input.values():
                for dicts in genre_info:
                    for key,v in dicts.items():
                        for feature in features_input:
                            if key == feature:
                                dicts[key].append(r[feature])
                    
                    
                
        genre_features = {}

In [7]:
genre_df =clean_data[['artist_genres', 'danceability','energy','acousticness','valence','song_popularity']].copy()
genre_df

Unnamed: 0,artist_genres,danceability,energy,acousticness,valence,song_popularity
0,"[classic rock, country rock, folk, folk rock, ...",0.482,0.721,0.731000,0.557,68
1,"[grunge, permanent wave, rock]",0.485,0.863,0.000012,0.767,0
2,"[beatlesque, british invasion, classic rock, m...",0.364,0.457,0.290000,0.175,0
3,"[baroque pop, classic rock, psychedelic rock, ...",0.398,0.413,0.082200,0.331,20
4,"[classic rock, rock, rock-and-roll, rockabilly]",0.518,0.756,0.735000,0.968,0
...,...,...,...,...,...,...
1094,"[classic rock, country rock, folk, folk rock, ...",0.509,0.680,0.360000,0.802,56
1095,"[baroque pop, canadian indie, indie rock, mode...",0.612,0.668,0.057900,0.524,0
1096,[uk alternative pop],0.651,0.944,0.085500,0.676,3
1097,[hot jazz],0.492,0.194,0.984000,0.368,11


In [65]:
genre_df['artist_genres']  = (genre_df['artist_genres'].astype('string'))
newframe=genre_df.groupby('artist_genres')['danceability','energy','acousticness','valence','song_popularity'].mean()
newframe.info


<bound method DataFrame.info of                                                    danceability    energy  \
artist_genres                                                               
['']                                                   0.572286  0.384957   
['acid house', 'detroit techno', 'techno']             0.714000  0.873000   
['acid rock', 'album rock', 'blues rock', 'cla...      0.438000  0.710000   
['acid rock', 'album rock', 'blues rock', 'cla...      0.421333  0.813667   
['acid rock', 'album rock', 'classic rock', 'p...      0.416750  0.620000   
...                                                         ...       ...   
['ska']                                                0.805000  0.472000   
['soul', 'southern soul']                              0.516000  0.150000   
['space age pop']                                      0.699000  0.503000   
['surf music']                                         0.461000  0.955000   
['uk alternative pop']                      

In [73]:
df = pd.DataFrame(newframe)
new_genre= df.nlargest(10, 'song_popularity',keep='last')
#new_genre.drop_duplicates('artist_genres')
new_genre.info()
new_genre.reset_index(inplace=True)


<class 'pandas.core.frame.DataFrame'>
Index: 10 entries, ['permanent wave', 'pop'] to ['dance pop', 'europop', 'girl group']
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   danceability     10 non-null     float64
 1   energy           10 non-null     float64
 2   acousticness     10 non-null     float64
 3   valence          10 non-null     float64
 4   song_popularity  10 non-null     float64
dtypes: float64(5)
memory usage: 480.0 bytes


In [74]:
new_genre

Unnamed: 0,artist_genres,danceability,energy,acousticness,valence,song_popularity
0,"['permanent wave', 'pop']",0.51225,0.61725,0.356947,0.2925,84.0
1,"['new romantic', 'new wave', 'new wave pop', '...",0.573,0.902,0.018,0.876,84.0
2,"['east coast hip hop', 'gangster rap', 'hip ho...",0.899,0.713,0.255,0.777,82.0
3,"['art pop', 'electropop', 'pop']",0.701,0.425,0.328,0.562,82.0
4,"['rap', 'slap house']",0.834,0.73,0.00513,0.446,81.0
5,"['art rock', 'dance rock', 'europop', 'mellow ...",0.692,0.711,0.225,0.875,81.0
6,"['modern rock', 'rock']",0.542,0.905,0.00172,0.374,80.0
7,"['album rock', 'classic rock', 'hard rock', 'm...",0.4885,0.6095,0.16,0.4685,79.5
8,"['funk', 'quiet storm', 'soul']",0.527,0.415,0.457,0.515,79.0
9,"['dance pop', 'europop', 'girl group']",0.768,0.864,0.101,0.891,79.0


In [75]:
#top10_genres= genre_df.nlargest(10, 'song_popularity')

fig = px.bar(new_genre, x='artist_genres', y=['valence', 'energy', 'danceability', 'acousticness'], barmode='group')


fig.update_layout(
    #yaxis = dict(
        #tickmode = 'linear',
        #tick0 = 0,
        #dtick = 0.2
    )
 )
fig.show()
        

In [80]:
cluster_pipeline = Pipeline([('scaler', StandardScaler()), ('kmeans', KMeans(n_clusters=9, random_state=0))])
X = genre_df.select_dtypes(np.number)
cluster_pipeline.fit(X)
genre_df['cluster'] = cluster_pipeline.predict(X)

In [81]:
tsne_pipeline = Pipeline([('scaler', StandardScaler()), ('tsne', TSNE(n_components=2, verbose=1))])
genre_embedding = tsne_pipeline.fit_transform(X)
projection = pd.DataFrame(columns=['x', 'y'], data=genre_embedding)
projection['genres'] = genre_df['artist_genres']
projection['cluster'] = genre_df['cluster']

fig = px.scatter(
    projection, x='x', y='y', color='cluster', hover_data=['x', 'y', 'genres'])
fig.show()

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 1099 samples in 0.003s...
[t-SNE] Computed neighbors for 1099 samples in 0.061s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1099
[t-SNE] Computed conditional probabilities for sample 1099 / 1099
[t-SNE] Mean sigma: 0.624392
[t-SNE] KL divergence after 250 iterations with early exaggeration: 54.980080
[t-SNE] KL divergence after 1000 iterations: 0.589347


In [78]:
song_cluster_pipeline = Pipeline([('scaler', StandardScaler()), 
                                  ('kmeans', KMeans(n_clusters=20, 
                                   verbose=False, random_state=0))
                                 ], verbose=False)

X = clean_data.select_dtypes(np.number)
number_cols = list(X.columns)
song_cluster_pipeline.fit(X)
song_cluster_labels = song_cluster_pipeline.predict(X)
clean_data['cluster_label'] = song_cluster_labels

In [79]:
pca_pipeline = Pipeline([('scaler', StandardScaler()), ('PCA', PCA(n_components=2))])
song_embedding = pca_pipeline.fit_transform(X)
projection = pd.DataFrame(columns=['x', 'y'], data=song_embedding)
projection['title'] = clean_data['song_name']
projection['cluster'] = clean_data['cluster_label']

fig = px.scatter(
    projection, x='x', y='y', color='cluster', hover_data=['x', 'y', 'title'])
fig.show()