In [2]:
import os
import numpy as np
import pandas as pd

import seaborn as sns
import plotly.express as px 
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist

import warnings
warnings.filterwarnings("ignore")

In [3]:
data = pd.read_csv('./data/sample_music')
data.head()

Unnamed: 0.1,Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,...,uri,track_href,analysis_url,duration_ms,time_signature,song_name,artist_name,artist_genres,artist_pop,song_popularity
0,0,0.482,0.721,0,-6.839,1,0.0321,0.731,0.0,0.189,...,spotify:track:3AhXZa8sUQht0UEdBJgpGc,https://api.spotify.com/v1/tracks/3AhXZa8sUQht...,https://api.spotify.com/v1/audio-analysis/3AhX...,369600,4,Like a Rolling Stone,Bob Dylan,"['classic rock', 'country rock', 'folk', 'folk...",72,68
1,1,0.485,0.863,1,-9.027,1,0.0495,1.2e-05,0.0162,0.138,...,spotify:track:3oTlkzk1OtrhH8wBAduVEi,https://api.spotify.com/v1/tracks/3oTlkzk1Otrh...,https://api.spotify.com/v1/audio-analysis/3oTl...,300977,4,Smells Like Teen Spirit,Nirvana,"['grunge', 'permanent wave', 'rock']",82,0
2,2,0.364,0.457,4,-14.162,0,0.0675,0.29,0.000106,0.922,...,spotify:track:3ZFBeIyP41HhnALjxWy1pR,https://api.spotify.com/v1/tracks/3ZFBeIyP41Hh...,https://api.spotify.com/v1/audio-analysis/3ZFB...,337413,4,A Day In The Life - Remastered,The Beatles,"['beatlesque', 'british invasion', 'classic ro...",84,0
3,3,0.398,0.413,1,-10.934,1,0.0388,0.0822,2.5e-05,0.0891,...,spotify:track:5Qt4Cc66g24QWwGP3YYV9y,https://api.spotify.com/v1/tracks/5Qt4Cc66g24Q...,https://api.spotify.com/v1/audio-analysis/5Qt4...,219147,4,Good Vibrations (Mono),The Beach Boys,"['baroque pop', 'classic rock', 'psychedelic r...",78,20
4,4,0.518,0.756,10,-10.851,1,0.0915,0.735,6.2e-05,0.317,...,spotify:track:7MH2ZclofPlTrZOkPzZKhK,https://api.spotify.com/v1/tracks/7MH2ZclofPlT...,https://api.spotify.com/v1/audio-analysis/7MH2...,160893,4,Johnny B Goode,Chuck Berry,"['classic rock', 'rock', 'rock-and-roll', 'roc...",75,0


In [4]:
data.drop(columns='Unnamed: 0',inplace=True)

In [5]:
#function to turn artist genres string to list
def genre_list(self):
    split = self.strip().split(',')
    clean = [x.strip(" [']") for x in split]
    return clean

In [6]:
# convert the df to list
data['artist_genres'] = data['artist_genres'].apply(lambda x: genre_list(x))

In [7]:
#finding all the genres in the data set
all_genres = []
for row in data.artist_genres:
    for x in row:
        if x not in all_genres:
            all_genres.append(x)
        
print(all_genres)
    

['classic rock', 'country rock', 'folk', 'folk rock', 'rock', 'roots rock', 'singer-songwriter', 'grunge', 'permanent wave', 'beatlesque', 'british invasion', 'merseybeat', 'psychedelic rock', 'baroque pop', 'sunshine pop', 'rock-and-roll', 'rockabilly', 'classic soul', 'jazz blues', 'memphis soul', 'soul', 'southern soul', 'adult standards', 'brill building pop', 'classic girl group', 'funk', 'indie r&b', 'motown', 'northern soul', 'quiet storm', 'pop', 'r&b', 'heartland rock', 'mellow gold', 'album rock', 'art rock', 'blues rock', 'hard rock', 'alternative rock', 'madchester', 'new wave', 'post-punk', 'uk post-punk', 'soul blues', 'atl hip hop', 'dirty south rap', 'hip hop', 'old school atlanta hip hop', 'pop rap', 'rap', 'southern hip hop', 'punk', 'dance pop', 'indietronica', 'new rave', 'bronx hip hop', 'electro', 'old school hip hop', 'piano blues', 'vocal jazz', 'glam rock', 'funk rock', 'minneapolis sound', 'synth funk', 'britpop', 'indie rock', 'pop rock', 'sheffield indie', '

In [8]:
data.head()


Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,...,uri,track_href,analysis_url,duration_ms,time_signature,song_name,artist_name,artist_genres,artist_pop,song_popularity
0,0.482,0.721,0,-6.839,1,0.0321,0.731,0.0,0.189,0.557,...,spotify:track:3AhXZa8sUQht0UEdBJgpGc,https://api.spotify.com/v1/tracks/3AhXZa8sUQht...,https://api.spotify.com/v1/audio-analysis/3AhX...,369600,4,Like a Rolling Stone,Bob Dylan,"[classic rock, country rock, folk, folk rock, ...",72,68
1,0.485,0.863,1,-9.027,1,0.0495,1.2e-05,0.0162,0.138,0.767,...,spotify:track:3oTlkzk1OtrhH8wBAduVEi,https://api.spotify.com/v1/tracks/3oTlkzk1Otrh...,https://api.spotify.com/v1/audio-analysis/3oTl...,300977,4,Smells Like Teen Spirit,Nirvana,"[grunge, permanent wave, rock]",82,0
2,0.364,0.457,4,-14.162,0,0.0675,0.29,0.000106,0.922,0.175,...,spotify:track:3ZFBeIyP41HhnALjxWy1pR,https://api.spotify.com/v1/tracks/3ZFBeIyP41Hh...,https://api.spotify.com/v1/audio-analysis/3ZFB...,337413,4,A Day In The Life - Remastered,The Beatles,"[beatlesque, british invasion, classic rock, m...",84,0
3,0.398,0.413,1,-10.934,1,0.0388,0.0822,2.5e-05,0.0891,0.331,...,spotify:track:5Qt4Cc66g24QWwGP3YYV9y,https://api.spotify.com/v1/tracks/5Qt4Cc66g24Q...,https://api.spotify.com/v1/audio-analysis/5Qt4...,219147,4,Good Vibrations (Mono),The Beach Boys,"[baroque pop, classic rock, psychedelic rock, ...",78,20
4,0.518,0.756,10,-10.851,1,0.0915,0.735,6.2e-05,0.317,0.968,...,spotify:track:7MH2ZclofPlTrZOkPzZKhK,https://api.spotify.com/v1/tracks/7MH2ZclofPlT...,https://api.spotify.com/v1/audio-analysis/7MH2...,160893,4,Johnny B Goode,Chuck Berry,"[classic rock, rock, rock-and-roll, rockabilly]",75,0


In [16]:
#combining genres with music feature data average into a new df
genre_features_input = {}
genre_features = {}
#list of features were looking for
features_input = ['danceability','energy','loudness','speechiness','acousticness','liveness','valence']


#loop through df
for i, r in data.iterrows():

    #loop through each genre
    for genre in r['artist_genres']:

        #create a dict for the genre if it isnt made
        if genre not in genre_features_input.keys():
            genre_features_input[genre] = []

            #start adding the features
            for feature in features_input:

                #create dict for features
                if feature not in genre_features.keys():
                    genre_features[feature] = []
                    genre_features[feature].append(r[feature])
                
            genre_features_input[genre].append(genre_features)
        #else add onto the generated genre key
        else:
            for genre_info in genre_features_input.values():
                for dicts in genre_info:
                    for key,v in dicts.items():
                        for feature in features_input:
                            if key == feature:
                                dicts[key].append(r[feature])
                    
                    
                
        genre_features = {}
        


In [None]:
#get the average of all the variables that was gathered from each artists songs associated with the genre
for genre_key in genre_features_input.values():
    for item in genre_key:
        for k,v in item.items():
            # print(v[:10])
            new = np.mean(v)
            item[k] = new
            
            


In [15]:
print(genre_features_input['cyberpunk'])

[{'danceability': 0.5605952532653711, 'energy': 0.6146990825103535, 'loudness': -9.368419560369544, 'speechiness': 0.06508690665817139, 'acousticness': 0.28670849553679517, 'liveness': 0.1946907613889774, 'valence': 0.5889797387703091}]
