In [1]:
# imports
import pandas as pd

from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Lambda, Softmax
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.metrics import top_k_categorical_accuracy

from sklearn.preprocessing import StandardScaler

import joblib

In [0]:
# dataframe the data
df = pd.read_csv('/content/SpotifyTracks_doubleforloop_genre_year.csv',index_col=[0])

In [3]:
# check data
df.head(10)

Unnamed: 0,artist_name,track_name,track_id,popularity,year,genre,id,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,Gorillaz,On Melancholy Hill,0q6LuUqGLUiCPP1cbdwFs3,75,2010,alternative,0,1.5e-05,0.689,233867,0.739,0.509,2,0.064,-5.81,1,0.026,120.423,4,0.578
1,Avenged Sevenfold,Nightmare,4UEo1b0wWrtHMC8bVqPiH8,70,2010,alternative,1,0.000318,0.554,374453,0.949,0.0001,2,0.2,-4.928,1,0.0787,129.984,4,0.233
2,The Black Keys,Howlin' for You,0grFc6klR3hxoHLcgCYsF4,66,2010,alternative,2,0.028,0.705,191800,0.735,0.0783,11,0.112,-6.646,1,0.0931,132.627,4,0.448
3,My Darkest Days,Porn Star Dancing,3Q8zopc4ABXhysDb1sgLVW,65,2010,alternative,3,0.0139,0.477,199013,0.917,0.0,2,0.0756,-3.399,1,0.0837,160.044,4,0.271
4,Volbeat,A Warrior's Call,0hTiTU0yqthnByyZDD3bcc,62,2010,alternative,4,0.00075,0.374,263080,0.903,6e-06,10,0.244,-4.49,1,0.0825,109.118,3,0.429
5,Sick Puppies,You're Going Down,5omWAB5iNMHvbAfBSzkdu8,63,2010,alternative,5,0.0116,0.488,187347,0.842,0.00127,6,0.146,-5.926,1,0.0469,90.003,4,0.41
6,Godsmack,Cryin' Like A Bitch!!,4a9i7rCLfPjbS1sNamZeQN,65,2010,alternative,6,7.2e-05,0.543,201667,0.808,0.00336,10,0.114,-5.484,1,0.0407,95.19,4,0.472
7,Avenged Sevenfold,Welcome to the Family,0jqblvsI9LBY4irmLVqqEO,65,2010,alternative,7,0.00267,0.567,245573,0.946,0.0,2,0.0567,-3.973,0,0.0935,94.985,4,0.699
8,Disturbed,Down with the Sickness,4ImIJRZNJhNQLLdUFSYJoS,66,2010,alternative,8,0.000319,0.649,278707,0.896,1.2e-05,10,0.102,-2.704,0,0.056,90.009,4,0.924
9,LCD Soundsystem,Dance Yrself Clean,2cmRpmO04TLaKPzmAzySYZ,64,2010,alternative,9,0.00557,0.739,536471,0.611,0.725,11,0.04,-9.829,1,0.0622,98.004,4,0.794


In [0]:
# create onehots for the genre column, add them on
genreframe = pd.concat([df,pd.get_dummies(df['genre'], prefix='genre')],axis=1)

In [5]:
# check
genreframe.head()

Unnamed: 0,artist_name,track_name,track_id,popularity,year,genre,id,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,genre_alternative,genre_country,genre_dance,genre_folk,genre_grunge,genre_indie,genre_jazz,genre_metal,genre_pop,genre_punk,genre_rap,genre_rock
0,Gorillaz,On Melancholy Hill,0q6LuUqGLUiCPP1cbdwFs3,75,2010,alternative,0,1.5e-05,0.689,233867,0.739,0.509,2,0.064,-5.81,1,0.026,120.423,4,0.578,1,0,0,0,0,0,0,0,0,0,0,0
1,Avenged Sevenfold,Nightmare,4UEo1b0wWrtHMC8bVqPiH8,70,2010,alternative,1,0.000318,0.554,374453,0.949,0.0001,2,0.2,-4.928,1,0.0787,129.984,4,0.233,1,0,0,0,0,0,0,0,0,0,0,0
2,The Black Keys,Howlin' for You,0grFc6klR3hxoHLcgCYsF4,66,2010,alternative,2,0.028,0.705,191800,0.735,0.0783,11,0.112,-6.646,1,0.0931,132.627,4,0.448,1,0,0,0,0,0,0,0,0,0,0,0
3,My Darkest Days,Porn Star Dancing,3Q8zopc4ABXhysDb1sgLVW,65,2010,alternative,3,0.0139,0.477,199013,0.917,0.0,2,0.0756,-3.399,1,0.0837,160.044,4,0.271,1,0,0,0,0,0,0,0,0,0,0,0
4,Volbeat,A Warrior's Call,0hTiTU0yqthnByyZDD3bcc,62,2010,alternative,4,0.00075,0.374,263080,0.903,6e-06,10,0.244,-4.49,1,0.0825,109.118,3,0.429,1,0,0,0,0,0,0,0,0,0,0,0


In [6]:
# features to use for genre inference
genreframe.iloc[0,7:20]

acousticness        1.51e-05
danceability           0.689
duration_ms           233867
energy                 0.739
instrumentalness       0.509
key                        2
liveness               0.064
loudness               -5.81
mode                       1
speechiness            0.026
tempo                120.423
time_signature             4
valence                0.578
Name: 0, dtype: object

In [7]:
# get the features for genre inference, scale them.
X_train = genreframe.iloc[:,7:20].to_numpy()
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train)
X_scaled

array([[-0.81353437,  0.86273319,  0.01422737, ..., -0.07457545,
         0.21919423,  0.52020022],
       [-0.81253793,  0.02764277,  1.61395879, ...,  0.24680209,
         0.21919423, -0.95936143],
       [-0.72147348,  0.96170687, -0.46445403, ...,  0.33564226,
         0.21919423, -0.03731576],
       ...,
       [ 0.35424623,  1.02975127,  0.17080273, ..., -0.0196848 ,
         0.21919423,  1.42080297],
       [-0.80624809, -0.62187201, -1.57989421, ..., -1.07440547,
         0.21919423,  1.22781667],
       [-0.42211418,  1.23388449, -0.24267685, ..., -0.99696024,
         0.21919423, -0.50048289]])

In [8]:
# pickle the scaler
joblib.dump(scaler, 'genre_NN_scaler')

['genre_NN_scaler']

In [9]:
# genre list
genres = genreframe.genre.unique().tolist()
genres

['alternative',
 'country',
 'dance',
 'folk',
 'grunge',
 'indie',
 'jazz',
 'metal',
 'pop',
 'punk',
 'rap',
 'rock']

In [10]:
# get the onehots for targets
onehots = genreframe.iloc[:,20:]
y_train = onehots.to_numpy()
y_train.shape

(164449, 12)

In [141]:
# define the model
model = Sequential()

# input
model.add(Dense(26, input_dim=13, activation='relu'))
# hidden
model.add(Dropout(0.2))
# hidden
model.add(Dense(26, activation='relu'))
# hidden
model.add(Dropout(0.2))
# hidden
model.add(Dense(26, activation='relu'))
# hidden
model.add(Dropout(0.2))
# map down to number of classes
model.add(Dense(12,activation='relu'))
# set the softmax temperature 
model.add(Lambda(lambda x: x / 10.0))
# softmax layer
model.add(Softmax())

def top_3_accuracy(y_true, y_pred):
    return top_k_categorical_accuracy(y_true, y_pred, k=3)

#Compile
model.compile(loss='categorical_crossentropy',
                    optimizer='adam',
                    metrics=['accuracy'])

model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_12 (Dense)             (None, 26)                364       
_________________________________________________________________
dropout_9 (Dropout)          (None, 26)                0         
_________________________________________________________________
dense_13 (Dense)             (None, 26)                702       
_________________________________________________________________
dropout_10 (Dropout)         (None, 26)                0         
_________________________________________________________________
dense_14 (Dense)             (None, 26)                702       
_________________________________________________________________
dropout_11 (Dropout)         (None, 26)                0         
_________________________________________________________________
dense_15 (Dense)             (None, 12)               

In [142]:
# Fit the model
history = model.fit(X_scaled, y_train, validation_split=0.2, epochs=1000, batch_size=2048)

Train on 131559 samples, validate on 32890 samples
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 

In [166]:
# take a dataframe row, make it an observation
obs = scaler.transform(genreframe.iloc[62069:62070,7:20])
obs

array([[-0.31026564, -0.14556117, -0.02462062, -0.85104841, -0.41424087,
         0.49260707, -0.60230363, -0.07112041, -1.39063474, -0.65527578,
         1.65594226,  0.21919423,  1.25783676]])

In [167]:
# predict from the observation, look at the output
pred = model.predict(obs)
pred

array([[0.1134963 , 0.18399121, 0.03622173, 0.28088233, 0.0631346 ,
        0.09686016, 0.08357186, 0.02284853, 0.03707801, 0.05090391,
        0.00469987, 0.02631152]], dtype=float32)

In [173]:
# look at the actual content of the row
genreframe.iloc[62069:62070,:].to_numpy()

array([['Sharon Van Etten', 'Taking Chances', '1zqFc9MSJQhFmhCe7xyT5D',
        39, 2014, 'folk', 102563, 0.153, 0.526, 230453, 0.462, 0.0139, 7,
        0.1, -7.722, 0, 0.0262, 171.90599999999998, 4, 0.75, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 0]], dtype=object)

In [0]:
genre_list = ['alternative', 'country', 'dance', 'folk', 'grunge', 'indie', 'jazz', 'metal', 'pop', 'punk', 'rap', 'rock']

def best_3(genre_vector):
    """takes a genre vector and returns the 3 most-fit genres as strings."""
    vector_list = genre_vector.tolist()[0]

    best3_tuples =  sorted(zip(vector_list, genre_list), reverse=True)[:3]
    best3_genres = [x[1] for x in best3_tuples]

    return best3_genres

In [170]:
# best 3 genres
best_3(pred)

['folk', 'country', 'alternative']

In [0]:
# save model
model.save('genre_NN')