# Dissecting Spotify Valence

---

>Stogiannidis Ilias Marios  <br />
>Department of Informatics  <br />
>Athens University of Economics and Business  <br />
>stoyianel@gmail.com

### Extracting data from a spotify playlist 

In [364]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols

### Setting up the Spotify API*

For storing our credentials, we'll create a file spotify_config.py with the following contents:

In [None]:
config = {
    'client_id' : 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX',
    'client_secret' :'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'
}

In [365]:
from spotify_config import config

In [366]:
#Authentication - without user
client_credentials_manager = SpotifyClientCredentials(client_id=config["cid"], client_secret=config["csecret"])
sp = spotipy.Spotify(client_credentials_manager = client_credentials_manager)

In [367]:
playlist_creator = "Susanna Ketola"
playlist_id = "4rnleEAOdmFAbRcNCgZMpY"

* We are using the function below to extract the data from the spotify api and storing it to a csv

In [368]:
playlist_features_list = ["artist", "album", "track_name", "track_id","acousticness",
                             "danceability", "energy", "key", "loudness", "mode", "speechiness",
                             "instrumentalness", "liveness", "valence", "tempo", "duration_ms", "time_signature"]

In [369]:
def analyze_playlist(creator, playlist_id):
    
    # Create empty dataframe
    playlist_features_list = ["artist", "album", "track_name", "track_id","acousticness",
                             "danceability", "energy", "key", "loudness", "mode", "speechiness",
                             "instrumentalness", "liveness", "valence", "tempo", "duration_ms", "time_signature"]
    playlist_df = pd.DataFrame(columns = playlist_features_list)
    
    # Create empty dict
    playlist_features = {}
    
    # Loop through every track in the playlist, extract features and append the features to the playlist df
    results = sp.user_playlist_tracks(creator,playlist_id)
    tracks = results['items']
    while results['next']:
        results = sp.next(results)
        tracks.extend(results['items'])
    for track in tracks:
        # Get metadata
        playlist_features["artist"] = track["track"]["album"]["artists"][0]["name"]
        playlist_features["album"] = track["track"]["album"]["name"]
        playlist_features["track_name"] = track["track"]["name"]
        playlist_features["track_id"] = track["track"]["id"]
        # Get audio features
        audio_features = sp.audio_features(playlist_features["track_id"])[0]
        for feature in playlist_features_list[4:]:
            playlist_features[feature] = audio_features[feature]
        
        # Concat the dfs
        track_df = pd.DataFrame(playlist_features, index = [0])
        playlist_df = pd.concat([playlist_df, track_df], ignore_index = True)
        
    return playlist_df

* the code in comments below is used to store the dataframe to a csv 

In [370]:
#playlist_df = analyze_playlist(playlist_creator,playlist_id)
#playlist_df.to_csv('playlist_features.csv')

In [371]:
playlist_df = pd.read_csv("playlist_features.csv")
playlist_df.drop(columns = ["Unnamed: 0"], inplace = True)

In [372]:
playlist_df = playlist_df.astype({'acousticness': 'float64','danceability': 'float64', 'energy': 'float64', 'key': 'float64', 'loudness': 'float64', 'mode': 'float64', 'speechiness': 'float64', 'instrumentalness': 'float64', 'liveness': 'float64', 'valence': 'float64', 'tempo': 'float64', 'duration_ms': 'float64', 'time_signature': 'float64'})

### Q1: Expore which Features Influence Valence

In [373]:
playlist_df.corr(method="pearson").style.background_gradient(cmap='coolwarm')

Unnamed: 0,acousticness,danceability,energy,key,loudness,mode,speechiness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
acousticness,1.0,-0.128429,-0.525334,-0.00016,-0.409016,0.03157,0.021099,-0.017128,-0.099146,-0.152342,-0.120919,-0.053462,-0.090135
danceability,-0.128429,1.0,-0.017655,-0.005442,0.009181,-0.032772,0.188258,-0.004625,-0.076984,0.374874,-0.137144,-0.193455,0.114482
energy,-0.525334,-0.017655,1.0,0.006415,0.689456,-0.03581,-0.081145,0.081076,0.16609,0.354886,0.141599,0.058517,0.082322
key,-0.00016,-0.005442,0.006415,1.0,-0.006232,-0.153581,0.019253,0.016174,0.015445,0.024043,0.012406,-0.001163,-0.004152
loudness,-0.409016,0.009181,0.689456,-0.006232,1.0,-0.014824,-0.112025,-0.05191,0.096353,0.208928,0.066475,0.027564,0.044661
mode,0.03157,-0.032772,-0.03581,-0.153581,-0.014824,1.0,-0.007684,-0.014467,-0.026455,-0.03352,0.004752,-0.014321,-0.027352
speechiness,0.021099,0.188258,-0.081145,0.019253,-0.112025,-0.007684,1.0,-0.057842,0.067565,0.021094,0.098401,-0.134268,0.032836
instrumentalness,-0.017128,-0.004625,0.081076,0.016174,-0.05191,-0.014467,-0.057842,1.0,0.004481,-0.053718,0.037712,0.003018,-0.049221
liveness,-0.099146,-0.076984,0.16609,0.015445,0.096353,-0.026455,0.067565,0.004481,1.0,0.02819,0.024986,0.025322,0.017061
valence,-0.152342,0.374874,0.354886,0.024043,0.208928,-0.03352,0.021094,-0.053718,0.02819,1.0,0.026833,-0.110459,0.067634


In [374]:
model = ols("valence ~ acousticness + danceability + energy + key + loudness + mode + speechiness + instrumentalness + liveness + tempo + duration_ms", data = playlist_df).fit()
model.summary()

0,1,2,3
Dep. Variable:,valence,R-squared:,0.3
Model:,OLS,Adj. R-squared:,0.299
Method:,Least Squares,F-statistic:,205.9
Date:,"Thu, 17 Mar 2022",Prob (F-statistic):,0.0
Time:,17:01:19,Log-Likelihood:,1260.1
No. Observations:,5295,AIC:,-2496.0
Df Residuals:,5283,BIC:,-2417.0
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.4479,0.036,-12.518,0.000,-0.518,-0.378
acousticness,0.1362,0.016,8.700,0.000,0.106,0.167
danceability,0.6750,0.021,32.655,0.000,0.635,0.716
energy,0.6730,0.024,28.088,0.000,0.626,0.720
key,0.0015,0.001,1.998,0.046,2.77e-05,0.003
loudness,-0.0095,0.002,-6.016,0.000,-0.013,-0.006
mode,-0.0032,0.005,-0.607,0.544,-0.014,0.007
speechiness,-0.1090,0.028,-3.838,0.000,-0.165,-0.053
instrumentalness,-0.1844,0.022,-8.485,0.000,-0.227,-0.142

0,1,2,3
Omnibus:,49.008,Durbin-Watson:,1.764
Prob(Omnibus):,0.0,Jarque-Bera (JB):,37.618
Skew:,-0.117,Prob(JB):,6.78e-09
Kurtosis:,2.66,Cond. No.,3450000.0


In [375]:
aov_table = sm.stats.anova_lm(model, typ=2)
aov_table.sort_values(by = "F", ascending = False)

Unnamed: 0,sum_sq,df,F,PR(>F)
danceability,38.877882,1.0,1066.339138,3.230166e-213
energy,28.764622,1.0,788.953525,6.458495e-162
acousticness,2.759543,1.0,75.688509,4.3752900000000004e-18
instrumentalness,2.624916,1.0,71.995978,2.7682920000000002e-17
loudness,1.319615,1.0,36.194285,1.906041e-09
duration_ms,0.874203,1.0,23.977563,1.003682e-06
speechiness,0.537033,1.0,14.7297,0.0001255252
tempo,0.423393,1.0,11.612793,0.0006598904
key,0.145584,1.0,3.993054,0.04573934
mode,0.013424,1.0,0.368191,0.544018


Function `process_subset()` takes the dependent variable, the observations, and the set of columns, fits a model, and returns the fitted model.

In [376]:
def process_subset(y, data, feature_set):
    X = data.loc[:, feature_set].values
    X = sm.add_constant(X)
    names = ['intercept']
    names.extend(feature_set)
    model = sm.OLS(y, X)
    model.data.xnames = names
    regr = model.fit()
    return regr

Function `forward_add_variable()` finds the best variable to add at each step.

In [377]:
def forward_add_variable(data, exog, selected, to_select):
    best_rsquared = 0
    best_model = None
    best_column = None
    y = data.loc[:, exog]
    
    for column in to_select:
        new_selected = selected + [column]
        regr = process_subset(y, data, new_selected)
        if regr.rsquared > best_rsquared:
            best_rsquared = regr.rsquared
            best_model = regr
            best_column = column
    
    return best_model, best_column

Function `forward_stepwise_selection()` that just does a loop adding a variable at each iteration.

In [378]:
def forward_stepwise_selection(data, exog):

    best_models = []
    best_model = None
    selected = []
    to_select = [ x for x in data.columns if x != exog ]

    p = len(to_select) + 1

    for i in range(1, p):
        print(f'Finding the best model for {i} variable{"s" if i > 1 else ""}')
        model, best_column = forward_add_variable(data, exog, selected, to_select)
        selected.append(best_column)
        to_select.remove(best_column)
        if not best_model or model.rsquared_adj > best_model.rsquared_adj:
            best_model = model
        print(selected)
        best_models.append(model)
        
    print(f'Fitted {1 + p*(p+1)//2} models')
    return best_model, best_models

In [379]:
best_model, _ = forward_stepwise_selection(playlist_df.drop(columns=["track_id", "track_name", "album", "artist"]), 'valence')
print('Best overall model:', len(best_model.model.exog_names), best_model.model.exog_names)

Finding the best model for 1 variable
['danceability']
Finding the best model for 2 variables
['danceability', 'energy']
Finding the best model for 3 variables
['danceability', 'energy', 'acousticness']
Finding the best model for 4 variables
['danceability', 'energy', 'acousticness', 'instrumentalness']
Finding the best model for 5 variables
['danceability', 'energy', 'acousticness', 'instrumentalness', 'loudness']
Finding the best model for 6 variables
['danceability', 'energy', 'acousticness', 'instrumentalness', 'loudness', 'duration_ms']
Finding the best model for 7 variables
['danceability', 'energy', 'acousticness', 'instrumentalness', 'loudness', 'duration_ms', 'speechiness']
Finding the best model for 8 variables
['danceability', 'energy', 'acousticness', 'instrumentalness', 'loudness', 'duration_ms', 'speechiness', 'tempo']
Finding the best model for 9 variables
['danceability', 'energy', 'acousticness', 'instrumentalness', 'loudness', 'duration_ms', 'speechiness', 'tempo', 'k

In [380]:
best_model.summary()

0,1,2,3
Dep. Variable:,valence,R-squared:,0.3
Model:,OLS,Adj. R-squared:,0.299
Method:,Least Squares,F-statistic:,251.7
Date:,"Thu, 17 Mar 2022",Prob (F-statistic):,0.0
Time:,17:01:20,Log-Likelihood:,1259.9
No. Observations:,5295,AIC:,-2500.0
Df Residuals:,5285,BIC:,-2434.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
intercept,-0.4503,0.035,-12.734,0.000,-0.520,-0.381
danceability,0.6751,0.021,32.831,0.000,0.635,0.715
energy,0.6738,0.024,28.332,0.000,0.627,0.720
acousticness,0.1361,0.016,8.694,0.000,0.105,0.167
instrumentalness,-0.1843,0.022,-8.483,0.000,-0.227,-0.142
loudness,-0.0095,0.002,-6.029,0.000,-0.013,-0.006
duration_ms,-3.07e-07,6.28e-08,-4.886,0.000,-4.3e-07,-1.84e-07
speechiness,-0.1085,0.028,-3.838,0.000,-0.164,-0.053
tempo,0.0003,0.000,3.400,0.001,0.000,0.001

0,1,2,3
Omnibus:,48.802,Durbin-Watson:,1.764
Prob(Omnibus):,0.0,Jarque-Bera (JB):,37.505
Skew:,-0.117,Prob(JB):,7.18e-09
Kurtosis:,2.661,Cond. No.,3430000.0


* As we see from the models above the features that mostly influence the valence are danceability and energy \newline
* Although other features seem to have some small influence too.

### Q2: Predict Valence

Using Machine Learning techniques to predict valence based on track features:

In [381]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lars
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.metrics import SCORERS
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV

Scaling the data

In [382]:
playlist_df.drop(columns=["track_id", "track_name", "album", "artist"], inplace=True)
X = playlist_df.drop(columns = ["valence"])
y = playlist_df["valence"]
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size = 0.2, random_state = 42)

### Using the Lars regression model

In [383]:
lars = Lars()
random_grid = {'n_nonzero_coefs': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
            'fit_intercept': [True, False],
            'verbose': [True, False],
            'normalize': [True, False],
            'precompute': [True, False],
            'copy_X': [True, False],
            'fit_path': [True, False]}
random_search = RandomizedSearchCV(estimator = lars, param_distributions = random_grid, n_iter = 30, cv = 3, verbose=2, random_state=42, n_jobs = -1, scoring = 'r2')
random_search.fit(xtrain, ytrain)
print(random_search.best_params_)

Fitting 3 folds for each of 30 candidates, totalling 90 fits
{'verbose': False, 'precompute': False, 'normalize': True, 'n_nonzero_coefs': 10, 'fit_path': False, 'fit_intercept': True, 'copy_X': True}


In [384]:
best_lars = random_search.best_estimator_
best_lars.fit(xtrain, ytrain)
lars_pred = best_lars.predict(xtest)
print("R2: {:.3f}".format(r2_score(ytest, lars_pred)))
print("MAE: {:.3f} " .format(mean_absolute_error(ytest, lars_pred)))

R2: 0.293
MAE: 0.154 


### Using the random forest regression model

In [385]:
grid = {'bootstrap': [True, False],
 'max_depth': [80, 90, 100, 110],
 'max_features': [2, 3],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [100, 200, 400, 600, 800,1000]}

In [386]:
rfr = RandomForestRegressor()
random = RandomizedSearchCV(estimator = rfr, param_distributions = grid, n_iter = 20, cv = 3, verbose=2, random_state=42, n_jobs = -1,scoring = 'r2')
searchResults = random.fit(xtrain, ytrain)
print(searchResults.best_params_)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
{'n_estimators': 400, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 3, 'max_depth': 100, 'bootstrap': False}


In [387]:
bestRFR = searchResults.best_estimator_
y_pred = bestRFR.predict(xtest)
print("R2: {:.3f}".format(r2_score(ytest, y_pred)))
print("MAE: {:.3f} ".format(mean_absolute_error(ytest, y_pred)))

R2: 0.389
MAE: 0.141 


### Using the XGB reggresion model

In [388]:
grid_xgb = {'booster': ['gbtree', 'gblinear', 'dart'],
    'colsample_bytree': [0.6, 0.7, 0.8],
    'gamma': [0.0, 0.1, 0.2],
    'learning_rate': [0.01, 0.02, 0.03, 0.04],
    'max_depth': [3, 4, 5],
    'min_child_weight': [1, 2, 3, 4]}

In [389]:
xgb_model = xgb.XGBRegressor()
random = RandomizedSearchCV(estimator = xgb_model, param_distributions = grid_xgb, n_iter = 20, cv = 3, verbose=2, random_state=42, n_jobs = -1)
searchResults = random.fit(xtrain, ytrain)
print(searchResults.best_params_)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
{'min_child_weight': 1, 'max_depth': 5, 'learning_rate': 0.04, 'gamma': 0.0, 'colsample_bytree': 0.6, 'booster': 'gbtree'}


In [390]:
bestXGB = searchResults.best_estimator_
y_pred = bestXGB.predict(xtest)
print("R2: {:.3f}".format(r2_score(ytest, y_pred)))
print("MAE: {:.3f} ".format(mean_absolute_error(ytest, y_pred)))

R2: 0.346
MAE: 0.149 


In [391]:
random_grid = {'bootstrap': [True, False],
  'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
  'max_features': ['auto', 'sqrt'],
  'min_samples_leaf': [1, 2, 4],
  'min_samples_split': [2, 5, 10],
  'n_estimators': [200, 400, 500, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

### Using the Extra Trees reggresion model

In [393]:
#import extratrees 
from sklearn.ensemble import ExtraTreesRegressor
et = ExtraTreesRegressor()
et_random = RandomizedSearchCV(estimator = et, param_distributions = random_grid, n_iter = 20, cv = 3, verbose=2, random_state=42, n_jobs = -1)
et_random.fit(xtrain, ytrain)
print(searchResults.best_params_)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
{'min_child_weight': 1, 'max_depth': 5, 'learning_rate': 0.04, 'gamma': 0.0, 'colsample_bytree': 0.6, 'booster': 'gbtree'}


In [394]:
bestET = et_random.best_estimator_
y_pred = bestET.predict(xtest)
print("R2: {:.3f}".format(r2_score(ytest, y_pred)))
print("MAE: {:.3f} ".format(mean_absolute_error(ytest, y_pred)))

R2: 0.382
MAE: 0.140 


### Creating a Neural Network for Reggresion to predict valence

In [395]:
import tensorflow as tf
import keras
import numpy as np
from keras import layers
from keras import models
from keras import utils
from keras.layers import Dense
from keras.models import Sequential
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import Activation
from keras.regularizers import l2
from tensorflow.keras.optimizers import SGD
from keras.callbacks import LearningRateScheduler
from keras.callbacks import History
from sklearn.utils import shuffle
from sklearn.metrics import r2_score
from keras.callbacks import ModelCheckpoint
import scikeras
from scikeras.wrappers import KerasRegressor
from sklearn.model_selection import GridSearchCV

In [396]:
epochs=1000
es = tf.keras.callbacks.EarlyStopping(monitor='mean_absolute_error', mode='min', verbose=1, patience=20)

In [397]:
normalizer = tf.keras.layers.Normalization(axis=-1)
normalizer.adapt(np.array(xtrain))

In [409]:
def build_and_compile_model(norm):
    model = keras.Sequential([
        norm,
        layers.Dense(32, activation='relu'),
        layers.Dense(16, activation='relu'),
        layers.Dense(8, activation='relu'),
        layers.Dense(1, activation='linear')
    ])
    model.compile(loss='mean_absolute_error', optimizer=SGD(0.1), metrics=['mean_absolute_error'])
    return model

In [410]:
with tf.device('/device:GPU:0'):
    dnn = build_and_compile_model(normalizer)
    dnn.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 normalization_2 (Normalizat  (None, 12)               25        
 ion)                                                            
                                                                 
 dense_18 (Dense)            (None, 32)                416       
                                                                 
 dense_19 (Dense)            (None, 16)                528       
                                                                 
 dense_20 (Dense)            (None, 8)                 136       
                                                                 
 dense_21 (Dense)            (None, 1)                 9         
                                                                 
Total params: 1,114
Trainable params: 1,089
Non-trainable params: 25
___________________________________________________

In [None]:
callbacks_list = [es]

In [411]:
# Fit the model
input_dim = xtrain.shape[1]
batch_size = int(input_dim/100)
with tf.device('/device:GPU:0'):
    lr_model_history = dnn.fit(xtrain, ytrain,
                        batch_size = batch_size,
                        epochs=1000,
                        verbose=1,
                        validation_data=(xtest, ytest),
                        callbacks=callbacks_list)


Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

In [412]:
predictions = dnn.predict(xtest)
print("R2: {:.3f}".format(r2_score(ytest, predictions)))
print("MAE: {:.3f} ".format(mean_absolute_error(ytest, predictions)))


R2: 0.070
MAE: 0.168 


### Predicting valence from the data given 


#### Reading the song ids

In [None]:
test_song = pd.read_csv('spotify_ids.txt', header=None)
test_song = test_song.iloc[:,0]
test_data = pd.DataFrame()
for song in test_song:
    test_data = test_data.append(sp.audio_features(song))
test_data.head()
    

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,0.585,0.436,10,-8.761,1,0.0601,0.721,1.3e-05,0.105,0.132,143.874,audio_features,7lPN2DXiMsVn7XUKtOW1CS,spotify:track:7lPN2DXiMsVn7XUKtOW1CS,https://api.spotify.com/v1/tracks/7lPN2DXiMsVn...,https://api.spotify.com/v1/audio-analysis/7lPN...,242014,4
0,0.68,0.826,0,-5.487,1,0.0309,0.0212,1.2e-05,0.543,0.644,118.051,audio_features,5QO79kh1waicV47BqGRL3g,spotify:track:5QO79kh1waicV47BqGRL3g,https://api.spotify.com/v1/tracks/5QO79kh1waic...,https://api.spotify.com/v1/audio-analysis/5QO7...,215627,4
0,0.514,0.73,1,-5.934,1,0.0598,0.00146,9.5e-05,0.0897,0.334,171.005,audio_features,0VjIjW4GlUZAMYd2vXMi3b,spotify:track:0VjIjW4GlUZAMYd2vXMi3b,https://api.spotify.com/v1/tracks/0VjIjW4GlUZA...,https://api.spotify.com/v1/audio-analysis/0VjI...,200040,4
0,0.731,0.573,4,-10.059,0,0.0544,0.401,5.2e-05,0.113,0.145,109.928,audio_features,4MzXwWMhyBbmu6hOcLVD49,spotify:track:4MzXwWMhyBbmu6hOcLVD49,https://api.spotify.com/v1/tracks/4MzXwWMhyBbm...,https://api.spotify.com/v1/audio-analysis/4MzX...,205090,4
0,0.907,0.393,4,-7.636,0,0.0539,0.451,1e-06,0.135,0.202,104.949,audio_features,5Kskr9LcNYa0tpt5f0ZEJx,spotify:track:5Kskr9LcNYa0tpt5f0ZEJx,https://api.spotify.com/v1/tracks/5Kskr9LcNYa0...,https://api.spotify.com/v1/audio-analysis/5Ksk...,205458,4


#### Keeping the useful variables and scaling the data

In [413]:
col = ['acousticness','danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'instrumentalness', 'liveness', 'tempo', 'duration_ms','time_signature']
X, y = test_data[col], test_data['valence']

#### Making prredictions

In [420]:
predictions = dnn.predict(X)
eT_predictions = bestET.predict(X)
test_data['pred_valence_nn'] = predictions
test_data['pred_valence_et'] = eT_predictions
print("MAE for neural network: {:.3f} ".format(mean_absolute_error(test_data['valence'], predictions)))
print("MAE for Extra Tree : {:.3f} ".format(mean_absolute_error(test_data['valence'], rdf_predictions)))

MAE for neural network: 0.174 
MAE for Extra Tree : 0.140 


In [421]:
test_data[['valence','pred_valence_nn','pred_valence_et']]

Unnamed: 0,valence,pred_valence_nn,pred_valence_et
0,0.132,0.270897,0.132000
0,0.644,0.607005,0.624109
0,0.334,0.375980,0.412086
0,0.145,0.364640,0.145000
0,0.202,0.493048,0.526583
...,...,...,...
0,0.188,0.329584,0.422451
0,0.768,0.579179,0.486167
0,0.316,0.469767,0.576002
0,0.484,0.452780,0.520245


* Above we can see the mean absolute error for the two methods used 