# Spotify Song Classifier
*Predict genre using audio features.*

Author: Sam Spoerl<br>
Created: January 2021

In [None]:
# Spotify API authorization
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

# Custom functions 
from utils import songs_and_features

# Plotting and manipulation
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d

# Scikit-learn
import sklearn.feature_selection as sk
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer, LabelEncoder
from sklearn.metrics import adjusted_rand_score

## Data Collection

In [None]:
# Setup Spotify authorization object
client_id = 'YOUR_CLIENT_ID'
client_secret = 'YOUR_CLIENT_SECRET'
auth = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=client_id, client_secret=client_secret))

In [None]:
# Playlist IDs to pass to helper function
playlist_ids = [
    '37i9dQZF1DWWEJlAGA9gs0?si=stIkax0xSM6UgPdGZs8SpA', # Classical
    '37i9dQZF1DXbYM3nMM0oPk?si=P2xOL4mERjGyNdgPHAs8kQ', # Pop
    '37i9dQZF1DXcF6B6QPhFDv?si=KEQZLHzURt6erm-D7EFviA' # Rock
]

In [None]:
# Master dataframe
df_all_genres = pd.DataFrame()

# Spotify limits audio features to 100 tracks
# Call helper function from utils.py
# Call each playlist separately and append to master
for playlist_id in playlist_ids:
    df = songs_and_features(auth, playlist_id)
    df_all_genres = pd.concat([df_all_genres, df])

# Reset index
df_all_genres.reset_index(drop=True, inplace=True)

## Feature Selection

In [None]:
# Alias for feature selection
df_fs = df_all_genres.copy()

# Make "loudness" feature positive
# Needs to be global
df_fs['loudness'] = df_fs.loudness.abs()

def selkb(k):
    ''' Helper function to test different values for K. '''

    # Input: numeric features
    X = df_fs.iloc[:, 4:]

    # Prediction variable
    y = df_fs['playlist_name']

    # Select K best features
    sel = sk.SelectKBest(sk.chi2, k=k).fit_transform(X, y)
    df_sel = pd.DataFrame(sel)

    return df_sel

In [None]:
def name_selected_features(k):
    """ Helper function to convert values in selected features columns to float,
    then name them by matching values to the original dataset. """

    # Get k-best features dataframe
    df_sel = selkb(k)
    
    # Convert all numeric dtypes in df_fs to float to look for matching columns
    df_fs_float = pd.DataFrame()
    for col in df_fs.columns:
        df_fs_float[col] = pd.to_numeric(df_fs[col], errors='ignore', downcast='float')

    # Convert all numeric dtypes in df_sel to float to look for matching columns
    df_sel_float = pd.DataFrame()
    for col in df_sel.columns:
        df_sel_float[col] = pd.to_numeric(df_sel[col], errors='ignore', downcast='float')

    # Compare initial df to selected df to determine features
    sel_cols = []
    for i in df_sel_float.columns:
        for j in df_fs_float.columns:
            if df_fs_float[j].equals(df_sel_float[i]) == True:
                sel_cols.append(j)

    # Add names to columns
    df_sel_float.columns = sel_cols

    # Add playlist name
    df_sel_float['playlist_name'] = df_fs['playlist_name']

    return df_sel_float


## Cluster Model

In [None]:
def cluster(data, k, scaler=StandardScaler(), init='k-means++', random_state=42):
    ''' Helper function to scale and test different k values.'''

    # Get only numeric fields
    dataNum = data.select_dtypes(include='number')

    # Select number of clusters
    k = k

    # Create a KMeans instance with k clusters
    # Speed up convergence using k-means++ initialization
    kmeans = KMeans(n_clusters=k, init=init, random_state=random_state)

    # Scale the data and pass kmeans instance
    pipeline = make_pipeline(scaler, kmeans)

    # Fit pipeline to samples
    pipeline.fit_transform(dataNum)

    # Predict cluster labels
    df_clusters = pipeline.predict(dataNum)

    # Add column to identify the cluster
    data.insert(loc=0, column='cluster', value=df_clusters)

    return data, kmeans

In [None]:
# Get true labels for evaluation
le = LabelEncoder()
labels_true = le.fit_transform(df_all_genres['playlist_name'])

# Number of clusters is equal to number of classes
n_clusters = len(le.classes_)

# Spotify has 15 audio features available
n_features = 15

In [None]:
# Empty lists for appending
ars_list = []
scaler_list = []
features_list = []

# Dict of scaler names and objects for parameter tuning
scaler_types = {('standard', StandardScaler()), 
                            ('minmax', MinMaxScaler()), 
                            ('normalizer', Normalizer())}

# Tune parameters by varying scaler type and number of features
for scaler in scaler_types:
    for i in range(1, 15):

        # Vary number of features (select k best)
        df_sel = name_selected_features(i)

        # KMeans output
        df_cluster_results, kmeans = cluster(df_sel, n_clusters, scaler=scaler[1])

        # Adjusted rand score since true labels are known
        # 1 is perfect match, 0 or negative is poor
        ars = adjusted_rand_score(labels_true, kmeans.labels_)
        ars_list.append(round(ars, 3))

        scaler_list.append(scaler[0])
        features_list.append(i)


In [None]:
# Results as df for plotting and analysis
df_ars = pd.DataFrame({'n_features':
                                            features_list,
                                            'scaler':
                                            scaler_list,
                                            'adjusted_rand_score':
                                            ars_list})

# Pivot: Get each scaler type in its own column for plotting
df_ars_pivot = df_ars.pivot(index='n_features', columns='scaler', values='adjusted_rand_score')

# Look for the max
_ = df_ars_pivot.plot(xticks=[x for x in range(n_features)])
plt.show()
print(df_ars_pivot)

## Conclusion

In this scenario, the optimal scaler and number of features is the StandardScaler and 13 features, respectively. Because the optimal number of features is more than three, the output cannot be plotted. For the purpose of example, we can generate the following 3D plot using the best three features.

In [None]:
def plot(n_features, scaler):

    # KMeans input with k-best features
    df_sel = name_selected_features(n_features)

    # Call cluster function on optimal parameters
    data, kmeans = cluster(df_sel, k=n_clusters, scaler=scaler)

    # Create figure, make axes 3D
    fig = plt.figure()
    ax = mplot3d.Axes3D(fig)

    # Genre classify clusters
    data['pred_genre'] = 'Pop'
    data.loc[data['cluster'] == 1, ['pred_genre']] = 'Classical'
    data.loc[data['cluster'] == 2, ['pred_genre']] = 'Rock'

    # Plot each genre separately so they each get their own label
    for genre in data['pred_genre'].drop_duplicates():
        
        # Filter on genre
        df = data.loc[data['pred_genre'] == genre].copy()

        # Assign the columns of samples: xs, ys and zs
        xs = df.iloc[:,1] # First feature
        ys = df.iloc[:,2] # Second feature
        zs = df.iloc[:,3] # Third feature

        # Plot 3D scatter
        _ = ax.scatter3D(xs, ys, zs, label=genre, alpha=0.5)

    # Label axes
    ax.set_xlabel(df.columns[1])
    ax.set_ylabel(df.columns[2])
    ax.set_zlabel(df.columns[3])

    # Title and legend
    ax.legend()
    ax.set_title('Genre Classification using Audio Features')

    return data, plt

df, plt = plot(3, StandardScaler())
plt.show()