In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as skl
from tensorflow import keras
from sklearn.neighbors import kneighbors_graph, NearestNeighbors
from tensorflow.keras.layers import Dense, LeakyReLU
from tensorflow.keras.optimizers import Adam, Nadam, RMSprop, SGD
from tensorflow.keras.utils import plot_model, to_categorical
from tensorflow.keras.models import Model, Sequential, save_model, load_model
from tensorflow.config import list_logical_devices
from tensorflow.keras.callbacks import TensorBoard

### Data Preparation

#### We removed null values, entries with duplicate song name / artists pairs, and normalized the numerical features for use in a neural network.
#### We also case-normalized the text in the data to make things a bit easier on the queries.

### Model Architecture

#### I tried 4 different models built with 2 different major architectures. Both architectures are autoencoders, but have some slight differences.

#### The first model architecture is deeper and wider than the second, with a larger latent vector. After I tried a few different optimizers and loss functions,

#### I settled on mean absolute error for the loss function for each model as it gave the best looking results.

#### The second model architecture uses LeakyReLU activation functions, is smaller, and has a smaller latent vector. The second model made from this architecture, dubbed
#### a very plain name of ae4 (autoencoder 4) utilizes RMSProp as the optimizer, and each LeakyReLU has an alpha of 0.3.

#### After looking at the output for each model with various songs, it seemed that ae4 had the most consistently understandable recommendations, so I saved that model, its encoded vectors for the entire dataset, and the K-NearestNeighbors model used to relate the encoded vectors for recommendation. These are what is used in the application.

In [None]:
# df_full.to_csv('model_ready_data_no_dupes.csv')

In [2]:
# Completely cleaned data
df_full = pd.read_csv(r'C:\Users\Logan\Desktop\model_ready_data_no_dupes.csv')

In [3]:
# Save numerical features for model in separate dataframe
df = df_full.select_dtypes('number')

In [4]:
# Convert to numpy array
data = df.to_numpy()

In [5]:
data.shape

(524211, 15)

In [8]:
df.columns

Index(['duration_ms', 'explicit', 'danceability', 'energy', 'key', 'loudness',
       'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'time_signature', 'popularity'],
      dtype='object')

In [6]:
df_full.columns

Index(['name', 'duration_ms', 'explicit', 'artists', 'release_date',
       'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'time_signature', 'popularity'],
      dtype='object')

In [None]:
# Tensorflow subclass API

class AutoEncoder(Model):
    def __init__(self):
        #Inherit init from Model base class
        super(AutoEncoder, self).__init__()

        # Encoder portion utilizing Keras Sequential
        self.encoder = Sequential([
            Dense(64, input_shape=(data.shape[1],), activation='relu'),
            Dense(32, activation='relu'),
            Dense(16, activation='relu'),
            Dense(8, activation='gelu')])
        # Encoder portion utilizing Keras Sequential
        self.decoder = Sequential([
            Dense(16, activation='gelu'),
            Dense(32, activation='relu'),
            Dense(64, activation='relu'),
            Dense(15, activation='relu')])

    # This function is used by fit to pass data through both the encoder and decoder
    def call(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

In [None]:
ae = AutoEncoder()
ae.compile(optimizer='adam', loss='mae')

In [None]:
ae.fit(data, data, shuffle=True, epochs=10, workers=10, use_multiprocessing=True, validation_split=.1)

In [None]:
encoded_data = ae.encoder(data)

In [None]:
encoded_data

In [None]:
knn = NearestNeighbors(n_neighbors=5, radius=1, n_jobs=-1)
knn.fit(encoded_data)

In [None]:
model_2 = AutoEncoder()
loss = tf.keras.losses.MeanAbsoluteError()
model_2.compile(optimizer='nadam', loss=loss)

In [None]:
model_2.fit(data, data, epochs=10, shuffle=True, validation_split=.1)

In [None]:
encoded_data_2 = model_2.encoder(data)

In [None]:
knn_2 = NearestNeighbors(n_neighbors=5, n_jobs=-1)
knn_2.fit(encoded_data_2)

In [None]:
# Same as above, just a different architecture
class AutoEncoder2(Model):
    def __init__(self, alpha):
        super(AutoEncoder2, self).__init__()

        self.encoder = Sequential([
            Dense(32, input_shape=data.shape[1:]),
            LeakyReLU(alpha),
            Dense(16),
            LeakyReLU(alpha),
            Dense(5),
            LeakyReLU(alpha)])

        self.decoder = Sequential([
            Dense(16),
            LeakyReLU(alpha),
            Dense(32),
            LeakyReLU(alpha),
            Dense(15),
            LeakyReLU(alpha)])
    

    def call(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

In [None]:
ae2 = AutoEncoder2(.1)
ae2.compile(optimizer='adam', loss='mae')

In [None]:
ae2.fit(data, data, epochs=10, validation_split=.1, workers=10, use_multiprocessing=True, shuffle=True)

In [None]:
encoded_data_3 = ae2.encoder(data)

In [None]:
knn_3 = NearestNeighbors(n_neighbors=5, n_jobs=-1)
knn_3.fit(encoded_data_3)

In [None]:
ae4 = AutoEncoder2(.3)
ae4.compile(optimizer='rmsprop', loss='mae')

In [None]:
ae4.fit(data, data, epochs=10, validation_split=.1, workers=10, use_multiprocessing=True, shuffle=True)

In [None]:
# ae4.save('ae4')

In [10]:
ae4 = load_model('assets/ae4')

In [11]:
encoded_data_4 = ae4.encoder(data)
knn4 = NearestNeighbors(n_jobs=-1)
knn4.fit(encoded_data_4)

NearestNeighbors(n_jobs=-1)

In [None]:
# joblib.dump(encoded_data_4, 'encoded_data.joblib')

In [13]:
query = 72837

# _, ind = knn.kneighbors([encoded_data[query]])
# _, ind2 = knn_2.kneighbors([encoded_data_2[query]])
# _, ind3 = knn_3.kneighbors([encoded_data_3[query]])
_, ind4 = knn4.kneighbors([encoded_data_4[query]])

In [None]:
df_full.iloc[ind[0]]

In [None]:
df_full.iloc[ind2[0]]

In [None]:
df_full.iloc[ind3[0]]

In [14]:
df_full.iloc[ind4[0]]

Unnamed: 0,name,duration_ms,explicit,artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,popularity
72837,duality,0.044939,0,"[""'slipknot'""]",2004,0.354,0.982,0.363636,0.871482,0,0.168,0.000237,0.000294,0.204,0.194,0.583535,0.8,0.78
71911,bring me to life,0.041965,0,"[""'evanescence'""]",2003-03-04,0.331,0.943,0.363636,0.869004,0,0.0698,0.00721,2e-06,0.242,0.296,0.384007,0.8,0.81
77786,eyeless,0.042048,1,"[""'slipknot'""]",2009-09-09,0.293,0.997,0.363636,0.851016,0,0.217,0.000463,3.9e-05,0.415,0.13,0.407288,0.8,0.61
50622,ace of spades,0.029695,0,"[""'motörhead'""]",1980-11-08,0.329,0.974,0.272727,0.783621,0,0.135,0.000852,0.000118,0.0904,0.234,0.571724,0.8,0.75
73246,"give 'em hell, kid",0.024699,0,"[""'my chemical romance'""]",2004-06-08,0.252,0.993,0.363636,0.870671,0,0.16,0.0234,0.0,0.228,0.118,0.745889,0.8,0.62


In [None]:
df_full[df_full['artists'].str.contains('slipkn') == True].head(50)