In [42]:
# Imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
import pickle
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [43]:
# Reading in the data and removing the 
df = pd.read_csv('genres_v2.csv')
df

Unnamed: 0.1,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,...,id,uri,track_href,analysis_url,duration_ms,time_signature,genre,song_name,Unnamed: 0,title
0,0.831,0.814,2,-7.364,1,0.4200,0.059800,0.013400,0.0556,0.3890,...,2Vc6NJ9PW9gD9q343XFRKx,spotify:track:2Vc6NJ9PW9gD9q343XFRKx,https://api.spotify.com/v1/tracks/2Vc6NJ9PW9gD...,https://api.spotify.com/v1/audio-analysis/2Vc6...,124539,4,Dark Trap,Mercury: Retrograde,,
1,0.719,0.493,8,-7.230,1,0.0794,0.401000,0.000000,0.1180,0.1240,...,7pgJBLVz5VmnL7uGHmRj6p,spotify:track:7pgJBLVz5VmnL7uGHmRj6p,https://api.spotify.com/v1/tracks/7pgJBLVz5Vmn...,https://api.spotify.com/v1/audio-analysis/7pgJ...,224427,4,Dark Trap,Pathology,,
2,0.850,0.893,5,-4.783,1,0.0623,0.013800,0.000004,0.3720,0.0391,...,0vSWgAlfpye0WCGeNmuNhy,spotify:track:0vSWgAlfpye0WCGeNmuNhy,https://api.spotify.com/v1/tracks/0vSWgAlfpye0...,https://api.spotify.com/v1/audio-analysis/0vSW...,98821,4,Dark Trap,Symbiote,,
3,0.476,0.781,0,-4.710,1,0.1030,0.023700,0.000000,0.1140,0.1750,...,0VSXnJqQkwuH2ei1nOQ1nu,spotify:track:0VSXnJqQkwuH2ei1nOQ1nu,https://api.spotify.com/v1/tracks/0VSXnJqQkwuH...,https://api.spotify.com/v1/audio-analysis/0VSX...,123661,3,Dark Trap,ProductOfDrugs (Prod. The Virus and Antidote),,
4,0.798,0.624,2,-7.668,1,0.2930,0.217000,0.000000,0.1660,0.5910,...,4jCeguq9rMTlbMmPHuO7S3,spotify:track:4jCeguq9rMTlbMmPHuO7S3,https://api.spotify.com/v1/tracks/4jCeguq9rMTl...,https://api.spotify.com/v1/audio-analysis/4jCe...,123298,4,Dark Trap,Venom,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42300,0.528,0.693,4,-5.148,1,0.0304,0.031500,0.000345,0.1210,0.3940,...,46bXU7Sgj7104ZoXxzz9tM,spotify:track:46bXU7Sgj7104ZoXxzz9tM,https://api.spotify.com/v1/tracks/46bXU7Sgj710...,https://api.spotify.com/v1/audio-analysis/46bX...,269208,4,hardstyle,,20995.0,Euphoric Hardstyle
42301,0.517,0.768,0,-7.922,0,0.0479,0.022500,0.000018,0.2050,0.3830,...,0he2ViGMUO3ajKTxLOfWVT,spotify:track:0he2ViGMUO3ajKTxLOfWVT,https://api.spotify.com/v1/tracks/0he2ViGMUO3a...,https://api.spotify.com/v1/audio-analysis/0he2...,210112,4,hardstyle,,20996.0,Greatest Hardstyle Playlist
42302,0.361,0.821,8,-3.102,1,0.0505,0.026000,0.000242,0.3850,0.1240,...,72DAt9Lbpy9EUS29OzQLob,spotify:track:72DAt9Lbpy9EUS29OzQLob,https://api.spotify.com/v1/tracks/72DAt9Lbpy9E...,https://api.spotify.com/v1/audio-analysis/72DA...,234823,4,hardstyle,,20997.0,Best of Hardstyle 2020
42303,0.477,0.921,6,-4.777,0,0.0392,0.000551,0.029600,0.0575,0.4880,...,6HXgExFVuE1c3cq9QjFCcU,spotify:track:6HXgExFVuE1c3cq9QjFCcU,https://api.spotify.com/v1/tracks/6HXgExFVuE1c...,https://api.spotify.com/v1/audio-analysis/6HXg...,323200,4,hardstyle,,20998.0,Euphoric Hardstyle


In [44]:
def wrangle(df):
    # drop columns not in use by nearest-neighbors 
    df.drop(columns=['type', 'id', 'track_href', 'analysis_url', 'title', 'Unnamed: 0'], inplace=True)
    
    # drop unuseful audio features
    df.drop(columns=['instrumentalness', 'time_signature'], inplace=True)
    
    # drop genre, might not work well with nearest-neighbors and does not appear in spotify api request
    df.drop(columns=['genre'], inplace=True)
    
    # making a pool of songs to use as query items
    test_df = df[df['song_name'].isna() == True]
    # dropping rows without song_names, maybe we can keep them if we implement the api calls
    df = df[df['song_name'].isna() == False]
    # drop song-name, not used in nearest-neighbors
    df.drop(columns=['song_name'], inplace=True)
    test_df.drop(columns=['song_name'], inplace=True)
    
    
    return df

In [45]:
q_df = wrangle(df)


In [47]:
uq_df = q_df.drop(columns='uri')

In [54]:
uq_df

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,liveness,valence,tempo,duration_ms
0,0.831,0.814,-7.364,0.4200,0.05980,0.0556,0.3890,156.985,124539
1,0.719,0.493,-7.230,0.0794,0.40100,0.1180,0.1240,115.080,224427
2,0.850,0.893,-4.783,0.0623,0.01380,0.3720,0.0391,218.050,98821
3,0.476,0.781,-4.710,0.1030,0.02370,0.1140,0.1750,186.948,123661
4,0.798,0.624,-7.668,0.2930,0.21700,0.1660,0.5910,147.988,123298
...,...,...,...,...,...,...,...,...,...
21520,0.538,0.819,-6.780,0.4670,0.00854,0.1310,0.2520,160.299,209582
21521,0.415,0.548,-8.563,0.2850,0.22400,0.1220,0.5280,154.272,213898
21522,0.785,0.648,-6.907,0.2790,0.01570,0.1000,0.5060,184.160,271867
21523,0.747,0.832,-5.737,0.1850,0.23300,0.1300,0.7330,188.760,182040


In [49]:
# standard scaler removes the mean and scales to unit variance for each feature
scaler = Normalizer()
scaler.fit(uq_df)
scaled_df = scaler.transform(uq_df)

In [50]:
# Instantiate nearest-neighbors estimator, n_neighbors is Number of neighbors to use by default for kneighbors queries.
nn = NearestNeighbors(n_neighbors=5)
# fit to our song's audio features
nn.fit(scaled_df)

NearestNeighbors()

In [52]:
pickle.dump(nn, open('nn.pkl', 'wb'))

In [53]:
pickle.dump(scaler, open('norm.pkl', 'wb'))

In [None]:
y = df['target']
X = df.drop(columns='target')

In [None]:
!ls -lh

total 224K
-rw-r--r-- 1 root root 218K Oct 20 23:02 data.csv
drwxr-xr-x 1 root root 4.0K Oct  8 13:45 sample_data


In [None]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2017 entries, 0 to 2016
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   acousticness      2017 non-null   float64
 1   danceability      2017 non-null   float64
 2   duration_ms       2017 non-null   int64  
 3   energy            2017 non-null   float64
 4   instrumentalness  2017 non-null   float64
 5   key               2017 non-null   int64  
 6   liveness          2017 non-null   float64
 7   loudness          2017 non-null   float64
 8   mode              2017 non-null   int64  
 9   speechiness       2017 non-null   float64
 10  tempo             2017 non-null   float64
 11  time_signature    2017 non-null   float64
 12  valence           2017 non-null   float64
dtypes: float64(10), int64(3)
memory usage: 205.0 KB


In [None]:
# We are going to scale our data to improve the model's ability to see relationships

# Instantiate and fit_transform using our scalar model
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# Our target values are pretty balanced
y.value_counts()

1    1020
0     997
Name: target, dtype: int64

In [None]:
# We're going to split the data into our train and val sets
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=.2, random_state=42)

In [None]:
X_train[0]

array([2.92623951e-04, 8.17865429e-01, 3.40182180e-01, 4.54841334e-01,
       5.56352459e-01, 7.27272727e-01, 7.34582193e-02, 6.83867033e-01,
       1.00000000e+00, 4.94387691e-02, 4.44008351e-01, 7.50000000e-01,
       3.19891350e-01])

In [None]:
X_train.shape

(1613, 13)

In [None]:
type(X_train)

numpy.ndarray

In [None]:
## Build a perceptron with keras 
# The perceptron can't get to 100% accuracy because it can only fit linear boundaries between classes

# instantiate a sequential model
model = Sequential()


# add a dense layer
# with some layer-specific hyperparameters

model.add(Dense(13,                     # 1 neuron in the hidden layer
                input_dim=13,          # input_dim is the only place where we say anything about the input layer
                activation='relu')) # selecting our activation function

model.add(Dense(93, activation='relu'))

model.add(Dense(54, activation='relu'))

model.add(Dense(130, activation='relu'))

model.add(Dense(13, activation='relu'))

model.add(Dense(1, activation='sigmoid'))

# compile the model 
# locks the model architecture. 
# indicate network-level hyperparameters
model.compile(loss='binary_crossentropy', # We're doing binary classification
             optimizer='nadam',
             metrics=['accuracy'])

In [None]:
# fit the model 
model.fit(X_train, y_train, batch_size=16, epochs=200)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<keras.callbacks.History at 0x7fbe7f345ed0>

In [None]:
# Since we just ran a Neural Network Model, let's try a XGBClassifier
# and see what we get
xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [None]:
# Let's look at some scores
train_score = xgb_model.score(X_train, y_train)
val_score = xgb_model.score(X_val, y_val)
print(f"Training score: {train_score}")
print(f"Validation score: {val_score}")

Training score: 0.8574085554866708
Validation score: 0.7376237623762376


In [None]:
# Predicting the target using our model
y_pred = xgb_model.predict(X_val)

In [None]:
# This is a check to see how well the model deals with new data
y_pred = np.round(y_pred, 0)
accuracy = accuracy_score(y_val, y_pred)
accuracy

0.7376237623762376

In [None]:
# NearestNeighbors is the model that is going to give us a list of the most 
# similar songs to the searched song
nn = NearestNeighbors(n_neighbors=6)
nn.fit(X_train)

# This shows us what songs are similar by index
doc_index = 5
doc = [X_train[doc_index]]

# Query using kneighbors 
neigh_dist, neigh_index = nn.kneighbors(doc)


In [None]:
# Ignore the first 0, we care about the floats and integers
print(f"How similar are the songs? {neigh_dist}")
print(f"What song is similar? {neigh_index}")

How similar are the songs? [[0.         0.3899814  0.40295771 0.42029116 0.42386568 0.43956592]]
What song is similar? [[   5   89 1456  733  540  601]]


In [None]:
# This is tensorflows way of saving and loading a model
model.save('nn_model_v1')
tf.keras.models.load_model('nn_model_v1')

INFO:tensorflow:Assets written to: nn_model_v1/assets


<keras.engine.sequential.Sequential at 0x7fbe858c7050>

In [None]:
# Pickling the Nearest Neighbor model
pickle.dump(nn, open('kneighbor_model.pkl','wb'))
pickle.load(open('kneighbor_model.pkl', 'rb'))

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=6, p=2,
                 radius=1.0)

In [27]:
pickle.dump(scaler, open('mms.pkl', 'wb'))


In [28]:

pickle.load(open('mms.pkl', 'rb'))

MinMaxScaler(copy=True, feature_range=(0, 1))