## Imports and Initial Settings ##

In [1]:
# standard python imports

import numpy as np 
import pandas as pd 


In [2]:
# imports for plotting and visualizations

import seaborn as sns
sns.set_style('whitegrid') # set global seaborn style for readability

from matplotlib import pyplot as plt 
%matplotlib inline 
# set matplotlib backend

In [3]:
# imports for predictions and models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, LinearSVC
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.neighbors import KNeighborsClassifier as knn

from sklearn.metrics import make_scorer, accuracy_score, roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [3]:
import warnings
warnings.simplefilter('ignore')

## Load and View Dataset ##

In [4]:
# Dataframe path
PATH = '../data/SpotifyFeatures.csv'

# load from PATH
df = pd.read_csv(PATH)
df.head()

Unnamed: 0.1,Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,0,Movie,Henri Salvador,C'est beau de faire un Show,0BRjO6ga9RKCKjfDqeFgWV,0,0.611,0.389,99373,0.91,0.0,C#,0.346,-1.828,Major,0.0525,166.969,4/4,0.814
1,1,Movie,Martin & les fées,Perdu d'avance (par Gad Elmaleh),0BjC1NfoEOOusryehmNudP,1,0.246,0.59,137373,0.737,0.0,F#,0.151,-5.559,Minor,0.0868,174.003,4/4,0.816
2,2,Movie,Joseph Williams,Don't Let Me Be Lonely Tonight,0CoSDzoNIKCRs124s9uTVy,3,0.952,0.663,170267,0.131,0.0,C,0.103,-13.879,Minor,0.0362,99.488,5/4,0.368
3,3,Movie,Henri Salvador,Dis-moi Monsieur Gordon Cooper,0Gc6TVm52BwZD07Ki6tIvf,0,0.703,0.24,152427,0.326,0.0,C#,0.0985,-12.178,Major,0.0395,171.758,4/4,0.227
4,4,Movie,Fabien Nataf,Ouverture,0IuslXpMROHdEPvSl1fTQK,4,0.95,0.331,82625,0.225,0.123,F,0.202,-21.15,Major,0.0456,140.576,4/4,0.39


In [6]:
# let's get a better look

df.describe()

Unnamed: 0.1,Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence
count,232725.0,232725.0,232725.0,232725.0,232725.0,232725.0,232725.0,232725.0,232725.0,232725.0,232725.0,232725.0
mean,116362.0,41.127502,0.36856,0.554364,235122.3,0.570958,0.148301,0.215009,-9.569885,0.120765,117.666585,0.454917
std,67182.065036,18.189948,0.354768,0.185608,118935.9,0.263456,0.302768,0.198273,5.998204,0.185518,30.898907,0.260065
min,0.0,0.0,0.0,0.0569,15387.0,2e-05,0.0,0.00967,-52.457,0.0222,30.379,0.0
25%,58181.0,29.0,0.0376,0.435,182857.0,0.385,0.0,0.0974,-11.771,0.0367,92.959,0.237
50%,116362.0,43.0,0.232,0.571,220427.0,0.605,4.4e-05,0.128,-7.762,0.0501,115.778,0.444
75%,174543.0,55.0,0.722,0.692,265768.0,0.787,0.0358,0.264,-5.501,0.105,139.054,0.66
max,232724.0,100.0,0.996,0.989,5552917.0,0.999,0.999,1.0,3.744,0.967,242.903,1.0


In [5]:
# df keys

print(df.keys())

Index(['Unnamed: 0', 'genre', 'artist_name', 'track_name', 'track_id',
       'popularity', 'acousticness', 'danceability', 'duration_ms', 'energy',
       'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
       'speechiness', 'tempo', 'time_signature', 'valence'],
      dtype='object')


In [8]:
# getting rid of that dumb "Unnamed: 0" column

# df = df.drop("Unnamed: 0", axis=1)

In [9]:
# keys

print(df.keys())

Index(['Unnamed: 0', 'genre', 'artist_name', 'track_name', 'track_id',
       'popularity', 'acousticness', 'danceability', 'duration_ms', 'energy',
       'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
       'speechiness', 'tempo', 'time_signature', 'valence'],
      dtype='object')


In [10]:
# finger's crossed for no NaN values

pd.isnull(df).sum()

Unnamed: 0          0
genre               0
artist_name         0
track_name          0
track_id            0
popularity          0
acousticness        0
danceability        0
duration_ms         0
energy              0
instrumentalness    0
key                 0
liveness            0
loudness            0
mode                0
speechiness         0
tempo               0
time_signature      0
valence             0
dtype: int64

In [11]:
# gonna split train/test here, instead of down below:

train, test = train_test_split(df, test_size=0.2, random_state=42)

In [12]:
print(train.shape)
print(test.shape)

(186180, 19)
(46545, 19)


## Exploratory Plotting ##

## Feature Engineering ##

In [13]:
# select features to ignore/keep

# Index(['genre', 'artist_name', 'track_name', 'track_id', 'popularity',
#        'acousticness', 'danceability', 'duration_ms', 'energy',
#        'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
#        'speechiness', 'tempo', 'time_signature', 'valence'],
#       dtype='object')

ignore = ([
    'genre', 'popularity', 'duration_ms', 
    'key', 'liveness', 'mode', 'tempo', 'time_signature', 
    'energy', 'acousticness'
])

features = ([
    'danceability', 'instrumentalness', 
    'loudness', 'speechiness', 'valence',
    'energy', 'acousticness'
])

# took out: 'energy', 'acousticness'

clean_train = train.drop(ignore, axis=1)
clean_test = test.drop(ignore, axis=1)

In [6]:
df.duplicated(subset=None, keep='first')

0         False
1         False
2         False
3         False
4         False
          ...  
232720    False
232721    False
232722    False
232723    False
232724    False
Length: 232725, dtype: bool

In [15]:
clean_test.head()

Unnamed: 0.1,Unnamed: 0,artist_name,track_name,track_id,danceability,instrumentalness,loudness,speechiness,valence
788,788,A Thousand Horses,My Time's Comin',16zol4GvHyTER5irYODUk0,0.327,0.00015,-4.952,0.0609,0.385
207109,207109,Mark Mothersbaugh,House Tour,6ac5gUfGTckpdGQCyWsdh2,0.253,0.568,-18.512,0.0439,0.0487
138644,138644,Unified Highway,We Can't Fall (Remix) [feat. J. Patz],09Yz6koF1Y15n1012t1UX6,0.821,0.0134,-6.295,0.212,0.787
37164,37164,Stooki Sound,Endz - Original Mix,3dzEZARDL4ZwICMKVta7Xn,0.745,0.114,-3.949,0.0904,0.17
174351,174351,Bill Hicks,I Love My Job (Live),39Z1G5384UgGa5vmW6WyxC,0.502,9.6e-05,-9.935,0.807,0.185


## Model Exploration ##

In [27]:
df = df.drop(columns=['key', 'mode', 'time_signature'])

In [28]:
def predicto(track_id):

    # Instantiate and fit knn to the correct columns
    knn = NearestNeighbors(n_neighbors=20)
    knn.fit(df[df.columns[5:]])

    obs = df.index[df['track_id'] == track_id]
    series = df.iloc[obs, 5:].to_numpy()

    neighbors = knn.kneighbors(series)
    new_obs = neighbors[1][0][6:20]
    return list(df.loc[new_obs, 'track_id'])

In [13]:
track_id = '4yTDJt8yfOpRKvRNQ8uV5X'

obs = df.index[df['track_id'] == track_id]
obs

Int64Index([126166, 200510], dtype='int64')

In [29]:
predicto('4yTDJt8yfOpRKvRNQ8uV5X')

['6ScgNyiMGRJcuQl6fHE32t',
 '4hpQCCUn1D2KJ0hRIvviNz',
 '1oXRum87ShmIRW8GgETPjd',
 '12pBDYcRz2KJXTnhFste8v',
 '7cGpisHH8TCwcPI1Pxd0IM',
 '3Kb4dMQn8cAXthan2osI0l',
 '2ggqfj97qyiORmXoVFzP5j',
 '2ggqfj97qyiORmXoVFzP5j',
 '3Mt3L75pk83KGc0c4VJzLM',
 '3oDk8PFjkiqwEn1m03pnkm',
 '27fUxjCxoOG7u2kxKAjCJA',
 '63L3A0z2A5DRix83DnHCDX',
 '7hlljw8YiOutMUrFekNIA0',
 '3uZIfWx5ridUBQevmgHDUt']

## Save Model and Pickle

In [1]:
import pickle

In [None]:
pickling_on = open("KNN.pickle","wb")
pickle.dump(emp, pickling_on)
pickling_on.close()