In [1]:
import pandas as pd

df = pd.read_csv('spotify_kaggle/data.csv')
df.head()

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year
0,0.995,['Carl Woitschach'],0.708,158648,0.195,0,6KbQ3uYMLKb5jDxLF7wYDD,0.563,10,0.151,-12.428,1,Singende Bataillone 1. Teil,0,1928,0.0506,118.469,0.779,1928
1,0.994,"['Robert Schumann', 'Vladimir Horowitz']",0.379,282133,0.0135,0,6KuQTIu1KoTTkLXKrwlLPV,0.901,8,0.0763,-28.454,1,"Fantasiestücke, Op. 111: Più tosto lento",0,1928,0.0462,83.972,0.0767,1928
2,0.604,['Seweryn Goszczyński'],0.749,104300,0.22,0,6L63VW0PibdM1HDSBoqnoM,0.0,5,0.119,-19.924,0,Chapter 1.18 - Zamek kaniowski,0,1928,0.929,107.177,0.88,1928
3,0.995,['Francisco Canaro'],0.781,180760,0.13,0,6M94FkXd15sOAOQYRnWPN8,0.887,1,0.111,-14.734,0,Bebamos Juntos - Instrumental (Remasterizado),0,1928-09-25,0.0926,108.003,0.72,1928
4,0.99,"['Frédéric Chopin', 'Vladimir Horowitz']",0.21,687733,0.204,0,6N6tiFZ9vLTSOIxkj8qKrd,0.908,11,0.098,-16.829,1,"Polonaise-Fantaisie in A-Flat Major, Op. 61",1,1928,0.0424,62.149,0.0693,1928


In [2]:
# No missing data, beautiful.

df.isnull().sum()

acousticness        0
artists             0
danceability        0
duration_ms         0
energy              0
explicit            0
id                  0
instrumentalness    0
key                 0
liveness            0
loudness            0
mode                0
name                0
popularity          0
release_date        0
speechiness         0
tempo               0
valence             0
year                0
dtype: int64

In [3]:
df.columns

Index(['acousticness', 'artists', 'danceability', 'duration_ms', 'energy',
       'explicit', 'id', 'instrumentalness', 'key', 'liveness', 'loudness',
       'mode', 'name', 'popularity', 'release_date', 'speechiness', 'tempo',
       'valence', 'year'],
      dtype='object')

In [4]:
# Breakdown of feature separation

features_to_drop = ['explicit',
                    'mode',
                    'name',
                    'release_date']


normalized_numeric_features = ['acousticness',
                               'danceability',
                               'energy',
                               'instrumentalness',
                               'liveness',
                               'loudness',
                               'valence',
                               'speechiness']

numeric_features_to_normalize = ['duration_ms',
                                 'tempo',
                                 'popularity']

features_to_encode = ['key',
                      'year',
                      'artists']

labels = df.columns.drop(features_to_drop)

target = 'id'

In [9]:
# Drop excess features, check work

df = df.drop(columns = features_to_drop)
len(df.columns)

15

In [14]:
# Create cleaning pipeline

!pip install category_encoders

from sklearn.pipeline import make_pipeline
import category_encoders as ce
from sklearn import preprocessing
#from google.colab import files

def pipeline(df):
    '''
    Takes our Spotify dataset from kaggle, applies categorical encoding,
    normalizes numeric data, and returns the result as a pandas df ready
    for machine learning
    '''
    # Removes brackets from our 'artists' column string values
    for x in df['artists']:
        x = x.replace('[', '')
        x = x.replace(']', '')

    normalizer = preprocessing.MinMaxScaler(feature_range=(0,1))
    encoder = ce.OrdinalEncoder()
    df_encoded = encoder.fit_transform(df)
    df_clean = normalizer.fit_transform(df_encoded)

    # We could drop the label headers from this step, as we are
    # using unsupervised techniques
    new_df = pd.DataFrame(df_clean, columns=labels)

    new_df.to_csv('spotify2.csv', index=False)
    #files.download('spotify2.csv')

    return new_df



In [15]:
# Seems right

pipeline(df)

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,id,instrumentalness,key,liveness,loudness,popularity,speechiness,tempo,valence,year
0,0.998996,0.000000,0.716599,0.028442,0.1950,0.000000,0.563000,0.909091,0.1510,0.745000,0.00,0.052219,0.485348,0.7790,0.070707
1,0.997992,0.000030,0.383603,0.051316,0.0135,0.000006,0.901000,0.727273,0.0763,0.494026,0.00,0.047678,0.344019,0.0767,0.070707
2,0.606426,0.000060,0.758097,0.018374,0.2200,0.000012,0.000000,0.454545,0.1190,0.627609,0.00,0.958720,0.439086,0.8800,0.070707
3,0.998996,0.000090,0.790486,0.032538,0.1300,0.000018,0.887000,0.090909,0.1110,0.708887,0.00,0.095562,0.442470,0.7200,0.070707
4,0.993976,0.000120,0.212551,0.126450,0.2040,0.000024,0.908000,1.000000,0.0980,0.676079,0.01,0.043756,0.254614,0.0693,0.070707
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169904,0.173695,0.999910,0.885628,0.029396,0.4430,0.999976,0.000032,0.090909,0.0891,0.822786,0.75,0.147575,0.409732,0.3060,1.000000
169905,0.016767,0.999940,0.727733,0.030076,0.3850,0.999982,0.031300,0.727273,0.1110,0.768820,0.64,0.041589,0.524395,0.2700,1.000000
169906,0.540161,0.999970,0.520243,0.032527,0.5390,0.999988,0.002330,0.636364,0.1080,0.793485,0.70,0.108359,0.506778,0.1530,1.000000
169907,0.071687,1.000000,0.653846,0.030046,0.7610,0.999994,0.000000,0.090909,0.2220,0.899585,0.70,0.039732,0.532244,0.4720,1.000000


In [8]:
# Imports back in well.

new_df = pd.read_csv('spotify2.csv')
new_df.head()

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,id,instrumentalness,key,liveness,loudness,popularity,speechiness,tempo,valence,year
0,0.998996,0.0,0.716599,0.028442,0.195,0.0,0.563,0.909091,0.151,0.745,0.0,0.052219,0.485348,0.779,0.070707
1,0.997992,3e-05,0.383603,0.051316,0.0135,6e-06,0.901,0.727273,0.0763,0.494026,0.0,0.047678,0.344019,0.0767,0.070707
2,0.606426,6e-05,0.758097,0.018374,0.22,1.2e-05,0.0,0.454545,0.119,0.627609,0.0,0.95872,0.439086,0.88,0.070707
3,0.998996,9e-05,0.790486,0.032538,0.13,1.8e-05,0.887,0.090909,0.111,0.708887,0.0,0.095562,0.44247,0.72,0.070707
4,0.993976,0.00012,0.212551,0.12645,0.204,2.4e-05,0.908,1.0,0.098,0.676079,0.01,0.043756,0.254614,0.0693,0.070707


## Visualizations

In [24]:

new_df.columns.tolist()

['acousticness',
 'artists',
 'danceability',
 'duration_ms',
 'energy',
 'id',
 'instrumentalness',
 'key',
 'liveness',
 'loudness',
 'popularity',
 'speechiness',
 'tempo',
 'valence',
 'year']

In [2]:
def heigher_order_features(input_y):
    """A helper function for compare_this function, it creates
    a list with a specific row input"""
    state = []
    for i, x in enumerate(new_df.columns.tolist()):
        a = new_df[str(x)][input_y]
        state.append(a)
        
    return state

In [3]:
print(heigher_order_features(2))

NameError: name 'new_df' is not defined

In [1]:
import plotly.graph_objects as go
import plotly.offline as py

def heigher_order_features(input_y):
    """A helper function for compare_this function, it creates
    a list with a specific row input"""
    state = []
    for i, x in enumerate(new_df.columns.tolist()):
        a = new_df[str(x)][input_y]
        state.append(a)
        
    return state

def compare_this(a,b):

    categories = new_df.columns.tolist()

    fig = go.Figure()

    fig.add_trace(go.Scatterpolar(
        r=heigher_order_features(a),
        theta=categories,
        fill='toself',
        name='Product A'
    ))
    fig.add_trace(go.Scatterpolar(
        r=heigher_order_features(b),
        theta=categories,
        fill='toself',
        name='Product B'
    ))

    fig.update_layout(
    polar=dict(
      radialaxis=dict(
        visible=True,
        range=[0, 1]
      )),
    showlegend=False
    )
    
    pyo.iplot(fig, filename = 'basic-line')

compare_this(100,200)

NameError: name 'new_df' is not defined

In [53]:
new_df.values.tolist()

[[0.9989959839357432,
  0.0,
  0.7165991902834008,
  0.028441802670128438,
  0.195,
  0.0,
  0.563,
  0.9090909090909092,
  0.151,
  0.7450003915120195,
  0.0,
  0.052218782249742,
  0.4853476777103621,
  0.779,
  0.07070707070707273],
 [0.997991967871486,
  2.9963444597590942e-05,
  0.3836032388663968,
  0.0513162067519365,
  0.0135,
  5.885538055889069e-06,
  0.901,
  0.7272727272727273,
  0.0763,
  0.4940255265836662,
  0.0,
  0.04767801857585138,
  0.3440192387265406,
  0.0767,
  0.07070707070707273],
 [0.606425702811245,
  5.9926889195181884e-05,
  0.7580971659919029,
  0.01837436036508649,
  0.22,
  1.1771076111778136e-05,
  0.0,
  0.4545454545454546,
  0.119,
  0.6276094276094277,
  0.0,
  0.9587203302373579,
  0.4390862424259805,
  0.88,
  0.07070707070707273],
 [0.9989959839357432,
  8.989033379277281e-05,
  0.7904858299595141,
  0.032537837193001184,
  0.13,
  1.7656614167667205e-05,
  0.887,
  0.09090909090909093,
  0.111,
  0.7088873228408112,
  0.0,
  0.09556243550051598,


In [78]:
samples = new_df.values.tolist()
from sklearn.neighbors import NearestNeighbors
neigh = NearestNeighbors(n_neighbors=10)
neigh.fit(samples)



NearestNeighbors(n_neighbors=20)

In [79]:
def value_monad(a):
    return new_df.values.tolist()[a]

value_monad(1)

[0.997991967871486,
 2.9963444597590942e-05,
 0.3836032388663968,
 0.0513162067519365,
 0.0135,
 5.885538055889069e-06,
 0.901,
 0.7272727272727273,
 0.0763,
 0.4940255265836662,
 0.0,
 0.04767801857585138,
 0.3440192387265406,
 0.0767,
 0.07070707070707273]

In [80]:

print(neigh.kneighbors([value_monad(10000)]))

(array([[0.        , 0.26437765, 0.26623907, 0.28723417, 0.30341281,
        0.31390304, 0.32246313, 0.32313638, 0.32415728, 0.32679204,
        0.32883792, 0.33545995, 0.33711348, 0.33724571, 0.34088624,
        0.34282455, 0.34399587, 0.34404901, 0.34761091, 0.3498265 ]]), array([[10000, 18201,  2611, 35170, 26469, 26655, 18360,  8941, 49579,
         3038, 11263, 11720, 11185, 19080, 26064, 18835,  1449, 19030,
        10249, 19130]]))


In [81]:
compare_this(10000,18201)

In [88]:
df = pd.read_csv('spotify_kaggle/data.csv')

df.values[10000]

array([0.488, "['Martin Denny']", 0.301, 186000, 0.249, 0,
       '6UMAEsJEaKEN7Y2GXtid5a', 0.0012, 1, 0.11199999999999999, -16.408,
       1, 'Forbidden Island', 10, '1958-01-01', 0.0315, 89.51799999999999,
       0.0494, 1958], dtype=object)

In [89]:
df.values[18201]

array([0.54, "['Eddie Cochran']", 0.33, 139533, 0.185, 0,
       '2Bpf44xO9EcGEVewLuOLmN', 0.0288, 2, 0.122, -14.31, 1,
       'Tell Me Why', 17, '1960-05-01', 0.0343, 140.356, 0.0795, 1960],
      dtype=object)

In [91]:
neigh

NearestNeighbors(n_neighbors=20)

In [93]:
import pickle
filename = 'neighbors'
outfile = open(filename,'wb')

pickle.dump(neigh, outfile)
outfile.close()

In [5]:
import pickle
filename = 'neighbors'

infile = open(filename,'rb')
model = pickle.load(infile)
infile.close()

In [6]:
model

NearestNeighbors(n_neighbors=20)

In [10]:
print(new_dict.kneighbors([value_monad(10000)]))

NameError: name 'new_dict' is not defined