## Imports and Initial Settings ##

In [1]:
# standard python imports

import numpy as np 
import pandas as pd 


In [2]:
# imports for plotting and visualizations

import seaborn as sns
sns.set_style('whitegrid') # set global seaborn style for readability

from matplotlib import pyplot as plt 
%matplotlib inline 
# set matplotlib backend

In [16]:
# imports for predictions and models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, LinearSVC
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.neighbors import KNeighborsClassifier as knn

from sklearn.metrics import make_scorer, accuracy_score, roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [4]:
import warnings
warnings.simplefilter('ignore')

## Load and View Dataset ##

In [5]:
# Dataframe path
PATH = '../data/SpotifyFeatures.csv'

# load from PATH
df = pd.read_csv(PATH)
df.head()

Unnamed: 0.1,Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,0,Movie,Henri Salvador,C'est beau de faire un Show,0BRjO6ga9RKCKjfDqeFgWV,0,0.611,0.389,99373,0.91,0.0,C#,0.346,-1.828,Major,0.0525,166.969,4/4,0.814
1,1,Movie,Martin & les fées,Perdu d'avance (par Gad Elmaleh),0BjC1NfoEOOusryehmNudP,1,0.246,0.59,137373,0.737,0.0,F#,0.151,-5.559,Minor,0.0868,174.003,4/4,0.816
2,2,Movie,Joseph Williams,Don't Let Me Be Lonely Tonight,0CoSDzoNIKCRs124s9uTVy,3,0.952,0.663,170267,0.131,0.0,C,0.103,-13.879,Minor,0.0362,99.488,5/4,0.368
3,3,Movie,Henri Salvador,Dis-moi Monsieur Gordon Cooper,0Gc6TVm52BwZD07Ki6tIvf,0,0.703,0.24,152427,0.326,0.0,C#,0.0985,-12.178,Major,0.0395,171.758,4/4,0.227
4,4,Movie,Fabien Nataf,Ouverture,0IuslXpMROHdEPvSl1fTQK,4,0.95,0.331,82625,0.225,0.123,F,0.202,-21.15,Major,0.0456,140.576,4/4,0.39


In [6]:
# let's get a better look

df.describe()

Unnamed: 0.1,Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence
count,232725.0,232725.0,232725.0,232725.0,232725.0,232725.0,232725.0,232725.0,232725.0,232725.0,232725.0,232725.0
mean,116362.0,41.127502,0.36856,0.554364,235122.3,0.570958,0.148301,0.215009,-9.569885,0.120765,117.666585,0.454917
std,67182.065036,18.189948,0.354768,0.185608,118935.9,0.263456,0.302768,0.198273,5.998204,0.185518,30.898907,0.260065
min,0.0,0.0,0.0,0.0569,15387.0,2e-05,0.0,0.00967,-52.457,0.0222,30.379,0.0
25%,58181.0,29.0,0.0376,0.435,182857.0,0.385,0.0,0.0974,-11.771,0.0367,92.959,0.237
50%,116362.0,43.0,0.232,0.571,220427.0,0.605,4.4e-05,0.128,-7.762,0.0501,115.778,0.444
75%,174543.0,55.0,0.722,0.692,265768.0,0.787,0.0358,0.264,-5.501,0.105,139.054,0.66
max,232724.0,100.0,0.996,0.989,5552917.0,0.999,0.999,1.0,3.744,0.967,242.903,1.0


In [7]:
# df keys

print(df.keys())

Index(['Unnamed: 0', 'genre', 'artist_name', 'track_name', 'track_id',
       'popularity', 'acousticness', 'danceability', 'duration_ms', 'energy',
       'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
       'speechiness', 'tempo', 'time_signature', 'valence'],
      dtype='object')


In [8]:
# getting rid of that dumb "Unnamed: 0" column

df = df.drop("Unnamed: 0", axis=1)

In [9]:
# keys

print(df.keys())

Index(['genre', 'artist_name', 'track_name', 'track_id', 'popularity',
       'acousticness', 'danceability', 'duration_ms', 'energy',
       'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
       'speechiness', 'tempo', 'time_signature', 'valence'],
      dtype='object')


In [10]:
# finger's crossed for no NaN values

pd.isnull(df).sum()

genre               0
artist_name         0
track_name          0
track_id            0
popularity          0
acousticness        0
danceability        0
duration_ms         0
energy              0
instrumentalness    0
key                 0
liveness            0
loudness            0
mode                0
speechiness         0
tempo               0
time_signature      0
valence             0
dtype: int64

In [11]:
# gonna split train/test here, instead of down below:

train, test = train_test_split(df, test_size=0.2, random_state=42)

In [12]:
print(train.shape)
print(test.shape)

(186180, 18)
(46545, 18)


## Exploratory Plotting ##

## Feature Engineering ##

In [13]:
# select features to ignore/keep

# Index(['genre', 'artist_name', 'track_name', 'track_id', 'popularity',
#        'acousticness', 'danceability', 'duration_ms', 'energy',
#        'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
#        'speechiness', 'tempo', 'time_signature', 'valence'],
#       dtype='object')

ignore = ([
    'genre', 'artist_name', 'track_name', 'track_id', 
    'popularity', 'duration_ms', 'key', 'liveness', 'mode',
    'tempo', 'time_signature', 'energy', 'acousticness'
])

features = ([
    'danceability', 'instrumentalness', 
    'loudness', 'speechiness', 'valence'
])

# took out: 'energy', 'acousticness'

clean_train = train.drop(ignore, axis=1)
clean_test = test.drop(ignore, axis=1)

In [14]:
clean_train.head()

Unnamed: 0,danceability,instrumentalness,loudness,speechiness,valence
200510,0.128,0.967,-17.0,0.0333,0.152
19161,0.4,0.0,-3.899,0.0348,0.231
130619,0.406,0.00887,-5.142,0.0734,0.638
215414,0.805,1.5e-05,-8.215,0.0272,0.76
226822,0.619,0.0105,-9.357,0.0934,0.486


In [15]:
clean_test.head()

Unnamed: 0,danceability,instrumentalness,loudness,speechiness,valence
788,0.327,0.00015,-4.952,0.0609,0.385
207109,0.253,0.568,-18.512,0.0439,0.0487
138644,0.821,0.0134,-6.295,0.212,0.787
37164,0.745,0.114,-3.949,0.0904,0.17
174351,0.502,9.6e-05,-9.935,0.807,0.185


## Model Exploration ##