## Data Cleaning ##

In [9]:
import pandas as pd 
import numpy as np 

# preprocessing for data normalization
from sklearn import preprocessing

In [1]:
# importing sample outputs from predicto() in NN.ipynb

samples = ['6ScgNyiMGRJcuQl6fHE32t',
           '4hpQCCUn1D2KJ0hRIvviNz',
           '1oXRum87ShmIRW8GgETPjd',
           '12pBDYcRz2KJXTnhFste8v',
           '7cGpisHH8TCwcPI1Pxd0IM',
           '3Kb4dMQn8cAXthan2osI0l',
           '2ggqfj97qyiORmXoVFzP5j',
           '2ggqfj97qyiORmXoVFzP5j',
           '3Mt3L75pk83KGc0c4VJzLM',
           '3oDk8PFjkiqwEn1m03pnkm',
           '27fUxjCxoOG7u2kxKAjCJA',
           '63L3A0z2A5DRix83DnHCDX',
           '7hlljw8YiOutMUrFekNIA0',
           '3uZIfWx5ridUBQevmgHDUt']

In [18]:
# path of our song dataset
PATH = '../data/SpotifyFeatures.csv'

df = pd.read_csv(PATH)
df.head()

Unnamed: 0.1,Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,0,Movie,Henri Salvador,C'est beau de faire un Show,0BRjO6ga9RKCKjfDqeFgWV,0,0.611,0.389,99373,0.91,0.0,C#,0.346,-1.828,Major,0.0525,166.969,4/4,0.814
1,1,Movie,Martin & les fées,Perdu d'avance (par Gad Elmaleh),0BjC1NfoEOOusryehmNudP,1,0.246,0.59,137373,0.737,0.0,F#,0.151,-5.559,Minor,0.0868,174.003,4/4,0.816
2,2,Movie,Joseph Williams,Don't Let Me Be Lonely Tonight,0CoSDzoNIKCRs124s9uTVy,3,0.952,0.663,170267,0.131,0.0,C,0.103,-13.879,Minor,0.0362,99.488,5/4,0.368
3,3,Movie,Henri Salvador,Dis-moi Monsieur Gordon Cooper,0Gc6TVm52BwZD07Ki6tIvf,0,0.703,0.24,152427,0.326,0.0,C#,0.0985,-12.178,Major,0.0395,171.758,4/4,0.227
4,4,Movie,Fabien Nataf,Ouverture,0IuslXpMROHdEPvSl1fTQK,4,0.95,0.331,82625,0.225,0.123,F,0.202,-21.15,Major,0.0456,140.576,4/4,0.39


In [19]:
# wanna keep Danceability, Instrumentalness, Loudness, Speechiness, Valence
# drop all other columns (maybe preserve artist name/track name etc.)

cols = [
    'Unnamed: 0', 'genre', 'popularity', 'duration_ms',
    'key', 'mode', 'tempo', 'time_signature', 'acousticness',
    'energy', 'liveness'
]

df = df.drop(cols, axis=1)

In [20]:
df.head()

Unnamed: 0,artist_name,track_name,track_id,danceability,instrumentalness,loudness,speechiness,valence
0,Henri Salvador,C'est beau de faire un Show,0BRjO6ga9RKCKjfDqeFgWV,0.389,0.0,-1.828,0.0525,0.814
1,Martin & les fées,Perdu d'avance (par Gad Elmaleh),0BjC1NfoEOOusryehmNudP,0.59,0.0,-5.559,0.0868,0.816
2,Joseph Williams,Don't Let Me Be Lonely Tonight,0CoSDzoNIKCRs124s9uTVy,0.663,0.0,-13.879,0.0362,0.368
3,Henri Salvador,Dis-moi Monsieur Gordon Cooper,0Gc6TVm52BwZD07Ki6tIvf,0.24,0.0,-12.178,0.0395,0.227
4,Fabien Nataf,Ouverture,0IuslXpMROHdEPvSl1fTQK,0.331,0.123,-21.15,0.0456,0.39


## Data Normalization ##

In [37]:
# our aim here is to make all the data normalized (between 0 and 1)

# let's make a dataframe with just the data to be normalized
slice_df = df.loc[:, 'danceability':'valence']
slice_df.head()

Unnamed: 0,danceability,instrumentalness,loudness,speechiness,valence
0,0.389,0.0,-1.828,0.0525,0.814
1,0.59,0.0,-5.559,0.0868,0.816
2,0.663,0.0,-13.879,0.0362,0.368
3,0.24,0.0,-12.178,0.0395,0.227
4,0.331,0.123,-21.15,0.0456,0.39


In [38]:
import pandas as pd
from sklearn import preprocessing

x = slice_df.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
norm_df = pd.DataFrame(x_scaled)

In [40]:
norm_df.head()

Unnamed: 0,0,1,2,3,4
0,0.356292,0.0,0.900856,0.03207,0.814
1,0.571934,0.0,0.834469,0.068374,0.816
2,0.650252,0.0,0.686429,0.014818,0.368
3,0.196438,0.0,0.716695,0.018311,0.227
4,0.294067,0.123123,0.557054,0.024767,0.39


In [43]:
# columns got renamed, let's fix that
# Danceability, Instrumentalness, Loudness, Speechiness, Valence
norm_df.columns = ['danceability', 'instrumentalness', 'loudness', 'speechiness', 'valence']
norm_df.head()

Unnamed: 0,danceability,instrumentalness,loudness,speechiness,valence
0,0.356292,0.0,0.900856,0.03207,0.814
1,0.571934,0.0,0.834469,0.068374,0.816
2,0.650252,0.0,0.686429,0.014818,0.368
3,0.196438,0.0,0.716695,0.018311,0.227
4,0.294067,0.123123,0.557054,0.024767,0.39


In [44]:
# speechiness looks really low, let's see if it's actually between 0 and 1

norm_df['speechiness'].describe() 

# guess so... but it's definitely skewed. oh well.

count    232725.000000
mean          0.104324
std           0.196357
min           0.000000
25%           0.015347
50%           0.029530
75%           0.087638
max           1.000000
Name: speechiness, dtype: float64

In [51]:
df = df.loc[:, 'artist_name':'track_id']
df.join(norm_df)

Unnamed: 0,artist_name,track_name,track_id,danceability,instrumentalness,loudness,speechiness,valence
0,Henri Salvador,C'est beau de faire un Show,0BRjO6ga9RKCKjfDqeFgWV,0.356292,0.000000,0.900856,0.032070,0.814
1,Martin & les fées,Perdu d'avance (par Gad Elmaleh),0BjC1NfoEOOusryehmNudP,0.571934,0.000000,0.834469,0.068374,0.816
2,Joseph Williams,Don't Let Me Be Lonely Tonight,0CoSDzoNIKCRs124s9uTVy,0.650252,0.000000,0.686429,0.014818,0.368
3,Henri Salvador,Dis-moi Monsieur Gordon Cooper,0Gc6TVm52BwZD07Ki6tIvf,0.196438,0.000000,0.716695,0.018311,0.227
4,Fabien Nataf,Ouverture,0IuslXpMROHdEPvSl1fTQK,0.294067,0.123123,0.557054,0.024767,0.390
...,...,...,...,...,...,...,...,...
232720,Slave,Son Of Slide,2XGLdVl7lGeq8ksM6Al7jT,0.676000,0.544545,0.744311,0.009949,0.962
232721,Jr Thomas & The Volcanos,Burning Fire,1qWZdkBl4UVPj9lK6HuuFM,0.781139,0.000881,0.809825,0.012172,0.969
232722,Muddy Waters,(I'm Your) Hoochie Coochie Man,2ziWXUmQLrXTiYjCg2fZ2t,0.493617,0.000000,0.786018,0.133150,0.813
232723,R.LUM.R,With My Words,6EFsue2YbIG4Qkq8Zr9Rir,0.738226,0.000000,0.806391,0.131033,0.489


## Radar Plot Experimentation ##

In [6]:
# imports required for plotting

import matplotlib.pyplot as plt
from matplotlib.patches import Circle, RegularPolygon
from matplotlib.path import Path
from matplotlib.projections.polar import PolarAxes
from matplotlib.projections import register_projection
from matplotlib.spines import Spine
from matplotlib.transforms import Affine2D