In [1]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings("ignore")

In [2]:
music_df = pd.read_csv('music_clean.csv')
music_df.head()

Unnamed: 0.1,Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,genre
0,36506,60.0,0.896,0.726,214547.0,0.177,2e-06,0.116,-14.824,0.0353,92.934,0.618,1
1,37591,63.0,0.00384,0.635,190448.0,0.908,0.0834,0.239,-4.795,0.0563,110.012,0.637,1
2,37658,59.0,7.5e-05,0.352,456320.0,0.956,0.0203,0.125,-3.634,0.149,122.897,0.228,1
3,36060,54.0,0.945,0.488,352280.0,0.326,0.0157,0.119,-12.02,0.0328,106.063,0.323,1
4,35710,55.0,0.245,0.667,273693.0,0.647,0.000297,0.0633,-7.787,0.0487,143.995,0.3,1


In [5]:
music_df[['duration_ms','loudness','speechiness']].describe()

Unnamed: 0,duration_ms,loudness,speechiness
count,1000.0,1000.0,1000.0
mean,217220.4,-8.253305,0.077879
std,117558.2,5.158523,0.089451
min,-1.0,-38.718,0.0234
25%,180656.2,-9.7755,0.0331
50%,216300.0,-6.855,0.0436
75%,260502.5,-4.97775,0.07495
max,1617333.0,-0.883,0.71


In [7]:
#distance can misinform machine learning models (like KNeighbors), so let's normalize and standardize
#1. subtract mean and divide by variance (standardization), centered around 0 with variance of 1
#2. subtract minimum and divide by range, ranges from 0 to 1
#3. data ranges from -1 to +1 (normalization)

In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [9]:
X = music_df.drop('genre', axis=1).values
y = music_df['genre'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print(np.mean(X), np.std(X))
print(np.mean(X_train_scaled), np.std(X_train_scaled))


20666.582585618085 68890.98734103922
3.5971225997855074e-16 0.9999999999999996


In [10]:
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier

steps = [('scaler', StandardScaler()),
        ('knn', KNeighborsClassifier(n_neighbors=6))]
pipeline = Pipeline(steps)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)
knn_scaled = pipeline.fit(X_train, y_train)
y_pred = knn_scaled.predict(X_test)
print(knn_scaled.score(X_test, y_test))

0.91


In [13]:
#as opposed to
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)
knn_unscaled = KNeighborsClassifier(n_neighbors=6).fit(X_train, y_train)
knn_unscaled.score(X_test, y_test)

0.885

In [12]:
from sklearn.model_selection import GridSearchCV

In [14]:
steps = [('scaler', StandardScaler()),
        ('knn', KNeighborsClassifier())]
pipeline = Pipeline(steps)
parameters = {'knn__n_neighbors': np.arange(1, 50)}
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)
cv = GridSearchCV(pipeline, param_grid = parameters)
cv.fit(X_train, y_train)
y_pred = cv.predict(X_test)
cv.best_score_

0.9099999999999999

In [15]:
cv.best_params_

{'knn__n_neighbors': 6}