You now know two kinds of regression and two kinds of classifier. So let's use that to compare models!

Comparing models is something data scientists do all the time. There's very rarely just one model that would be possible to run for a given situation, so learning to choose the best one is very important.

Here let's work on regression. Find a data set and build a KNN Regression and an OLS regression. Compare the two. How similar are they? Do they miss in different ways?

Create a Jupyter notebook with your models. At the end in a markdown cell write a few paragraphs to describe the models' behaviors and why you favor one model or the other. Try to determine whether there is a situation where you would change your mind, or whether one is unambiguously better than the other. Lastly, try to note what it is about the data that causes the better model to outperform the weaker model. 

In [40]:
import os
os.chdir('C:\\Users\\M246047\\Documents\\Python')
import numpy as np
import pandas as pd
import datetime as dt
import pylab
from matplotlib import pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')
import scipy as sc
from scipy.stats import ttest_ind
import re
import seaborn as sns
from sklearn import neighbors
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import normalize
from sklearn import linear_model
from sklearn.model_selection import train_test_split
import statsmodels.api as sm

spotifytop = pd.read_csv('spotifytop2018.csv')
music = pd.DataFrame(spotifytop)
print(music.columns)
print('\n')
music.describe()

Index(['id', 'name', 'artists', 'danceability', 'energy', 'key', 'loudness',
       'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'duration_ms', 'time_signature'],
      dtype='object')




Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,0.71646,0.65906,5.33,-5.67764,0.59,0.115569,0.195701,0.001584,0.158302,0.484443,119.90418,205206.78,3.98
std,0.13107,0.145067,3.676447,1.777577,0.494311,0.104527,0.220946,0.013449,0.111662,0.206145,28.795984,40007.893404,0.2
min,0.258,0.296,0.0,-10.109,0.0,0.0232,0.000282,0.0,0.0215,0.0796,64.934,95467.0,3.0
25%,0.6355,0.562,1.75,-6.6505,0.0,0.04535,0.040225,0.0,0.094675,0.341,95.73075,184680.0,4.0
50%,0.733,0.678,5.0,-5.5665,1.0,0.07495,0.109,0.0,0.1185,0.4705,120.116,205047.5,4.0
75%,0.79825,0.77225,8.25,-4.36375,1.0,0.137,0.24775,3.1e-05,0.17075,0.6415,140.02275,221493.25,4.0
max,0.964,0.909,11.0,-2.384,1.0,0.53,0.934,0.134,0.636,0.931,198.075,417920.0,5.0


In [41]:
music['normalized_loudness'] = normalize(np.array(music["loudness"]).reshape(1,-1)).reshape(-1,1)
music['normalized_tempo'] = normalize(np.array(music["tempo"]).reshape(1,-1)).reshape(-1,1)

music.corr()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,normalized_loudness,normalized_tempo
danceability,1.0,-0.072582,-0.051759,0.015517,-0.058019,0.227075,-0.134374,-0.066592,-0.038761,0.413855,-0.195012,-0.068368,0.119421,0.015517,-0.195012
energy,-0.072582,1.0,-0.136345,0.732719,-0.117555,-0.073591,-0.421209,0.093684,0.050542,0.382434,0.062272,0.073017,0.255235,0.732719,0.062272
key,-0.051759,-0.136345,1.0,-0.105309,-0.141568,0.019583,0.14159,-0.136607,-0.125443,-0.032622,0.003737,0.046144,-0.087096,-0.105309,0.003737
loudness,0.015517,0.732719,-0.105309,1.0,-0.110178,-0.252037,-0.269742,0.036248,6e-06,0.40776,-0.035156,0.26531,0.072301,1.0,-0.035156
mode,-0.058019,-0.117555,-0.141568,-0.110178,1.0,-0.150076,-0.030028,0.089667,0.024428,-0.210599,-0.011911,0.055411,-0.083782,-0.110178,-0.011911
speechiness,0.227075,-0.073591,0.019583,-0.252037,-0.150076,1.0,-0.081536,-0.069543,-0.099379,-0.051054,0.102999,-0.009856,0.235615,-0.252037,0.102999
acousticness,-0.134374,-0.421209,0.14159,-0.269742,-0.030028,-0.081536,1.0,-0.089583,-0.150177,-0.0208,-0.158013,-0.069627,-0.158935,-0.269742,-0.158013
instrumentalness,-0.066592,0.093684,-0.136607,0.036248,0.089667,-0.069543,-0.089583,1.0,-0.016249,-0.095123,0.178142,-0.045873,0.011894,0.036248,0.178142
liveness,-0.038761,0.050542,-0.125443,6e-06,0.024428,-0.099379,-0.150177,-0.016249,1.0,-0.042612,-0.107652,-0.042942,-0.079558,6e-06,-0.107652
valence,0.413855,0.382434,-0.032622,0.40776,-0.210599,-0.051054,-0.0208,-0.095123,-0.042612,1.0,-0.148423,-0.131901,0.22341,0.40776,-0.148423


## Predicting Energy with Loudness and Valence

Five neighbors

In [42]:
knn = neighbors.KNeighborsRegressor(n_neighbors=5)
Y = music.energy
X = music[['normalized_loudness','valence']]
knn.fit(X, Y)
Y_ = knn.predict(music[['normalized_loudness', 'valence']])
# Set up our prediction line.
T = np.arange(0, 50, 0.1)[:, np.newaxis]

# Trailing underscores are a common convention for a prediction.
knn_w = neighbors.KNeighborsRegressor(n_neighbors=5, weights='distance')
Y = music.energy
X = music[['normalized_loudness','valence']]
knn_w.fit(X, Y)
Y_ = knn_w.predict(music[['normalized_loudness', 'valence']])

score = cross_val_score(knn, X, Y, cv=5)
print("Unweighted Accuracy: %0.2f (+/- %0.2f)" % (score.mean(), score.std() * 2))
score_w = cross_val_score(knn_w, X, Y, cv=5)
print("Weighted Accuracy: %0.2f (+/- %0.2f)" % (score_w.mean(), score_w.std() * 2))

Unweighted Accuracy: 0.32 (+/- 0.48)
Weighted Accuracy: 0.33 (+/- 0.44)


Ten Neighbors

In [43]:
knn = neighbors.KNeighborsRegressor(n_neighbors=10)
Y = music.energy
X = music[['normalized_loudness','valence']]
knn.fit(X, Y)
Y_ = knn.predict(music[['normalized_loudness', 'valence']])
# Set up our prediction line.
T = np.arange(0, 50, 0.1)[:, np.newaxis]

# Trailing underscores are a common convention for a prediction.
knn_w = neighbors.KNeighborsRegressor(n_neighbors=10, weights='distance')
Y = music.energy
X = music[['normalized_loudness','valence']]
knn_w.fit(X, Y)
Y_ = knn_w.predict(music[['normalized_loudness', 'valence']])

score = cross_val_score(knn, X, Y, cv=5)
print("Unweighted Accuracy: %0.2f (+/- %0.2f)" % (score.mean(), score.std() * 2))
score_w = cross_val_score(knn_w, X, Y, cv=5)
print("Weighted Accuracy: %0.2f (+/- %0.2f)" % (score_w.mean(), score_w.std() * 2))

Unweighted Accuracy: 0.28 (+/- 0.46)
Weighted Accuracy: 0.30 (+/- 0.39)


OLS with training sets

In [46]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .2, random_state = 465)
print("The number of observations in training set is {}".format(X_train.shape[0]))
print("The number of observations in test set is {}".format(X_test.shape[0]))

X_train_const = sm.add_constant(X_train)
X_test_const = sm.add_constant(X_test)

# We fit an OLS model using statsmodels
results_train = sm.OLS(Y_train, X_train_const).fit()


results_test = sm.OLS(Y_test, X_test_const).fit()
# We print the summary results
print(results_train.summary())
print(results_test.summary())

The number of observations in training set is 80
The number of observations in test set is 20
                            OLS Regression Results                            
Dep. Variable:                 energy   R-squared:                       0.598
Model:                            OLS   Adj. R-squared:                  0.587
Method:                 Least Squares   F-statistic:                     57.20
Date:                Thu, 07 Nov 2019   Prob (F-statistic):           5.96e-16
Time:                        21:21:13   Log-Likelihood:                 75.372
No. Observations:                  80   AIC:                            -144.7
Df Residuals:                      77   BIC:                            -137.6
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
----------------------------

These models are both pretty terrible, but I think I prefer the OLS to the KNN. While the OLS without training sets only accounts for 38.5% of the variance, the summary at least gives us information on how to improve the model. We can see that the p-vlaue for valence is well above the 0.05 threshold and as the adjusted R-squared value is lower than the R-squared, we can likely remove valence as a feature to obtain a better model. 

I also like that OLS gives an actual formula which gives me a more solid understanding of the data and how unobserved values might be predicted. 

I think I would prefer KNN if I knew my data had an unconventional pattern to it - if there were peaks and troughs that a correlation matrix wouldn't be able to grasp, but matching other datapoints did.

## Predicting Energy with Loudness

Five neighbors

In [28]:
knn = neighbors.KNeighborsRegressor(n_neighbors=5)
Y = music.energy
X = pd.DataFrame(music.normalized_loudness)
knn.fit(X, Y)
Y_ = knn.predict(X)
# Set up our prediction line.
T = np.arange(0, 50, 0.1)[:, np.newaxis]

# Trailing underscores are a common convention for a prediction.
knn_w = neighbors.KNeighborsRegressor(n_neighbors=5, weights='distance')
Y = music.energy
X = pd.DataFrame(music.normalized_loudness)
knn_w.fit(X, Y)
Y_ = knn_w.predict(X)

score = cross_val_score(knn, X, Y, cv=5)
print("Unweighted Accuracy: %0.2f (+/- %0.2f)" % (score.mean(), score.std() * 2))
score_w = cross_val_score(knn_w, X, Y, cv=5)
print("Weighted Accuracy: %0.2f (+/- %0.2f)" % (score_w.mean(), score_w.std() * 2))

Unweighted Accuracy: 0.26 (+/- 0.29)
Weighted Accuracy: 0.08 (+/- 0.29)


Ten neighbors

In [29]:
knn = neighbors.KNeighborsRegressor(n_neighbors=10)
Y = music.energy
X = pd.DataFrame(music.normalized_loudness)
knn.fit(X, Y)
Y_ = knn.predict(X)
# Set up our prediction line.
T = np.arange(0, 50, 0.1)[:, np.newaxis]

# Trailing underscores are a common convention for a prediction.
knn_w = neighbors.KNeighborsRegressor(n_neighbors=10, weights='distance')
Y = music.energy
X = pd.DataFrame(music.normalized_loudness)
knn_w.fit(X, Y)
Y_ = knn_w.predict(X)

score = cross_val_score(knn, X, Y, cv=5)
print("Unweighted Accuracy: %0.2f (+/- %0.2f)" % (score.mean(), score.std() * 2))
score_w = cross_val_score(knn_w, X, Y, cv=5)
print("Weighted Accuracy: %0.2f (+/- %0.2f)" % (score_w.mean(), score_w.std() * 2))

Unweighted Accuracy: 0.40 (+/- 0.38)
Weighted Accuracy: 0.18 (+/- 0.29)


## Predicting Energy with Acousticness

Five Neighbors

In [31]:
knn = neighbors.KNeighborsRegressor(n_neighbors=5)
Y = music.energy
X = pd.DataFrame(music.acousticness)
knn.fit(X, Y)
Y_ = knn.predict(X)
# Set up our prediction line.
T = np.arange(0, 50, 0.1)[:, np.newaxis]

# Trailing underscores are a common convention for a prediction.
knn_w = neighbors.KNeighborsRegressor(n_neighbors=5, weights='distance')
Y = music.energy
X = pd.DataFrame(music.acousticness)
knn_w.fit(X, Y)
Y_ = knn_w.predict(X)

score = cross_val_score(knn, X, Y, cv=5)
print("Unweighted Accuracy: %0.2f (+/- %0.2f)" % (score.mean(), score.std() * 2))
score_w = cross_val_score(knn_w, X, Y, cv=5)
print("Weighted Accuracy: %0.2f (+/- %0.2f)" % (score_w.mean(), score_w.std() * 2))

Unweighted Accuracy: -0.23 (+/- 0.69)
Weighted Accuracy: -0.64 (+/- 0.86)


In [32]:
knn = neighbors.KNeighborsRegressor(n_neighbors=10)
Y = music.energy
X = pd.DataFrame(music.acousticness)
knn.fit(X, Y)
Y_ = knn.predict(X)
# Set up our prediction line.
T = np.arange(0, 50, 0.1)[:, np.newaxis]

# Trailing underscores are a common convention for a prediction.
knn_w = neighbors.KNeighborsRegressor(n_neighbors=10, weights='distance')
Y = music.energy
X = pd.DataFrame(music.acousticness)
knn_w.fit(X, Y)
Y_ = knn_w.predict(X)

score = cross_val_score(knn, X, Y, cv=5)
print("Unweighted Accuracy: %0.2f (+/- %0.2f)" % (score.mean(), score.std() * 2))
score_w = cross_val_score(knn_w, X, Y, cv=5)
print("Weighted Accuracy: %0.2f (+/- %0.2f)" % (score_w.mean(), score_w.std() * 2))

Unweighted Accuracy: -0.03 (+/- 0.53)
Weighted Accuracy: -0.51 (+/- 0.74)
