# k Nearest Neighbors

## Imports

In [1]:
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
from scipy import stats, integrate
import seaborn as sns

import pprint
p = pprint.PrettyPrinter(indent=4)

import config as config

%matplotlib inline


## Importing utilities object
+ imports dataset as lists (a training and test set)


In [2]:
%run utilities.py
util = Utilities({}, use_json=False)
training_list, cv_list, testing_list = util.get_datasets()

# turning the datasets into pandas DataFrames 
training_DF, cv_DF, testing_DF = util.create_dataframes(training_list, cv_list, testing_list)

# generate the energy and dance measures 
training_DF, cv_DF, testing_DF = util.generate_energy_measure(training_DF, cv_DF, testing_DF)
training_DF, cv_DF, testing_DF = util.generate_dance_measure(training_DF, cv_DF, testing_DF)


In [3]:
from models.linear_regression_model import LinearRegressionModel

In [4]:
X = training_DF['artist_familiarity artist_hotttnesss artist_familiarity'.split()] 
y = training_DF['song_hotttnesss']

In [13]:
mod = LinearRegressionModel()
mod.train(X, y)
mod.predict([X.loc[3]])

array([0.45881342])

In [12]:
df

In [13]:
df_norm = (df - df.mean()) / (df.max() - df.min())

In [17]:
training_DF.dtypes

track_id               object
title                  object
song_id                object
release                object
artist_id              object
artist_mbid            object
artist_name            object
duration              float64
artist_familiarity    float64
artist_hotttnesss     float64
year                    int64
track_7digitalid        int64
shs_perf                int64
shs_work                int64
song_hotttnesss       float64
danceability          float64
energy                float64
key                     int64
tempo                 float64
loudness              float64
time_signature          int64
segments_avg          float64
tatums_avg            float64
beats_avg             float64
bars_avg              float64
sections_avg          float64
energy1               float64
energy2               float64
energy3               float64
energy4               float64
dance1                float64
dance2                float64
dance3                float64
dance4    

In [32]:
numeric_cols = training_DF._get_numeric_data().columns 
for col in numeric_cols: 
    training_DF[numeric_cols] 

In [36]:
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(X)


In [38]:
X.head()

Unnamed: 0,artist_familiarity,artist_hotttnesss,artist_familiarity.1
0,0.731066,0.509879,0.731066
1,0.581475,0.298377,0.581475
2,0.483075,0.339457,0.483075
3,0.583443,0.385788,0.583443
4,0.594833,0.37865,0.594833


In [39]:
x_scaled

array([[0.73106576, 0.43732872, 0.73106576],
       [0.58147522, 0.20461207, 0.58147522],
       [0.48307464, 0.24981297, 0.48307464],
       ...,
       [0.59160201, 0.28946763, 0.59160201],
       [0.62112105, 0.35569189, 0.62112105],
       [0.66621616, 0.48121172, 0.66621616]])

# Makin Linear Reg Learners!

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn import metrics 
# Look at utilities.py for RunAndTestLinearRegModel() 

### using metadata (familiarity and artist_hott)

In [None]:
metadata = ['artist_familiarity', 'artist_hotttnesss', 'artist_hotttnesss artist_familiarity']

for item in metadata:
    X_cols = item
    results = util.RunAndTestLinearRegModel(X_cols, training_DF, testing_DF)
#     print results
    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]

    print "{:<40}".format(item), "training_error:\t", mserr_training
    print "{:<40}".format(item), "testing_error:\t\t\t", mserr_testing
    # print "The min error values were:", min(results[1]), min(results[2])
    # print "std of hot is:", results[3]
    # -------------------------------------------------


#### Conclusions?
can see that a combination of both features perform better

### using acoustic features 

In [None]:
allsegs = 'segments_avg tatums_avg beats_avg bars_avg sections_avg '
allrawacous = 'key loudness duration tempo time_signature'

raw_acous = []
for each in allsegs.split():
    raw_acous.append(each)

raw_acous.append(allsegs)

for each in allrawacous.split():
    raw_acous.append(each)   
raw_acous.append(allrawacous)

for item in raw_acous:
    X_cols = item
    results = util.RunAndTestLinearRegModel(X_cols, training_DF, testing_DF)

    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]

    print "{:<60}".format(item), "training_error:\t", mserr_training
#     print "{:<40}".format(item), "testing_error:\t\t\t", mserr_testing
    # print "The min error values were:", min(results[1]), min(results[2])
#     print "std of hot is:", results[3]
    # -------------------------------------------------
print results[3]

In [None]:
0.02/0.167923247404*100

## Finding the best energy measure

In [None]:
energylsit = ['energy1', 'energy2', 'energy3', 'energy4', 'energy1 energy2 energy3 energy4']

for item in energylsit:
    X_cols = item
    results = util.RunAndTestLinearRegModel(X_cols, training_DF, testing_DF)

    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]

    print "{:<40}".format(item), "training_error:\t", mserr_training
    # print "d1 testing_error\t", mserr_testing
    # print "The min error values were:", min(results[1]), min(results[2])
    # print "std of hot is:", results[3]
    # -------------------------------------------------




In [None]:
# Comparing acoustic results
acous_compare = ["energy1 energy2 energy3 energy4 tatums_avg beats_avg", 
                 "energy1 energy2 energy3 energy4 tempo loudness", 
                 "energy1 energy2 energy3 energy4 tempo loudness tatums_avg beats_avg"]

for item in acous_compare:
    X_cols = item
    results = util.RunAndTestLinearRegModel(X_cols, training_DF, testing_DF)

    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]

    print "{:<70}".format(item), "training_error:\t", mserr_training
    # print "d1 testing_error\t", mserr_testing
    # print "The min error values were:", min(results[1]), min(results[2])
    # print "std of hot is:", results[3]
    # -------------------------------------------------





## lets play with the dance measure now

In [None]:
dancelist = ['dance1', 'dance2', 'dance3', 'dance4', 'dance1 dance2 dance3 dance4']

for item in dancelist:
    # dance1
    X_cols = item
    results = util.RunAndTestLinearRegModel(X_cols, training_DF, testing_DF)

    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]

    print "{:<40}".format(item), "training_error:\t", mserr_training
    # print "d1 testing_error\t", mserr_testing
    # print "The min error values were:", min(results[1]), min(results[2])
    # print "std of hot is:", results[3]
    # -------------------------------------------------




In [None]:
dancelist = 'dance1 dance2 dance3 dance4 '
energylist = 'energy1 energy2 energy3 energy4 '

acousitc =[]
acousitc.append(dancelist)
acousitc.append(energylist)
acousitc.append(dancelist+energylist)
acousitc.append("artist_familiarity artist_hotttnesss")
acousitc.append(dancelist+energylist+"artist_familiarity artist_hotttnesss")

for item in acousitc:
    # dance1
    X_cols = item
    results = util.RunAndTestLinearRegModel(X_cols, training_DF, testing_DF)

    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]

    if(len(item)>70):
        item='alleng alldance allmeta'
    
    print "{:<70}".format(item), "training_error:\t", mserr_training
#     print "{:<70}".format(item), "training_error:\t\t", mserr_testing
    # print "The min error values were:", min(results[1]), min(results[2])
    # print "std of hot is:", results[3]
    # -------------------------------------------------




In [None]:
finalset = [dancelist+energylist+"artist_familiarity artist_hotttnesss"]

for item in finalset:
    X_cols = item
    results = util.RunAndTestLinearRegModel(X_cols, training_DF, testing_DF)

    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]

    if(len(item)>70):
        item='alleng alldance allmeta'
    
    print "{:<70}".format(item), "training_error:\t", mserr_training
    print "{:<70}".format(item), "training_error:\t\t", mserr_testing
    # print "The min error values were:", min(results[1]), min(results[2])
    print "std of hot is:\t\t", results[3]
    print mserr_testing/results[3]*100
    # -------------------------------------------------



## THROW IT ALL IN THERE 


In [None]:
all_features = [dancelist+energylist+'tempo loudness tatums_avg beats_avg'+" artist_familiarity artist_hotttnesss"]

for item in all_features:
    X_cols = item
    results = util.RunAndTestLinearRegModel(X_cols, training_DF, testing_DF)

    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]

    if(len(item)>70):
        item="allfeatures"
    
    print "{:<70}".format(item), "training_error:\t", mserr_training
    print "{:<70}".format(item), "training_error:\t", mserr_testing
    # print "The min error values were:", min(results[1]), min(results[2])
    # print "std of hot is:", results[3]
    # -------------------------------------------------



---


# KNN STUFF

### using metadata (familiarity and artist_hott)

In [None]:
metadata = ['artist_familiarity', 'artist_hotttnesss', 'artist_hotttnesss artist_familiarity']

for item in metadata:
    X_cols = item
    results = util.RunAndTestKNNModel(X_cols, training_DF, testing_DF, 7)

    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]
    discrete_mserr_training = results[4][1]
    discrete_mserr_testing = results[5][1]

    print "{:<40}".format(item), "training_error:\t\t", mserr_training
    print "{:<40}".format(item), "testing_error:\t\t\t", mserr_testing
    print "{:<40}".format(item), "discrete training_error:\t", discrete_mserr_training
    print "{:<40}".format(item), "discrete testing_error:\t", discrete_mserr_testing
    print ""
    # print "The min error values were:", min(results[1]), min(results[2])
    # print "std of hot is:", results[3]
    # -------------------------------------------------

### using acoustic features 

In [None]:
allsegs = 'segments_avg tatums_avg beats_avg bars_avg sections_avg '
allrawacous = 'key loudness duration tempo time_signature'

raw_acous = []
for each in allsegs.split():
    raw_acous.append(each)

raw_acous.append(allsegs)

for each in allrawacous.split():
    raw_acous.append(each)   
raw_acous.append(allrawacous)

for item in raw_acous:
    X_cols = item
    results = util.RunAndTestKNNModel(X_cols, training_DF, testing_DF)

    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]
    discrete_mserr_training = results[4][1]
    discrete_mserr_testing = results[5][1]

    print "{:<60}".format(item), "training_error:\t\t", mserr_training
    print "{:<60}".format(item), "testing_error:\t\t", mserr_testing
    print "{:<60}".format(item), "discrete training_error:\t", discrete_mserr_training
    print "{:<60}".format(item), "discrete testing_error:\t", discrete_mserr_testing
    print ""
    # print "The min error values were:", min(results[1]), min(results[2])
#     print "std of hot is:", results[3]
    # -------------------------------------------------
print results[3]

### Finding the best energy measure

In [None]:
energylsit = ['energy1', 'energy2', 'energy3', 'energy4', 'energy1 energy2 energy3 energy4']

for item in energylsit:
    X_cols = item
    results = util.RunAndTestKNNModel(X_cols, training_DF, testing_DF)

    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]
    discrete_mserr_training = results[4][1]
    discrete_mserr_testing = results[5][1]

    print "{:<40}".format(item), "training_error:\t\t", mserr_training
    print "{:<40}".format(item), "testing_error:\t\t\t", mserr_testing
    print "{:<40}".format(item), "discrete training_error:\t", discrete_mserr_training
    print "{:<40}".format(item), "discrete testing_error:\t", discrete_mserr_testing
    print ""
#     print "d1 testing_error\t", mserr_testing
    # print "The min error values were:", min(results[1]), min(results[2])
    # print "std of hot is:", results[3]
    # -------------------------------------------------




In [None]:
# Comparing acoustic results
acous_compare = ["energy1 energy2 energy3 energy4 tatums_avg beats_avg", 
                 "energy1 energy2 energy3 energy4 tempo loudness", 
                 "energy1 energy2 energy3 energy4 tempo loudness tatums_avg beats_avg"]

for item in acous_compare:
    X_cols = item
    results = util.RunAndTestKNNModel(X_cols, training_DF, testing_DF)

    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]
    discrete_mserr_training = results[4][1]
    discrete_mserr_testing = results[5][1]

    print "{:<70}".format(item), "training_error:\t\t", mserr_training
    print "{:<70}".format(item), "testing_error:\t\t", mserr_testing
    print "{:<70}".format(item), "discrete training_error:\t", discrete_mserr_training
    print "{:<70}".format(item), "discrete testing_error:\t", discrete_mserr_testing
    print ""
#     print "d1 testing_error\t", mserr_testing
    # print "The min error values were:", min(results[1]), min(results[2])
    # print "std of hot is:", results[3]
    # -------------------------------------------------




### lets play with the dance measure now

In [None]:
dancelist = ['dance1', 'dance2', 'dance3', 'dance4', 'dance1 dance2 dance3 dance4']

for item in dancelist:
    # dance1
    X_cols = item
    results = util.RunAndTestKNNModel(X_cols, training_DF, testing_DF)

    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]
    discrete_mserr_training = results[4][1]
    discrete_mserr_testing = results[5][1]

    print "{:<40}".format(item), "training_error:\t\t", mserr_training
    print "{:<40}".format(item), "testing_error:\t\t\t", mserr_testing
    print "{:<40}".format(item), "discrete training_error:\t", discrete_mserr_training
    print "{:<40}".format(item), "discrete testing_error:\t", discrete_mserr_testing
    print ""
    # print "d1 testing_error\t", mserr_testing
    # print "The min error values were:", min(results[1]), min(results[2])
    # print "std of hot is:", results[3]
    # -------------------------------------------------



In [None]:
dancelist = 'dance1 dance2 dance3 dance4 '
energylist = 'energy1 energy2 energy3 energy4 '

acousitc =[]
acousitc.append(dancelist)
acousitc.append(energylist)
acousitc.append(dancelist+energylist)
acousitc.append("artist_familiarity artist_hotttnesss")
acousitc.append(dancelist+energylist+"artist_familiarity artist_hotttnesss")

for item in acousitc:
    # dance1
    X_cols = item
    results = util.RunAndTestKNNModel(X_cols, training_DF, testing_DF)

    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]
    discrete_mserr_training = results[4][1]
    discrete_mserr_testing = results[5][1]

    if(len(item)>70):
        item='alleng alldance allmeta'
    
    print "{:<70}".format(item), "training_error:\t\t", mserr_training
    print "{:<70}".format(item), "testing_error:\t\t", mserr_testing
    print "{:<70}".format(item), "discrete training_error:\t", discrete_mserr_training
    print "{:<70}".format(item), "discrete testing_error:\t", discrete_mserr_testing
    print ""
    # print "The min error values were:", min(results[1]), min(results[2])
    # print "std of hot is:", results[3]
    # -------------------------------------------------




In [None]:
finalset = [dancelist+energylist+"artist_familiarity artist_hotttnesss"]

for item in finalset:
    X_cols = item
    results = util.RunAndTestKNNModel(X_cols, training_DF, testing_DF)

    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]
    discrete_mserr_training = results[4][1]
    discrete_mserr_testing = results[5][1]

    if(len(item)>70):
        item='alleng alldance allmeta'

    print "{:<70}".format(item), "training_error:\t\t", mserr_training
    print "{:<70}".format(item), "testing_error:\t\t", mserr_testing
    print "{:<70}".format(item), "discrete training_error:\t", discrete_mserr_training
    print "{:<70}".format(item), "discrete testing_error:\t", discrete_mserr_testing
    print ""
    # print "The min error values were:", min(results[1]), min(results[2])
    print "std of hot is:\t\t", results[3]
#     print mserr_testing/results[3]*100
    # -------------------------------------------------



### THROW IT ALL IN THERE 

In [None]:
all_features = [dancelist+energylist+'tempo loudness tatums_avg beats_avg'+" artist_familiarity artist_hotttnesss"]

for item in all_features:
    X_cols = item
#     results = KNN(X_cols, training_DF, testing_DF)
    results = util.RunAndTestKNNModel(X_cols, training_DF, testing_DF)

    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]
    discrete_mserr_training = results[4][1]
    discrete_mserr_testing = results[5][1]

    if(len(item)>70):
        item="allfeatures"
    
    print "{:<70}".format(item), "training_error:\t\t", mserr_training
    print "{:<70}".format(item), "testing_error:\t\t", mserr_testing
    print "{:<70}".format(item), "discrete training_error:\t", discrete_mserr_training
    print "{:<70}".format(item), "discrete testing_error:\t", discrete_mserr_testing
    # print "The min error values were:", min(results[1]), min(results[2])
    # print "std of hot is:", results[3]
    # -------------------------------------------------
