# Basic Linear Regression Model

## Imports

In [1]:
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
from scipy import stats, integrate
import seaborn as sns

import pprint
p = pprint.PrettyPrinter(indent=4)

import config as config

%matplotlib inline


## Importing utilities object
+ imports dataset as lists (a training and test set)


In [3]:
%run utilities.py
util = Utilities({}, use_json=False)
training_list, testing_list = util.get_datasets()

# turning the datasets into pandas DataFrames 
training_DF, testing_DF = util.create_dataframes(training_list, testing_list)

# generate the energy and dance measures 
training_DF, testing_DF = util.generate_energy_measure(training_DF, testing_DF)
training_DF, testing_DF = util.generate_dance_measure(training_DF, testing_DF)


The minimum supported version is 2.4.6



# Makin Linear Reg Learners!

In [4]:
from sklearn.linear_model import LinearRegression
from sklearn import metrics 
# Look at utilities.py for RunAndTestLinearRegModel() 

### using metadata (familiarity and artist_hott)

In [5]:
metadata = ['artist_familiarity', 'artist_hotttnesss', 'artist_hotttnesss artist_familiarity']

for item in metadata:
    X_cols = item
    results = util.RunAndTestLinearRegModel(X_cols, training_DF, testing_DF)
#     print results
    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]

    print "{:<40}".format(item), "training_error:\t", mserr_training
    print "{:<40}".format(item), "testing_error:\t\t\t", mserr_testing
    # print "The min error values were:", min(results[1]), min(results[2])
    # print "std of hot is:", results[3]
    # -------------------------------------------------


artist_familiarity                       training_error:	0.0209722605506
artist_familiarity                       testing_error:			0.0215771284529
artist_hotttnesss                        training_error:	0.0213953658299
artist_hotttnesss                        testing_error:			0.0222403846493
artist_hotttnesss artist_familiarity     training_error:	0.0203608697767
artist_hotttnesss artist_familiarity     testing_error:			0.0210909167516


#### Conclusions?
can see that a combination of both features perform better

### using acoustic features 

In [6]:
allsegs = 'segments_avg tatums_avg beats_avg bars_avg sections_avg '
allrawacous = 'key loudness duration tempo time_signature'

raw_acous = []
for each in allsegs.split():
    raw_acous.append(each)

raw_acous.append(allsegs)

for each in allrawacous.split():
    raw_acous.append(each)   
raw_acous.append(allrawacous)

for item in raw_acous:
    X_cols = item
    results = util.RunAndTestLinearRegModel(X_cols, training_DF, testing_DF)

    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]

    print "{:<60}".format(item), "training_error:\t", mserr_training
#     print "{:<40}".format(item), "testing_error:\t\t\t", mserr_testing
    # print "The min error values were:", min(results[1]), min(results[2])
#     print "std of hot is:", results[3]
    # -------------------------------------------------
print results[3]

segments_avg                                                 training_error:	0.0281400841003
tatums_avg                                                   training_error:	0.0278723578916
beats_avg                                                    training_error:	0.0279674090187
bars_avg                                                     training_error:	0.0281332010789
sections_avg                                                 training_error:	0.0280646021003
segments_avg tatums_avg beats_avg bars_avg sections_avg      training_error:	0.027649427754
key                                                          training_error:	0.0281365941353
loudness                                                     training_error:	0.0269978016227
duration                                                     training_error:	0.028140511597
tempo                                                        training_error:	0.0279922208325
time_signature                                               training_er

In [7]:
0.02/0.167923247404*100

11.910203208423416

## Finding the best energy measure

In [8]:
energylsit = ['energy1', 'energy2', 'energy3', 'energy4', 'energy1 energy2 energy3 energy4']

for item in energylsit:
    X_cols = item
    results = util.RunAndTestLinearRegModel(X_cols, training_DF, testing_DF)

    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]

    print "{:<40}".format(item), "training_error:\t", mserr_training
    # print "d1 testing_error\t", mserr_testing
    # print "The min error values were:", min(results[1]), min(results[2])
    # print "std of hot is:", results[3]
    # -------------------------------------------------




energy1                                  training_error:	0.0212063840075
energy2                                  training_error:	0.0220835666
energy3                                  training_error:	0.0215252606148
energy4                                  training_error:	0.023340289915
energy1 energy2 energy3 energy4          training_error:	0.0211614262626


In [9]:
# Comparing acoustic results
acous_compare = ["energy1 energy2 energy3 energy4 tatums_avg beats_avg", 
                 "energy1 energy2 energy3 energy4 tempo loudness", 
                 "energy1 energy2 energy3 energy4 tempo loudness tatums_avg beats_avg"]

for item in acous_compare:
    X_cols = item
    results = util.RunAndTestLinearRegModel(X_cols, training_DF, testing_DF)

    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]

    print "{:<70}".format(item), "training_error:\t", mserr_training
    # print "d1 testing_error\t", mserr_testing
    # print "The min error values were:", min(results[1]), min(results[2])
    # print "std of hot is:", results[3]
    # -------------------------------------------------





energy1 energy2 energy3 energy4 tatums_avg beats_avg                   training_error:	0.0211130907734
energy1 energy2 energy3 energy4 tempo loudness                         training_error:	0.0205244765226
energy1 energy2 energy3 energy4 tempo loudness tatums_avg beats_avg    training_error:	0.0205207925284


## lets play with the dance measure now

In [10]:
dancelist = ['dance1', 'dance2', 'dance3', 'dance4', 'dance1 dance2 dance3 dance4']

for item in dancelist:
    # dance1
    X_cols = item
    results = util.RunAndTestLinearRegModel(X_cols, training_DF, testing_DF)

    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]

    print "{:<40}".format(item), "training_error:\t", mserr_training
    # print "d1 testing_error\t", mserr_testing
    # print "The min error values were:", min(results[1]), min(results[2])
    # print "std of hot is:", results[3]
    # -------------------------------------------------




dance1                                   training_error:	0.0229539347096
dance2                                   training_error:	0.0222344508512
dance3                                   training_error:	0.0240572218225
dance4                                   training_error:	0.0241017306605
dance1 dance2 dance3 dance4              training_error:	0.0219152188339


In [11]:
dancelist = 'dance1 dance2 dance3 dance4 '
energylist = 'energy1 energy2 energy3 energy4 '

acousitc =[]
acousitc.append(dancelist)
acousitc.append(energylist)
acousitc.append(dancelist+energylist)
acousitc.append("artist_familiarity artist_hotttnesss")
acousitc.append(dancelist+energylist+"artist_familiarity artist_hotttnesss")

for item in acousitc:
    # dance1
    X_cols = item
    results = util.RunAndTestLinearRegModel(X_cols, training_DF, testing_DF)

    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]

    if(len(item)>70):
        item='alleng alldance allmeta'
    
    print "{:<70}".format(item), "training_error:\t", mserr_training
#     print "{:<70}".format(item), "training_error:\t\t", mserr_testing
    # print "The min error values were:", min(results[1]), min(results[2])
    # print "std of hot is:", results[3]
    # -------------------------------------------------




dance1 dance2 dance3 dance4                                            training_error:	0.0219152188339
energy1 energy2 energy3 energy4                                        training_error:	0.0211614262626
dance1 dance2 dance3 dance4 energy1 energy2 energy3 energy4            training_error:	0.0203509789523
artist_familiarity artist_hotttnesss                                   training_error:	0.0203608697767
alleng alldance allmeta                                                training_error:	0.0200099104971


In [12]:
finalset = [dancelist+energylist+"artist_familiarity artist_hotttnesss"]

for item in finalset:
    X_cols = item
    results = util.RunAndTestLinearRegModel(X_cols, training_DF, testing_DF)

    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]

    if(len(item)>70):
        item='alleng alldance allmeta'
    
    print "{:<70}".format(item), "training_error:\t", mserr_training
    print "{:<70}".format(item), "training_error:\t\t", mserr_testing
    # print "The min error values were:", min(results[1]), min(results[2])
    print "std of hot is:\t\t", results[3]
    print mserr_testing/results[3]*100
    # -------------------------------------------------



alleng alldance allmeta                                                training_error:	0.0200099104971
alleng alldance allmeta                                                training_error:		0.0204623449656
std of hot is:		0.167892339146
12.1877776376


## THROW IT ALL IN THERE 


In [13]:
all_features = [dancelist+energylist+'tempo loudness tatums_avg beats_avg'+" artist_familiarity artist_hotttnesss"]

for item in all_features:
    X_cols = item
    results = util.RunAndTestLinearRegModel(X_cols, training_DF, testing_DF)

    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]

    if(len(item)>70):
        item="allfeatures"
    
    print "{:<70}".format(item), "training_error:\t", mserr_training
    print "{:<70}".format(item), "training_error:\t", mserr_testing
    # print "The min error values were:", min(results[1]), min(results[2])
    # print "std of hot is:", results[3]
    # -------------------------------------------------


allfeatures                                                            training_error:	0.0199577504479
allfeatures                                                            training_error:	0.0204399432671



---


# KNN STUFF

### using metadata (familiarity and artist_hott)

In [24]:
metadata = ['artist_familiarity', 'artist_hotttnesss', 'artist_hotttnesss artist_familiarity']

for item in metadata:
    X_cols = item
    results = util.RunAndTestKNNModel(X_cols, training_DF, testing_DF, 7)

    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]
    discrete_mserr_training = results[4][1]
    discrete_mserr_testing = results[5][1]

    print "{:<40}".format(item), "training_error:\t\t", mserr_training
    print "{:<40}".format(item), "testing_error:\t\t\t", mserr_testing
    print "{:<40}".format(item), "discrete training_error:\t", discrete_mserr_training
    print "{:<40}".format(item), "discrete testing_error:\t", discrete_mserr_testing
    print ""
    # print "The min error values were:", min(results[1]), min(results[2])
    # print "std of hot is:", results[3]
    # -------------------------------------------------

Score for Training: 	0.451219512195
Score for Testing: 	0.26493256262
artist_familiarity                       training_error:		0.032535096815
artist_familiarity                       testing_error:			0.0391009597448
artist_familiarity                       discrete training_error:	0.0261328626444
artist_familiarity                       discrete testing_error:	0.0331695568401

Score for Training: 	0.451861360719
Score for Testing: 	0.288053949904
artist_hotttnesss                        training_error:		0.0344070905994
artist_hotttnesss                        testing_error:			0.0423473824439
artist_hotttnesss                        discrete training_error:	0.0277310654685
artist_hotttnesss                        discrete testing_error:	0.035366088632

Score for Training: 	0.44351732991
Score for Testing: 	0.274566473988
artist_hotttnesss artist_familiarity     training_error:		0.0331897379804
artist_hotttnesss artist_familiarity     testing_error:			0.0387646084964
artist_hotttnesss a

### using acoustic features 

In [15]:
allsegs = 'segments_avg tatums_avg beats_avg bars_avg sections_avg '
allrawacous = 'key loudness duration tempo time_signature'

raw_acous = []
for each in allsegs.split():
    raw_acous.append(each)

raw_acous.append(allsegs)

for each in allrawacous.split():
    raw_acous.append(each)   
raw_acous.append(allrawacous)

for item in raw_acous:
    X_cols = item
    results = util.RunAndTestKNNModel(X_cols, training_DF, testing_DF)

    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]
    discrete_mserr_training = results[4][1]
    discrete_mserr_testing = results[5][1]

    print "{:<60}".format(item), "training_error:\t\t", mserr_training
    print "{:<60}".format(item), "testing_error:\t\t", mserr_testing
    print "{:<60}".format(item), "discrete training_error:\t", discrete_mserr_training
    print "{:<60}".format(item), "discrete testing_error:\t", discrete_mserr_testing
    print ""
    # print "The min error values were:", min(results[1]), min(results[2])
#     print "std of hot is:", results[3]
    # -------------------------------------------------
print results[3]

Score for Training: 	0.490051347882
Score for Testing: 	0.209055876686
segments_avg                                                 training_error:		0.053716573654
segments_avg                                                 testing_error:		0.0659548363973
segments_avg                                                 discrete training_error:	0.0415982028241
segments_avg                                                 discrete testing_error:	0.0542003853565

Score for Training: 	0.481707317073
Score for Testing: 	0.214836223507
tatums_avg                                                   training_error:		0.0524520835091
tatums_avg                                                   testing_error:		0.0673719584393
tatums_avg                                                   discrete training_error:	0.0400770218228
tatums_avg                                                   discrete testing_error:	0.0550674373796

Score for Training: 	0.486842105263
Score for Testing: 	0.208092485549
beats_

### Finding the best energy measure

In [16]:
energylsit = ['energy1', 'energy2', 'energy3', 'energy4', 'energy1 energy2 energy3 energy4']

for item in energylsit:
    X_cols = item
    results = util.RunAndTestKNNModel(X_cols, training_DF, testing_DF)

    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]
    discrete_mserr_training = results[4][1]
    discrete_mserr_testing = results[5][1]

    print "{:<40}".format(item), "training_error:\t\t", mserr_training
    print "{:<40}".format(item), "testing_error:\t\t\t", mserr_testing
    print "{:<40}".format(item), "discrete training_error:\t", discrete_mserr_training
    print "{:<40}".format(item), "discrete testing_error:\t", discrete_mserr_testing
    print ""
#     print "d1 testing_error\t", mserr_testing
    # print "The min error values were:", min(results[1]), min(results[2])
    # print "std of hot is:", results[3]
    # -------------------------------------------------




Score for Training: 	0.534659820282
Score for Testing: 	0.215799614644
energy1                                  training_error:		0.0383636590119
energy1                                  testing_error:			0.0548165902725
energy1                                  discrete training_error:	0.0285911424904
energy1                                  discrete testing_error:	0.0440269749518

Score for Training: 	0.509627727856
Score for Testing: 	0.215799614644
energy2                                  training_error:		0.0413729864433
energy2                                  testing_error:			0.055978689906
energy2                                  discrete training_error:	0.0307349165597
energy2                                  discrete testing_error:	0.0451252408478

Score for Training: 	0.51026957638
Score for Testing: 	0.19267822736
energy3                                  training_error:		0.0415272816446
energy3                                  testing_error:			0.0538887723157
energy3           

In [17]:
# Comparing acoustic results
acous_compare = ["energy1 energy2 energy3 energy4 tatums_avg beats_avg", 
                 "energy1 energy2 energy3 energy4 tempo loudness", 
                 "energy1 energy2 energy3 energy4 tempo loudness tatums_avg beats_avg"]

for item in acous_compare:
    X_cols = item
    results = util.RunAndTestKNNModel(X_cols, training_DF, testing_DF)

    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]
    discrete_mserr_training = results[4][1]
    discrete_mserr_testing = results[5][1]

    print "{:<70}".format(item), "training_error:\t\t", mserr_training
    print "{:<70}".format(item), "testing_error:\t\t", mserr_testing
    print "{:<70}".format(item), "discrete training_error:\t", discrete_mserr_training
    print "{:<70}".format(item), "discrete testing_error:\t", discrete_mserr_testing
    print ""
#     print "d1 testing_error\t", mserr_testing
    # print "The min error values were:", min(results[1]), min(results[2])
    # print "std of hot is:", results[3]
    # -------------------------------------------------




Score for Training: 	0.54107830552
Score for Testing: 	0.220616570328
energy1 energy2 energy3 energy4 tatums_avg beats_avg                   training_error:		0.038252880177
energy1 energy2 energy3 energy4 tatums_avg beats_avg                   testing_error:		0.0545856462781
energy1 energy2 energy3 energy4 tatums_avg beats_avg                   discrete training_error:	0.0282862644416
energy1 energy2 energy3 energy4 tatums_avg beats_avg                   discrete testing_error:	0.0441136801541

Score for Training: 	0.508664955071
Score for Testing: 	0.208092485549
energy1 energy2 energy3 energy4 tempo loudness                         training_error:		0.0473793536626
energy1 energy2 energy3 energy4 tempo loudness                         testing_error:		0.0606222535455
energy1 energy2 energy3 energy4 tempo loudness                         discrete training_error:	0.0361071887035
energy1 energy2 energy3 energy4 tempo loudness                         discrete testing_error:	0.0497398843931

### lets play with the dance measure now

In [18]:
dancelist = ['dance1', 'dance2', 'dance3', 'dance4', 'dance1 dance2 dance3 dance4']

for item in dancelist:
    # dance1
    X_cols = item
    results = util.RunAndTestKNNModel(X_cols, training_DF, testing_DF)

    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]
    discrete_mserr_training = results[4][1]
    discrete_mserr_testing = results[5][1]

    print "{:<40}".format(item), "training_error:\t\t", mserr_training
    print "{:<40}".format(item), "testing_error:\t\t\t", mserr_testing
    print "{:<40}".format(item), "discrete training_error:\t", discrete_mserr_training
    print "{:<40}".format(item), "discrete testing_error:\t", discrete_mserr_testing
    print ""
    # print "d1 testing_error\t", mserr_testing
    # print "The min error values were:", min(results[1]), min(results[2])
    # print "std of hot is:", results[3]
    # -------------------------------------------------



Score for Training: 	0.527278562259
Score for Testing: 	0.234104046243
dance1                                   training_error:		0.0422484436616
dance1                                   testing_error:			0.0596307554549
dance1                                   discrete training_error:	0.0317297817715
dance1                                   discrete testing_error:	0.0487668593449

Score for Training: 	0.515725288832
Score for Testing: 	0.227360308285
dance2                                   training_error:		0.0424797707184
dance2                                   testing_error:			0.0574139028251
dance2                                   discrete training_error:	0.0318934531451
dance2                                   discrete testing_error:	0.0461078998073

Score for Training: 	0.5
Score for Testing: 	0.221579961464
dance3                                   training_error:		0.0454673563543
dance3                                   testing_error:			0.0603035476369
dance3                    

In [19]:
dancelist = 'dance1 dance2 dance3 dance4 '
energylist = 'energy1 energy2 energy3 energy4 '

acousitc =[]
acousitc.append(dancelist)
acousitc.append(energylist)
acousitc.append(dancelist+energylist)
acousitc.append("artist_familiarity artist_hotttnesss")
acousitc.append(dancelist+energylist+"artist_familiarity artist_hotttnesss")

for item in acousitc:
    # dance1
    X_cols = item
    results = util.RunAndTestKNNModel(X_cols, training_DF, testing_DF)

    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]
    discrete_mserr_training = results[4][1]
    discrete_mserr_testing = results[5][1]

    if(len(item)>70):
        item='alleng alldance allmeta'
    
    print "{:<70}".format(item), "training_error:\t\t", mserr_training
    print "{:<70}".format(item), "testing_error:\t\t", mserr_testing
    print "{:<70}".format(item), "discrete training_error:\t", discrete_mserr_training
    print "{:<70}".format(item), "discrete testing_error:\t", discrete_mserr_testing
    print ""
    # print "The min error values were:", min(results[1]), min(results[2])
    # print "std of hot is:", results[3]
    # -------------------------------------------------




Score for Training: 	0.521822849807
Score for Testing: 	0.23506743738
dance1 dance2 dance3 dance4                                            training_error:		0.0406380832174
dance1 dance2 dance3 dance4                                            testing_error:		0.0542938640762
dance1 dance2 dance3 dance4                                            discrete training_error:	0.0305231065469
dance1 dance2 dance3 dance4                                            discrete testing_error:	0.0439884393064

Score for Training: 	0.534980744544
Score for Testing: 	0.215799614644
energy1 energy2 energy3 energy4                                        training_error:		0.0389512650171
energy1 energy2 energy3 energy4                                        testing_error:		0.0527535024261
energy1 energy2 energy3 energy4                                        discrete training_error:	0.0289345314506
energy1 energy2 energy3 energy4                                        discrete testing_error:	0.042109826589

In [20]:
finalset = [dancelist+energylist+"artist_familiarity artist_hotttnesss"]

for item in finalset:
    X_cols = item
    results = util.RunAndTestKNNModel(X_cols, training_DF, testing_DF)

    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]
    discrete_mserr_training = results[4][1]
    discrete_mserr_testing = results[5][1]

    if(len(item)>70):
        item='alleng alldance allmeta'

    print "{:<70}".format(item), "training_error:\t\t", mserr_training
    print "{:<70}".format(item), "testing_error:\t\t", mserr_testing
    print "{:<70}".format(item), "discrete training_error:\t", discrete_mserr_training
    print "{:<70}".format(item), "discrete testing_error:\t", discrete_mserr_testing
    print ""
    # print "The min error values were:", min(results[1]), min(results[2])
    print "std of hot is:\t\t", results[3]
#     print mserr_testing/results[3]*100
    # -------------------------------------------------



Score for Training: 	0.538831835687
Score for Testing: 	0.249518304432
alleng alldance allmeta                                                training_error:		0.0379710864099
alleng alldance allmeta                                                testing_error:		0.0518358022649
alleng alldance allmeta                                                discrete training_error:	0.0282798459564
alleng alldance allmeta                                                discrete testing_error:	0.0410404624277

std of hot is:		0.167892339146


### THROW IT ALL IN THERE 

In [21]:
all_features = [dancelist+energylist+'tempo loudness tatums_avg beats_avg'+" artist_familiarity artist_hotttnesss"]

for item in all_features:
    X_cols = item
#     results = KNN(X_cols, training_DF, testing_DF)
    results = util.RunAndTestKNNModel(X_cols, training_DF, testing_DF)

    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]
    discrete_mserr_training = results[4][1]
    discrete_mserr_testing = results[5][1]

    if(len(item)>70):
        item="allfeatures"
    
    print "{:<70}".format(item), "training_error:\t\t", mserr_training
    print "{:<70}".format(item), "testing_error:\t\t", mserr_testing
    print "{:<70}".format(item), "discrete training_error:\t", discrete_mserr_training
    print "{:<70}".format(item), "discrete testing_error:\t", discrete_mserr_testing
    # print "The min error values were:", min(results[1]), min(results[2])
    # print "std of hot is:", results[3]
    # -------------------------------------------------


Score for Training: 	0.51026957638
Score for Testing: 	0.207129094412
allfeatures                                                            training_error:		0.042592338119
allfeatures                                                            testing_error:		0.0588998550877
allfeatures                                                            discrete training_error:	0.0316816431322
allfeatures                                                            discrete testing_error:	0.0476396917148
