# Basic Linear Regression Model

In [1]:
import sys
ver_info = sys.version
print("This jupyter notebook was executed with:\n{0}".format(ver_info));


This jupyter notebook was executed with:
2.7.14 |Anaconda, Inc.| (default, Nov 20 2017, 18:04:19) 
[GCC 7.2.0]


## Imports

In [2]:
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
from scipy import stats, integrate
import seaborn as sns

import pprint
p = pprint.PrettyPrinter(indent=4)

import config as config

%matplotlib inline


## Importing utilities object
+ imports dataset as lists (a training and test set)


In [3]:
%run utilities.py
util = Utilities({}, use_json=False)
training_list, testing_list = util.get_datasets()

# turning the datasets into pandas DataFrames 
training_DF, testing_DF = util.create_dataframes(training_list, testing_list)

# generate the energy and dance measures 
training_DF, testing_DF = util.generate_energy_measure(training_DF, testing_DF)
training_DF, testing_DF = util.generate_dance_measure(training_DF, testing_DF)


The minimum supported version is 2.4.6



# Makin Linear Reg Learners!

In [4]:
from sklearn.linear_model import LinearRegression
from sklearn import metrics 
# Look at utilities.py for RunAndTestLinearRegModel() 

### using metadata (familiarity and artist_hott)

In [5]:
metadata = ['artist_familiarity', 'artist_hotttnesss', 'artist_hotttnesss artist_familiarity']

for item in metadata:
    X_cols = item
    results = util.RunAndTestLinearRegModel(X_cols, training_DF, testing_DF)

    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]

    print "{:<40}".format(item), "training_error:\t", mserr_training
    print "{:<40}".format(item), "testing_error:\t\t\t", mserr_testing
    # print "The min error values were:", min(results[1]), min(results[2])
    # print "std of hot is:", results[3]
    # -------------------------------------------------


artist_familiarity                       training_error:	0.0210615568338
artist_familiarity                       testing_error:			0.0213577107761
artist_hotttnesss                        training_error:	0.0213931859342
artist_hotttnesss                        testing_error:			0.022303188341
artist_hotttnesss artist_familiarity     training_error:	0.020440738042
artist_hotttnesss artist_familiarity     testing_error:			0.0209070104041


#### Conclusions?
can see that a combination of both features perform better

### using acoustic features 

In [6]:
allsegs = 'segments_avg tatums_avg beats_avg bars_avg sections_avg '
allrawacous = 'key loudness duration tempo time_signature'

raw_acous = []
for each in allsegs.split():
    raw_acous.append(each)

raw_acous.append(allsegs)


for each in allrawacous.split():
    raw_acous.append(each)   
raw_acous.append(allrawacous)
raw_acous.append((allrawacous+ ' ' + allsegs))


for item in raw_acous:
    X_cols = item
    results = util.RunAndTestLinearRegModel(X_cols, training_DF, testing_DF)

    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]

#     print "{:<60}".format(item), "training_error:\t", mserr_training
    print "{:<40}".format(item), "testing_error:\t\t\t", mserr_testing
    # print "The min error values were:", min(results[1]), min(results[2])
#     print "std of hot is:", results[3]
    # -------------------------------------------------
# print results[3]

segments_avg                             testing_error:			0.028947303262
tatums_avg                               testing_error:			0.0288234858365
beats_avg                                testing_error:			0.0289073349998
bars_avg                                 testing_error:			0.0289791249152
sections_avg                             testing_error:			0.0288374165962
segments_avg tatums_avg beats_avg bars_avg sections_avg  testing_error:			0.0286068674162
key                                      testing_error:			0.028953069294
loudness                                 testing_error:			0.0274338005675
duration                                 testing_error:			0.0290466638605
tempo                                    testing_error:			0.0288841183369
time_signature                           testing_error:			0.0288774122332
key loudness duration tempo time_signature testing_error:			0.0275424240233
key loudness duration tempo time_signature segments_avg tatums_avg beats_avg bars_avg sections_a

## Finding the best energy measure

In [7]:
energylsit = ['energy1', 'energy2', 'energy3', 'energy4', 'energy1 energy2 energy3 energy4']

for item in energylsit:
    X_cols = item
    results = util.RunAndTestLinearRegModel(X_cols, training_DF, testing_DF)

    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]

#     print "{:<40}".format(item), "training_error:\t", mserr_training
    print item
    # print "d1 testing_error\t", mserr_testing
    # print "The min error values were:", min(results[1]), min(results[2])
    # print "std of hot is:", results[3]
    # -------------------------------------------------




energy1
energy2
energy3
energy4
energy1 energy2 energy3 energy4


In [8]:
# Comparing acoustic results
acous_compare = ["energy1 energy2 energy3 energy4 tatums_avg beats_avg", 
                 "energy1 energy2 energy3 energy4 tempo loudness", 
                 "energy1 energy2 energy3 energy4 tempo loudness tatums_avg beats_avg"]

for item in acous_compare:
    X_cols = item
    results = util.RunAndTestLinearRegModel(X_cols, training_DF, testing_DF)

    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]

    print "{:<70}".format(item), "training_error:\t", mserr_training
    # print "d1 testing_error\t", mserr_testing
    # print "The min error values were:", min(results[1]), min(results[2])
    # print "std of hot is:", results[3]
    # -------------------------------------------------





energy1 energy2 energy3 energy4 tatums_avg beats_avg                   training_error:	0.0211592256926
energy1 energy2 energy3 energy4 tempo loudness                         training_error:	0.0206005377593
energy1 energy2 energy3 energy4 tempo loudness tatums_avg beats_avg    training_error:	0.0205903925348


## lets play with the dance measure now

In [14]:
dancelist = ['dance1', 'dance2', 'dance3', 'dance4', 'dance1 dance2 dance3 dance4']

for item in dancelist:
    # dance1
    X_cols = item
    results = util.RunAndTestLinearRegModel(X_cols, training_DF, testing_DF)

    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]

    print "{:<40}".format(item), "training_error:\t", mserr_training
#     print item
    # print "d1 testing_error\t", mserr_testing
    # print "The min error values were:", min(results[1]), min(results[2])
    # print "std of hot is:", results[3]
    # -------------------------------------------------




dance1                                   training_error:	0.0229094894437
dance2                                   training_error:	0.0221730934354
dance3                                   training_error:	0.0239712685124
dance4                                   training_error:	0.023969093992
dance1 dance2 dance3 dance4              training_error:	0.0218188639278


In [10]:
dancelist = 'dance1 dance2 dance3 dance4 '
energylist = 'energy1 energy2 energy3 energy4 '

acousitc =[]
acousitc.append(dancelist)
acousitc.append(energylist)
acousitc.append(dancelist+energylist)
acousitc.append("artist_familiarity artist_hotttnesss")
acousitc.append(dancelist+energylist+"artist_familiarity artist_hotttnesss")

for item in acousitc:
    # dance1
    X_cols = item
    results = util.RunAndTestLinearRegModel(X_cols, training_DF, testing_DF)

    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]

    if(len(item)>70):
        item='alleng alldance allmeta'
    
#     print "{:<70}".format(item), "training_error:\t", mserr_training
    print "{:<70}".format(item), "testing_error:\t\t", mserr_testing
    # print "The min error values were:", min(results[1]), min(results[2])
    # print "std of hot is:", results[3]
    # -------------------------------------------------




dance1 dance2 dance3 dance4                                            testing_error:		0.0228423383647
energy1 energy2 energy3 energy4                                        testing_error:		0.0212909912842
dance1 dance2 dance3 dance4 energy1 energy2 energy3 energy4            testing_error:		0.0208636930648
artist_familiarity artist_hotttnesss                                   testing_error:		0.0209070104041
alleng alldance allmeta                                                testing_error:		0.0204036653068


In [11]:
finalset = [dancelist+energylist+"artist_familiarity artist_hotttnesss"]

for item in finalset:
    X_cols = item
    results = util.RunAndTestLinearRegModel(X_cols, training_DF, testing_DF)

    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]

    if(len(item)>70):
        item='alleng alldance allmeta'
    
    print "{:<70}".format(item), "training_error:\t", mserr_training
    print "{:<70}".format(item), "training_error:\t\t", mserr_testing
    # print "The min error values were:", min(results[1]), min(results[2])
    print "std of hot is:\t\t", results[3]
    print mserr_testing/results[3]*100
    # -------------------------------------------------



alleng alldance allmeta                                                training_error:	0.0200564508292
alleng alldance allmeta                                                training_error:		0.0204036653068
std of hot is:		0.167935342113
12.1497149141


## THROW IT ALL IN THERE 


In [16]:
rawacoustic = " key loudness duration tempo time_signature segments_avg tatums_avg beats_avg bars_avg sections_avg  "
above = ' dance1 dance2 dance3 dance4 energy1 energy2 energy3 energy4 artist_familiarity artist_hotttnesss  '
actual_all = [rawacoustic+above]

for item in actual_all:
    X_cols = item
    results = util.RunAndTestLinearRegModel(X_cols, training_DF, testing_DF)

    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]

    if(len(item)>70):
        item="allfeatures"
    
    print "{:<70}".format(item), "training_error:\t", mserr_training
    print "{:<70}".format(item), "training_error:\t", mserr_testing
    # print "The min error values were:", min(results[1]), min(results[2])
    # print "std of hot is:", results[3]
    # -------------------------------------------------


allfeatures                                                            training_error:	0.0199210003694
allfeatures                                                            training_error:	0.0204565606284
