# Basic Linear Regression Model

## Imports

In [3]:
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
from scipy import stats, integrate
import seaborn as sns

import pprint
p = pprint.PrettyPrinter(indent=4)

import config as config

%matplotlib inline


## Importing utilities object
+ imports dataset as lists (a training and test set)


In [4]:
%run utilities.py
util = Utilities({}, use_json=False)
training_list, testing_list = util.get_datasets()

# turning the datasets into pandas DataFrames 
training_DF, testing_DF = util.create_dataframes(training_list, testing_list)

# generate the energy and dance measures 
training_DF, testing_DF = util.generate_energy_measure(training_DF, testing_DF)
training_DF, testing_DF = util.generate_dance_measure(training_DF, testing_DF)


# Makin Linear Reg Learners!

In [5]:
from sklearn.linear_model import LinearRegression
from sklearn import metrics 
# Look at utilities.py for RunAndTestLinearRegModel() 

### using metadata (familiarity and artist_hott)

In [7]:
metadata = ['artist_familiarity', 'artist_hotttnesss', 'artist_hotttnesss artist_familiarity']

for item in metadata:
    X_cols = item
    results = util.RunAndTestLinearRegModel(X_cols, training_DF, testing_DF)

    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]

    print "{:<40}".format(item), "training_error:\t", mserr_training
    print "{:<40}".format(item), "testing_error:\t\t\t", mserr_testing
    # print "The min error values were:", min(results[1]), min(results[2])
    # print "std of hot is:", results[3]
    # -------------------------------------------------


artist_familiarity                       training_error:	0.021030005378670887
artist_familiarity                       testing_error:			0.02142993874774311
artist_hotttnesss                        training_error:	0.021238893507643818
artist_hotttnesss                        testing_error:			0.022749268586318712
artist_hotttnesss artist_familiarity     training_error:	0.020295593815473315
artist_hotttnesss artist_familiarity     testing_error:			0.021339939044369476


#### Conclusions?
can see that a combination of both features perform better

### using acoustic features 

In [19]:
allsegs = 'segments_avg tatums_avg beats_avg bars_avg sections_avg '
allrawacous = 'key loudness duration tempo time_signature'

raw_acous = []
for each in allsegs.split():
    raw_acous.append(each)

raw_acous.append(allsegs)


for each in allrawacous.split():
    raw_acous.append(each)   
raw_acous.append(allrawacous)
raw_acous.append((allrawacous+ ' ' + allsegs))


for item in raw_acous:
    X_cols = item
    results = util.RunAndTestLinearRegModel(X_cols, training_DF, testing_DF)

    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]

#     print "{:<60}".format(item), "training_error:\t", mserr_training
    print "{:<40}".format(item), "testing_error:\t\t\t", mserr_testing
    # print "The min error values were:", min(results[1]), min(results[2])
#     print "std of hot is:", results[3]
    # -------------------------------------------------
# print results[3]

segments_avg                             testing_error:			0.028457186422077094
tatums_avg                               testing_error:			0.028180480371497084
beats_avg                                testing_error:			0.02811980102065815
bars_avg                                 testing_error:			0.028434217848738584
sections_avg                             testing_error:			0.02839814755945918
segments_avg tatums_avg beats_avg bars_avg sections_avg  testing_error:			0.02793461769772043
key                                      testing_error:			0.02845269047601641
loudness                                 testing_error:			0.02707379200876735
duration                                 testing_error:			0.02845381699716093
tempo                                    testing_error:			0.028159897678227058
time_signature                           testing_error:			0.028414960868019215
key loudness duration tempo time_signature testing_error:			0.026982405063956997
key loudness duration tempo time_signatu

## Finding the best energy measure

In [20]:
energylsit = ['energy1', 'energy2', 'energy3', 'energy4', 'energy1 energy2 energy3 energy4']

for item in energylsit:
    X_cols = item
    results = util.RunAndTestLinearRegModel(X_cols, training_DF, testing_DF)

    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]

#     print "{:<40}".format(item), "training_error:\t", mserr_training
    print item
    # print "d1 testing_error\t", mserr_testing
    # print "The min error values were:", min(results[1]), min(results[2])
    # print "std of hot is:", results[3]
    # -------------------------------------------------




energy1
energy2
energy3
energy4
energy1 energy2 energy3 energy4


In [21]:
# Comparing acoustic results
acous_compare = ["energy1 energy2 energy3 energy4 tatums_avg beats_avg", 
                 "energy1 energy2 energy3 energy4 tempo loudness", 
                 "energy1 energy2 energy3 energy4 tempo loudness tatums_avg beats_avg"]

for item in acous_compare:
    X_cols = item
    results = util.RunAndTestLinearRegModel(X_cols, training_DF, testing_DF)

    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]

    print "{:<70}".format(item), "training_error:\t", mserr_training
    # print "d1 testing_error\t", mserr_testing
    # print "The min error values were:", min(results[1]), min(results[2])
    # print "std of hot is:", results[3]
    # -------------------------------------------------





energy1 energy2 energy3 energy4 tatums_avg beats_avg                   training_error:	0.021089367630256366
energy1 energy2 energy3 energy4 tempo loudness                         training_error:	0.020521568693150513
energy1 energy2 energy3 energy4 tempo loudness tatums_avg beats_avg    training_error:	0.02052106554974991


## lets play with the dance measure now

In [22]:
dancelist = ['dance1', 'dance2', 'dance3', 'dance4', 'dance1 dance2 dance3 dance4']

for item in dancelist:
    # dance1
    X_cols = item
    results = util.RunAndTestLinearRegModel(X_cols, training_DF, testing_DF)

    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]

    print "{:<40}".format(item), "training_error:\t", mserr_training
#     print item
    # print "d1 testing_error\t", mserr_testing
    # print "The min error values were:", min(results[1]), min(results[2])
    # print "std of hot is:", results[3]
    # -------------------------------------------------




dance1                                   training_error:	0.023006615151422
dance2                                   training_error:	0.022262816360753654
dance3                                   training_error:	0.024228863127340797
dance4                                   training_error:	0.024297983871834624
dance1 dance2 dance3 dance4              training_error:	0.021928343537768374


In [23]:
dancelist = 'dance1 dance2 dance3 dance4 '
energylist = 'energy1 energy2 energy3 energy4 '

acousitc =[]
acousitc.append(dancelist)
acousitc.append(energylist)
acousitc.append(dancelist+energylist)
acousitc.append("artist_familiarity artist_hotttnesss")
acousitc.append(dancelist+energylist+"artist_familiarity artist_hotttnesss")

for item in acousitc:
    # dance1
    X_cols = item
    results = util.RunAndTestLinearRegModel(X_cols, training_DF, testing_DF)

    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]

    if(len(item)>70):
        item='alleng alldance allmeta'
    
#     print "{:<70}".format(item), "training_error:\t", mserr_training
    print "{:<70}".format(item), "testing_error:\t\t", mserr_testing
    # print "The min error values were:", min(results[1]), min(results[2])
    # print "std of hot is:", results[3]
    # -------------------------------------------------




dance1 dance2 dance3 dance4                                            testing_error:		0.022512001244532513
energy1 energy2 energy3 energy4                                        testing_error:		0.021322151342341906
dance1 dance2 dance3 dance4 energy1 energy2 energy3 energy4            testing_error:		0.020914384178874878
artist_familiarity artist_hotttnesss                                   testing_error:		0.021339939044369476
alleng alldance allmeta                                                testing_error:		0.020902199348699035


In [24]:
finalset = [dancelist+energylist+"artist_familiarity artist_hotttnesss"]

for item in finalset:
    X_cols = item
    results = util.RunAndTestLinearRegModel(X_cols, training_DF, testing_DF)

    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]

    if(len(item)>70):
        item='alleng alldance allmeta'
    
    print "{:<70}".format(item), "training_error:\t", mserr_training
    print "{:<70}".format(item), "training_error:\t\t", mserr_testing
    # print "The min error values were:", min(results[1]), min(results[2])
    print "std of hot is:\t\t", results[3]
    print mserr_testing/results[3]*100
    # -------------------------------------------------



alleng alldance allmeta                                                training_error:	0.01988942736648771
alleng alldance allmeta                                                training_error:		0.020902199348699035
std of hot is:		0.1679281094688551
12.447111692504151


## THROW IT ALL IN THERE 


In [25]:
rawacoustic = " key loudness duration tempo time_signature segments_avg tatums_avg beats_avg bars_avg sections_avg  "
above = ' dance1 dance2 dance3 dance4 energy1 energy2 energy3 energy4 artist_familiarity artist_hotttnesss  '
actual_all = [rawacoustic+above]

for item in actual_all:
    X_cols = item
    results = util.RunAndTestLinearRegModel(X_cols, training_DF, testing_DF)

    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]

    if(len(item)>70):
        item="allfeatures"
    
    print "{:<70}".format(item), "training_error:\t", mserr_training
    print "{:<70}".format(item), "training_error:\t", mserr_testing
    # print "The min error values were:", min(results[1]), min(results[2])
    # print "std of hot is:", results[3]
    # -------------------------------------------------


allfeatures                                                            training_error:	0.019739023142623494
allfeatures                                                            training_error:	0.020922138808010943
