# Basic Linear Regression Model

In [1]:
import sys
ver_info = sys.version
print("This jupyter notebook was executed with:\n{0}".format(ver_info));


This jupyter notebook was executed with:
2.7.14 |Anaconda, Inc.| (default, Nov  8 2017, 13:40:45) [MSC v.1500 64 bit (AMD64)]


## Imports

In [2]:
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
from scipy import stats, integrate
import seaborn as sns

import pprint
p = pprint.PrettyPrinter(indent=4)

import config as config

%matplotlib inline


## Importing utilities object
+ imports dataset as lists (a training and test set)


In [3]:
%run utilities.py
util = Utilities({}, use_json=False)
training_list, testing_list = util.get_datasets()

# turning the datasets into pandas DataFrames 
training_DF, testing_DF = util.create_dataframes(training_list, testing_list)

# generate the energy and dance measures 
training_DF, testing_DF = util.generate_energy_measure(training_DF, testing_DF)
training_DF, testing_DF = util.generate_dance_measure(training_DF, testing_DF)


# Makin Linear Reg Learners!

In [4]:
from sklearn.linear_model import LinearRegression
from sklearn import metrics 
# Look at utilities.py for RunAndTestLinearRegModel() 

### using metadata (familiarity and artist_hott)

In [5]:
metadata = ['artist_familiarity', 'artist_hotttnesss', 'artist_hotttnesss artist_familiarity']

for item in metadata:
    X_cols = item
    results = util.RunAndTestLinearRegModel(X_cols, training_DF, testing_DF)

    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]

    print "{:<40}".format(item), "training_error:\t", mserr_training
    print "{:<40}".format(item), "testing_error:\t\t\t", mserr_testing
    # print "The min error values were:", min(results[1]), min(results[2])
    # print "std of hot is:", results[3]
    # -------------------------------------------------


artist_familiarity                       training_error:	0.0209843415002
artist_familiarity                       testing_error:			0.021592219545
artist_hotttnesss                        training_error:	0.0217360664561
artist_hotttnesss                        testing_error:			0.021263418241
artist_hotttnesss artist_familiarity     training_error:	0.020487585518
artist_hotttnesss artist_familiarity     testing_error:			0.0207775077472


#### Conclusions?
can see that a combination of both features perform better

### using acoustic features 

In [6]:
allsegs = 'segments_avg tatums_avg beats_avg bars_avg sections_avg '
allrawacous = 'key loudness duration tempo time_signature'

raw_acous = []
for each in allsegs.split():
    raw_acous.append(each)

raw_acous.append(allsegs)

for each in allrawacous.split():
    raw_acous.append(each)   
raw_acous.append(allrawacous)

for item in raw_acous:
    X_cols = item
    results = util.RunAndTestLinearRegModel(X_cols, training_DF, testing_DF)

    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]

    print "{:<60}".format(item), "training_error:\t", mserr_training
#     print "{:<40}".format(item), "testing_error:\t\t\t", mserr_testing
    # print "The min error values were:", min(results[1]), min(results[2])
    # print "std of hot is:", results[3]
    # -------------------------------------------------


segments_avg                                                 training_error:	0.0282166219448
tatums_avg                                                   training_error:	0.0280030894988
beats_avg                                                    training_error:	0.0280759323732
bars_avg                                                     training_error:	0.0282210187759
sections_avg                                                 training_error:	0.0281696869489
segments_avg tatums_avg beats_avg bars_avg sections_avg      training_error:	0.0277502230671
key                                                          training_error:	0.0282298750632
loudness                                                     training_error:	0.0269271029015
duration                                                     training_error:	0.028232161741
tempo                                                        training_error:	0.0280985572194
time_signature                                               training_e

## Finding the best energy measure

In [7]:
energylsit = ['energy1', 'energy2', 'energy3', 'energy4', 'energy1 energy2 energy3 energy4']

for item in energylsit:
    X_cols = item
    results = util.RunAndTestLinearRegModel(X_cols, training_DF, testing_DF)

    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]

    print "{:<40}".format(item), "training_error:\t", mserr_training
    # print "d1 testing_error\t", mserr_testing
    # print "The min error values were:", min(results[1]), min(results[2])
    # print "std of hot is:", results[3]
    # -------------------------------------------------




energy1                                  training_error:	0.0274097520628
energy2                                  training_error:	0.0275345639369
energy3                                  training_error:	0.0274511321071
energy4                                  training_error:	0.0276497069657
energy1 energy2 energy3 energy4          training_error:	0.0272843996838


In [8]:
# Comparing acoustic results
acous_compare = ["energy1 energy2 energy3 energy4 tatums_avg beats_avg", 
                 "energy1 energy2 energy3 energy4 tempo loudness", 
                 "energy1 energy2 energy3 energy4 tempo loudness tatums_avg beats_avg"]

for item in acous_compare:
    X_cols = item
    results = util.RunAndTestLinearRegModel(X_cols, training_DF, testing_DF)

    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]

    print "{:<70}".format(item), "training_error:\t", mserr_training
    # print "d1 testing_error\t", mserr_testing
    # print "The min error values were:", min(results[1]), min(results[2])
    # print "std of hot is:", results[3]
    # -------------------------------------------------





energy1 energy2 energy3 energy4 tatums_avg beats_avg                   training_error:	0.027280884532
energy1 energy2 energy3 energy4 tempo loudness                         training_error:	0.0268382799936
energy1 energy2 energy3 energy4 tempo loudness tatums_avg beats_avg    training_error:	0.0268195645813


## lets play with the dance measure now

In [9]:
dancelist = ['dance1', 'dance2', 'dance3', 'dance4', 'dance1 dance2 dance3 dance4']

for item in dancelist:
    # dance1
    X_cols = item
    results = util.RunAndTestLinearRegModel(X_cols, training_DF, testing_DF)

    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]

    print "{:<40}".format(item), "training_error:\t", mserr_training
    # print "d1 testing_error\t", mserr_testing
    # print "The min error values were:", min(results[1]), min(results[2])
    # print "std of hot is:", results[3]
    # -------------------------------------------------




dance1                                   training_error:	0.0278800835649
dance2                                   training_error:	0.0280045367418
dance3                                   training_error:	0.0275409529732
dance4                                   training_error:	0.0280974624394
dance1 dance2 dance3 dance4              training_error:	0.0266515508999


In [10]:
dancelist = 'dance1 dance2 dance3 dance4 '
energylist = 'energy1 energy2 energy3 energy4 '

acousitc =[]
acousitc.append(dancelist)
acousitc.append(energylist)
acousitc.append(dancelist+energylist)
acousitc.append("artist_familiarity artist_hotttnesss")
acousitc.append(dancelist+energylist+"artist_familiarity artist_hotttnesss")

for item in acousitc:
    # dance1
    X_cols = item
    results = util.RunAndTestLinearRegModel(X_cols, training_DF, testing_DF)

    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]

    if(len(item)>70):
        item='the above + metadata'
    
    print "{:<70}".format(item), "training_error:\t", mserr_training
#     print "{:<70}".format(item), "training_error:\t\t", mserr_testing
    # print "The min error values were:", min(results[1]), min(results[2])
    # print "std of hot is:", results[3]
    # -------------------------------------------------




dance1 dance2 dance3 dance4                                            training_error:	0.0266515508999
energy1 energy2 energy3 energy4                                        training_error:	0.0272843996838
dance1 dance2 dance3 dance4 energy1 energy2 energy3 energy4            training_error:	0.0266037282015
artist_familiarity artist_hotttnesss                                   training_error:	0.020487585518
the above + metadata                                                   training_error:	0.0201583671257


## THROW IT ALL IN THERE 


In [11]:
all_features = [dancelist+energylist+'tempo loudness tatums_avg beats_avg'+" artist_familiarity artist_hotttnesss"]

for item in all_features:
    X_cols = item
    results = util.RunAndTestLinearRegModel(X_cols, training_DF, testing_DF)

    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]

    if(len(item)>70):
        item="allfeatures"
    
    print "{:<70}".format(item), "training_error:\t", mserr_training
#     print "{:<70}".format(item), "training_error:\t", mserr_testing
    # print "The min error values were:", min(results[1]), min(results[2])
    # print "std of hot is:", results[3]
    # -------------------------------------------------


allfeatures                                                            training_error:	0.0201530905353
