# Basic Linear Regression Model

In [1]:
import sys
ver_info = sys.version
print("This jupyter notebook was executed with:\n{0}".format(ver_info));


This jupyter notebook was executed with:
2.7.14 |Anaconda, Inc.| (default, Nov  8 2017, 13:40:45) [MSC v.1500 64 bit (AMD64)]


## Imports

In [2]:
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
from scipy import stats, integrate
import seaborn as sns

import pprint
p = pprint.PrettyPrinter(indent=4)

import config as config

%matplotlib inline


## Importing utilities object
+ imports dataset as lists (a training and test set)


In [3]:
%run utilities.py
util = Utilities({}, use_json=False)
training_list, testing_list = util.get_datasets()

# turning the datasets into pandas DataFrames 
training_DF, testing_DF = util.create_dataframes(training_list, testing_list)

# generate the energy and dance measures 
training_DF, testing_DF = util.generate_energy_measure(training_DF, testing_DF)
training_DF, testing_DF = util.generate_dance_measure(training_DF, testing_DF)


# Makin Linear Reg Learners!

In [4]:
from sklearn.linear_model import LinearRegression
from sklearn import metrics 
# Look at utilities.py for RunAndTestLinearRegModel() 

### using metadata (familiarity and artist_hott)

In [5]:
metadata = ['artist_familiarity', 'artist_hotttnesss', 'artist_hotttnesss artist_familiarity']

for item in metadata:
    X_cols = item
    results = util.RunAndTestLinearRegModel(X_cols, training_DF, testing_DF)

    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]

    print "{:<40}".format(item), "training_error:\t", mserr_training
    print "{:<40}".format(item), "testing_error:\t\t\t", mserr_testing
    # print "The min error values were:", min(results[1]), min(results[2])
    # print "std of hot is:", results[3]
    # -------------------------------------------------


artist_familiarity                       training_error:	0.0207233163318
artist_familiarity                       testing_error:			0.0223786721313
artist_hotttnesss                        training_error:	0.0215358097962
artist_hotttnesss                        testing_error:			0.0218632687975
artist_hotttnesss artist_familiarity     training_error:	0.0203070402092
artist_hotttnesss artist_familiarity     testing_error:			0.021343016484


#### Conclusions?
can see that a combination of both features perform better

### using acoustic features 

In [6]:
allsegs = 'segments_avg tatums_avg beats_avg bars_avg sections_avg '
allrawacous = 'key loudness duration tempo time_signature'

raw_acous = []
for each in allsegs.split():
    raw_acous.append(each)

raw_acous.append(allsegs)

for each in allrawacous.split():
    raw_acous.append(each)   
raw_acous.append(allrawacous)

for item in raw_acous:
    X_cols = item
    results = util.RunAndTestLinearRegModel(X_cols, training_DF, testing_DF)

    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]

    print "{:<60}".format(item), "training_error:\t", mserr_training
#     print "{:<40}".format(item), "testing_error:\t\t\t", mserr_testing
    # print "The min error values were:", min(results[1]), min(results[2])
    # print "std of hot is:", results[3]
    # -------------------------------------------------


segments_avg                                                 training_error:	0.0278762401239
tatums_avg                                                   training_error:	0.0276862254331
beats_avg                                                    training_error:	0.0277258500288
bars_avg                                                     training_error:	0.0279077504593
sections_avg                                                 training_error:	0.0278630536529
segments_avg tatums_avg beats_avg bars_avg sections_avg      training_error:	0.0273478819796
key                                                          training_error:	0.0279029386285
loudness                                                     training_error:	0.0267259279304
duration                                                     training_error:	0.0279112686753
tempo                                                        training_error:	0.0277489531524
time_signature                                               training_

## Finding the best energy measure

In [7]:
energylsit = ['energy1', 'energy2', 'energy3', 'energy4', 'energy1 energy2 energy3 energy4']

for item in energylsit:
    X_cols = item
    results = util.RunAndTestLinearRegModel(X_cols, training_DF, testing_DF)

    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]

    print "{:<40}".format(item), "training_error:\t", mserr_training
    # print "d1 testing_error\t", mserr_testing
    # print "The min error values were:", min(results[1]), min(results[2])
    # print "std of hot is:", results[3]
    # -------------------------------------------------




energy1                                  training_error:	0.0271552513266
energy2                                  training_error:	0.0271891703294
energy3                                  training_error:	0.0272016993581
energy4                                  training_error:	0.0273131282322
energy1 energy2 energy3 energy4          training_error:	0.0270181658648


In [8]:
# Comparing acoustic results
acous_compare = ["energy1 energy2 energy3 energy4 tatums_avg beats_avg", 
                 "energy1 energy2 energy3 energy4 tempo loudness", 
                 "energy1 energy2 energy3 energy4 tempo loudness tatums_avg beats_avg"]

for item in acous_compare:
    X_cols = item
    results = util.RunAndTestLinearRegModel(X_cols, training_DF, testing_DF)

    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]

    print "{:<70}".format(item), "training_error:\t", mserr_training
    # print "d1 testing_error\t", mserr_testing
    # print "The min error values were:", min(results[1]), min(results[2])
    # print "std of hot is:", results[3]
    # -------------------------------------------------





energy1 energy2 energy3 energy4 tatums_avg beats_avg                   training_error:	0.0270155557022
energy1 energy2 energy3 energy4 tempo loudness                         training_error:	0.0266043882085
energy1 energy2 energy3 energy4 tempo loudness tatums_avg beats_avg    training_error:	0.026600948299


## lets play with the dance measure now

In [9]:
dancelist = ['dance1', 'dance2', 'dance3', 'dance4', 'dance1 dance2 dance3 dance4']

for item in dancelist:
    # dance1
    X_cols = item
    results = util.RunAndTestLinearRegModel(X_cols, training_DF, testing_DF)

    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]

    print "{:<40}".format(item), "training_error:\t", mserr_training
    # print "d1 testing_error\t", mserr_testing
    # print "The min error values were:", min(results[1]), min(results[2])
    # print "std of hot is:", results[3]
    # -------------------------------------------------




dance1                                   training_error:	0.0275279091283
dance2                                   training_error:	0.0276391953274
dance3                                   training_error:	0.0271946666171
dance4                                   training_error:	0.0277554546377
dance1 dance2 dance3 dance4              training_error:	0.026395921643


In [10]:
dancelist = 'dance1 dance2 dance3 dance4 '
energylist = 'energy1 energy2 energy3 energy4 '

acousitc =[]
acousitc.append(dancelist)
acousitc.append(energylist)
acousitc.append(dancelist+energylist)
acousitc.append("artist_familiarity artist_hotttnesss")
acousitc.append(dancelist+energylist+"artist_familiarity artist_hotttnesss")

for item in acousitc:
    # dance1
    X_cols = item
    results = util.RunAndTestLinearRegModel(X_cols, training_DF, testing_DF)

    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]

    if(len(item)>70):
        item='alleng alldance allmeta'
    
    print "{:<70}".format(item), "training_error:\t", mserr_training
#     print "{:<70}".format(item), "training_error:\t\t", mserr_testing
    # print "The min error values were:", min(results[1]), min(results[2])
    # print "std of hot is:", results[3]
    # -------------------------------------------------




dance1 dance2 dance3 dance4                                            training_error:	0.026395921643
energy1 energy2 energy3 energy4                                        training_error:	0.0270181658648
dance1 dance2 dance3 dance4 energy1 energy2 energy3 energy4            training_error:	0.0263670467153
artist_familiarity artist_hotttnesss                                   training_error:	0.0203070402092
alleng alldance allmeta                                                training_error:	0.0199976224545


In [11]:
finalset = [dancelist+energylist+"artist_familiarity artist_hotttnesss"]

for item in finalset:
    X_cols = item
    results = util.RunAndTestLinearRegModel(X_cols, training_DF, testing_DF)

    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]

    if(len(item)>70):
        item='alleng alldance allmeta'
    
    print "{:<70}".format(item), "training_error:\t", mserr_training
    print "{:<70}".format(item), "training_error:\t\t", mserr_testing
    # print "The min error values were:", min(results[1]), min(results[2])
    print "std of hot is:\t\t", results[3]
    print mserr_testing/results[3]*100
    # -------------------------------------------------



alleng alldance allmeta                                                training_error:	0.0199976224545
alleng alldance allmeta                                                training_error:		0.0210237506945
std of hot is:		0.167909525665
12.5208802843


## THROW IT ALL IN THERE 


In [14]:
all_features = [dancelist+energylist+'tempo loudness tatums_avg beats_avg'+" artist_familiarity artist_hotttnesss"]

for item in all_features:
    X_cols = item
    results = util.RunAndTestLinearRegModel(X_cols, training_DF, testing_DF)

    #  [x][1] = Mean squared error regression loss (bc its the smallest value lol)
    mserr_training = results[1][1]
    mserr_testing = results[2][1]

    if(len(item)>70):
        item="allfeatures"
    
    print "{:<70}".format(item), "training_error:\t", mserr_training
    print "{:<70}".format(item), "training_error:\t", mserr_testing
    # print "The min error values were:", min(results[1]), min(results[2])
    # print "std of hot is:", results[3]
    # -------------------------------------------------


allfeatures                                                            training_error:	0.0199918647999
allfeatures                                                            training_error:	0.0210441811375


In [13]:
from sklearn.linear_model import LogisticRegression
