# Basic Linear Regression

## Imports

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import config as config
from models.linear_regression import LinearRegressionManager 
from model_evaluator import ModelEvaluator

import pprint
p = pprint.PrettyPrinter(indent=4)


%matplotlib inline


## Importing utilities object & building dataset 


In [2]:
%run utilities.py
util = Utilities({}, use_json=False)
master_DF = util.get_master_dataframe()

# add constructed features  
master_DF = util.generate_energy_measure(master_DF)
master_DF = util.generate_dance_measure(master_DF)

# normalize numeric values 
master_norm_DF = util.normalize_numeric_columns(master_DF) 


# partition into train, crossvalidation, and test sets 
train, cv, test = util.split_master_df(master_norm_DF)


## Cross validation to choose model features optimal for prediction 
+ acoustic features 
+ metadata features 
+ combination 

In [3]:
linreg_manager = LinearRegressionManager()

### Acoustic features 

In [4]:
acoustic_df, score = linreg_manager.find_optimal_acoustic(train, cv)
acoustic_df.columns, score 

(Index([u'loudness', u'tatums_avg', u'segments_avg'], dtype='object'),
 0.08004308398102489)

### Metadata features

In [5]:
metadata_df, score = linreg_manager.find_optimal_metadata(train, cv)
metadata_df.columns, score 

(Index([u'artist_hotttnesss', u'artist_familiarity'], dtype='object'),
 0.2953257739560675)

### Constructed features (energy and dance) + metadata 

In [6]:
constructed_df, score = linreg_manager.find_optimal_constructed(train, cv)
constructed_df.columns, score 

(Index([u'artist_hotttnesss', u'artist_familiarity'], dtype='object'),
 0.2953257739560675)

### All features for maximum overfitting 

In [7]:
# super long 
all_feature_list = config.acoustic_features + config.metadata_feaures + config.constructed_features
all_model = LinearRegressionManager()
X = train[all_feature_list]
y = train['song_hotttnesss']

X_cv = cv[all_feature_list]
y_cv = cv['song_hotttnesss']

all_model.train(X, y)
all_model.R2_score(X_cv, y_cv)


0.3078703874898726

### Error on test set using best model 

In [8]:
linreg_manager.errors(train, test, all_feature_list)

(LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
          normalize=False),
 {'MSE': 0.030393725902695585,
  'mean_abs': 0.14300881350188488,
  'mean_err': 0.17433796460523332},
 {'MSE': 0.032108972300629,
  'mean_abs': 0.14659137455924526,
  'mean_err': 0.17918976617158971},
 0.20953139938705823)

### Example of how to use LinearRegressionModel

In [None]:
X = train['artist_hotttnesss artist_familiarity'.split()]
y = train['song_hotttnesss']

X_cv = cv['artist_hotttnesss artist_familiarity'.split()]
y_cv = cv['song_hotttnesss']

X_test = test['artist_hotttnesss artist_familiarity'.split()]
y_test = test['song_hotttnesss']


In [None]:
meval = ModelEvaluator()
linreg = LinearRegressionModel()
meval.cross_validation_score(linreg, X_cv, y_cv, 5)


### using acoustic features 

In [16]:
allsegs = 'segments_avg tatums_avg beats_avg bars_avg sections_avg'.split()

linreg_manager.errors(train, test, allsegs)



(LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
          normalize=False),
 {'MSE': 0.041235484634284114,
  'mean_abs': 0.17093583117288189,
  'mean_err': 0.20306522261156418},
 {'MSE': 0.04356677206941965,
  'mean_abs': 0.17437613651456246,
  'mean_err': 0.20872654854957873},
 0.20953139938705823)

In [18]:
allrawacous = 'key loudness duration tempo time_signature'.split()

linreg_manager.errors(train, test, allrawacous)


(LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
          normalize=False),
 {'MSE': 0.04014705799714451,
  'mean_abs': 0.16834453640710054,
  'mean_err': 0.200367307705485},
 {'MSE': 0.042689291907287846,
  'mean_abs': 0.17253130372040182,
  'mean_err': 0.20661387152678748},
 0.20953139938705823)

## Finding the best energy measure

In [22]:
meval = ModelEvaluator()
energy_names = ['energy1', 'energy2', 'energy3', 'energy4', 'energy']
energy_df, score = linreg_manager.find_optimal_featureset(train, cv, energy_names) 

energy_df.columns, score

(Index([u'energy1', u'energy2'], dtype='object'), 0.28649674937880687)

## lets play with the dance measure now

In [23]:
dance_names = ['dance1', 'dance2', 'dance3', 'dance4']
dance_df, score = linreg_manager.find_optimal_featureset(train, cv, dance_names) 

dance_df.columns, score

(Index([u'dance1', u'dance2', u'dance4'], dtype='object'), 0.23675632079995534)

In [25]:
dance_energy_list = dance_names + energy_names 
df, score = linreg_manager.find_optimal_featureset(train, cv, dance_energy_list) 

df.columns, score

(Index([u'dance2', u'dance3', u'dance4', u'energy1'], dtype='object'),
 0.3071753272395724)

## Highest performing predictor uses featureset: dance2, dance3, dance4, energy1

In [28]:
model, training_err, testing_err, hot_std = linreg_manager.errors(train, test, dance_energy_list)
testing_err

{'MSE': 0.03243810957579446,
 'mean_abs': 0.14829774215451902,
 'mean_err': 0.1801058288223745}