# Basic Linear Regression

## Imports

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import config as config
from models.linear_regression import LinearRegressionManager 
from model_evaluator import ModelEvaluator

import pprint
p = pprint.PrettyPrinter(indent=4)


%matplotlib inline


## Importing utilities object & building dataset 


In [2]:
%run utilities.py
util = Utilities({}, use_json=False)
master_DF = util.get_master_dataframe()

# add constructed features  
master_DF = util.generate_energy_measure(master_DF)
master_DF = util.generate_dance_measure(master_DF)

# normalize numeric values 
master_norm_DF = util.normalize_numeric_columns(master_DF) 


# partition into train, crossvalidation, and test sets 
train, cv, test = util.split_master_df(master_norm_DF)


## Cross validation to choose model features optimal for prediction 
+ acoustic features 
+ metadata features 
+ combination 

In [3]:
linreg_manager = LinearRegressionManager()

### Acoustic features 

In [4]:
acoustic_df, score = linreg_manager.find_optimal_acoustic(train, cv)
acoustic_df.columns, score 

(Index([u'tempo', u'loudness', u'tatums_avg', u'segments_avg', u'beats_avg',
        u'sections_avg'],
       dtype='object'), 0.04640727173639492)

### Metadata features

In [5]:
metadata_df, score = linreg_manager.find_optimal_metadata(train, cv)
metadata_df.columns, score 

(Index([u'artist_hotttnesss', u'artist_familiarity'], dtype='object'),
 0.20064171274608644)

### Constructed features (energy and dance) + metadata 

In [6]:
constructed_df, score = linreg_manager.find_optimal_constructed(train, cv)
constructed_df.columns, score 

(Index([u'energy', u'artist_hotttnesss', u'artist_familiarity'], dtype='object'),
 0.20064171274608655)

### All features for maximum overfitting 

In [7]:
# super long 
all_feature_list = config.acoustic_features + config.metadata_feaures + config.constructed_features
all_model = LinearRegressionManager()
X = train[all_feature_list]
y = train['song_hotttnesss']

X_cv = cv[all_feature_list]
y_cv = cv['song_hotttnesss']

all_model.train(X, y)
all_model.R2_score(X_cv, y_cv)


0.21716837694880586

### Error on test set using best model 

In [8]:
linreg_manager.errors(train, test, all_feature_list)

(LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False),
 {'MSE': 0.029787661194220556,
  'mean_abs': 0.14163152854532676,
  'mean_err': 0.17259102292477602},
 {'MSE': 0.031597548320455665,
  'mean_abs': 0.1456913738343797,
  'mean_err': 0.1777569923250719},
 0.21336283906958015)

In [9]:
X = train['artist_hotttnesss artist_familiarity'.split()]
y = train['song_hotttnesss']

X_cv = cv['artist_hotttnesss artist_familiarity'.split()]
y_cv = cv['song_hotttnesss']

X_test = test['artist_hotttnesss artist_familiarity'.split()]
y_test = test['song_hotttnesss']


### using acoustic features 

In [10]:
allsegs = 'segments_avg tatums_avg beats_avg bars_avg sections_avg'.split()

linreg_manager.errors(train, test, allsegs)



(LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False),
 {'MSE': 0.041378565491573295,
  'mean_abs': 0.17095619326406314,
  'mean_err': 0.20341722024345257},
 {'MSE': 0.044982198972326304,
  'mean_abs': 0.1795937322532547,
  'mean_err': 0.21209007278118017},
 0.21336283906958015)

In [11]:
allrawacous = 'key loudness duration tempo time_signature'.split()

linreg_manager.errors(train, test, allrawacous)


(LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False),
 {'MSE': 0.040223003736958966,
  'mean_abs': 0.1681829667409854,
  'mean_err': 0.20055673445925212},
 {'MSE': 0.043387147113216194,
  'mean_abs': 0.17489961709359192,
  'mean_err': 0.2082958163603297},
 0.21336283906958015)

## Finding the best energy measure

In [12]:
meval = ModelEvaluator()
energy_names = ['energy1', 'energy2', 'energy3', 'energy4', 'energy']
energy_df, score = linreg_manager.find_optimal_featureset(train, cv, energy_names) 

energy_df.columns, score

(Index([u'energy2', u'energy3', u'energy4'], dtype='object'),
 0.19664635783183815)

## lets play with the dance measure now

In [13]:
dance_names = ['dance1', 'dance2', 'dance3', 'dance4']
dance_df, score = linreg_manager.find_optimal_featureset(train, cv, dance_names) 

dance_df.columns, score

(Index([u'dance1', u'dance2', u'dance4'], dtype='object'), 0.15530004079837956)

## Combine energy and dance

In [14]:
dance_energy_list = dance_names + energy_names 
df, score = linreg_manager.find_optimal_featureset(train, cv, dance_energy_list) 

df.columns, score

(Index([u'dance1', u'dance3', u'dance4', u'energy2', u'energy3', u'energy4'], dtype='object'),
 0.2145037495787977)

## Highest performing predictor used on test set 

In [15]:
features = df.columns
model, training_err, testing_err, hot_std = linreg_manager.errors(train, test, features)
testing_err

{'MSE': 0.032043770456529376,
 'mean_abs': 0.14715002012726816,
 'mean_err': 0.17900773853811286}

In [16]:
X_test = test[features]
y_test = test['song_hotttnesss']

model.score(X_test, y_test)

0.29525990557423476