# Basic Linear Regression

## Imports

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import config as config
from models.linear_regression import LinearRegressionManager 
from model_evaluator import ModelEvaluator

import pprint
p = pprint.PrettyPrinter(indent=4)


%matplotlib inline


## Importing utilities object & building dataset 


In [2]:
%run utilities.py
util = Utilities({}, use_json=False)
master_DF = util.get_master_dataframe()

# add constructed features  
master_DF = util.generate_energy_measure(master_DF)
master_DF = util.generate_dance_measure(master_DF)

# normalize numeric values 
master_norm_DF = util.normalize_numeric_columns(master_DF) 


# partition into train, crossvalidation, and test sets 
train, cv, test = util.split_master_df(master_norm_DF)


## Cross validation to choose model features optimal for prediction 
+ acoustic features 
+ metadata features 
+ combination 

In [3]:
linreg_manager = LinearRegressionManager()

### Acoustic features 

In [4]:
acoustic_df, score = linreg_manager.find_optimal_acoustic(train, cv)
acoustic_df.columns, score 

(Index([u'loudness', u'time_signature', u'tatums_avg', u'segments_avg',
        u'bars_avg', u'sections_avg'],
       dtype='object'), 0.059799364615435786)

### Metadata features

In [5]:
metadata_df, score = linreg_manager.find_optimal_metadata(train, cv)
metadata_df.columns, score 

(Index([u'artist_hotttnesss', u'artist_familiarity'], dtype='object'),
 0.31661902336894177)

### Constructed features (energy and dance) + metadata 

In [6]:
constructed_df, score = linreg_manager.find_optimal_constructed(train, cv)
constructed_df.columns, score 

(Index([u'duration', u'artist_hotttnesss', u'artist_familiarity'], dtype='object'),
 0.3166811436034833)

### All features for maximum overfitting 

In [7]:
# super long 
all_feature_list = config.acoustic_features + config.metadata_feaures + config.constructed_features
all_model = LinearRegressionManager()
X = train[all_feature_list]
y = train['song_hotttnesss']

X_cv = cv[all_feature_list]
y_cv = cv['song_hotttnesss']

all_model.train(X, y)
all_model.R2_score(X_cv, y_cv)


0.3223642617663598

### Error on test set using best model 

In [8]:
linreg_manager.errors(train, test, all_feature_list)

(LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False),
 {'MSE': 0.030747403974795928,
  'mean_abs': 0.14352546785001283,
  'mean_err': 0.17534937688738997},
 {'MSE': 0.030779303639932275,
  'mean_abs': 0.14356469700605598,
  'mean_err': 0.17544031361101778},
 0.20375084648767097)

In [9]:
X = train['artist_hotttnesss artist_familiarity'.split()]
y = train['song_hotttnesss']

X_cv = cv['artist_hotttnesss artist_familiarity'.split()]
y_cv = cv['song_hotttnesss']

X_test = test['artist_hotttnesss artist_familiarity'.split()]
y_test = test['song_hotttnesss']


### using acoustic features 

In [10]:
allsegs = 'segments_avg tatums_avg beats_avg bars_avg sections_avg'.split()

linreg_manager.errors(train, test, allsegs)



(LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False),
 {'MSE': 0.04158310474733874,
  'mean_abs': 0.17134140661451086,
  'mean_err': 0.20391935844185746},
 {'MSE': 0.04098019908307169,
  'mean_abs': 0.1698261130798567,
  'mean_err': 0.20243566652907705},
 0.20375084648767097)

In [11]:
allrawacous = 'key loudness duration tempo time_signature'.split()

linreg_manager.errors(train, test, allrawacous)


(LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False),
 {'MSE': 0.040195714412381775,
  'mean_abs': 0.1674827354958183,
  'mean_err': 0.2004886889886354},
 {'MSE': 0.04035046054027063,
  'mean_abs': 0.1674601980182983,
  'mean_err': 0.20087424060907022},
 0.20375084648767097)

## Finding the best energy measure

In [12]:
meval = ModelEvaluator()
energy_names = ['energy1', 'energy2', 'energy3', 'energy4', 'energy']
energy_df, score = linreg_manager.find_optimal_featureset(train, cv, energy_names) 

energy_df.columns, score

(Index([u'energy1', u'energy4'], dtype='object'), 0.2841501822259611)

## lets play with the dance measure now

In [13]:
dance_names = ['dance1', 'dance2', 'dance3', 'dance4']
dance_df, score = linreg_manager.find_optimal_featureset(train, cv, dance_names) 

dance_df.columns, score

(Index([u'dance1', u'dance2', u'dance3'], dtype='object'), 0.23637861251406134)

## Combine energy and dance

In [14]:
dance_energy_list = dance_names + energy_names 
df, score = linreg_manager.find_optimal_featureset(train, cv, dance_energy_list) 

df.columns, score

(Index([u'dance1', u'dance3', u'dance4', u'energy1', u'energy3', u'energy4'], dtype='object'),
 0.3107035855112257)

## Highest performing predictor used on test set 

In [17]:
features = df.columns
model, training_err, testing_err, hot_std = linreg_manager.errors(train, test, features)
testing_err

{'MSE': 0.031063399073083284,
 'mean_abs': 0.14419936325310542,
 'mean_err': 0.17624811792777614}

In [20]:
X_test = test[features]
y_test = test['song_hotttnesss']

model.score(X_test, y_test)

0.25084261658508844