In [26]:
#import all helpers
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import statsmodels.api as sm
from patsy import dmatrices
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn import metrics
from sklearn import linear_model, decomposition, datasets
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import GridSearchCV
import seaborn as sns # More snazzy plotting library
import itertools

#import regressors
#-----Ensemble---------------------
from sklearn.ensemble import       AdaBoostRegressor
from sklearn.ensemble import       BaggingRegressor
from sklearn.ensemble import       ExtraTreesRegressor
from sklearn.ensemble import       GradientBoostingRegressor
from sklearn.ensemble import       RandomForestRegressor

#----Generalized Linear models-----
from sklearn.linear_model import   ARDRegression
from sklearn.linear_model import   BayesianRidge
from sklearn.linear_model import   ElasticNet
from sklearn.linear_model import   HuberRegressor
from sklearn.linear_model import   Lars
from sklearn.linear_model import   Lasso
from sklearn.linear_model import   LassoLars
from sklearn.linear_model import   LinearRegression
from sklearn.linear_model import   PassiveAggressiveRegressor
from sklearn.linear_model import   Ridge
from sklearn.linear_model import   SGDRegressor

#---Nearest Neighbors----
from sklearn.neighbors import      KNeighborsRegressor
from sklearn.neighbors import      RadiusNeighborsRegressor


#----Neural Networks--------------- 
from sklearn.neural_network import MLPRegressor

#-----Support Vector Machines------
from sklearn.svm import            SVR
from sklearn.svm import            LinearSVR
from sklearn.svm import            NuSVR

#-----Decission Trees--------------
from sklearn.tree import           DecisionTreeRegressor
from sklearn.tree import           ExtraTreeRegressor

#----extras
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.isotonic import         IsotonicRegression
from sklearn.kernel_ridge import     KernelRidge
from sklearn.linear_model import     OrthogonalMatchingPursuit
from sklearn.linear_model import     RANSACRegressor
from sklearn.linear_model import     TheilSenRegressor

extra_mods = [GaussianProcessRegressor,IsotonicRegression,KernelRidge,OrthogonalMatchingPursuit,RANSACRegressor,TheilSenRegressor]
#file_path =  "../dataset/movie_metadata_cleaned_tfidf_num_only_min.csv"
file_path =  "../dataset/movie_metadata_cleaned_categ_num_only.csv"
#file_path = "../dataset/movie_metadata_cleaned_no_vector_num_only.csv"

dta = pd.read_csv(file_path)
dta_clean = dta
#remove the null values, that is fill NaN with there - FIXME: Rihards, naive implementation
dta_clean = dta_clean.fillna(value=0, axis=1)
dta_clean = dta_clean.dropna()
dta_clean = dta_clean.drop('Unnamed: 0', axis=1)
dta_clean.describe()
    
y = dta_clean['worldwide_gross']
X = dta_clean.drop('worldwide_gross', axis=1)
X, _X_dummy, y, _y_dummy = train_test_split(X, y, test_size=0)

In [27]:
models = [AdaBoostRegressor,BaggingRegressor,ExtraTreesRegressor,GradientBoostingRegressor,RandomForestRegressor,BayesianRidge,ElasticNet,HuberRegressor,Lars,Lasso,LassoLars,LinearRegression,PassiveAggressiveRegressor,Ridge,SGDRegressor,KNeighborsRegressor,RadiusNeighborsRegressor,MLPRegressor,SVR,LinearSVR,NuSVR,DecisionTreeRegressor,ExtraTreeRegressor]#classifiers = [RadiusNeighborsClassifier]
#models= [LinearRegression]
results = {}
errors = {}

def run_grid_search(x,y,Model, res_dict, error):
    #create pipline and use GridSearch to find the betst params
    name = Model.__name__
    #if name == "MLPClassifier" or name == "MLPRegressor": 
    #    cv = 50
    #else: 
    cv = 5
    model = Model()
    pipe = Pipeline(steps=[('model', model)])
    #create estimator
    print()
    print ("***Starting ["  + name + "] estimator run ")
    estimator = GridSearchCV(pipe,dict(),verbose=2, cv=cv, n_jobs=4)
    #run the esmimator, except eceptions, sape errors
    try:
            estimator.fit(x, y)
            print ("GREP_ME***Results of ["  + name + "] estimator run are")
            print (estimator.cv_results_)
            print ("GREP_ME***Best params of ["  + name + "] estimator run are")
            print (estimator.best_params_)
            print ("GREP_ME***Best score of ["  + name + "] estimator run are")
            print (estimator.best_score_)
            results[name] = estimator.best_score_
    except ValueError as err:
            print ("GREP_ME***Error caught for  ["  + name + "]")
            errors[name] = err
            pass
            
def run_solver(X,y,models_arr, res_dict, errors):
    for model in models_arr:
        run_grid_search(X,y, model, res_dict, errors)    

In [28]:
run_solver(X,y, extra_mods, results, errors)


***Starting [GaussianProcessRegressor] estimator run 
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................................. , total=   5.6s
[CV] ................................................. , total=   5.8s
[CV] ................................................. , total=   5.8s
[CV] ................................................. , total=   6.0s
[CV]  ................................................................
[CV] ................................................. , total=   3.4s


[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:   13.6s finished


GREP_ME***Results of [GaussianProcessRegressor] estimator run are
{'split4_train_score': array([ 1.]), 'mean_score_time': array([ 0.50541658]), 'rank_test_score': array([1], dtype=int32), 'params': ({},), 'split0_train_score': array([ 1.]), 'mean_train_score': array([ 1.]), 'split3_test_score': array([-0.27459981]), 'mean_fit_time': array([ 4.81412849]), 'mean_test_score': array([-0.29211234]), 'split2_train_score': array([ 1.]), 'split0_test_score': array([-0.30393826]), 'split3_train_score': array([ 1.]), 'split2_test_score': array([-0.31287388]), 'split4_test_score': array([-0.28172426]), 'std_score_time': array([ 0.06053125]), 'split1_test_score': array([-0.28741809]), 'std_fit_time': array([ 0.92762512]), 'split1_train_score': array([ 1.]), 'std_test_score': array([ 0.01419352]), 'std_train_score': array([ 0.])}
GREP_ME***Best params of [GaussianProcessRegressor] estimator run are
{}
GREP_ME***Best score of [GaussianProcessRegressor] estimator run are
-0.292112342841

***Starting 

[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:    7.5s finished


GREP_ME***Results of [KernelRidge] estimator run are
{'split4_train_score': array([-2557126.82232622]), 'mean_score_time': array([ 0.06729708]), 'rank_test_score': array([1], dtype=int32), 'params': ({},), 'split0_train_score': array([ -2.36843953e+14]), 'mean_train_score': array([ -4.74283351e+13]), 'split3_test_score': array([ -2.91798829e+11]), 'mean_fit_time': array([ 4.63265262]), 'mean_test_score': array([ -4.06341004e+13]), 'split2_train_score': array([ -1.60346637e+10]), 'split0_test_score': array([ -2.02735226e+14]), 'split3_train_score': array([ -2.81287959e+11]), 'split2_test_score': array([ -1.67736006e+10]), 'split4_test_score': array([-2312733.14772845]), 'std_score_time': array([ 0.03102257]), 'split1_test_score': array([ -4.35286848e+08]), 'std_fit_time': array([ 1.42049165]), 'split1_train_score': array([ -3.97692645e+08]), 'std_test_score': array([  8.10822191e+13]), 'std_train_score': array([  9.47078692e+13])}
GREP_ME***Best params of [KernelRidge] estimator run are

[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:    0.2s finished


[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................................. , total=   1.9s
[CV]  ................................................................
[CV] ................................................. , total=   2.2s
[CV] ................................................. , total=   2.3s
[CV] ................................................. , total=   2.4s
[CV] ................................................. , total=   0.8s


[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:    2.8s finished


GREP_ME***Results of [RANSACRegressor] estimator run are
{'split4_train_score': array([ 0.55103313]), 'mean_score_time': array([ 0.00247955]), 'rank_test_score': array([1], dtype=int32), 'params': ({},), 'split0_train_score': array([ 0.50467272]), 'mean_train_score': array([ 0.54071398]), 'split3_test_score': array([ 0.55227181]), 'mean_fit_time': array([ 1.92293315]), 'mean_test_score': array([ 0.5332811]), 'split2_train_score': array([ 0.55205452]), 'split0_test_score': array([ 0.541972]), 'split3_train_score': array([ 0.57702799]), 'split2_test_score': array([ 0.43223151]), 'split4_test_score': array([ 0.56735692]), 'std_score_time': array([ 0.00083565]), 'split1_test_score': array([ 0.57252343]), 'std_fit_time': array([ 0.57840225]), 'split1_train_score': array([ 0.51878153]), 'std_test_score': array([ 0.0516613]), 'std_train_score': array([ 0.02582171])}
GREP_ME***Best params of [RANSACRegressor] estimator run are
{}
GREP_ME***Best score of [RANSACRegressor] estimator run are
0.53

[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:  6.3min finished


GREP_ME***Results of [TheilSenRegressor] estimator run are
{'split4_train_score': array([ 0.6636878]), 'mean_score_time': array([ 0.01300163]), 'rank_test_score': array([1], dtype=int32), 'params': ({},), 'split0_train_score': array([ 0.65510519]), 'mean_train_score': array([ 0.66181897]), 'split3_test_score': array([ 0.65194785]), 'mean_fit_time': array([ 263.10558391]), 'mean_test_score': array([ 0.6524548]), 'split2_train_score': array([ 0.67004249]), 'split0_test_score': array([ 0.67359218]), 'split3_train_score': array([ 0.66509738]), 'split2_test_score': array([ 0.59767929]), 'split4_test_score': array([ 0.65504665]), 'std_score_time': array([ 0.00977112]), 'split1_test_score': array([ 0.68395331]), 'std_fit_time': array([ 86.89690375]), 'split1_train_score': array([ 0.65516201]), 'std_test_score': array([ 0.0298192]), 'std_train_score': array([ 0.00585246])}
GREP_ME***Best params of [TheilSenRegressor] estimator run are
{}
GREP_ME***Best score of [TheilSenRegressor] estimator ru

In [29]:
sorted(results.items(), key=lambda x:x[1])

[('KernelRidge', -40634100388046.898),
 ('GaussianProcessRegressor', -0.29211234284061061),
 ('RANSACRegressor', 0.53328109935233614),
 ('TheilSenRegressor', 0.65245479904349846),
 ('OrthogonalMatchingPursuit', 0.65561143821505341)]

In [30]:
errors

{'IsotonicRegression': sklearn.externals.joblib.my_exceptions.JoblibValueError('Multiprocessing exception:\n...........................................................................\n/usr/lib/python3.4/runpy.py in _run_module_as_main(mod_name=\'ipykernel.__main__\', alter_argv=1)\n    165         sys.exit(msg)\n    166     main_globals = sys.modules["__main__"].__dict__\n    167     if alter_argv:\n    168         sys.argv[0] = mod_spec.origin\n    169     return _run_code(code, main_globals, None,\n--> 170                      "__main__", mod_spec)\n        mod_spec = ModuleSpec(name=\'ipykernel.__main__\', loader=<_f...b/python3.4/dist-packages/ipykernel/__main__.py\')\n    171 \n    172 def run_module(mod_name, init_globals=None,\n    173                run_name=None, alter_sys=False):\n    174     """Execute a module\'s code without importing it\n\n...........................................................................\n/usr/lib/python3.4/runpy.py in _run_code(code=<code obje