In [1]:
# import the libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt



In [2]:
# read the data into pandas data frames.
df_feat = pd.read_csv('train_features_2013-03-07.csv',sep=',')
df_target = pd.read_csv('train_salaries_2013-03-07.csv',sep=',')
# Join the features and the target data frames.
df = df_feat.join(df_target,lsuffix='_feat',rsuffix='_target')
# Drop redundant column
df.drop(['jobId_target'],axis=1,inplace=True)
# Rename the column
df.columns = [u'jobId', u'companyId', u'jobType', u'degree', u'major',
       u'industry', u'yearsExperience', u'milesFromMetropolis', u'salary']

In [3]:
df.head()

Unnamed: 0,jobId,companyId,jobType,degree,major,industry,yearsExperience,milesFromMetropolis,salary
0,JOB1362684407687,COMP37,CFO,MASTERS,MATH,HEALTH,10,83,130
1,JOB1362684407688,COMP19,CEO,HIGH_SCHOOL,NONE,WEB,3,73,101
2,JOB1362684407689,COMP52,VICE_PRESIDENT,DOCTORAL,PHYSICS,HEALTH,10,38,137
3,JOB1362684407690,COMP38,MANAGER,DOCTORAL,CHEMISTRY,AUTO,8,17,142
4,JOB1362684407691,COMP7,VICE_PRESIDENT,BACHELORS,PHYSICS,FINANCE,8,16,163


In [8]:
# Creating a Category variable out of yearsExperience variable
def f(row):
    if row['yearsExperience'] < 3:
        return 0
    elif row['yearsExperience'] > 3 and row['yearsExperience'] < 10:
        return 1
    elif row['yearsExperience'] >10 and row['yearsExperience'] < 15:
        return 2
    else:
        return 3
    
df['yearsExperienceCategory'] = df.apply(f,axis=1)

In [9]:
df['yearsExperienceCategory'] = df['yearsExperienceCategory'].astype(object)

In [10]:
# Converting the columns to 'category' type.
df['jobType'] = df['jobType'].astype('category')
df['major'] = df['major'].astype('category')
df['degree'] = df['degree'].astype('category')
df['industry'] = df['industry'].astype('category')
df['yearsExperienceCategory'] = df['yearsExperienceCategory'].astype('category')

In [11]:
df.head()

Unnamed: 0,jobId,companyId,jobType,degree,major,industry,yearsExperience,milesFromMetropolis,salary,yearsExperienceCategory
0,JOB1362684407687,COMP37,CFO,MASTERS,MATH,HEALTH,10,83,130,3
1,JOB1362684407688,COMP19,CEO,HIGH_SCHOOL,NONE,WEB,3,73,101,3
2,JOB1362684407689,COMP52,VICE_PRESIDENT,DOCTORAL,PHYSICS,HEALTH,10,38,137,3
3,JOB1362684407690,COMP38,MANAGER,DOCTORAL,CHEMISTRY,AUTO,8,17,142,1
4,JOB1362684407691,COMP7,VICE_PRESIDENT,BACHELORS,PHYSICS,FINANCE,8,16,163,1


In [12]:
# Choosing 'jobType','degree','major','industry','yearsExperience','milesFromMetropolis' as features for prediction.
# salary is our target variable.
predictor_columns = ['jobType','degree','major','industry','yearsExperience','milesFromMetropolis']
target_column = ['salary']
train_x = df[predictor_columns]
train_y = df[target_column] 

In [16]:
# Encode the categorical variables
train_x = pd.get_dummies(train_x)

In [17]:
# Running Linear Regression (without regularization)
from sklearn.model_selection import cross_val_score
from sklearn import linear_model

regr = linear_model.LinearRegression()
scores = cross_val_score(regr,train_x,train_y,cv=5,scoring='r2')

In [22]:
print "CV Scores from Linear Regression: {0}".format(scores)
print "Mean Score: {0}".format(scores.mean())

CV Scores from Linear Regression: [ 0.74288958  0.74357977  0.74313149  0.74394332  0.74400963]
Mean Score: 0.743510758469


In [24]:
# Run Ridge Regression using GridSearch
from sklearn.grid_search import GridSearchCV

alphas = np.logspace(-3,2,10)
tuning_parameters = [{'alpha':alphas}]
grid = GridSearchCV(linear_model.Ridge(),tuning_parameters,cv=5,scoring='r2',verbose=10)
grid.fit(train_x,train_y)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] alpha=0.001 .....................................................
[CV] ............................ alpha=0.001, score=0.742890 -   1.5s
[CV] alpha=0.001 .....................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.5s remaining:    0.0s


[CV] ............................ alpha=0.001, score=0.743580 -   1.6s
[CV] alpha=0.001 .....................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.1s remaining:    0.0s


[CV] ............................ alpha=0.001, score=0.743131 -   1.4s
[CV] alpha=0.001 .....................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    4.5s remaining:    0.0s


[CV] ............................ alpha=0.001, score=0.743943 -   1.4s
[CV] alpha=0.001 .....................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    6.0s remaining:    0.0s


[CV] ............................ alpha=0.001, score=0.744010 -   1.4s
[CV] alpha=0.0035938136638 ...........................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    7.4s remaining:    0.0s


[CV] .................. alpha=0.0035938136638, score=0.742890 -   1.4s
[CV] alpha=0.0035938136638 ...........................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    8.8s remaining:    0.0s


[CV] .................. alpha=0.0035938136638, score=0.743580 -   1.5s
[CV] alpha=0.0035938136638 ...........................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   10.3s remaining:    0.0s


[CV] .................. alpha=0.0035938136638, score=0.743131 -   1.5s
[CV] alpha=0.0035938136638 ...........................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   11.8s remaining:    0.0s


[CV] .................. alpha=0.0035938136638, score=0.743943 -   1.6s
[CV] alpha=0.0035938136638 ...........................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   13.4s remaining:    0.0s


[CV] .................. alpha=0.0035938136638, score=0.744010 -   1.5s
[CV] alpha=0.0129154966501 ...........................................
[CV] .................. alpha=0.0129154966501, score=0.742890 -   1.4s
[CV] alpha=0.0129154966501 ...........................................
[CV] .................. alpha=0.0129154966501, score=0.743580 -   1.4s
[CV] alpha=0.0129154966501 ...........................................
[CV] .................. alpha=0.0129154966501, score=0.743131 -   1.5s
[CV] alpha=0.0129154966501 ...........................................
[CV] .................. alpha=0.0129154966501, score=0.743943 -   1.8s
[CV] alpha=0.0129154966501 ...........................................
[CV] .................. alpha=0.0129154966501, score=0.744010 -   1.5s
[CV] alpha=0.0464158883361 ...........................................
[CV] .................. alpha=0.0464158883361, score=0.742890 -   1.6s
[CV] alpha=0.0464158883361 ...........................................
[CV] .

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  1.2min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'alpha': array([  1.00000e-03,   3.59381e-03,   1.29155e-02,   4.64159e-02,
         1.66810e-01,   5.99484e-01,   2.15443e+00,   7.74264e+00,
         2.78256e+01,   1.00000e+02])}],
       pre_dispatch='2*n_jobs', refit=True, scoring='r2', verbose=10)

In [27]:
print "Best Score: {0}".format(grid.best_score_)

0.7435107629393091

In [28]:
# Running Lasso regression
alphas = np.logspace(-3,2,10)
tuning_parameters = [{'alpha': alphas}]
grid = GridSearchCV(linear_model.Lasso(),tuning_parameters,cv=5,scoring='r2',verbose=10)
grid.fit(train_x,train_y)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] alpha=0.001 .....................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.6min remaining:    0.0s


[CV] ............................ alpha=0.001, score=0.742891 - 1.6min
[CV] alpha=0.001 .....................................................
[CV] ............................ alpha=0.001, score=0.743578 - 1.6min
[CV] alpha=0.001 .....................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  3.1min remaining:    0.0s


[CV] ............................ alpha=0.001, score=0.743129 - 1.5min
[CV] alpha=0.001 .....................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  4.7min remaining:    0.0s


[CV] ............................ alpha=0.001, score=0.743944 - 1.6min
[CV] alpha=0.001 .....................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  6.2min remaining:    0.0s


[CV] ............................ alpha=0.001, score=0.744010 - 1.5min
[CV] alpha=0.0035938136638 ...........................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  7.7min remaining:    0.0s


[CV] .................. alpha=0.0035938136638, score=0.742893 -  33.8s
[CV] alpha=0.0035938136638 ...........................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  8.3min remaining:    0.0s


[CV] .................. alpha=0.0035938136638, score=0.743573 -  34.2s
[CV] alpha=0.0035938136638 ...........................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  8.9min remaining:    0.0s


[CV] .................. alpha=0.0035938136638, score=0.743123 -  35.0s
[CV] alpha=0.0035938136638 ...........................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  9.5min remaining:    0.0s


[CV] .................. alpha=0.0035938136638, score=0.743945 -  35.0s
[CV] alpha=0.0035938136638 ...........................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 10.0min remaining:    0.0s


[CV] .................. alpha=0.0035938136638, score=0.744010 -  34.4s
[CV] alpha=0.0129154966501 ...........................................
[CV] .................. alpha=0.0129154966501, score=0.742882 -  11.1s
[CV] alpha=0.0129154966501 ...........................................
[CV] .................. alpha=0.0129154966501, score=0.743534 -  11.0s
[CV] alpha=0.0129154966501 ...........................................
[CV] .................. alpha=0.0129154966501, score=0.743080 -  11.2s
[CV] alpha=0.0129154966501 ...........................................
[CV] .................. alpha=0.0129154966501, score=0.743928 -  11.6s
[CV] alpha=0.0129154966501 ...........................................
[CV] .................. alpha=0.0129154966501, score=0.743992 -  11.1s
[CV] alpha=0.0464158883361 ...........................................
[CV] .................. alpha=0.0464158883361, score=0.742607 -   5.1s
[CV] alpha=0.0464158883361 ...........................................
[CV] .

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed: 13.1min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'alpha': array([  1.00000e-03,   3.59381e-03,   1.29155e-02,   4.64159e-02,
         1.66810e-01,   5.99484e-01,   2.15443e+00,   7.74264e+00,
         2.78256e+01,   1.00000e+02])}],
       pre_dispatch='2*n_jobs', refit=True, scoring='r2', verbose=10)

In [33]:
print "Best Estimator: {0}".format(grid.best_estimator_)
print "Best Score: {0}".format(grid.best_score_)

Best Estimator: Lasso(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)
Best Score: 0.743510596432


In [41]:
# Running Random Forest
from sklearn.ensemble import RandomForestRegressor
rf_reg = RandomForestRegressor()
# Running Grid search with 'n_estimators' and 'max_depth' as parameters
param_dict = {'n_estimators':[50],'max_depth':[None,25,20]}
grid = GridSearchCV(rf_reg,param_grid=param_dict,verbose=10,scoring='r2',cv=5,n_jobs=1)
grid.fit(train_x,train_y)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] n_estimators=50, max_depth=None .................................
[CV] ........ n_estimators=50, max_depth=None, score=0.703429 - 4.0min
[CV] n_estimators=50, max_depth=None .................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  4.0min remaining:    0.0s


[CV] ........ n_estimators=50, max_depth=None, score=0.703338 - 3.7min
[CV] n_estimators=50, max_depth=None .................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  7.7min remaining:    0.0s


[CV] ........ n_estimators=50, max_depth=None, score=0.702800 - 3.6min
[CV] n_estimators=50, max_depth=None .................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 11.4min remaining:    0.0s


[CV] ........ n_estimators=50, max_depth=None, score=0.703303 - 3.8min
[CV] n_estimators=50, max_depth=None .................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed: 15.1min remaining:    0.0s


[CV] ........ n_estimators=50, max_depth=None, score=0.702812 - 3.7min
[CV] n_estimators=50, max_depth=25 ...................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 18.8min remaining:    0.0s


[CV] .......... n_estimators=50, max_depth=25, score=0.717954 - 3.3min
[CV] n_estimators=50, max_depth=25 ...................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed: 22.2min remaining:    0.0s


[CV] .......... n_estimators=50, max_depth=25, score=0.718076 - 3.3min
[CV] n_estimators=50, max_depth=25 ...................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed: 25.5min remaining:    0.0s


[CV] .......... n_estimators=50, max_depth=25, score=0.717828 - 3.3min
[CV] n_estimators=50, max_depth=25 ...................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed: 28.9min remaining:    0.0s


[CV] .......... n_estimators=50, max_depth=25, score=0.718461 -16.9min
[CV] n_estimators=50, max_depth=25 ...................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 45.8min remaining:    0.0s


[CV] .......... n_estimators=50, max_depth=25, score=0.718046 - 3.6min
[CV] n_estimators=50, max_depth=20 ...................................
[CV] .......... n_estimators=50, max_depth=20, score=0.742403 - 3.0min
[CV] n_estimators=50, max_depth=20 ...................................
[CV] .......... n_estimators=50, max_depth=20, score=0.742522 - 3.1min
[CV] n_estimators=50, max_depth=20 ...................................
[CV] .......... n_estimators=50, max_depth=20, score=0.742591 -11.3min
[CV] n_estimators=50, max_depth=20 ...................................
[CV] .......... n_estimators=50, max_depth=20, score=0.743373 - 3.1min
[CV] n_estimators=50, max_depth=20 ...................................
[CV] .......... n_estimators=50, max_depth=20, score=0.742732 -10.8min


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed: 80.7min finished
  best_estimator.fit(X, y, **self.fit_params)


GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [50], 'max_depth': [None, 25, 20]},
       pre_dispatch='2*n_jobs', refit=True, scoring='r2', verbose=10)

In [42]:
print "Best Estimator: {0}".format(grid.best_estimator_)
print "Best Score: {0}".format(grid.best_score_)

Best Estimator: RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=20,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=50, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)
Best Score: 0.742724111796


In [43]:
# Running Grid search with 'n_estimators' and 'max_depth' as parameters
# Trying different parameters
rf_reg = RandomForestRegressor()
param_dict = {'n_estimators':[50],'max_depth':[18,15,12,10]}
grid = GridSearchCV(rf_reg,param_grid=param_dict,verbose=10,scoring='r2',cv=5,n_jobs=1)
grid.fit(train_x,train_y)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] n_estimators=50, max_depth=18 ...................................
[CV] .......... n_estimators=50, max_depth=18, score=0.748400 - 3.3min
[CV] n_estimators=50, max_depth=18 ...................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  3.3min remaining:    0.0s


[CV] .......... n_estimators=50, max_depth=18, score=0.748208 - 3.3min
[CV] n_estimators=50, max_depth=18 ...................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  6.7min remaining:    0.0s


[CV] .......... n_estimators=50, max_depth=18, score=0.748849 - 3.4min
[CV] n_estimators=50, max_depth=18 ...................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 10.1min remaining:    0.0s


[CV] .......... n_estimators=50, max_depth=18, score=0.748974 - 3.3min
[CV] n_estimators=50, max_depth=18 ...................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed: 13.4min remaining:    0.0s


[CV] .......... n_estimators=50, max_depth=18, score=0.748875 - 3.7min
[CV] n_estimators=50, max_depth=15 ...................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 17.1min remaining:    0.0s


[CV] .......... n_estimators=50, max_depth=15, score=0.743707 - 3.2min
[CV] n_estimators=50, max_depth=15 ...................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed: 20.3min remaining:    0.0s


[CV] .......... n_estimators=50, max_depth=15, score=0.743391 - 3.3min
[CV] n_estimators=50, max_depth=15 ...................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed: 23.7min remaining:    0.0s


[CV] .......... n_estimators=50, max_depth=15, score=0.743622 - 3.2min
[CV] n_estimators=50, max_depth=15 ...................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed: 26.9min remaining:    0.0s


[CV] .......... n_estimators=50, max_depth=15, score=0.744263 - 3.1min
[CV] n_estimators=50, max_depth=15 ...................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 29.9min remaining:    0.0s


[CV] .......... n_estimators=50, max_depth=15, score=0.744929 - 3.1min
[CV] n_estimators=50, max_depth=12 ...................................
[CV] .......... n_estimators=50, max_depth=12, score=0.718545 - 2.7min
[CV] n_estimators=50, max_depth=12 ...................................
[CV] .......... n_estimators=50, max_depth=12, score=0.719342 - 2.7min
[CV] n_estimators=50, max_depth=12 ...................................
[CV] .......... n_estimators=50, max_depth=12, score=0.718681 - 2.7min
[CV] n_estimators=50, max_depth=12 ...................................
[CV] .......... n_estimators=50, max_depth=12, score=0.719733 - 2.8min
[CV] n_estimators=50, max_depth=12 ...................................
[CV] .......... n_estimators=50, max_depth=12, score=0.720770 - 2.8min
[CV] n_estimators=50, max_depth=10 ...................................
[CV] .......... n_estimators=50, max_depth=10, score=0.685414 - 2.4min
[CV] n_estimators=50, max_depth=10 ...................................
[CV] .

[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed: 58.2min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [50], 'max_depth': [18, 15, 12, 10]},
       pre_dispatch='2*n_jobs', refit=True, scoring='r2', verbose=10)

In [45]:
print "Best Estimator: {0}".format(grid.best_estimator_)
print "Best Score: {0}".format(grid.best_score_)

Best Estimator: RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=18,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=50, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)
Best Score: 0.748661132996


In [48]:
rf_reg = RandomForestRegressor()
# Running Grid search with 'n_estimators' and 'max_depth' as parameters
param_dict = {'n_estimators':[100,150,200],'max_depth':[18]}
grid = GridSearchCV(rf_reg,param_grid=param_dict,verbose=10,scoring='r2',cv=3,n_jobs=1)
grid.fit(train_x,train_y)

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] n_estimators=100, max_depth=18 ..................................
[CV] ......... n_estimators=100, max_depth=18, score=0.747843 - 5.5min
[CV] n_estimators=100, max_depth=18 ..................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  5.5min remaining:    0.0s


[CV] ......... n_estimators=100, max_depth=18, score=0.748084 - 5.7min
[CV] n_estimators=100, max_depth=18 ..................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 11.2min remaining:    0.0s


[CV] ......... n_estimators=100, max_depth=18, score=0.748589 - 5.9min
[CV] n_estimators=150, max_depth=18 ..................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 17.1min remaining:    0.0s


[CV] ......... n_estimators=150, max_depth=18, score=0.748037 - 8.7min
[CV] n_estimators=150, max_depth=18 ..................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed: 25.8min remaining:    0.0s


[CV] ......... n_estimators=150, max_depth=18, score=0.748394 - 8.2min
[CV] n_estimators=150, max_depth=18 ..................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 34.0min remaining:    0.0s


[CV] ......... n_estimators=150, max_depth=18, score=0.748759 - 8.2min
[CV] n_estimators=200, max_depth=18 ..................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed: 42.3min remaining:    0.0s


[CV] ......... n_estimators=200, max_depth=18, score=0.748128 - 9.9min
[CV] n_estimators=200, max_depth=18 ..................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed: 52.2min remaining:    0.0s


[CV] ......... n_estimators=200, max_depth=18, score=0.748447 - 9.5min
[CV] n_estimators=200, max_depth=18 ..................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed: 61.7min remaining:    0.0s


[CV] ......... n_estimators=200, max_depth=18, score=0.748775 - 9.4min


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 71.1min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 71.1min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [100, 150, 200], 'max_depth': [18]},
       pre_dispatch='2*n_jobs', refit=True, scoring='r2', verbose=10)

In [49]:
print "Best Estimator: {0}".format(grid.best_estimator_)
print "Best Score: {0}".format(grid.best_score_)

Best Estimator: RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=18,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=200, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)
Best Score: 0.748450057709


In [51]:
# Running Gradient Boosted Trees
from sklearn.ensemble import GradientBoostingRegressor

gbr = GradientBoostingRegressor()
param_dict = {'n_estimators':[30,50],'max_depth':[4,5,6]}
grid_gbr = GridSearchCV(gbr,param_grid=param_dict,verbose=10,scoring='r2',cv=4)
grid_gbr.fit(train_x,train_y)

Fitting 4 folds for each of 6 candidates, totalling 24 fits
[CV] n_estimators=30, max_depth=4 ....................................
[CV] ........... n_estimators=30, max_depth=4, score=0.680571 - 1.3min
[CV] n_estimators=30, max_depth=4 ....................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.3min remaining:    0.0s


[CV] ........... n_estimators=30, max_depth=4, score=0.678929 - 1.3min
[CV] n_estimators=30, max_depth=4 ....................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  2.7min remaining:    0.0s


[CV] ........... n_estimators=30, max_depth=4, score=0.680843 - 1.1min
[CV] n_estimators=30, max_depth=4 ....................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  3.8min remaining:    0.0s


[CV] ........... n_estimators=30, max_depth=4, score=0.682024 - 1.1min
[CV] n_estimators=50, max_depth=4 ....................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  4.9min remaining:    0.0s


[CV] ........... n_estimators=50, max_depth=4, score=0.733609 - 1.9min
[CV] n_estimators=50, max_depth=4 ....................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  6.8min remaining:    0.0s


[CV] ........... n_estimators=50, max_depth=4, score=0.732998 - 1.8min
[CV] n_estimators=50, max_depth=4 ....................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  8.6min remaining:    0.0s


[CV] ........... n_estimators=50, max_depth=4, score=0.732966 - 1.8min
[CV] n_estimators=50, max_depth=4 ....................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed: 10.4min remaining:    0.0s


[CV] ........... n_estimators=50, max_depth=4, score=0.734294 - 2.1min
[CV] n_estimators=30, max_depth=5 ....................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed: 12.5min remaining:    0.0s


[CV] ........... n_estimators=30, max_depth=5, score=0.709154 - 1.8min
[CV] n_estimators=30, max_depth=5 ....................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 14.3min remaining:    0.0s


[CV] ........... n_estimators=30, max_depth=5, score=0.708291 - 1.7min
[CV] n_estimators=30, max_depth=5 ....................................
[CV] ........... n_estimators=30, max_depth=5, score=0.708511 - 1.7min
[CV] n_estimators=30, max_depth=5 ....................................
[CV] ........... n_estimators=30, max_depth=5, score=0.710014 - 1.7min
[CV] n_estimators=50, max_depth=5 ....................................
[CV] ........... n_estimators=50, max_depth=5, score=0.746192 - 2.9min
[CV] n_estimators=50, max_depth=5 ....................................
[CV] ........... n_estimators=50, max_depth=5, score=0.745736 - 2.9min
[CV] n_estimators=50, max_depth=5 ....................................
[CV] ........... n_estimators=50, max_depth=5, score=0.745678 - 3.1min
[CV] n_estimators=50, max_depth=5 ....................................
[CV] ........... n_estimators=50, max_depth=5, score=0.746121 - 3.0min
[CV] n_estimators=30, max_depth=6 ....................................
[CV] .

[Parallel(n_jobs=1)]: Done  24 out of  24 | elapsed: 61.3min finished


GridSearchCV(cv=4, error_score='raise',
       estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_split=1e-07,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=100,
             presort='auto', random_state=None, subsample=1.0, verbose=0,
             warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [30, 50], 'max_depth': [4, 5, 6]},
       pre_dispatch='2*n_jobs', refit=True, scoring='r2', verbose=10)

In [52]:
print "Best Estimator: {0}".format(grid_gbr.best_estimator_)
print "Best Score: {0}".format(grid_gbr.best_score_)

Best Estimator: GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=6, max_features=None,
             max_leaf_nodes=None, min_impurity_split=1e-07,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=50, presort='auto',
             random_state=None, subsample=1.0, verbose=0, warm_start=False)
Best Score: 0.752571089964


In [53]:
# Running Gradient Boosted Regressor
# Trying different parameters
gbr = GradientBoostingRegressor()
param_dict = {'n_estimators':[100,150],'max_depth':[6,7,8]}
grid_gbr = GridSearchCV(gbr,param_grid=param_dict,verbose=10,scoring='r2',cv=3)
grid_gbr.fit(train_x,train_y)

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] n_estimators=100, max_depth=6 ...................................
[CV] .......... n_estimators=100, max_depth=6, score=0.760953 - 8.2min
[CV] n_estimators=100, max_depth=6 ...................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  8.2min remaining:    0.0s


[CV] .......... n_estimators=100, max_depth=6, score=0.760854 - 8.0min
[CV] n_estimators=100, max_depth=6 ...................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 16.2min remaining:    0.0s


[CV] .......... n_estimators=100, max_depth=6, score=0.761687 - 8.2min
[CV] n_estimators=150, max_depth=6 ...................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 24.3min remaining:    0.0s


[CV] .......... n_estimators=150, max_depth=6, score=0.761799 -11.3min
[CV] n_estimators=150, max_depth=6 ...................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed: 35.6min remaining:    0.0s


[CV] .......... n_estimators=150, max_depth=6, score=0.761780 -12.0min
[CV] n_estimators=150, max_depth=6 ...................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 47.6min remaining:    0.0s


KeyboardInterrupt: 

In [54]:
# Running Gradient Boosted Regressor
# Trying different parameters
gbr = GradientBoostingRegressor()
param_dict = {'n_estimators':[100],'max_depth':[7,8]}
grid_gbr = GridSearchCV(gbr,param_grid=param_dict,verbose=10,scoring='r2',cv=3)
grid_gbr.fit(train_x,train_y)

Fitting 3 folds for each of 2 candidates, totalling 6 fits
[CV] n_estimators=100, max_depth=7 ...................................
[CV] .......... n_estimators=100, max_depth=7, score=0.761379 -12.5min
[CV] n_estimators=100, max_depth=7 ...................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 12.5min remaining:    0.0s


[CV] .......... n_estimators=100, max_depth=7, score=0.761487 -13.2min
[CV] n_estimators=100, max_depth=7 ...................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 25.7min remaining:    0.0s


KeyboardInterrupt: 

In [57]:
# Fir GBR with the optimal paramters over the training data
gbr = GradientBoostingRegressor(n_estimators=150,max_depth=6,verbose=10)
gbr.fit(train_x,train_y)

      Iter       Train Loss   Remaining Time 
         1        1338.4885           16.74m
         2        1207.4652           16.42m
         3        1099.4195           16.03m
         4        1009.1919           15.81m
         5         933.2072           15.50m
         6         869.1195           15.45m
         7         813.1739           15.29m
         8         766.0781           15.20m
         9         724.7665           15.12m
        10         686.6425           15.11m
        11         653.1713           15.07m
        12         624.3714           15.01m
        13         596.9420           14.93m
        14         573.5535           14.92m
        15         554.0265           14.92m
        16         535.7504           14.88m
        17         519.5516           14.77m
        18         504.8930           14.80m
        19         491.5705           14.70m
        20         479.9661           14.61m
        21         469.7543           14.58m
        2

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=6, max_features=None,
             max_leaf_nodes=None, min_impurity_split=1e-07,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=150,
             presort='auto', random_state=None, subsample=1.0, verbose=10,
             warm_start=False)

In [65]:
# Plotting the feature importance on a graph
# Get Feature Importance from the Regressor
feature_importance = gbr.feature_importances_
# Normalize The Features
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
plt.figure(figsize=(16, 12))
plt.barh(pos, feature_importance[sorted_idx], align='center', color='#7A68A6')
plt.yticks(pos, np.asanyarray(train_x.columns.tolist())[sorted_idx])
plt.xlabel('Relative Importance')
plt.title('Variable Importance')
plt.savefig('feature_importance_gbr.png')

In [69]:
# Running with Scaled Features
# Linear Regression (without regularization)
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
train_scaled_x = scaler.fit_transform(train_x)
linear_regr = linear_model.LinearRegression()
scores = cross_val_score(linear_regr,train_scaled_x,train_y,cv=5,scoring='r2',verbose=10)

[CV]  ................................................................
[CV] ................................. , score=0.742888, total=   2.2s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.2s remaining:    0.0s


[CV] ................................. , score=0.743580, total=   2.2s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    4.4s remaining:    0.0s


[CV] ................................. , score=0.743134, total=   2.1s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    6.6s remaining:    0.0s


[CV] ................................. , score=0.743941, total=   2.1s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    8.7s remaining:    0.0s


[CV] ................................. , score=0.744009, total=   2.3s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   11.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   11.0s finished


In [75]:
scores.mean()

0.74351041066454726

In [76]:
# Running Ridge Regression with scaled features
alphas = np.logspace(-3,2,10)
tuning_parameters = [{'alpha':alphas}]
grid_ridge = GridSearchCV(linear_model.Ridge(),tuning_parameters,cv=5,scoring='r2',verbose=10)
grid_ridge.fit(train_scaled_x,train_y)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] alpha=0.001 .....................................................
[CV] ............................ alpha=0.001, score=0.742890 -   1.6s
[CV] alpha=0.001 .....................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.7s remaining:    0.0s


[CV] ............................ alpha=0.001, score=0.743580 -   1.5s
[CV] alpha=0.001 .....................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.2s remaining:    0.0s


[CV] ............................ alpha=0.001, score=0.743131 -   1.3s
[CV] alpha=0.001 .....................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    4.5s remaining:    0.0s


[CV] ............................ alpha=0.001, score=0.743943 -   1.4s
[CV] alpha=0.001 .....................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    6.0s remaining:    0.0s


[CV] ............................ alpha=0.001, score=0.744010 -   1.4s
[CV] alpha=0.0035938136638 ...........................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    7.4s remaining:    0.0s


[CV] .................. alpha=0.0035938136638, score=0.742890 -   1.4s
[CV] alpha=0.0035938136638 ...........................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    8.9s remaining:    0.0s


[CV] .................. alpha=0.0035938136638, score=0.743580 -   1.3s
[CV] alpha=0.0035938136638 ...........................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   10.2s remaining:    0.0s


[CV] .................. alpha=0.0035938136638, score=0.743131 -   1.3s
[CV] alpha=0.0035938136638 ...........................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   11.5s remaining:    0.0s


[CV] .................. alpha=0.0035938136638, score=0.743943 -   1.3s
[CV] alpha=0.0035938136638 ...........................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   12.8s remaining:    0.0s


[CV] .................. alpha=0.0035938136638, score=0.744010 -   1.3s
[CV] alpha=0.0129154966501 ...........................................
[CV] .................. alpha=0.0129154966501, score=0.742890 -   1.3s
[CV] alpha=0.0129154966501 ...........................................
[CV] .................. alpha=0.0129154966501, score=0.743580 -   1.3s
[CV] alpha=0.0129154966501 ...........................................
[CV] .................. alpha=0.0129154966501, score=0.743131 -   1.3s
[CV] alpha=0.0129154966501 ...........................................
[CV] .................. alpha=0.0129154966501, score=0.743943 -   1.3s
[CV] alpha=0.0129154966501 ...........................................
[CV] .................. alpha=0.0129154966501, score=0.744010 -   1.2s
[CV] alpha=0.0464158883361 ...........................................
[CV] .................. alpha=0.0464158883361, score=0.742890 -   1.2s
[CV] alpha=0.0464158883361 ...........................................
[CV] .

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  1.1min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'alpha': array([  1.00000e-03,   3.59381e-03,   1.29155e-02,   4.64159e-02,
         1.66810e-01,   5.99484e-01,   2.15443e+00,   7.74264e+00,
         2.78256e+01,   1.00000e+02])}],
       pre_dispatch='2*n_jobs', refit=True, scoring='r2', verbose=10)

In [79]:
print grid_ridge.best_estimator_
print grid_ridge.best_score_

Ridge(alpha=27.825594022071257, copy_X=True, fit_intercept=True,
   max_iter=None, normalize=False, random_state=None, solver='auto',
   tol=0.001)
0.743510762604


In [101]:
# Weights of the features after fitting the model
print grid_ridge.best_estimator_.coef_

[[ 14.49722956 -11.53659414   9.14758076   5.89979141   5.91845474
  -11.46689918  -7.3031121   -0.72311798  -4.01515562   2.56036416
    0.26804401   4.08216173  -2.13185092   2.16808737  -3.69791263
   -0.09018973   1.71674444   0.16375179   0.8571961    2.40011968
   -0.94131471   1.11024872  -2.6649455    0.45531293  -2.29303459
   -5.78753619   5.10595247  -0.10639439   5.16292446  -4.03261364
    1.94720438]]


In [109]:
# Plot the weights on a graph
# Get Feature Importance from the Ridge Regressor
feature_importance = abs(grid_ridge.best_estimator_.coef_.ravel())
# Normalize The Features
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
plt.figure(figsize=(16, 12))
plt.barh(pos, feature_importance[sorted_idx], align='center', color='#7A68A6')
plt.yticks(pos, np.asanyarray(train_x.columns.tolist())[sorted_idx])
plt.xlabel('Relative Importance')
plt.title('Variable Importance')
plt.savefig('feature_importance_ridge.png')

In [98]:
train_x.columns

Index([u'yearsExperience', u'milesFromMetropolis', u'jobType_CEO',
       u'jobType_CFO', u'jobType_CTO', u'jobType_JANITOR', u'jobType_JUNIOR',
       u'jobType_MANAGER', u'jobType_SENIOR', u'jobType_VICE_PRESIDENT',
       u'degree_BACHELORS', u'degree_DOCTORAL', u'degree_HIGH_SCHOOL',
       u'degree_MASTERS', u'degree_NONE', u'major_BIOLOGY', u'major_BUSINESS',
       u'major_CHEMISTRY', u'major_COMPSCI', u'major_ENGINEERING',
       u'major_LITERATURE', u'major_MATH', u'major_NONE', u'major_PHYSICS',
       u'industry_AUTO', u'industry_EDUCATION', u'industry_FINANCE',
       u'industry_HEALTH', u'industry_OIL', u'industry_SERVICE',
       u'industry_WEB'],
      dtype='object')

In [78]:
# Running Lasso with scaled features
alphas = np.logspace(-3,2,10)
tuning_parameters = [{'alpha': alphas}]
grid_lasso = GridSearchCV(linear_model.Lasso(),tuning_parameters,cv=5,scoring='r2',verbose=10)
grid_lasso.fit(train_scaled_x,train_y)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] alpha=0.001 .....................................................
[CV] ............................ alpha=0.001, score=0.742890 - 2.4min
[CV] alpha=0.001 .....................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.4min remaining:    0.0s


[CV] ............................ alpha=0.001, score=0.743579 - 2.2min
[CV] alpha=0.001 .....................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  4.6min remaining:    0.0s


[CV] ............................ alpha=0.001, score=0.743131 - 2.0min
[CV] alpha=0.001 .....................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  6.6min remaining:    0.0s


[CV] ............................ alpha=0.001, score=0.743944 - 1.9min
[CV] alpha=0.001 .....................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  8.5min remaining:    0.0s


[CV] ............................ alpha=0.001, score=0.744010 - 1.9min
[CV] alpha=0.0035938136638 ...........................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 10.3min remaining:    0.0s


[CV] .................. alpha=0.0035938136638, score=0.742891 - 1.6min
[CV] alpha=0.0035938136638 ...........................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed: 11.9min remaining:    0.0s


[CV] .................. alpha=0.0035938136638, score=0.743578 - 1.6min
[CV] alpha=0.0035938136638 ...........................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed: 13.5min remaining:    0.0s


[CV] .................. alpha=0.0035938136638, score=0.743128 - 1.6min
[CV] alpha=0.0035938136638 ...........................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed: 15.1min remaining:    0.0s


[CV] .................. alpha=0.0035938136638, score=0.743945 - 1.6min
[CV] alpha=0.0035938136638 ...........................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 16.8min remaining:    0.0s


[CV] .................. alpha=0.0035938136638, score=0.744011 - 1.7min
[CV] alpha=0.0129154966501 ...........................................
[CV] .................. alpha=0.0129154966501, score=0.742893 -  33.7s
[CV] alpha=0.0129154966501 ...........................................
[CV] .................. alpha=0.0129154966501, score=0.743570 -  33.7s
[CV] alpha=0.0129154966501 ...........................................
[CV] .................. alpha=0.0129154966501, score=0.743119 -  32.7s
[CV] alpha=0.0129154966501 ...........................................
[CV] .................. alpha=0.0129154966501, score=0.743947 -  33.5s
[CV] alpha=0.0129154966501 ...........................................
[CV] .................. alpha=0.0129154966501, score=0.744012 -  35.3s
[CV] alpha=0.0464158883361 ...........................................
[CV] .................. alpha=0.0464158883361, score=0.742875 -  11.0s
[CV] alpha=0.0464158883361 ...........................................
[CV] .

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed: 23.7min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'alpha': array([  1.00000e-03,   3.59381e-03,   1.29155e-02,   4.64159e-02,
         1.66810e-01,   5.99484e-01,   2.15443e+00,   7.74264e+00,
         2.78256e+01,   1.00000e+02])}],
       pre_dispatch='2*n_jobs', refit=True, scoring='r2', verbose=10)

In [80]:
print grid_lasso.best_estimator_
print grid_lasso.best_score_

Lasso(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)
0.743510748426


In [81]:
# Running Random Forest with scaled features
# Running Grid search with 'n_estimators' and 'max_depth' as parameters
rf_reg = RandomForestRegressor()
param_dict = {'n_estimators':[50],'max_depth':[16,18,None,19,20]}
grid_rf = GridSearchCV(rf_reg,param_grid=param_dict,verbose=10,scoring='r2',cv=3,n_jobs=1)
grid_rf.fit(train_scaled_x,train_y)

Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV] n_estimators=50, max_depth=16 ...................................
[CV] .......... n_estimators=50, max_depth=16, score=0.746984 - 2.9min
[CV] n_estimators=50, max_depth=16 ...................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.9min remaining:    0.0s


[CV] .......... n_estimators=50, max_depth=16, score=0.747088 - 3.0min
[CV] n_estimators=50, max_depth=16 ...................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  5.9min remaining:    0.0s


[CV] .......... n_estimators=50, max_depth=16, score=0.747596 - 2.9min
[CV] n_estimators=50, max_depth=18 ...................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  8.8min remaining:    0.0s


[CV] .......... n_estimators=50, max_depth=18, score=0.747378 - 2.9min
[CV] n_estimators=50, max_depth=18 ...................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed: 11.8min remaining:    0.0s


[CV] .......... n_estimators=50, max_depth=18, score=0.747554 - 3.0min
[CV] n_estimators=50, max_depth=18 ...................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 14.7min remaining:    0.0s


[CV] .......... n_estimators=50, max_depth=18, score=0.747865 - 3.1min
[CV] n_estimators=50, max_depth=None .................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed: 17.8min remaining:    0.0s


[CV] ........ n_estimators=50, max_depth=None, score=0.704705 - 3.6min
[CV] n_estimators=50, max_depth=None .................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed: 21.4min remaining:    0.0s


[CV] ........ n_estimators=50, max_depth=None, score=0.704118 - 3.3min
[CV] n_estimators=50, max_depth=None .................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed: 24.7min remaining:    0.0s


[CV] ........ n_estimators=50, max_depth=None, score=0.704542 - 3.2min
[CV] n_estimators=50, max_depth=19 ...................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 27.9min remaining:    0.0s


[CV] .......... n_estimators=50, max_depth=19, score=0.744803 - 2.8min
[CV] n_estimators=50, max_depth=19 ...................................
[CV] .......... n_estimators=50, max_depth=19, score=0.745112 - 2.8min
[CV] n_estimators=50, max_depth=19 ...................................
[CV] .......... n_estimators=50, max_depth=19, score=0.745587 - 2.9min
[CV] n_estimators=50, max_depth=20 ...................................
[CV] .......... n_estimators=50, max_depth=20, score=0.741194 - 2.9min
[CV] n_estimators=50, max_depth=20 ...................................
[CV] .......... n_estimators=50, max_depth=20, score=0.741358 - 2.9min
[CV] n_estimators=50, max_depth=20 ...................................
[CV] .......... n_estimators=50, max_depth=20, score=0.741829 - 2.9min


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed: 45.0min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [50], 'max_depth': [16, 18, None, 19, 20]},
       pre_dispatch='2*n_jobs', refit=True, scoring='r2', verbose=10)

In [84]:
print grid_rf.best_estimator_
print grid_rf.best_score_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=18,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=50, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)
0.747598937753


In [89]:
rf_reg = RandomForestRegressor(n_estimators=100,max_depth=18,verbose=10)
scores = cross_val_score(rf_reg,train_scaled_x,train_y,cv=3,scoring='r2',verbose=10)

[CV]  ................................................................


  estimator.fit(X_train, y_train, **fit_params)


building tree 1 of 100


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.8s remaining:    0.0s


building tree 2 of 100


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    7.2s remaining:    0.0s


building tree 3 of 100


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   10.6s remaining:    0.0s


building tree 4 of 100


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   14.1s remaining:    0.0s


building tree 5 of 100


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   17.3s remaining:    0.0s


building tree 6 of 100


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   21.3s remaining:    0.0s


building tree 7 of 100


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   24.7s remaining:    0.0s


building tree 8 of 100


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   27.9s remaining:    0.0s


building tree 9 of 100


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   31.4s remaining:    0.0s


building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:  6.3min finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.9s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   11.5s finished


[CV] ................................. , score=0.747744, total= 6.5min
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  6.5min remaining:    0.0s


building tree 1 of 100


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.5s remaining:    0.0s


building tree 2 of 100


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    7.4s remaining:    0.0s


building tree 3 of 100


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   10.6s remaining:    0.0s


building tree 4 of 100


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   13.7s remaining:    0.0s


building tree 5 of 100


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   16.9s remaining:    0.0s


building tree 6 of 100


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   20.0s remaining:    0.0s


building tree 7 of 100


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   23.5s remaining:    0.0s


building tree 8 of 100


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   27.1s remaining:    0.0s


building tree 9 of 100


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   30.5s remaining:    0.0s


building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:  6.0min finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    9.3s finished
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 12.7min remaining:    0.0s


[CV] ................................. , score=0.748166, total= 6.2min
[CV]  ................................................................
building tree 1 of 100


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.4s remaining:    0.0s


building tree 2 of 100


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    6.5s remaining:    0.0s


building tree 3 of 100


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   10.4s remaining:    0.0s


building tree 4 of 100


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   13.8s remaining:    0.0s


building tree 5 of 100


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   18.0s remaining:    0.0s


building tree 6 of 100


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   22.3s remaining:    0.0s


building tree 7 of 100


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   25.9s remaining:    0.0s


building tree 8 of 100


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   29.3s remaining:    0.0s


building tree 9 of 100


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   32.5s remaining:    0.0s


building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:  5.6min finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.8s remaining:    0.0s


[CV] ................................. , score=0.748566, total= 5.7min


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    9.1s finished
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 18.4min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 18.4min finished


In [91]:
scores.mean()

0.74815855283759747

In [82]:
# Running GBR on scaled features
gbr = GradientBoostingRegressor()
param_dict = {'n_estimators':[100],'max_depth':[5,6,7,8]}
grid_gbr = GridSearchCV(gbr,param_grid=param_dict,verbose=10,scoring='r2',cv=3)
grid_gbr.fit(train_scaled_x,train_y)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] n_estimators=100, max_depth=5 ...................................
[CV] .......... n_estimators=100, max_depth=5, score=0.759762 - 6.2min
[CV] n_estimators=100, max_depth=5 ...................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  6.2min remaining:    0.0s


[CV] .......... n_estimators=100, max_depth=5, score=0.759744 - 6.5min
[CV] n_estimators=100, max_depth=5 ...................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 12.7min remaining:    0.0s


[CV] .......... n_estimators=100, max_depth=5, score=0.760225 - 6.5min
[CV] n_estimators=100, max_depth=6 ...................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 19.3min remaining:    0.0s


[CV] .......... n_estimators=100, max_depth=6, score=0.760953 - 9.0min
[CV] n_estimators=100, max_depth=6 ...................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed: 28.3min remaining:    0.0s


[CV] .......... n_estimators=100, max_depth=6, score=0.760855 -10.0min
[CV] n_estimators=100, max_depth=6 ...................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 38.3min remaining:    0.0s


[CV] .......... n_estimators=100, max_depth=6, score=0.761687 -10.0min
[CV] n_estimators=100, max_depth=7 ...................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed: 48.3min remaining:    0.0s


[CV] .......... n_estimators=100, max_depth=7, score=0.761380 -13.8min
[CV] n_estimators=100, max_depth=7 ...................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed: 62.1min remaining:    0.0s


[CV] ......... n_estimators=100, max_depth=7, score=0.761488 -426.5min
[CV] n_estimators=100, max_depth=7 ...................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed: 488.6min remaining:    0.0s


[CV] .......... n_estimators=100, max_depth=7, score=0.762106 -13.7min
[CV] n_estimators=100, max_depth=8 ...................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 502.3min remaining:    0.0s


[CV] .......... n_estimators=100, max_depth=8, score=0.761183 -22.3min
[CV] n_estimators=100, max_depth=8 ...................................
[CV] .......... n_estimators=100, max_depth=8, score=0.761324 -22.3min
[CV] n_estimators=100, max_depth=8 ...................................
[CV] .......... n_estimators=100, max_depth=8, score=0.761825 -21.1min


[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed: 568.1min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_split=1e-07,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=100,
             presort='auto', random_state=None, subsample=1.0, verbose=0,
             warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [100], 'max_depth': [5, 6, 7, 8]},
       pre_dispatch='2*n_jobs', refit=True, scoring='r2', verbose=10)

In [83]:
print grid_gbr.best_estimator_
print grid_gbr.best_score_

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=7, max_features=None,
             max_leaf_nodes=None, min_impurity_split=1e-07,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=100,
             presort='auto', random_state=None, subsample=1.0, verbose=0,
             warm_start=False)
0.761657828536
