In [1]:
# import the libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt



In [2]:
# read the data into pandas data frames.
df_feat = pd.read_csv('train_features_2013-03-07.csv',sep=',')
df_target = pd.read_csv('train_salaries_2013-03-07.csv',sep=',')
# Join the features and the target data frames.
df = df_feat.join(df_target,lsuffix='_feat',rsuffix='_target')
# Drop redundant column
df.drop(['jobId_target'],axis=1,inplace=True)
# Rename the column
df.columns = [u'jobId', u'companyId', u'jobType', u'degree', u'major',
       u'industry', u'yearsExperience', u'milesFromMetropolis', u'salary']

In [3]:
df.head()

Unnamed: 0,jobId,companyId,jobType,degree,major,industry,yearsExperience,milesFromMetropolis,salary
0,JOB1362684407687,COMP37,CFO,MASTERS,MATH,HEALTH,10,83,130
1,JOB1362684407688,COMP19,CEO,HIGH_SCHOOL,NONE,WEB,3,73,101
2,JOB1362684407689,COMP52,VICE_PRESIDENT,DOCTORAL,PHYSICS,HEALTH,10,38,137
3,JOB1362684407690,COMP38,MANAGER,DOCTORAL,CHEMISTRY,AUTO,8,17,142
4,JOB1362684407691,COMP7,VICE_PRESIDENT,BACHELORS,PHYSICS,FINANCE,8,16,163


In [4]:
# Encoding the degree variable
def encode_degree(row):
    if row['degree'] == 'BACHELORS':
        return 0
    elif row['degree'] == 'DOCTORAL':
        return 1
    else:
        return 2

# Encoding the major variable
def encode_major(row):
    if row['major'] == 'NONE':
        return 0
    elif row['major'] == 'BUSINESS':
        return 1
    elif row['major'] == 'ENGINEERING':
        return 2
    else:
        return 3
    
# Encoding the industry variable
def encode_industry(row):
    if row['industry'] == 'HEALTH':
        return 0
    elif row['industry'] == 'WEB':
        return 1
    elif row['industry'] == 'EDUCATION':
        return 2
    elif row['industry'] == 'OIL':
        return 3
    elif row['industry'] == 'FINANCE':
        return 4
    else:
        return 5
    
    
# Encoding the jobType variable
def encode_jobType(row):
    if row['jobType'] == 'JANITOR':
        return 0
    elif row['jobType'] == 'CEO':
        return 1
    elif row['jobType'] == 'CFO':
        return 2
    elif row['jobType'] == 'CTO':
        return 3
    elif row['jobType'] == 'JUNIOR':
        return 4
    else:
        return 5
    

df['degreeEncoded'] = df.apply(encode_degree,axis=1)
df['majorEncoded'] = df.apply(encode_major,axis=1)
df['industryEncoded'] = df.apply(encode_industry,axis=1)
df['jobTypeEncoded'] = df.apply(encode_jobType,axis=1)

In [5]:
df.columns

Index([              u'jobId',           u'companyId',             u'jobType',
                    u'degree',               u'major',            u'industry',
           u'yearsExperience', u'milesFromMetropolis',              u'salary',
             u'degreeEncoded',        u'majorEncoded',     u'industryEncoded',
            u'jobTypeEncoded'],
      dtype='object')

In [6]:
df.dtypes

jobId                  object
companyId              object
jobType                object
degree                 object
major                  object
industry               object
yearsExperience         int64
milesFromMetropolis     int64
salary                  int64
degreeEncoded           int64
majorEncoded            int64
industryEncoded         int64
jobTypeEncoded          int64
dtype: object

In [7]:
# Change the data type of the columns
df['degreeEncoded'] = df['degreeEncoded'].astype('category')
df['majorEncoded'] = df['majorEncoded'].astype('category')
df['industryEncoded'] = df['industryEncoded'].astype('category')
df['jobTypeEncoded'] = df['jobTypeEncoded'].astype('category')

In [8]:
df.dtypes

jobId                    object
companyId                object
jobType                  object
degree                   object
major                    object
industry                 object
yearsExperience           int64
milesFromMetropolis       int64
salary                    int64
degreeEncoded          category
majorEncoded           category
industryEncoded        category
jobTypeEncoded         category
dtype: object

In [9]:
df.head()

Unnamed: 0,jobId,companyId,jobType,degree,major,industry,yearsExperience,milesFromMetropolis,salary,degreeEncoded,majorEncoded,industryEncoded,jobTypeEncoded
0,JOB1362684407687,COMP37,CFO,MASTERS,MATH,HEALTH,10,83,130,2,3,0,2
1,JOB1362684407688,COMP19,CEO,HIGH_SCHOOL,NONE,WEB,3,73,101,2,0,1,1
2,JOB1362684407689,COMP52,VICE_PRESIDENT,DOCTORAL,PHYSICS,HEALTH,10,38,137,1,3,0,5
3,JOB1362684407690,COMP38,MANAGER,DOCTORAL,CHEMISTRY,AUTO,8,17,142,1,3,5,5
4,JOB1362684407691,COMP7,VICE_PRESIDENT,BACHELORS,PHYSICS,FINANCE,8,16,163,0,3,4,5


In [10]:
# convert the data frame into predictors and target
predictor_columns = ['yearsExperience','milesFromMetropolis','degreeEncoded','majorEncoded','industryEncoded',\
                     'jobTypeEncoded']
target_column = ['salary']
train_x = df[predictor_columns]
train_y = df[target_column] 

In [13]:
print train_x.head()
print train_y.head()

   yearsExperience  milesFromMetropolis degreeEncoded majorEncoded  \
0               10                   83             2            3   
1                3                   73             2            0   
2               10                   38             1            3   
3                8                   17             1            3   
4                8                   16             0            3   

  industryEncoded jobTypeEncoded  
0               0              2  
1               1              1  
2               0              5  
3               5              5  
4               4              5  
   salary
0     130
1     101
2     137
3     142
4     163


In [14]:
# Convert category columns into one-hot vector
train_x = pd.get_dummies(train_x)

In [15]:
train_x.head()

Unnamed: 0,yearsExperience,milesFromMetropolis,degreeEncoded_0,degreeEncoded_1,degreeEncoded_2,majorEncoded_0,majorEncoded_1,majorEncoded_2,majorEncoded_3,industryEncoded_0,...,industryEncoded_2,industryEncoded_3,industryEncoded_4,industryEncoded_5,jobTypeEncoded_0,jobTypeEncoded_1,jobTypeEncoded_2,jobTypeEncoded_3,jobTypeEncoded_4,jobTypeEncoded_5
0,10,83,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,3,73,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,10,38,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,8,17,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,8,16,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [17]:
train_x.shape

(1000000, 21)

In [20]:
# Running Linear Regression (without regularization)
from sklearn.model_selection import cross_val_score
from sklearn import linear_model

linear_regr = linear_model.LinearRegression()
scores = cross_val_score(linear_regr,train_x,train_y,cv=5,scoring='r2',verbose=10)

[CV]  ................................................................
[CV] ................................. , score=0.717738, total=   1.6s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.6s remaining:    0.0s


[CV] ................................. , score=0.718116, total=   1.5s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.1s remaining:    0.0s


[CV] ................................. , score=0.718174, total=   1.4s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    4.5s remaining:    0.0s


[CV] ................................. , score=0.719070, total=   1.6s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    6.1s remaining:    0.0s


[CV] ................................. , score=0.718988, total=   1.6s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    7.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    7.7s finished


In [23]:
print "CV Scores from Linear Regression: {0}".format(scores)
print "Mean Score: {0}".format(scores.mean())

CV Scores from Linear Regression: [ 0.7177381   0.71811566  0.71817447  0.71906956  0.71898846]
Mean Score: 0.718417249755


In [24]:
# Running Ridge Regression
from sklearn.grid_search import GridSearchCV

alphas = np.logspace(-3,2,10)
tuning_parameters = [{'alpha':alphas}]
grid_ridge = GridSearchCV(linear_model.Ridge(),tuning_parameters,cv=5,scoring='r2',verbose=10)
grid_ridge.fit(train_x,train_y)



Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] alpha=0.001 .....................................................
[CV] ............................ alpha=0.001, score=0.717738 -   1.5s
[CV] alpha=0.001 .....................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.5s remaining:    0.0s


[CV] ............................ alpha=0.001, score=0.718116 -   1.2s
[CV] alpha=0.001 .....................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.7s remaining:    0.0s


[CV] ............................ alpha=0.001, score=0.718174 -   1.1s
[CV] alpha=0.001 .....................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    3.9s remaining:    0.0s


[CV] ............................ alpha=0.001, score=0.719070 -   1.1s
[CV] alpha=0.001 .....................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    5.0s remaining:    0.0s


[CV] ............................ alpha=0.001, score=0.718988 -   1.2s
[CV] alpha=0.0035938136638 ...........................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    6.2s remaining:    0.0s


[CV] .................. alpha=0.0035938136638, score=0.717738 -   1.1s
[CV] alpha=0.0035938136638 ...........................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    7.4s remaining:    0.0s


[CV] .................. alpha=0.0035938136638, score=0.718116 -   1.1s
[CV] alpha=0.0035938136638 ...........................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    8.5s remaining:    0.0s


[CV] .................. alpha=0.0035938136638, score=0.718174 -   1.2s
[CV] alpha=0.0035938136638 ...........................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    9.7s remaining:    0.0s


[CV] .................. alpha=0.0035938136638, score=0.719070 -   1.2s
[CV] alpha=0.0035938136638 ...........................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   10.9s remaining:    0.0s


[CV] .................. alpha=0.0035938136638, score=0.718988 -   1.2s
[CV] alpha=0.0129154966501 ...........................................
[CV] .................. alpha=0.0129154966501, score=0.717738 -   1.1s
[CV] alpha=0.0129154966501 ...........................................
[CV] .................. alpha=0.0129154966501, score=0.718116 -   1.1s
[CV] alpha=0.0129154966501 ...........................................
[CV] .................. alpha=0.0129154966501, score=0.718174 -   1.1s
[CV] alpha=0.0129154966501 ...........................................
[CV] .................. alpha=0.0129154966501, score=0.719070 -   1.1s
[CV] alpha=0.0129154966501 ...........................................
[CV] .................. alpha=0.0129154966501, score=0.718988 -   1.1s
[CV] alpha=0.0464158883361 ...........................................
[CV] .................. alpha=0.0464158883361, score=0.717738 -   1.1s
[CV] alpha=0.0464158883361 ...........................................
[CV] .

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:   57.2s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'alpha': array([  1.00000e-03,   3.59381e-03,   1.29155e-02,   4.64159e-02,
         1.66810e-01,   5.99484e-01,   2.15443e+00,   7.74264e+00,
         2.78256e+01,   1.00000e+02])}],
       pre_dispatch='2*n_jobs', refit=True, scoring='r2', verbose=10)

In [25]:
print grid_ridge.best_estimator_
print grid_ridge.best_score_

Ridge(alpha=2.1544346900318843, copy_X=True, fit_intercept=True,
   max_iter=None, normalize=False, random_state=None, solver='auto',
   tol=0.001)
0.71841725013


In [26]:
# Running Lasso regression
alphas = np.logspace(-3,2,10)
tuning_parameters = [{'alpha': alphas}]
grid_lasso = GridSearchCV(linear_model.Lasso(),tuning_parameters,cv=5,scoring='r2',verbose=10)
grid_lasso.fit(train_x,train_y)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] alpha=0.001 .....................................................
[CV] ............................ alpha=0.001, score=0.717739 -  16.5s
[CV] alpha=0.001 .....................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   16.5s remaining:    0.0s


[CV] ............................ alpha=0.001, score=0.718116 -  16.4s
[CV] alpha=0.001 .....................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   32.9s remaining:    0.0s


[CV] ............................ alpha=0.001, score=0.718173 -  17.1s
[CV] alpha=0.001 .....................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   50.0s remaining:    0.0s


[CV] ............................ alpha=0.001, score=0.719070 -  16.7s
[CV] alpha=0.001 .....................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  1.1min remaining:    0.0s


[CV] ............................ alpha=0.001, score=0.718989 -  16.7s
[CV] alpha=0.0035938136638 ...........................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.4min remaining:    0.0s


[CV] .................. alpha=0.0035938136638, score=0.717741 -   5.8s
[CV] alpha=0.0035938136638 ...........................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  1.5min remaining:    0.0s


[CV] .................. alpha=0.0035938136638, score=0.718115 -   6.0s
[CV] alpha=0.0035938136638 ...........................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  1.6min remaining:    0.0s


[CV] .................. alpha=0.0035938136638, score=0.718167 -   6.3s
[CV] alpha=0.0035938136638 ...........................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  1.7min remaining:    0.0s


[CV] .................. alpha=0.0035938136638, score=0.719069 -   5.9s
[CV] alpha=0.0035938136638 ...........................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  1.8min remaining:    0.0s


[CV] .................. alpha=0.0035938136638, score=0.718989 -   5.7s
[CV] alpha=0.0129154966501 ...........................................
[CV] .................. alpha=0.0129154966501, score=0.717738 -   2.8s
[CV] alpha=0.0129154966501 ...........................................
[CV] .................. alpha=0.0129154966501, score=0.718103 -   2.8s
[CV] alpha=0.0129154966501 ...........................................
[CV] .................. alpha=0.0129154966501, score=0.718138 -   2.8s
[CV] alpha=0.0129154966501 ...........................................
[CV] .................. alpha=0.0129154966501, score=0.719056 -   2.9s
[CV] alpha=0.0129154966501 ...........................................
[CV] .................. alpha=0.0129154966501, score=0.718978 -   2.8s
[CV] alpha=0.0464158883361 ...........................................
[CV] .................. alpha=0.0464158883361, score=0.717601 -   2.0s
[CV] alpha=0.0464158883361 ...........................................
[CV] .

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  3.0min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'alpha': array([  1.00000e-03,   3.59381e-03,   1.29155e-02,   4.64159e-02,
         1.66810e-01,   5.99484e-01,   2.15443e+00,   7.74264e+00,
         2.78256e+01,   1.00000e+02])}],
       pre_dispatch='2*n_jobs', refit=True, scoring='r2', verbose=10)

In [27]:
print grid_lasso.best_estimator_
print grid_lasso.best_score_

Lasso(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)
0.71841716633


In [28]:
# Running Random Forest
# Running Grid search with 'n_estimators' and 'max_depth' as parameters
from sklearn.ensemble import RandomForestRegressor
rf_reg = RandomForestRegressor()
param_dict = {'n_estimators':[50],'max_depth':[None,19,17,15,10]}
grid_rf = GridSearchCV(rf_reg,param_grid=param_dict,verbose=10,scoring='r2',cv=3,n_jobs=1)
grid_rf.fit(train_x,train_y)

Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV] n_estimators=50, max_depth=None .................................


  estimator.fit(X_train, y_train, **fit_params)


[CV] ........ n_estimators=50, max_depth=None, score=0.664368 - 2.1min
[CV] n_estimators=50, max_depth=None .................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.1min remaining:    0.0s


[CV] ........ n_estimators=50, max_depth=None, score=0.664512 - 2.1min
[CV] n_estimators=50, max_depth=None .................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  4.2min remaining:    0.0s


[CV] ........ n_estimators=50, max_depth=None, score=0.664846 - 2.1min
[CV] n_estimators=50, max_depth=19 ...................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  6.3min remaining:    0.0s


[CV] .......... n_estimators=50, max_depth=19, score=0.711506 - 1.9min
[CV] n_estimators=50, max_depth=19 ...................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  8.2min remaining:    0.0s


[CV] .......... n_estimators=50, max_depth=19, score=0.712173 - 1.9min
[CV] n_estimators=50, max_depth=19 ...................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 10.1min remaining:    0.0s


[CV] .......... n_estimators=50, max_depth=19, score=0.712378 - 1.9min
[CV] n_estimators=50, max_depth=17 ...................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed: 11.9min remaining:    0.0s


[CV] .......... n_estimators=50, max_depth=17, score=0.723371 - 2.1min
[CV] n_estimators=50, max_depth=17 ...................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed: 14.0min remaining:    0.0s


[CV] .......... n_estimators=50, max_depth=17, score=0.723880 - 2.0min
[CV] n_estimators=50, max_depth=17 ...................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed: 16.0min remaining:    0.0s


[CV] .......... n_estimators=50, max_depth=17, score=0.724174 - 1.7min
[CV] n_estimators=50, max_depth=15 ...................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 17.7min remaining:    0.0s


[CV] .......... n_estimators=50, max_depth=15, score=0.729084 - 1.7min
[CV] n_estimators=50, max_depth=15 ...................................
[CV] .......... n_estimators=50, max_depth=15, score=0.729576 - 1.8min
[CV] n_estimators=50, max_depth=15 ...................................
[CV] .......... n_estimators=50, max_depth=15, score=0.730283 - 2.0min
[CV] n_estimators=50, max_depth=10 ...................................
[CV] .......... n_estimators=50, max_depth=10, score=0.700761 - 1.4min
[CV] n_estimators=50, max_depth=10 ...................................
[CV] .......... n_estimators=50, max_depth=10, score=0.701047 - 1.4min
[CV] n_estimators=50, max_depth=10 ...................................
[CV] .......... n_estimators=50, max_depth=10, score=0.702202 - 1.4min


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed: 27.4min finished
  best_estimator.fit(X, y, **self.fit_params)


GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [50], 'max_depth': [None, 19, 17, 15, 10]},
       pre_dispatch='2*n_jobs', refit=True, scoring='r2', verbose=10)

In [29]:
print grid_rf.best_estimator_
print grid_rf.best_score_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=15,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=50, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)
0.729647805808


In [30]:
# Running Grid search with 'n_estimators' and 'max_depth' as parameters
rf_reg = RandomForestRegressor()
param_dict = {'n_estimators':[50],'max_depth':[14,13,12]}
grid_rf = GridSearchCV(rf_reg,param_grid=param_dict,verbose=10,scoring='r2',cv=3,n_jobs=1)
grid_rf.fit(train_x,train_y)

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] n_estimators=50, max_depth=14 ...................................
[CV] .......... n_estimators=50, max_depth=14, score=0.729364 - 2.0min
[CV] n_estimators=50, max_depth=14 ...................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.0min remaining:    0.0s


[CV] .......... n_estimators=50, max_depth=14, score=0.729781 - 1.9min
[CV] n_estimators=50, max_depth=14 ...................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  4.0min remaining:    0.0s


[CV] .......... n_estimators=50, max_depth=14, score=0.730521 - 1.7min
[CV] n_estimators=50, max_depth=13 ...................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  5.7min remaining:    0.0s


[CV] .......... n_estimators=50, max_depth=13, score=0.727012 - 1.8min
[CV] n_estimators=50, max_depth=13 ...................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  7.5min remaining:    0.0s


[CV] .......... n_estimators=50, max_depth=13, score=0.727434 - 1.7min
[CV] n_estimators=50, max_depth=13 ...................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  9.2min remaining:    0.0s


[CV] .......... n_estimators=50, max_depth=13, score=0.728364 - 2.0min
[CV] n_estimators=50, max_depth=12 ...................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed: 11.2min remaining:    0.0s


[CV] .......... n_estimators=50, max_depth=12, score=0.721634 - 1.9min
[CV] n_estimators=50, max_depth=12 ...................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed: 13.1min remaining:    0.0s


[CV] .......... n_estimators=50, max_depth=12, score=0.721682 - 2.0min
[CV] n_estimators=50, max_depth=12 ...................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed: 15.1min remaining:    0.0s


[CV] .......... n_estimators=50, max_depth=12, score=0.722985 - 1.7min


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 16.8min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 16.8min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [50], 'max_depth': [14, 13, 12]},
       pre_dispatch='2*n_jobs', refit=True, scoring='r2', verbose=10)

In [32]:
print grid_rf.best_estimator_
print grid_rf.best_score_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=14,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=50, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)
0.729888692244


In [33]:
# Running Grid search with 'n_estimators' and 'max_depth' as parameters
rf_reg = RandomForestRegressor()
param_dict = {'n_estimators':[100],'max_depth':[14]}
grid_rf = GridSearchCV(rf_reg,param_grid=param_dict,verbose=10,scoring='r2',cv=3,n_jobs=1)
grid_rf.fit(train_x,train_y)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] n_estimators=100, max_depth=14 ..................................
[CV] ......... n_estimators=100, max_depth=14, score=0.729474 - 3.5min
[CV] n_estimators=100, max_depth=14 ..................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  3.5min remaining:    0.0s


[CV] ......... n_estimators=100, max_depth=14, score=0.729887 - 3.5min
[CV] n_estimators=100, max_depth=14 ..................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  7.0min remaining:    0.0s


[CV] ......... n_estimators=100, max_depth=14, score=0.730589 - 3.6min


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 10.6min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 10.6min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [100], 'max_depth': [14]},
       pre_dispatch='2*n_jobs', refit=True, scoring='r2', verbose=10)

In [34]:
print grid_rf.best_estimator_
print grid_rf.best_score_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=14,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)
0.729983116772


In [36]:
# Running Gradient Boosted Trees
# Search for optimal 'n_estimators' and 'max_depth'
from sklearn.ensemble import GradientBoostingRegressor

gbr = GradientBoostingRegressor()
param_dict = {'n_estimators':[30,50],'max_depth':[4,5,6]}
grid_gbr = GridSearchCV(gbr,param_grid=param_dict,verbose=10,scoring='r2',cv=3)
grid_gbr.fit(train_x,train_y)

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] n_estimators=30, max_depth=4 ....................................


  y = column_or_1d(y, warn=True)


[CV] ........... n_estimators=30, max_depth=4, score=0.676253 -  44.0s
[CV] n_estimators=30, max_depth=4 ....................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   44.1s remaining:    0.0s


[CV] ........... n_estimators=30, max_depth=4, score=0.675435 -  47.2s
[CV] n_estimators=30, max_depth=4 ....................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.5min remaining:    0.0s


[CV] ........... n_estimators=30, max_depth=4, score=0.678116 -  44.3s
[CV] n_estimators=50, max_depth=4 ....................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  2.3min remaining:    0.0s


[CV] ........... n_estimators=50, max_depth=4, score=0.719175 - 1.2min
[CV] n_estimators=50, max_depth=4 ....................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  3.5min remaining:    0.0s


[CV] ........... n_estimators=50, max_depth=4, score=0.717904 - 1.2min
[CV] n_estimators=50, max_depth=4 ....................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  4.7min remaining:    0.0s


[CV] ........... n_estimators=50, max_depth=4, score=0.719893 - 1.3min
[CV] n_estimators=30, max_depth=5 ....................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  5.9min remaining:    0.0s


[CV] ........... n_estimators=30, max_depth=5, score=0.699753 - 1.1min
[CV] n_estimators=30, max_depth=5 ....................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  7.0min remaining:    0.0s


[CV] ........... n_estimators=30, max_depth=5, score=0.700164 - 1.1min
[CV] n_estimators=30, max_depth=5 ....................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  8.1min remaining:    0.0s


[CV] ........... n_estimators=30, max_depth=5, score=0.702400 - 1.2min
[CV] n_estimators=50, max_depth=5 ....................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  9.3min remaining:    0.0s


[CV] ........... n_estimators=50, max_depth=5, score=0.727832 - 1.8min
[CV] n_estimators=50, max_depth=5 ....................................
[CV] ........... n_estimators=50, max_depth=5, score=0.727848 - 1.9min
[CV] n_estimators=50, max_depth=5 ....................................
[CV] ........... n_estimators=50, max_depth=5, score=0.728946 - 1.9min
[CV] n_estimators=30, max_depth=6 ....................................
[CV] ........... n_estimators=30, max_depth=6, score=0.714065 - 1.9min
[CV] n_estimators=30, max_depth=6 ....................................
[CV] ........... n_estimators=30, max_depth=6, score=0.713772 - 1.8min
[CV] n_estimators=30, max_depth=6 ....................................
[CV] ........... n_estimators=30, max_depth=6, score=0.715280 - 1.9min
[CV] n_estimators=50, max_depth=6 ....................................
[CV] ........... n_estimators=50, max_depth=6, score=0.731855 - 3.0min
[CV] n_estimators=50, max_depth=6 ....................................
[CV] .

[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed: 29.2min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_split=1e-07,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=100,
             presort='auto', random_state=None, subsample=1.0, verbose=0,
             warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [30, 50], 'max_depth': [4, 5, 6]},
       pre_dispatch='2*n_jobs', refit=True, scoring='r2', verbose=10)

In [37]:
print grid_gbr.best_estimator_
print grid_gbr.best_score_

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=6, max_features=None,
             max_leaf_nodes=None, min_impurity_split=1e-07,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=50, presort='auto',
             random_state=None, subsample=1.0, verbose=0, warm_start=False)
0.732076303039


In [38]:
# Running Gradient Boosted Trees
# Search for optimal 'n_estimators' and 'max_depth'
gbr = GradientBoostingRegressor()
param_dict = {'n_estimators':[30],'max_depth':[7,8,9]}
grid_gbr = GridSearchCV(gbr,param_grid=param_dict,verbose=10,scoring='r2',cv=3)
grid_gbr.fit(train_x,train_y)

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] n_estimators=30, max_depth=7 ....................................
[CV] ........... n_estimators=30, max_depth=7, score=0.721767 - 2.8min
[CV] n_estimators=30, max_depth=7 ....................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.8min remaining:    0.0s


[CV] ........... n_estimators=30, max_depth=7, score=0.721407 - 2.7min
[CV] n_estimators=30, max_depth=7 ....................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  5.6min remaining:    0.0s


[CV] ........... n_estimators=30, max_depth=7, score=0.722936 - 2.9min
[CV] n_estimators=30, max_depth=8 ....................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  8.4min remaining:    0.0s


[CV] ........... n_estimators=30, max_depth=8, score=0.726692 - 4.3min
[CV] n_estimators=30, max_depth=8 ....................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed: 12.7min remaining:    0.0s


[CV] ........... n_estimators=30, max_depth=8, score=0.726337 - 4.3min
[CV] n_estimators=30, max_depth=8 ....................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 17.0min remaining:    0.0s


[CV] ........... n_estimators=30, max_depth=8, score=0.727922 - 4.3min
[CV] n_estimators=30, max_depth=9 ....................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed: 21.3min remaining:    0.0s


[CV] ........... n_estimators=30, max_depth=9, score=0.729487 - 6.8min
[CV] n_estimators=30, max_depth=9 ....................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed: 28.1min remaining:    0.0s


[CV] ........... n_estimators=30, max_depth=9, score=0.729391 - 6.5min
[CV] n_estimators=30, max_depth=9 ....................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed: 34.6min remaining:    0.0s


[CV] ........... n_estimators=30, max_depth=9, score=0.730540 - 7.1min


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 41.7min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 41.7min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_split=1e-07,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=100,
             presort='auto', random_state=None, subsample=1.0, verbose=0,
             warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [30], 'max_depth': [7, 8, 9]},
       pre_dispatch='2*n_jobs', refit=True, scoring='r2', verbose=10)

In [39]:
print grid_gbr.best_estimator_
print grid_gbr.best_score_

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=9, max_features=None,
             max_leaf_nodes=None, min_impurity_split=1e-07,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=30, presort='auto',
             random_state=None, subsample=1.0, verbose=0, warm_start=False)
0.729806221544


In [40]:
# Running Gradient Boosted Trees
# Search for optimal 'n_estimators' and 'max_depth'
gbr = GradientBoostingRegressor()
param_dict = {'n_estimators':[150],'max_depth':[9]}
grid_gbr = GridSearchCV(gbr,param_grid=param_dict,verbose=10,scoring='r2',cv=3)
grid_gbr.fit(train_x,train_y)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] n_estimators=150, max_depth=9 ...................................
[CV] .......... n_estimators=150, max_depth=9, score=0.733582 -35.9min
[CV] n_estimators=150, max_depth=9 ...................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 35.9min remaining:    0.0s


[CV] .......... n_estimators=150, max_depth=9, score=0.733572 -35.6min
[CV] n_estimators=150, max_depth=9 ...................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 71.5min remaining:    0.0s


[CV] .......... n_estimators=150, max_depth=9, score=0.734474 -34.8min


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 106.3min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 106.3min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_split=1e-07,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=100,
             presort='auto', random_state=None, subsample=1.0, verbose=0,
             warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [150], 'max_depth': [9]},
       pre_dispatch='2*n_jobs', refit=True, scoring='r2', verbose=10)

In [41]:
print grid_gbr.best_estimator_
print grid_gbr.best_score_

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=9, max_features=None,
             max_leaf_nodes=None, min_impurity_split=1e-07,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=150,
             presort='auto', random_state=None, subsample=1.0, verbose=0,
             warm_start=False)
0.733876099872
