In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as stats
from scipy.stats import skew
import seaborn as sns
import sklearn
import warnings
from sklearn.cross_validation import KFold
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Lasso
import xgboost as xgb



In [2]:
#train_clean = pd.read_csv('train_clean.csv', dtype={'MSSubClass': str})  
train = pd.read_csv('train.csv', dtype={'MSSubClass': str})
test = pd.read_csv('test.csv', dtype={'MSSubClass': str})

In [3]:
train = train[(train.Id != 1299) & (train.Id != 524)]
all_data = pd.concat([train,
                      test])


In [4]:
fill_with_zero = [ 
    'LotFrontage',
    'MasVnrArea',
    'GarageCars',
    'TotalBsmtSF',
    'BsmtFullBath',
    'BsmtHalfBath',
    'BsmtFinSF1', 
    'BsmtFinSF2',
    'BsmtUnfSF',
    'GarageArea'
  ]

fill_with_most_common = [
    'Electrical'
]

fill_with_none = [
    'FireplaceQu',      
    'GarageType',   
    'GarageYrBlt',      
    'GarageFinish',     
    'GarageQual',       
    'GarageCond',       
    'PoolQC',          
    'Fence',
    'MiscFeature', 
    'BsmtQual',   
    'BsmtCond',          
    'BsmtExposure',      
    'BsmtFinType1',      
    'BsmtFinType2',
    'MasVnrType',
    'Alley'
]

In [5]:
from sklearn.preprocessing import StandardScaler

def prepData(df, unskew=False):
    dfForModel = df.copy()
    
    for col in fill_with_zero:
        dfForModel[col] = dfForModel[col].fillna(0)
    
    for col in fill_with_none:
        dfForModel[col] = dfForModel[col].fillna('None')
    
    for col in fill_with_most_common:
        dfForModel[col] = dfForModel[col].fillna(train[col].value_counts().index[0])
        
    
    
    numeric_feats = dfForModel.dtypes[dfForModel.dtypes != 'object'].index
    numeric_feats = [feat for feat in numeric_feats if feat != 'SalePrice' and feat !='Id' and feat !='YrSold']
    '''
    if unskew:
        skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna()))
        skewed_feats = skewed_feats[skewed_feats > 0.75]
        skewed_feats = skewed_feats.index
        dfForModel[skewed_feats] = np.log1p(dfForModel[skewed_feats])
    
    sc = StandardScaler()
    dfForModel[numeric_feats] = sc.fit_transform(dfForModel[numeric_feats])
    '''
    
    dfForModel["SalePrice"] = np.log1p(dfForModel["SalePrice"])
    return pd.get_dummies(dfForModel)

In [6]:
all_data_prepped = prepData(all_data, unskew=True)
X_train = all_data_prepped[:train.shape[0]].drop('SalePrice', axis=1)
X_test = all_data_prepped[train.shape[0]:].drop('SalePrice', axis=1)
y = all_data_prepped[:train.shape[0]].SalePrice
X_train.shape

(1458, 420)

In [7]:
dtr = xgb.DMatrix(X_train, label = y)

- .1 - 350, .1293
- .3 .14
- .05 = .126 - 471

In [70]:
params = {'eta':0.05}
num_rounds = 1000
early_stopping_rounds = 20
xgb.cv(params, dtr,num_rounds, early_stopping_rounds=early_stopping_rounds)

Unnamed: 0,test-rmse-mean,test-rmse-std,train-rmse-mean,train-rmse-std
0,10.955692,0.008046,10.955695,0.003716
1,10.409218,0.008223,10.409220,0.003513
2,9.890078,0.008387,9.890080,0.003319
3,9.396910,0.008542,9.396912,0.003135
4,8.928418,0.008687,8.928420,0.002958
5,8.483372,0.008822,8.483374,0.002789
6,8.060522,0.008851,8.060597,0.002639
7,7.658879,0.008882,7.658954,0.002503
8,7.277305,0.008917,7.277379,0.002374
9,6.914800,0.008953,6.914873,0.002251


In [26]:
from xgboost import XGBRegressor 

In [27]:
from sklearn.grid_search import GridSearchCV

max_depth:3
min_child_weight:2

In [71]:
param_test1 = {
 'max_depth':list(range(2,10,2)),
 'min_child_weight':list(range(1,6,2))
}
gsearch1 = GridSearchCV(estimator = XGBRegressor( learning_rate =0.05, n_estimators=462, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, verbose=5, iid=False, cv=3)
gsearch1.fit(X_train ,y)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] max_depth=2, min_child_weight=1 .................................
[CV] ........ max_depth=2, min_child_weight=1, score=0.919168 -   1.4s
[CV] max_depth=2, min_child_weight=1 .................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.4s remaining:    0.0s


[CV] ........ max_depth=2, min_child_weight=1, score=0.896272 -   1.3s
[CV] max_depth=2, min_child_weight=1 .................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.6s remaining:    0.0s


[CV] ........ max_depth=2, min_child_weight=1, score=0.890563 -   1.3s
[CV] max_depth=2, min_child_weight=3 .................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    3.9s remaining:    0.0s


[CV] ........ max_depth=2, min_child_weight=3, score=0.921830 -   1.2s
[CV] max_depth=2, min_child_weight=3 .................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    5.1s remaining:    0.0s


[CV] ........ max_depth=2, min_child_weight=3, score=0.896802 -   1.3s
[CV] max_depth=2, min_child_weight=3 .................................
[CV] ........ max_depth=2, min_child_weight=3, score=0.907761 -   1.3s
[CV] max_depth=2, min_child_weight=5 .................................
[CV] ........ max_depth=2, min_child_weight=5, score=0.921257 -   1.3s
[CV] max_depth=2, min_child_weight=5 .................................
[CV] ........ max_depth=2, min_child_weight=5, score=0.896601 -   1.3s
[CV] max_depth=2, min_child_weight=5 .................................
[CV] ........ max_depth=2, min_child_weight=5, score=0.910504 -   1.3s
[CV] max_depth=4, min_child_weight=1 .................................
[CV] ........ max_depth=4, min_child_weight=1, score=0.915387 -   2.3s
[CV] max_depth=4, min_child_weight=1 .................................
[CV] ........ max_depth=4, min_child_weight=1, score=0.893373 -   3.2s
[CV] max_depth=4, min_child_weight=1 .................................
[CV] .

[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed:  1.9min finished


([mean: 0.90200, std: 0.01236, params: {'max_depth': 2, 'min_child_weight': 1},
  mean: 0.90880, std: 0.01024, params: {'max_depth': 2, 'min_child_weight': 3},
  mean: 0.90945, std: 0.01009, params: {'max_depth': 2, 'min_child_weight': 5},
  mean: 0.90472, std: 0.00900, params: {'max_depth': 4, 'min_child_weight': 1},
  mean: 0.90487, std: 0.00919, params: {'max_depth': 4, 'min_child_weight': 3},
  mean: 0.91015, std: 0.00809, params: {'max_depth': 4, 'min_child_weight': 5},
  mean: 0.90036, std: 0.01155, params: {'max_depth': 6, 'min_child_weight': 1},
  mean: 0.90635, std: 0.00715, params: {'max_depth': 6, 'min_child_weight': 3},
  mean: 0.90762, std: 0.00896, params: {'max_depth': 6, 'min_child_weight': 5},
  mean: 0.89325, std: 0.00875, params: {'max_depth': 8, 'min_child_weight': 1},
  mean: 0.90202, std: 0.00934, params: {'max_depth': 8, 'min_child_weight': 3},
  mean: 0.90624, std: 0.01020, params: {'max_depth': 8, 'min_child_weight': 5}],
 {'max_depth': 4, 'min_child_weight': 5

In [72]:
param_test1 = {
 'max_depth':[3,4,5],
 'min_child_weight':[4,5,6]
}
gsearch1 = GridSearchCV(estimator = XGBRegressor( learning_rate =0.05, n_estimators=462, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, verbose=5, iid=False, cv=3)
gsearch1.fit(X_train ,y)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] max_depth=3, min_child_weight=4 .................................
[CV] ........ max_depth=3, min_child_weight=4, score=0.920658 -   1.8s
[CV] max_depth=3, min_child_weight=4 .................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.8s remaining:    0.0s


[CV] ........ max_depth=3, min_child_weight=4, score=0.897351 -   1.8s
[CV] max_depth=3, min_child_weight=4 .................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.6s remaining:    0.0s


[CV] ........ max_depth=3, min_child_weight=4, score=0.913027 -   1.7s
[CV] max_depth=3, min_child_weight=5 .................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    5.3s remaining:    0.0s


[CV] ........ max_depth=3, min_child_weight=5, score=0.918917 -   1.8s
[CV] max_depth=3, min_child_weight=5 .................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    7.1s remaining:    0.0s


[CV] ........ max_depth=3, min_child_weight=5, score=0.897429 -   1.7s
[CV] max_depth=3, min_child_weight=5 .................................
[CV] ........ max_depth=3, min_child_weight=5, score=0.911642 -   1.7s
[CV] max_depth=3, min_child_weight=6 .................................
[CV] ........ max_depth=3, min_child_weight=6, score=0.919662 -   1.8s
[CV] max_depth=3, min_child_weight=6 .................................
[CV] ........ max_depth=3, min_child_weight=6, score=0.893742 -   2.3s
[CV] max_depth=3, min_child_weight=6 .................................
[CV] ........ max_depth=3, min_child_weight=6, score=0.909900 -   2.3s
[CV] max_depth=4, min_child_weight=4 .................................
[CV] ........ max_depth=4, min_child_weight=4, score=0.918819 -   2.3s
[CV] max_depth=4, min_child_weight=4 .................................
[CV] ........ max_depth=4, min_child_weight=4, score=0.898974 -   2.7s
[CV] max_depth=4, min_child_weight=4 .................................
[CV] .

[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:  1.2min finished


([mean: 0.91034, std: 0.00970, params: {'max_depth': 3, 'min_child_weight': 4},
  mean: 0.90933, std: 0.00892, params: {'max_depth': 3, 'min_child_weight': 5},
  mean: 0.90777, std: 0.01069, params: {'max_depth': 3, 'min_child_weight': 6},
  mean: 0.91048, std: 0.00841, params: {'max_depth': 4, 'min_child_weight': 4},
  mean: 0.91015, std: 0.00809, params: {'max_depth': 4, 'min_child_weight': 5},
  mean: 0.90955, std: 0.01001, params: {'max_depth': 4, 'min_child_weight': 6},
  mean: 0.90942, std: 0.00867, params: {'max_depth': 5, 'min_child_weight': 4},
  mean: 0.90921, std: 0.00892, params: {'max_depth': 5, 'min_child_weight': 5},
  mean: 0.90809, std: 0.01153, params: {'max_depth': 5, 'min_child_weight': 6}],
 {'max_depth': 4, 'min_child_weight': 4},
 0.9104828873328404)

In [73]:
param_test1 = {
    'gamma': list(np.arange(0,0.5, 0.1))
}
gsearch1 = GridSearchCV(estimator = XGBRegressor( learning_rate =0.05, n_estimators=462, max_depth=4,
 min_child_weight=4, gamma=0, subsample=0.8, colsample_bytree=0.8, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, verbose=5, iid=False, cv=3)
gsearch1.fit(X_train ,y)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV] gamma=0.0 .......................................................
[CV] .............................. gamma=0.0, score=0.918819 -   2.2s
[CV] gamma=0.0 .......................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.2s remaining:    0.0s


[CV] .............................. gamma=0.0, score=0.898974 -   2.2s
[CV] gamma=0.0 .......................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    4.5s remaining:    0.0s


[CV] .............................. gamma=0.0, score=0.913656 -   2.2s
[CV] gamma=0.1 .......................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    6.7s remaining:    0.0s


[CV] .............................. gamma=0.1, score=0.914284 -   2.7s
[CV] gamma=0.1 .......................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    9.4s remaining:    0.0s


[CV] .............................. gamma=0.1, score=0.890539 -   2.6s
[CV] gamma=0.1 .......................................................
[CV] .............................. gamma=0.1, score=0.907126 -   2.7s
[CV] gamma=0.2 .......................................................
[CV] .............................. gamma=0.2, score=0.904278 -   2.3s
[CV] gamma=0.2 .......................................................
[CV] .............................. gamma=0.2, score=0.881655 -   2.5s
[CV] gamma=0.2 .......................................................
[CV] .............................. gamma=0.2, score=0.896865 -   2.3s
[CV] gamma=0.3 .......................................................
[CV] .............................. gamma=0.3, score=0.901695 -   2.3s
[CV] gamma=0.3 .......................................................
[CV] .............................. gamma=0.3, score=0.871651 -   2.3s
[CV] gamma=0.3 .......................................................
[CV] .

[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:   35.9s finished


([mean: 0.91048, std: 0.00841, params: {'gamma': 0.0},
  mean: 0.90398, std: 0.00995, params: {'gamma': 0.10000000000000001},
  mean: 0.89427, std: 0.00942, params: {'gamma': 0.20000000000000001},
  mean: 0.88680, std: 0.01227, params: {'gamma': 0.30000000000000004},
  mean: 0.88117, std: 0.01298, params: {'gamma': 0.40000000000000002}],
 {'gamma': 0.0},
 0.9104828873328404)

In [75]:
param_test1 = {
    'subsample': list(np.arange(0.5,1, 0.1)),
    'colsample_bytree': list(np.arange(0.5,1, 0.1))
}
gsearch1 = GridSearchCV(estimator = XGBRegressor( learning_rate =0.05, n_estimators=462, max_depth=4,
 min_child_weight=4, gamma=0, subsample=0.8, colsample_bytree=0.8, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, verbose=5, iid=False, cv=3)
gsearch1.fit(X_train ,y)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

Fitting 3 folds for each of 25 candidates, totalling 75 fits
[CV] colsample_bytree=0.5, subsample=0.5 .............................
[CV] .... colsample_bytree=0.5, subsample=0.5, score=0.917555 -   1.4s
[CV] colsample_bytree=0.5, subsample=0.5 .............................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.4s remaining:    0.0s


[CV] .... colsample_bytree=0.5, subsample=0.5, score=0.896656 -   1.4s
[CV] colsample_bytree=0.5, subsample=0.5 .............................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.8s remaining:    0.0s


[CV] .... colsample_bytree=0.5, subsample=0.5, score=0.913689 -   1.4s
[CV] colsample_bytree=0.5, subsample=0.6 .............................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    4.2s remaining:    0.0s


[CV] .... colsample_bytree=0.5, subsample=0.6, score=0.922177 -   1.4s
[CV] colsample_bytree=0.5, subsample=0.6 .............................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    5.6s remaining:    0.0s


[CV] .... colsample_bytree=0.5, subsample=0.6, score=0.896234 -   1.4s
[CV] colsample_bytree=0.5, subsample=0.6 .............................
[CV] .... colsample_bytree=0.5, subsample=0.6, score=0.913313 -   1.4s
[CV] colsample_bytree=0.5, subsample=0.7 .............................
[CV] .... colsample_bytree=0.5, subsample=0.7, score=0.920769 -   1.5s
[CV] colsample_bytree=0.5, subsample=0.7 .............................
[CV] .... colsample_bytree=0.5, subsample=0.7, score=0.896633 -   1.5s
[CV] colsample_bytree=0.5, subsample=0.7 .............................
[CV] .... colsample_bytree=0.5, subsample=0.7, score=0.916130 -   1.5s
[CV] colsample_bytree=0.5, subsample=0.8 .............................
[CV] .... colsample_bytree=0.5, subsample=0.8, score=0.923602 -   1.5s
[CV] colsample_bytree=0.5, subsample=0.8 .............................
[CV] .... colsample_bytree=0.5, subsample=0.8, score=0.898536 -   1.5s
[CV] colsample_bytree=0.5, subsample=0.8 .............................
[CV] .

[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:  2.8min finished


([mean: 0.90930, std: 0.00908, params: {'colsample_bytree': 0.5, 'subsample': 0.5},
  mean: 0.91057, std: 0.01077, params: {'colsample_bytree': 0.5, 'subsample': 0.59999999999999998},
  mean: 0.91118, std: 0.01046, params: {'colsample_bytree': 0.5, 'subsample': 0.69999999999999996},
  mean: 0.91151, std: 0.01025, params: {'colsample_bytree': 0.5, 'subsample': 0.79999999999999993},
  mean: 0.91026, std: 0.00921, params: {'colsample_bytree': 0.5, 'subsample': 0.89999999999999991},
  mean: 0.91006, std: 0.00902, params: {'colsample_bytree': 0.59999999999999998, 'subsample': 0.5},
  mean: 0.91046, std: 0.00926, params: {'colsample_bytree': 0.59999999999999998, 'subsample': 0.59999999999999998},
  mean: 0.90984, std: 0.00978, params: {'colsample_bytree': 0.59999999999999998, 'subsample': 0.69999999999999996},
  mean: 0.90950, std: 0.00954, params: {'colsample_bytree': 0.59999999999999998, 'subsample': 0.79999999999999993},
  mean: 0.90973, std: 0.00999, params: {'colsample_bytree': 0.599999

In [76]:
param_test1 = {
    'subsample': [0.3, 0.4, 0.5],
 }
gsearch1 = GridSearchCV(estimator = XGBRegressor( learning_rate =0.05, n_estimators=462, max_depth=4,
 min_child_weight=4, gamma=0, subsample=0.8, colsample_bytree=0.8, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, verbose=5, iid=False, cv=3)
gsearch1.fit(X_train ,y)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] subsample=0.3 ...................................................
[CV] .......................... subsample=0.3, score=0.920103 -   1.6s
[CV] subsample=0.3 ...................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.6s remaining:    0.0s


[CV] .......................... subsample=0.3, score=0.896909 -   1.5s
[CV] subsample=0.3 ...................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.1s remaining:    0.0s


[CV] .......................... subsample=0.3, score=0.904754 -   1.5s
[CV] subsample=0.4 ...................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    4.6s remaining:    0.0s


[CV] .......................... subsample=0.4, score=0.922256 -   1.8s
[CV] subsample=0.4 ...................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    6.4s remaining:    0.0s


[CV] .......................... subsample=0.4, score=0.901645 -   1.8s
[CV] subsample=0.4 ...................................................
[CV] .......................... subsample=0.4, score=0.916710 -   2.3s
[CV] subsample=0.5 ...................................................
[CV] .......................... subsample=0.5, score=0.922642 -   2.1s
[CV] subsample=0.5 ...................................................
[CV] .......................... subsample=0.5, score=0.899661 -   2.1s
[CV] subsample=0.5 ...................................................
[CV] .......................... subsample=0.5, score=0.910349 -   2.5s


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   17.2s finished


([mean: 0.90726, std: 0.00963, params: {'subsample': 0.3},
  mean: 0.91354, std: 0.00871, params: {'subsample': 0.4},
  mean: 0.91088, std: 0.00939, params: {'subsample': 0.5}],
 {'subsample': 0.4},
 0.913537087276096)

In [79]:
param_test1 = {
    'reg_alpha': [0, 0.01, 0.1]}
gsearch1 = GridSearchCV(estimator = XGBRegressor( learning_rate =0.05, n_estimators=462, max_depth=4,
 min_child_weight=4, gamma=0, subsample=0.4, colsample_bytree=0.8, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, verbose=5, iid=False, cv=3)

gsearch1.fit(X_train ,y)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

Fitting 3 folds for each of 2 candidates, totalling 6 fits
[CV] reg_alpha=0 .....................................................
[CV] ............................ reg_alpha=0, score=0.922256 -   1.9s
[CV] reg_alpha=0 .....................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.9s remaining:    0.0s


[CV] ............................ reg_alpha=0, score=0.901645 -   1.9s
[CV] reg_alpha=0 .....................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.8s remaining:    0.0s


[CV] ............................ reg_alpha=0, score=0.916710 -   1.9s
[CV] reg_alpha=0.001 .................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    5.7s remaining:    0.0s


[CV] ........................ reg_alpha=0.001, score=0.923460 -   2.6s
[CV] reg_alpha=0.001 .................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    8.3s remaining:    0.0s


[CV] ........................ reg_alpha=0.001, score=0.900612 -   1.9s
[CV] reg_alpha=0.001 .................................................
[CV] ........................ reg_alpha=0.001, score=0.912397 -   1.9s


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   12.0s finished


([mean: 0.91354, std: 0.00871, params: {'reg_alpha': 0},
  mean: 0.91216, std: 0.00933, params: {'reg_alpha': 0.001}],
 {'reg_alpha': 0},
 0.913537087276096)

In [81]:
param_test1 = {
    'reg_lambda': [0, 0.0001, 0.01]}
gsearch1 = GridSearchCV(estimator = XGBRegressor( learning_rate =0.05, n_estimators=462, max_depth=4,
 min_child_weight=4, gamma=0, subsample=0.4, colsample_bytree=0.8, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, verbose=5, iid=False, cv=3)
gsearch1.fit(X_train ,y)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] reg_lambda=0 ....................................................
[CV] ........................... reg_lambda=0, score=0.918201 -   1.9s
[CV] reg_lambda=0 ....................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.9s remaining:    0.0s


[CV] ........................... reg_lambda=0, score=0.895597 -   1.9s
[CV] reg_lambda=0 ....................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.9s remaining:    0.0s


[CV] ........................... reg_lambda=0, score=0.905715 -   1.9s
[CV] reg_lambda=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    5.8s remaining:    0.0s


[CV] ...................... reg_lambda=0.0001, score=0.917305 -   2.5s
[CV] reg_lambda=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    8.3s remaining:    0.0s


[CV] ...................... reg_lambda=0.0001, score=0.896993 -   2.6s
[CV] reg_lambda=0.0001 ...............................................
[CV] ...................... reg_lambda=0.0001, score=0.904833 -   2.0s
[CV] reg_lambda=0.01 .................................................
[CV] ........................ reg_lambda=0.01, score=0.917057 -   1.9s
[CV] reg_lambda=0.01 .................................................
[CV] ........................ reg_lambda=0.01, score=0.895686 -   2.0s
[CV] reg_lambda=0.01 .................................................
[CV] ........................ reg_lambda=0.01, score=0.908541 -   1.9s


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   18.8s finished


([mean: 0.90650, std: 0.00925, params: {'reg_lambda': 0},
  mean: 0.90638, std: 0.00836, params: {'reg_lambda': 0.0001},
  mean: 0.90709, std: 0.00878, params: {'reg_lambda': 0.01}],
 {'reg_lambda': 0.01},
 0.907094645606301)

In [86]:
param_test1 = {
    'colsample_bylevel': list(np.arange(0.5, 1.0, 0.1))}
gsearch1 = GridSearchCV(estimator = XGBRegressor( learning_rate =0.05, n_estimators=462, max_depth=4,
 min_child_weight=4, gamma=0, subsample=0.4, colsample_bytree=0.8, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, verbose=5, iid=False, cv=3)
gsearch1.fit(X_train ,y)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV] colsample_bylevel=0.5 ...........................................
[CV] .................. colsample_bylevel=0.5, score=0.921404 -   1.1s
[CV] colsample_bylevel=0.5 ...........................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.1s remaining:    0.0s


[CV] .................. colsample_bylevel=0.5, score=0.898123 -   1.6s
[CV] colsample_bylevel=0.5 ...........................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.7s remaining:    0.0s


[CV] .................. colsample_bylevel=0.5, score=0.910604 -   1.9s
[CV] colsample_bylevel=0.6 ...........................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    4.6s remaining:    0.0s


[CV] .................. colsample_bylevel=0.6, score=0.917243 -   2.0s
[CV] colsample_bylevel=0.6 ...........................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    6.6s remaining:    0.0s


[CV] .................. colsample_bylevel=0.6, score=0.899631 -   1.3s
[CV] colsample_bylevel=0.6 ...........................................
[CV] .................. colsample_bylevel=0.6, score=0.911289 -   1.4s
[CV] colsample_bylevel=0.7 ...........................................
[CV] .................. colsample_bylevel=0.7, score=0.919125 -   1.5s
[CV] colsample_bylevel=0.7 ...........................................
[CV] .................. colsample_bylevel=0.7, score=0.902563 -   1.5s
[CV] colsample_bylevel=0.7 ...........................................
[CV] .................. colsample_bylevel=0.7, score=0.911312 -   1.4s
[CV] colsample_bylevel=0.8 ...........................................
[CV] .................. colsample_bylevel=0.8, score=0.917250 -   2.2s
[CV] colsample_bylevel=0.8 ...........................................
[CV] .................. colsample_bylevel=0.8, score=0.901100 -   2.0s
[CV] colsample_bylevel=0.8 ...........................................
[CV] .

[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:   25.8s finished


([mean: 0.91004, std: 0.00951, params: {'colsample_bylevel': 0.5},
  mean: 0.90939, std: 0.00731, params: {'colsample_bylevel': 0.59999999999999998},
  mean: 0.91100, std: 0.00676, params: {'colsample_bylevel': 0.69999999999999996},
  mean: 0.90881, std: 0.00661, params: {'colsample_bylevel': 0.79999999999999993},
  mean: 0.91102, std: 0.01096, params: {'colsample_bylevel': 0.89999999999999991}],
 {'colsample_bylevel': 0.89999999999999991},
 0.9110152921314851)

In [13]:
params = {'eta':0.01, 'max_depth':4, 'min_child_weight':4, 'subsample':0.4, 'colsample_bytree':0.8, 'scale_pos_weight':1}
num_rounds = 1500
early_stopping_rounds = 20
xgb.cv(params, dtr,num_rounds, early_stopping_rounds=early_stopping_rounds)

Unnamed: 0,test-rmse-mean,test-rmse-std,train-rmse-mean,train-rmse-std
0,11.416143,0.007816,11.416146,0.003969
1,11.302526,0.007835,11.302528,0.003945
2,11.189959,0.007974,11.189961,0.003801
3,11.078519,0.008041,11.078520,0.003727
4,10.968146,0.008021,10.968148,0.003749
5,10.858871,0.007923,10.858873,0.003837
6,10.750753,0.008026,10.750755,0.003729
7,10.643711,0.008134,10.643713,0.003622
8,10.537756,0.008308,10.537758,0.003448
9,10.432739,0.008280,10.432742,0.003467


In [8]:
params = {'eta':0.01, 'max_depth':4, 'min_child_weight':4, 'subsample':0.4, 'colsample_bytree':0.8, 'scale_pos_weight':1}
num_rounds = 1334
bst = xgb.train(params, dtr,num_rounds)

In [9]:
train_preds = bst.predict(dtr)

In [11]:
np.expm1(train_preds)

array([ 200653.171875,  172396.484375,  209047.46875 , ...,  254594.078125,
        138676.28125 ,  142848.640625], dtype=float32)

In [15]:
import pickle
with open('xgb_train_preds.pkl', 'wb') as pickle_file:
    pickle.dump(train_preds, pickle_file)


Fit our tuned model on all the data prior to submission.

Make predictions on the test set and undo our log transform so that the values will be on their original scale.

In [91]:
test = pd.read_csv('test_clean.csv', dtype={'MSSubClass': str})
test_ids = test.Id.values

In [18]:
test_preds_to_save = bst.predict(xgb.DMatrix(X_test))


In [20]:
import pickle
with open('xgb_test_preds.pkl', 'wb') as pickle_file:
    pickle.dump(test_preds_to_save, pickle_file)


In [15]:
import pickle
with open('xgb_train_preds.pkl', 'wb') as pickle_file:
    pickle.dump(train_preds, pickle_file)


In [97]:
preds = np.expm1(bst.predict(xgb.DMatrix(X_test)))
solution = pd.DataFrame({"id":test_ids, "SalePrice":preds}, columns=['id', 'SalePrice'])

solution.to_csv("xgb.csv", index = False)

This one scores .12046

Let's save the model so we can use it later

In [None]:
from sklearn.externals import joblib
joblib.dump(bestLassoEst, 'poly_features_35_2.pkl')