In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Read the data
data = pd.read_csv('~/kaggle/input/melbourne-housing-snapshot/melb_data.csv')

# Select subset of predictors
cols_to_use = ['Rooms', 'Distance', 'Landsize', 'BuildingArea', 'YearBuilt']
X = data[cols_to_use]

# Select target
y = data.Price

# Separate data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y)

In [3]:
# import XGBoost - extreme gradient boosting
from xgboost import XGBRegressor

my_model = XGBRegressor()
my_model.fit(X_train, y_train)

  if getattr(data, 'base', None) is not None and \




XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [4]:
from sklearn.metrics import mean_absolute_error

predictions = my_model.predict(X_valid)
mae = str(mean_absolute_error(y_valid, predictions))
print("Mean Absolute Error: "+ mae)

Mean Absolute Error: 269567.0099318851


## Parameter Tuning

The various parameter that can be tuned to dramatically affect accuracy and training speed are

- `n_estimators`
    
- `early_stopping_rounds`

- `learning_rate`

- `n_jobs`

### n_estimators

specifies how mant times to go through the modelling cycle, thus equal to the number of models included in the ensemble

In [14]:
# n_estimators
my_model1 = XGBRegressor(n_estimators=500)
my_model1.fit(X_train, y_train)

  if getattr(data, 'base', None) is not None and \




XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=500,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [15]:
predictions = my_model1.predict(X_valid)
mae = str(mean_absolute_error(y_valid, predictions))
print("Mean Absolute Error: "+ mae)

Mean Absolute Error: 247134.31409011415


### early_stopping_rounds

In [16]:
# early_stopping_rounds
my_model2 = XGBRegressor(n_estimators=500)
my_model2.fit(X_train, y_train,
            early_stopping_rounds=5,
            eval_set=[(X_valid, y_valid)],
            verbose=False)

  if getattr(data, 'base', None) is not None and \




XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=500,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [17]:
predictions = my_model2.predict(X_valid)
mae = str(mean_absolute_error(y_valid, predictions))
print("Mean Absolute Error: "+ mae)

Mean Absolute Error: 251656.57691458028


### learning_rate

Step size of gradient descent, multiplied to predictions of each ensembled model before they are added(ensembled).

Allows us to set a higher value of `n_estimators` without overfitting.

In [24]:
# learning_rate
my_model3 = XGBRegressor(n_estimators=1000,
                       learning_rate=0.2)
my_model3.fit(X_train, y_train,
            early_stopping_rounds=5,
            eval_set=[(X_valid, y_valid)],
            verbose=False)

  if getattr(data, 'base', None) is not None and \




XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.2, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=1000,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [25]:
predictions = my_model3.predict(X_valid)
mae = str(mean_absolute_error(y_valid, predictions))
print("Mean Absolute Error: "+ mae)

Mean Absolute Error: 249474.64732488035


### n_jobs

Implements parallelism to reduce runtime while fitting and training model. Usually set to the number of cores and isn't particularly helpful on smaller models.

In [30]:
my_model4 = XGBRegressor(n_estimators=1000,
                        learning_rate=0.2, n_jobs=6)
my_model4.fit(X_train, y_train, 
             early_stopping_rounds=5, 
             eval_set=[(X_valid, y_valid)],
             verbose=False)



  if getattr(data, 'base', None) is not None and \


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.2, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=1000,
             n_jobs=6, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [31]:
predictions = my_model4.predict(X_valid)
mae = str(mean_absolute_error(y_valid, predictions))
print("Mean Absolute Error: "+ mae)

Mean Absolute Error: 249474.64732488035


In [36]:
def get_score(n_estimators, model):

    predictions = model.predict(X_valid)
    mae = str(mean_absolute_error(y_valid, predictions))
    return mae

In [54]:
models = [ my_model1, my_model2, my_model3, my_model4 ]
model_results = {}
for model in models:
    results = {}
    for i in range(1,11):
        results[100*i] = get_score(100*i, model)
    print(results)
    model_results[model] = results
print('\n') 
print(model_results)

{100: '247134.31409011415', 200: '247134.31409011415', 300: '247134.31409011415', 400: '247134.31409011415', 500: '247134.31409011415', 600: '247134.31409011415', 700: '247134.31409011415', 800: '247134.31409011415', 900: '247134.31409011415', 1000: '247134.31409011415'}
{100: '251656.57691458028', 200: '251656.57691458028', 300: '251656.57691458028', 400: '251656.57691458028', 500: '251656.57691458028', 600: '251656.57691458028', 700: '251656.57691458028', 800: '251656.57691458028', 900: '251656.57691458028', 1000: '251656.57691458028'}
{100: '249474.64732488035', 200: '249474.64732488035', 300: '249474.64732488035', 400: '249474.64732488035', 500: '249474.64732488035', 600: '249474.64732488035', 700: '249474.64732488035', 800: '249474.64732488035', 900: '249474.64732488035', 1000: '249474.64732488035'}
{100: '249474.64732488035', 200: '249474.64732488035', 300: '249474.64732488035', 400: '249474.64732488035', 500: '249474.64732488035', 600: '249474.64732488035', 700: '249474.64732488