# XGBoost (Gradient Boosted Decision Trees) 

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer

train_data = pd.read_csv('train.csv')
submission_data = pd.read_csv('test.csv')

#removing rows where sale price not available
train_data.dropna(axis=0, subset=['SalePrice'], inplace=True)

y = train_data.SalePrice
X = train_data.drop(['SalePrice'], axis=1).select_dtypes(exclude=['object'])

numeric_cols = [cname for cname in submission_data.columns if 
                                submission_data[cname].dtype in ['int64', 'float64']]
submission_X = submission_data[numeric_cols]

#print(type(y))
#print(type(submission_X))

In [2]:
train_X, test_X, train_y, test_y = train_test_split(X, y.as_matrix(), test_size=0.25)

my_imputer = Imputer()
train_X = my_imputer.fit_transform(train_X)
test_X = my_imputer.transform(test_X)
submission_X = my_imputer.transform(submission_X)



#rint(type(X))
#print(type(submission_X))

In [3]:
from xgboost import XGBRegressor

my_model = XGBRegressor()
# Add silent=True to avoid printing out updates with each cycle
my_model.fit(train_X, train_y, verbose=False)



XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [4]:
# make predictions
predictions = my_model.predict(test_X)

from sklearn.metrics import mean_absolute_error
print("Mean Absolute Error : " + str(mean_absolute_error(predictions, test_y)))

Mean Absolute Error : 16745.1397795


## Tuning XGBoost

In [5]:
#adding n_estimators and early_stopping_rounds
my_model = XGBRegressor(n_estimators=1000)
my_model.fit(train_X, train_y, early_stopping_rounds=5, 
             eval_set=[(test_X, test_y)])

[0]	validation_0-rmse:173945
Will train until validation_0-rmse hasn't improved in 5 rounds.
[1]	validation_0-rmse:158032
[2]	validation_0-rmse:143739
[3]	validation_0-rmse:131149
[4]	validation_0-rmse:119581
[5]	validation_0-rmse:109156
[6]	validation_0-rmse:99921.6
[7]	validation_0-rmse:91262.7
[8]	validation_0-rmse:84303.6
[9]	validation_0-rmse:77430.7
[10]	validation_0-rmse:71938.9
[11]	validation_0-rmse:66770.5
[12]	validation_0-rmse:62565.4
[13]	validation_0-rmse:58469.7
[14]	validation_0-rmse:54934.7
[15]	validation_0-rmse:51896.2
[16]	validation_0-rmse:49037.4
[17]	validation_0-rmse:46622.8
[18]	validation_0-rmse:44883.8
[19]	validation_0-rmse:42801.4
[20]	validation_0-rmse:41205
[21]	validation_0-rmse:39729.8
[22]	validation_0-rmse:38328.2
[23]	validation_0-rmse:37448.6
[24]	validation_0-rmse:36593.5
[25]	validation_0-rmse:35607
[26]	validation_0-rmse:34892.4
[27]	validation_0-rmse:34332.4
[28]	validation_0-rmse:33891.4
[29]	validation_0-rmse:33487.3
[30]	validation_0-rmse:332

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=1000, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [6]:
# iteration 72 is best 
my_model = XGBRegressor(n_estimators=110)
# Add silent=True to avoid printing out updates with each cycle
#this is done to prevent leaky validation stratregy
X = my_imputer.fit_transform(X)
my_model.fit(X, y, verbose=False)

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=110, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [7]:
print(X.shape)
print(submission_X.shape)

(1460, 37)
(1459, 37)


In [8]:
# Use the model to make predictions
predicted_prices = my_model.predict(submission_X)
# We will look at the predicted prices to ensure we have something sensible.
print(predicted_prices)

[ 125856.7421875  160742.078125   177234.4375    ...,  178432.8125
  119145.359375   237922.515625 ]


# Submission

In [9]:
my_submission = pd.DataFrame({'Id': submission_data.Id, 'SalePrice': predicted_prices})
# you could use any filename. We choose submission here
my_submission.to_csv('submission_XGBoost.csv', index=False)