In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

data = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')

In [2]:
y = data.SalePrice

X = data.drop('SalePrice', axis=1).select_dtypes(exclude='object')

In [3]:
train_X, test_X, train_y, test_y = train_test_split(X, y, train_size=0.75, test_size = 0.25 )

In [4]:
my_imputer = SimpleImputer()
train_X = my_imputer.fit_transform(train_X)
test_X = my_imputer.transform(test_X)


In [5]:
from xgboost import XGBRegressor

my_model = XGBRegressor()

my_model.fit(train_X, train_y)


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [6]:
predictions = my_model.predict(test_X)

from sklearn.metrics import mean_absolute_error

print('MEA : %d' % (mean_absolute_error(predictions, test_y)))

MEA : 18640


# Model tuning

In [7]:
my_model = XGBRegressor(n_estimators = 1000)
my_model.fit(train_X, train_y, early_stopping_rounds=5, eval_set=[(test_X, test_y)])

[0]	validation_0-rmse:141564.14062
Will train until validation_0-rmse hasn't improved in 5 rounds.
[1]	validation_0-rmse:102967.94531
[2]	validation_0-rmse:76456.06250
[3]	validation_0-rmse:58767.66406
[4]	validation_0-rmse:47577.07422
[5]	validation_0-rmse:41732.69141
[6]	validation_0-rmse:38925.49219
[7]	validation_0-rmse:37138.61328
[8]	validation_0-rmse:37030.48438
[9]	validation_0-rmse:36336.33984
[10]	validation_0-rmse:37010.48047
[11]	validation_0-rmse:36935.16797
[12]	validation_0-rmse:36931.66016
[13]	validation_0-rmse:37097.67188
[14]	validation_0-rmse:37154.01562
Stopping. Best iteration:
[9]	validation_0-rmse:36336.33984



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=1000, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [8]:
predictions = my_model.predict(test_X)
print('MAE: %d' % (mean_absolute_error(predictions, test_y)))

MAE: 19249


In [9]:
my_model = XGBRegressor(n_estimators=1000, learning_rate=0.05)
my_model.fit(train_X, train_y, early_stopping_rounds=5, eval_set=[(test_X, test_y)])

[0]	validation_0-rmse:188799.17188
Will train until validation_0-rmse hasn't improved in 5 rounds.
[1]	validation_0-rmse:179743.01562
[2]	validation_0-rmse:171202.31250
[3]	validation_0-rmse:163092.60938
[4]	validation_0-rmse:155460.42188
[5]	validation_0-rmse:148055.12500
[6]	validation_0-rmse:141183.34375
[7]	validation_0-rmse:134593.07812
[8]	validation_0-rmse:128480.26562
[9]	validation_0-rmse:122550.66406
[10]	validation_0-rmse:116937.07031
[11]	validation_0-rmse:111730.64844
[12]	validation_0-rmse:106783.67188
[13]	validation_0-rmse:102020.10156
[14]	validation_0-rmse:97498.25781
[15]	validation_0-rmse:93150.15625
[16]	validation_0-rmse:89262.11719
[17]	validation_0-rmse:85534.33594
[18]	validation_0-rmse:81952.39844
[19]	validation_0-rmse:78548.29688
[20]	validation_0-rmse:75321.82031
[21]	validation_0-rmse:72344.73438
[22]	validation_0-rmse:69589.84375
[23]	validation_0-rmse:67085.75781
[24]	validation_0-rmse:64623.55469
[25]	validation_0-rmse:62183.29297
[26]	validation_0-rmse

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.05, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=1000, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [10]:
predictions = my_model.predict(test_X)
print('MAE: %d' % (mean_absolute_error(predictions, test_y)))

MAE: 19305


In [11]:
test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

test_X = test.select_dtypes(exclude='object')
test_X = my_imputer.transform(test_X)
predicted_prices = my_model.predict(test_X)

print(predicted_prices)

[120783.73 140470.17 169104.88 ... 156126.53 107920.59 217637.5 ]


In [12]:
my_submission = pd.DataFrame({'Id':test.Id, 'SalePrice':predicted_prices})

my_submission.to_csv('submission.csv', index = False)