# All Imports

In [109]:
import numpy as np
import pandas as pd
import xgboost as xgb
import gc
from sklearn import ensemble
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split

# Reading Data

In [110]:
train_2016 = pd.read_csv('./../preprocessed data/df_train_2016.csv')
predict_2016 = pd.read_csv('./../preprocessed data/df_predict_2016.csv')

train_2017 = pd.read_csv('./../preprocessed data/df_train_2017.csv')
predict_2017 = pd.read_csv('./../preprocessed data/df_predict_2017.csv')

sample = pd.read_csv("./../data/sample_submission.csv")

# Processing Data (2016)

In [111]:
x_train_2016 = train_2016.drop(['logerror'], axis=1)
num_columns = x_train_2016.columns
y_train_2016 = train_2016['logerror'].values.astype(np.float32)

In [112]:
#Dropping Outliers
train_2016=train_2016[train_2016.logerror > -0.4 ]
train_2016=train_2016[train_2016.logerror < 0.418 ]
x_train_2016 = train_2016.drop(['logerror'], axis=1)
y_train_2016 = train_2016['logerror'].values.astype(np.float32)

In [113]:
x_train, x_test, y_train, y_test = train_test_split(x_train_2016, y_train_2016, random_state=0)

# Creating DMatrix (2016)

In [114]:
train_matrix_2016 = xgb.DMatrix(x_train, y_train)
test_matrix_2016 = xgb.DMatrix(x_test, y_test)

In [115]:
mean = np.mean(y_train)
xgb_params = {
    'max_depth':9,
    'min_child_weight': 5,
    'eta':.01,
    'subsample': 0.6,
    'colsample_bytree': 0.7,
    'objective':'reg:linear',
    'silent': 1,
    'base_score': mean
}

In [116]:
xgb_params['eval_metric']:"mae"
num_boost_round = 999
early_stopping_rounds = 10
model = xgb.train(xgb_params, train_matrix_2016, 100000, [(train_matrix_2016, 'train'), (test_matrix_2016, 'test')], early_stopping_rounds=100, verbose_eval=10)
print("Best MAE: {:.2f} with {} rounds".format(
                 model.best_score,
                 model.best_iteration+1))

Parameters: { "silent" } are not used.

[0]	train-rmse:0.08328	test-rmse:0.08379




[10]	train-rmse:0.08299	test-rmse:0.08370
[20]	train-rmse:0.08275	test-rmse:0.08362
[30]	train-rmse:0.08251	test-rmse:0.08355
[40]	train-rmse:0.08229	test-rmse:0.08349
[50]	train-rmse:0.08208	test-rmse:0.08344
[60]	train-rmse:0.08190	test-rmse:0.08340
[70]	train-rmse:0.08171	test-rmse:0.08336
[80]	train-rmse:0.08152	test-rmse:0.08332
[90]	train-rmse:0.08135	test-rmse:0.08328
[100]	train-rmse:0.08119	test-rmse:0.08325
[110]	train-rmse:0.08104	test-rmse:0.08323
[120]	train-rmse:0.08089	test-rmse:0.08321
[130]	train-rmse:0.08075	test-rmse:0.08318
[140]	train-rmse:0.08062	test-rmse:0.08317
[150]	train-rmse:0.08050	test-rmse:0.08316
[160]	train-rmse:0.08038	test-rmse:0.08314
[170]	train-rmse:0.08024	test-rmse:0.08311
[180]	train-rmse:0.08012	test-rmse:0.08310
[190]	train-rmse:0.08001	test-rmse:0.08308
[200]	train-rmse:0.07988	test-rmse:0.08307
[210]	train-rmse:0.07978	test-rmse:0.08306
[220]	train-rmse:0.07967	test-rmse:0.08305
[230]	train-rmse:0.07958	test-rmse:0.08304
[240]	train-rmse:0.0

# Finding Optimal Parameters (2016)

# max_depth, min_child_weight (2016)

In [117]:
'''
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(9,12)
    for min_child_weight in range(5,8)
]

min_mae = float("Inf")
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))
    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    # Run CV
    cv_results = xgb.cv(
        xgb_params,
        train_matrix_2016,
        num_boost_round=num_boost_round,
        nfold=5,
        seed=42,
        metrics={'mae'},
        early_stopping_rounds=early_stopping_rounds
    )
    # Update best MAE
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (max_depth,min_child_weight)
print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))
'''

'\ngridsearch_params = [\n    (max_depth, min_child_weight)\n    for max_depth in range(9,12)\n    for min_child_weight in range(5,8)\n]\n\nmin_mae = float("Inf")\nbest_params = None\nfor max_depth, min_child_weight in gridsearch_params:\n    print("CV with max_depth={}, min_child_weight={}".format(\n                             max_depth,\n                             min_child_weight))\n    # Update our parameters\n    params[\'max_depth\'] = max_depth\n    params[\'min_child_weight\'] = min_child_weight\n    # Run CV\n    cv_results = xgb.cv(\n        xgb_params,\n        train_matrix_2016,\n        num_boost_round=num_boost_round,\n        nfold=5,\n        seed=42,\n        metrics={\'mae\'},\n        early_stopping_rounds=early_stopping_rounds\n    )\n    # Update best MAE\n    mean_mae = cv_results[\'test-mae-mean\'].min()\n    boost_rounds = cv_results[\'test-mae-mean\'].argmin()\n    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))\n    if mean_mae < min_mae:\n  

# subsample, colsample_bytree (2016)

In [118]:
'''
gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(7,11)]
    for colsample in [i/10. for i in range(7,11)]
]

min_mae = float("Inf")
best_params = None
# We start by the largest values and go down to the smallest
for subsample, colsample in reversed(gridsearch_params):
    print("CV with subsample={}, colsample={}".format(
                             subsample,
                             colsample))
    # We update our parameters
    params['subsample'] = subsample
    params['colsample_bytree'] = colsample
    # Run CV
    cv_results = xgb.cv(
        xgb_params,
        train_matrix_2016,
        num_boost_round=num_boost_round,
        nfold=5,
        seed=42,
        metrics={'mae'},
        early_stopping_rounds=early_stopping_rounds
    )
    # Update best score
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (subsample,colsample)
print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))
'''

'\ngridsearch_params = [\n    (subsample, colsample)\n    for subsample in [i/10. for i in range(7,11)]\n    for colsample in [i/10. for i in range(7,11)]\n]\n\nmin_mae = float("Inf")\nbest_params = None\n# We start by the largest values and go down to the smallest\nfor subsample, colsample in reversed(gridsearch_params):\n    print("CV with subsample={}, colsample={}".format(\n                             subsample,\n                             colsample))\n    # We update our parameters\n    params[\'subsample\'] = subsample\n    params[\'colsample_bytree\'] = colsample\n    # Run CV\n    cv_results = xgb.cv(\n        xgb_params,\n        train_matrix_2016,\n        num_boost_round=num_boost_round,\n        nfold=5,\n        seed=42,\n        metrics={\'mae\'},\n        early_stopping_rounds=early_stopping_rounds\n    )\n    # Update best score\n    mean_mae = cv_results[\'test-mae-mean\'].min()\n    boost_rounds = cv_results[\'test-mae-mean\'].argmin()\n    print("\tMAE {} for {} r

# eta (2016)

In [119]:
'''
min_mae = float("Inf")
best_params = None
for eta in [.3, .2, .1, .05, .01, .005]:
    print("CV with eta={}".format(eta))
    # We update our parameters
    params['eta'] = eta
    # Run and time CV
    %time cv_results = xgb.cv(xgb_params,train_matrix_2016,num_boost_round=num_boost_round,nfold=5,seed=42,metrics={'mae'},early_stopping_rounds=early_stopping_rounds)
    # Update best score
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds\n".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = eta
print("Best params: {}, MAE: {}".format(best_params, min_mae))
'''

'\nmin_mae = float("Inf")\nbest_params = None\nfor eta in [.3, .2, .1, .05, .01, .005]:\n    print("CV with eta={}".format(eta))\n    # We update our parameters\n    params[\'eta\'] = eta\n    # Run and time CV\n    %time cv_results = xgb.cv(xgb_params,train_matrix_2016,num_boost_round=num_boost_round,nfold=5,seed=42,metrics={\'mae\'},early_stopping_rounds=early_stopping_rounds)\n    # Update best score\n    mean_mae = cv_results[\'test-mae-mean\'].min()\n    boost_rounds = cv_results[\'test-mae-mean\'].argmin()\n    print("\tMAE {} for {} rounds\n".format(mean_mae, boost_rounds))\n    if mean_mae < min_mae:\n        min_mae = mean_mae\n        best_params = eta\nprint("Best params: {}, MAE: {}".format(best_params, min_mae))\n'

In [120]:
sample['parcelid'] = sample['ParcelId']

df_test_2016 = sample.merge(predict_2016, on='parcelid', how='left')

x_2016 = df_test_2016[num_columns]
    
matrix_2016 = xgb.DMatrix(x_2016)

In [121]:
prediction_data = model.predict(matrix_2016)
y_values = []

for num,predict in enumerate(prediction_data):
    y_values.append(str(round(predict,4)))
    
y_values=np.array(y_values)

In [122]:
sample.pop('parcelid')
sample['201610'] = y_values
sample['201611'] = y_values
sample['201612'] = y_values

# Processing Data (2017)

In [123]:
x_train_2017 = train_2017.drop(['logerror'], axis=1)
num_columns = x_train_2017.columns
y_train_2017 = train_2017['logerror'].values.astype(np.float32)
#print(len(y_train_2016))
#print(x_train_2016.shape, y_train_2016.shape)

In [124]:
#Dropping Outliers
train_2017=train_2017[train_2017.logerror > -0.4 ]
train_2017=train_2017[train_2017.logerror < 0.418 ]
x_train_2017 = train_2017.drop(['logerror'], axis=1)
y_train_2017 = train_2017['logerror'].values.astype(np.float32)


In [125]:
x_train, x_test, y_train, y_test = train_test_split(x_train_2017, y_train_2017, random_state=0)

In [126]:
mean = np.mean(y_train)
xgb_params = {
    'max_depth':9,
    'min_child_weight': 5,
    'eta':.01,
    'subsample': 0.6,
    'colsample_bytree': 0.7,
    'objective':'reg:linear',
    'silent': 1,
    'base_score': mean
}

# Creating DMatrix (2017)

In [127]:
train_matrix_2017 = xgb.DMatrix(x_train, y_train)
test_matrix_2017 = xgb.DMatrix(x_test, y_test)

In [128]:
xgb_params['eval_metric']:"mae"
num_boost_round = 999
early_stopping_rounds = 10
model = xgb.train(xgb_params, train_matrix_2017, 100000, [(train_matrix_2017, 'train'), (test_matrix_2017, 'test')], early_stopping_rounds=100, verbose_eval=10)
print("Best MAE: {:.2f} with {} rounds".format(
                 model.best_score,
                 model.best_iteration+1))

Parameters: { "silent" } are not used.

[0]	train-rmse:0.08247	test-rmse:0.08283




[10]	train-rmse:0.08221	test-rmse:0.08276
[20]	train-rmse:0.08195	test-rmse:0.08269
[30]	train-rmse:0.08172	test-rmse:0.08263
[40]	train-rmse:0.08151	test-rmse:0.08258
[50]	train-rmse:0.08131	test-rmse:0.08254
[60]	train-rmse:0.08110	test-rmse:0.08249
[70]	train-rmse:0.08092	test-rmse:0.08246
[80]	train-rmse:0.08074	test-rmse:0.08243
[90]	train-rmse:0.08060	test-rmse:0.08241
[100]	train-rmse:0.08044	test-rmse:0.08238
[110]	train-rmse:0.08029	test-rmse:0.08236
[120]	train-rmse:0.08016	test-rmse:0.08234
[130]	train-rmse:0.08002	test-rmse:0.08233
[140]	train-rmse:0.07990	test-rmse:0.08232
[150]	train-rmse:0.07977	test-rmse:0.08230
[160]	train-rmse:0.07965	test-rmse:0.08230
[170]	train-rmse:0.07953	test-rmse:0.08229
[180]	train-rmse:0.07941	test-rmse:0.08228
[190]	train-rmse:0.07930	test-rmse:0.08227
[200]	train-rmse:0.07918	test-rmse:0.08226
[210]	train-rmse:0.07909	test-rmse:0.08226
[220]	train-rmse:0.07897	test-rmse:0.08226
[230]	train-rmse:0.07888	test-rmse:0.08226
[240]	train-rmse:0.0

# Finding Optimal Parameters (2017)

# max_depth, min_child_weight (2017)

In [129]:
'''
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(9,12)
    for min_child_weight in range(5,8)
]

min_mae = float("Inf")
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))
    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    # Run CV
    cv_results = xgb.cv(
        xgb_params,
        train_matrix_2017,
        num_boost_round=num_boost_round,
        nfold=5,
        seed=42,
        metrics={'mae'},
        early_stopping_rounds=early_stopping_rounds
    )
    # Update best MAE
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (max_depth,min_child_weight)
print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))
'''

'\ngridsearch_params = [\n    (max_depth, min_child_weight)\n    for max_depth in range(9,12)\n    for min_child_weight in range(5,8)\n]\n\nmin_mae = float("Inf")\nbest_params = None\nfor max_depth, min_child_weight in gridsearch_params:\n    print("CV with max_depth={}, min_child_weight={}".format(\n                             max_depth,\n                             min_child_weight))\n    # Update our parameters\n    params[\'max_depth\'] = max_depth\n    params[\'min_child_weight\'] = min_child_weight\n    # Run CV\n    cv_results = xgb.cv(\n        xgb_params,\n        train_matrix_2017,\n        num_boost_round=num_boost_round,\n        nfold=5,\n        seed=42,\n        metrics={\'mae\'},\n        early_stopping_rounds=early_stopping_rounds\n    )\n    # Update best MAE\n    mean_mae = cv_results[\'test-mae-mean\'].min()\n    boost_rounds = cv_results[\'test-mae-mean\'].argmin()\n    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))\n    if mean_mae < min_mae:\n  

# subsample, colsample_bytree (2017)

In [130]:
'''
gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(7,11)]
    for colsample in [i/10. for i in range(7,11)]
]

min_mae = float("Inf")
best_params = None
# We start by the largest values and go down to the smallest
for subsample, colsample in reversed(gridsearch_params):
    print("CV with subsample={}, colsample={}".format(
                             subsample,
                             colsample))
    # We update our parameters
    params['subsample'] = subsample
    params['colsample_bytree'] = colsample
    # Run CV
    cv_results = xgb.cv(
        xgb_params,
        train_matrix_2017,
        num_boost_round=num_boost_round,
        nfold=5,
        seed=42,
        metrics={'mae'},
        early_stopping_rounds=early_stopping_rounds
    )
    # Update best score
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (subsample,colsample)
print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))
'''

'\ngridsearch_params = [\n    (subsample, colsample)\n    for subsample in [i/10. for i in range(7,11)]\n    for colsample in [i/10. for i in range(7,11)]\n]\n\nmin_mae = float("Inf")\nbest_params = None\n# We start by the largest values and go down to the smallest\nfor subsample, colsample in reversed(gridsearch_params):\n    print("CV with subsample={}, colsample={}".format(\n                             subsample,\n                             colsample))\n    # We update our parameters\n    params[\'subsample\'] = subsample\n    params[\'colsample_bytree\'] = colsample\n    # Run CV\n    cv_results = xgb.cv(\n        xgb_params,\n        train_matrix_2017,\n        num_boost_round=num_boost_round,\n        nfold=5,\n        seed=42,\n        metrics={\'mae\'},\n        early_stopping_rounds=early_stopping_rounds\n    )\n    # Update best score\n    mean_mae = cv_results[\'test-mae-mean\'].min()\n    boost_rounds = cv_results[\'test-mae-mean\'].argmin()\n    print("\tMAE {} for {} r

# eta (2017)

In [131]:
'''
min_mae = float("Inf")
best_params = None
for eta in [.3, .2, .1, .05, .01, .005]:
    print("CV with eta={}".format(eta))
    # We update our parameters
    params['eta'] = eta
    # Run and time CV
    %time cv_results = xgb.cv(xgb_params,train_matrix_2017,num_boost_round=num_boost_round,nfold=5,seed=42,metrics={'mae'},early_stopping_rounds=early_stopping_rounds)
    # Update best score
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds\n".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = eta
print("Best params: {}, MAE: {}".format(best_params, min_mae))
'''

'\nmin_mae = float("Inf")\nbest_params = None\nfor eta in [.3, .2, .1, .05, .01, .005]:\n    print("CV with eta={}".format(eta))\n    # We update our parameters\n    params[\'eta\'] = eta\n    # Run and time CV\n    %time cv_results = xgb.cv(xgb_params,train_matrix_2017,num_boost_round=num_boost_round,nfold=5,seed=42,metrics={\'mae\'},early_stopping_rounds=early_stopping_rounds)\n    # Update best score\n    mean_mae = cv_results[\'test-mae-mean\'].min()\n    boost_rounds = cv_results[\'test-mae-mean\'].argmin()\n    print("\tMAE {} for {} rounds\n".format(mean_mae, boost_rounds))\n    if mean_mae < min_mae:\n        min_mae = mean_mae\n        best_params = eta\nprint("Best params: {}, MAE: {}".format(best_params, min_mae))\n'

In [132]:
sample['parcelid'] = sample['ParcelId']

df_test_2017 = sample.merge(predict_2017, on='parcelid', how='left')

x_2017 = df_test_2017[num_columns]
    
matrix_2017 = xgb.DMatrix(x_2017)

In [133]:
prediction_data = model.predict(matrix_2017)
y_values = []

for num,predict in enumerate(prediction_data):
    y_values.append(str(round(predict,4)))
    
y_values=np.array(y_values)

In [134]:
sample.pop('parcelid')
sample['201710'] = y_values
sample['201711'] = y_values
sample['201712'] = y_values

In [135]:
sample.to_csv('./../submission/xgb.csv', index=False, float_format='%.4f')