# All Imports

In [315]:
import numpy as np
import pandas as pd
import xgboost as xgb
import gc
from sklearn import ensemble
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split

# Reading Data

In [316]:
train_2016 = pd.read_csv('./../preprocessed data/df_train_2016.csv')
predict_2016 = pd.read_csv('./../preprocessed data/df_predict_2016.csv')

train_2017 = pd.read_csv('./../preprocessed data/df_train_2017.csv')
predict_2017 = pd.read_csv('./../preprocessed data/df_predict_2017.csv')

sample = pd.read_csv("./../data/sample_submission.csv")

# Processing Data (2016)

In [317]:
x_train_2016 = train_2016.drop(['logerror'], axis=1)
num_columns = x_train_2016.columns
y_train_2016 = train_2016['logerror'].values.astype(np.float32)

In [318]:
#Dropping Outliers
train_2016=train_2016[train_2016.logerror > -0.4 ]
train_2016=train_2016[train_2016.logerror < 0.418 ]
x_train_2016 = train_2016.drop(['logerror'], axis=1)
y_train_2016 = train_2016['logerror'].values.astype(np.float32)

In [319]:
x_train, x_test, y_train, y_test = train_test_split(x_train_2016, y_train_2016, random_state=0)

In [320]:
mean = np.mean(y_train)
xgb_params = {
    'eta': 0.06,
    'max_depth': 5,
    'subsample': 0.75,
    'objective': 'reg:linear',
    'eval_metric': 'mae',
    'base_score': mean,
    'silent': 1
}

# Creating DMatrix (2016)

In [321]:
train_matrix_2016 = xgb.DMatrix(x_train, y_train)
test_matrix_2016 = xgb.DMatrix(x_test, y_test)

In [322]:
model = xgb.train(xgb_params, train_matrix_2016, 100000, [(train_matrix_2016, 'train'), (test_matrix_2016, 'test')], early_stopping_rounds=100, verbose_eval=10)

Parameters: { "silent" } are not used.

[0]	train-mae:0.05318	test-mae:0.05350




[10]	train-mae:0.05287	test-mae:0.05328
[20]	train-mae:0.05266	test-mae:0.05317
[30]	train-mae:0.05253	test-mae:0.05312
[40]	train-mae:0.05242	test-mae:0.05310
[50]	train-mae:0.05235	test-mae:0.05309
[60]	train-mae:0.05226	test-mae:0.05308
[70]	train-mae:0.05217	test-mae:0.05307
[80]	train-mae:0.05210	test-mae:0.05305
[90]	train-mae:0.05201	test-mae:0.05302
[100]	train-mae:0.05196	test-mae:0.05302
[110]	train-mae:0.05189	test-mae:0.05302
[120]	train-mae:0.05182	test-mae:0.05303
[130]	train-mae:0.05174	test-mae:0.05303
[140]	train-mae:0.05166	test-mae:0.05303
[150]	train-mae:0.05159	test-mae:0.05304
[160]	train-mae:0.05153	test-mae:0.05304
[170]	train-mae:0.05146	test-mae:0.05302
[180]	train-mae:0.05139	test-mae:0.05303
[190]	train-mae:0.05133	test-mae:0.05302
[200]	train-mae:0.05127	test-mae:0.05303
[202]	train-mae:0.05125	test-mae:0.05303


In [323]:
sample['parcelid'] = sample['ParcelId']

df_test_2016 = sample.merge(predict_2016, on='parcelid', how='left')

x_2016 = df_test_2016[num_columns]
    
matrix_2016 = xgb.DMatrix(x_2016)

In [324]:
prediction_data = model.predict(matrix_2016)
y_values = []

for num,predict in enumerate(prediction_data):
    y_values.append(str(round(predict,4)))
    
y_values=np.array(y_values)

In [325]:
sample.pop('parcelid')
sample['201610'] = y_values
sample['201611'] = y_values
sample['201612'] = y_values

# Processing Data (2017)

In [326]:
x_train_2017 = train_2017.drop(['logerror'], axis=1)
num_columns = x_train_2017.columns
y_train_2017 = train_2017['logerror'].values.astype(np.float32)
#print(len(y_train_2016))
#print(x_train_2016.shape, y_train_2016.shape)

In [327]:
#Dropping Outliers
train_2017=train_2017[train_2017.logerror > -0.4 ]
train_2017=train_2017[train_2017.logerror < 0.418 ]
x_train_2017 = train_2017.drop(['logerror'], axis=1)
y_train_2017 = train_2017['logerror'].values.astype(np.float32)

In [328]:
x_train, x_test, y_train, y_test = train_test_split(x_train_2017, y_train_2017, random_state=0)

In [329]:
mean = np.mean(y_train)
xgb_params = {
    'eta': 0.06,
    'max_depth': 5,
    'subsample': 0.75,
    'objective': 'reg:linear',
    'eval_metric': 'mae',
    'base_score': mean,
    'silent': 1
}

# Creating DMatrix (2017)

In [330]:
train_matrix_2017 = xgb.DMatrix(x_train, y_train)
test_matrix_2017 = xgb.DMatrix(x_test, y_test)

In [331]:
model = xgb.train(xgb_params, train_matrix_2017, 100000, [(train_matrix_2017, 'train'), (test_matrix_2017, 'test')], early_stopping_rounds=100, verbose_eval=10)

Parameters: { "silent" } are not used.

[0]	train-mae:0.05269	test-mae:0.05269




[10]	train-mae:0.05239	test-mae:0.05252
[20]	train-mae:0.05221	test-mae:0.05247
[30]	train-mae:0.05208	test-mae:0.05244
[40]	train-mae:0.05197	test-mae:0.05241
[50]	train-mae:0.05187	test-mae:0.05239
[60]	train-mae:0.05178	test-mae:0.05240
[70]	train-mae:0.05169	test-mae:0.05240
[80]	train-mae:0.05161	test-mae:0.05239
[90]	train-mae:0.05153	test-mae:0.05239
[100]	train-mae:0.05145	test-mae:0.05239
[110]	train-mae:0.05138	test-mae:0.05240
[120]	train-mae:0.05130	test-mae:0.05240
[130]	train-mae:0.05122	test-mae:0.05241
[140]	train-mae:0.05116	test-mae:0.05241
[150]	train-mae:0.05110	test-mae:0.05242
[160]	train-mae:0.05103	test-mae:0.05243
[170]	train-mae:0.05093	test-mae:0.05243
[180]	train-mae:0.05086	test-mae:0.05243
[190]	train-mae:0.05080	test-mae:0.05245


In [332]:
sample['parcelid'] = sample['ParcelId']

df_test_2017 = sample.merge(predict_2017, on='parcelid', how='left')

x_2017 = df_test_2017[num_columns]
    
matrix_2017 = xgb.DMatrix(x_2017)

In [333]:
prediction_data = model.predict(matrix_2017)
y_values = []

for num,predict in enumerate(prediction_data):
    y_values.append(str(round(predict,4)))
    
y_values=np.array(y_values)

In [334]:
sample.pop('parcelid')
sample['201710'] = y_values
sample['201711'] = y_values
sample['201712'] = y_values

In [335]:
sample.to_csv('./../submission/xgb.csv', index=False, float_format='%.4f')