In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
import gc
from pathlib import Path

print('Loading data ...')

root_dir = Path('C:/Users/sinjy/jupyter_notebook/datasets')
data_dir = root_dir / 'kaggle_datasets' / 'Zillow-Price'
predict_dir = root_dir / 'kaggle_predict'

train = pd.read_csv(data_dir / 'train_2016_v2.csv')
prop = pd.read_csv(data_dir / 'properties_2016.csv')
sample = pd.read_csv(data_dir / 'sample_submission.csv')

Loading data ...


  exec(code_obj, self.user_global_ns, self.user_ns)


In [2]:
print('Binding to float32')

for c, dtype in zip(prop.columns, prop.dtypes):
    if dtype == np.float64:
        prop[c] = prop[c].astype(np.float32)

Binding to float32


### training set

In [3]:
df_train = train.merge(prop, how='left', on='parcelid')

x_train = df_train.drop(['parcelid', 'logerror', 'transactiondate', 'propertyzoningdesc', 'propertycountylandusecode'], axis=1)
y_train = df_train['logerror'].values
x_train.shape, y_train.shape

((90275, 55), (90275,))

In [4]:
train_columns = x_train.columns

for c in x_train.dtypes[x_train.dtypes == object].index.values:
    x_train[c] = (x_train[c] == True)
    
del df_train
gc.collect()

0

In [5]:
split = 80000
x_train, y_train, x_valid, y_valid = x_train[:split], y_train[:split], x_train[split:], y_train[split:]

In [6]:
d_train = xgb.DMatrix(x_train, label=y_train)
d_valid = xgb.DMatrix(x_valid, label=y_valid)

del x_train, x_valid
gc.collect()

49

### Training

In [7]:
params = {}
params['eta'] = 0.02
params['objective'] = 'reg:linear'
params['eval_metric'] = 'mae'
params['max_depth'] = 4
params['silent'] = 1

In [8]:
watchlist = [(d_train, 'train'), (d_valid, 'valid')]
clf = xgb.train(params, d_train, 10000, watchlist, early_stopping_rounds=100, verbose_eval=10)

del d_train, d_valid

Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	train-mae:0.48806	valid-mae:0.48112
[10]	train-mae:0.40222	valid-mae:0.39544
[20]	train-mae:0.33268	valid-mae:0.32610
[30]	train-mae:0.27652	valid-mae:0.27013
[40]	train-mae:0.23132	valid-mae:0.22521
[50]	train-mae:0.19506	valid-mae:0.18933
[60]	train-mae:0.16612	valid-mae:0.16073
[70]	train-mae:0.14312	valid-mae:0.13805
[80]	train-mae:0.12497	valid-mae:0.12024
[90]	train-mae:0.11078	valid-mae:0.10635
[100]	train-mae:0.09982	valid-mae:0.09570
[110]	train-mae:0.09145	valid-mae:0.08761
[120]	train-mae:0.08514	valid-mae:0.08160
[130]	train-mae:0.08045	valid-mae:0.07723
[140]	train-mae:0.07700	valid-mae:0.07408
[150]	train-mae:0.07450	valid-mae:0.07184
[160]	train-mae:0.07268	va

### Building test set

In [9]:
sample['parcelid'] = sample['ParcelId']
df_test = sample.merge(prop, on='parcelid', how='left')
del prop
gc.collect()

0

In [10]:
x_test = df_test[train_columns]
for c in x_test.dtypes[x_test.dtypes==object].index.values:
    x_test[c] = (x_test[c] == True)

del df_test, sample
gc.collect()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


21

In [11]:
d_test = xgb.DMatrix(x_test)
del x_test
gc.collect()

28

### predict

In [12]:
p_test = clf.predict(d_test)
del d_test
gc.collect()

21

In [15]:
sub = pd.read_csv(data_dir / 'sample_submission.csv')
for c in sub.columns[sub.columns != 'ParcelId']:
    sub[c] = p_test
    
sub.to_csv(predict_dir / 'xgb_starter.csv', index=False, float_format='%.4f')

### test score: 0.06522