In [1]:
FUDGE_FACTOR = 1.12
XGB_WEIGHT = 0.62
BASELINE_WEIGHT = 0.01
OLS_WEIGHT = 0.062
NN_WEIGHT = 0.08
XGB1_WEIGHT = 0.8
BASELINE_PRED = 0.0115

In [4]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import gc
from sklearn.linear_model import LinearRegression
import random
import datetime as dt

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout, BatchNormalization
from tensorflow.keras.layers import PReLU
from tensorflow.keras.layers import GaussianDropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

### Read raw data

In [5]:
from pathlib import Path

root_dir = Path('C:/Users/sinjy/jupyter_notebook/datasets')
data_dir = root_dir / 'kaggle_datasets' / 'Zillow-Price'
predict_dir = root_dir / 'kaggle_predict'

train = pd.read_csv(data_dir / 'train_2016_v2.csv')
prop = pd.read_csv(data_dir / 'properties_2016.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


## LightGBM

### process data for LightGBM

In [6]:
for c, dtype in zip(prop.columns, prop.dtypes):
    if dtype == np.float64:
        prop[c] = prop[c].astype(np.float32)

In [8]:
df_train = train.merge(prop, how='left', on='parcelid')
df_train.fillna(df_train.median(), inplace=True)

x_train = df_train.drop([
    'parcelid', 'logerror', 'transactiondate', 'propertyzoningdesc', 
    'propertycountylandusecode', 'fireplacecnt', 'fireplaceflag'], axis=1)
y_train = df_train['logerror'].values
x_train.shape, y_train.shape

  


((90275, 53), (90275,))

In [37]:
train_columns = x_train.columns

for c in x_train.dtypes[x_train.dtypes == object].index.values:
    x_train[c] = (x_train[c] == True)
    
del df_train
gc.collect()

x_train = x_train.values.astype(np.float32, copy=False)
d_train = lgb.Dataset(x_train, label=y_train)

## Run LightGBM

In [38]:
params = {}
params['max_bin'] = 10
params['learning_rate'] = 0.0021 # shrinkage_rate
params['boosting_type'] = 'gbdt'
params['objective'] = 'regression'
params['metric'] = 'l1'          # or 'mae'
params['sub_feature'] = 0.345    # feature_fraction (small values => use very different submodels)
params['bagging_fraction'] = 0.85 # sub_row
params['bagging_freq'] = 40
params['num_leaves'] = 512        # num_leaf
params['min_data'] = 500         # min_data_in_leaf
params['min_hessian'] = 0.05     # min_sum_hessian_in_leaf
params['verbose'] = 0
params['feature_fraction_seed'] = 2
params['bagging_seed'] = 3

In [39]:
np.random.seed(0)
random.seed(0)

clf = lgb.train(params, d_train, 430)

del d_train
gc.collect()
del x_train
gc.collect()

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.








0

### predict

In [40]:
sample = pd.read_csv(data_dir / 'sample_submission.csv')
sample['parcelid'] = sample['ParcelId']

df_test = sample.merge(prop, on='parcelid', how='left')
del sample, prop
gc.collect()

21

In [41]:
x_test = df_test[train_columns]
del df_test
gc.collect()

21

In [42]:
for c in x_test.dtypes[x_test.dtypes == object].index.values:
    x_test[c] = (x_test[c] == True)

x_test = x_test.values.astype(np.float32, copy=False)

In [43]:
p_test = clf.predict(x_test)
del x_test
gc.collect()

46

In [44]:
pd.DataFrame(p_test).head()

Unnamed: 0,0
0,0.031132
1,0.033375
2,0.010257
3,0.008651
4,0.00966


In [46]:
sub = pd.read_csv(data_dir / 'sample_submission.csv')
for c in sub.columns[sub.columns != 'ParcelId']:
    sub[c] = p_test
sub.to_csv(predict_dir / 'lgb.csv', index=False, float_format="%.4f")

### test score: 0.06499

## XGBoost

In [47]:
properties = pd.read_csv(data_dir / 'properties_2016.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


### process data for XGBoost

In [48]:
for c in properties.columns:
    properties[c] = properties[c].fillna(-1)
    if properties[c].dtype == 'object':
        lbl = LabelEncoder()
        lbl.fit(list(properties[c].values))
        properties[c] = lbl.transform(list(properties[c].values))
        
train_df = train.merge(properties, how='left', on='parcelid')
x_train = train_df.drop(['parcelid', 'logerror', 'transactiondate'], axis=1)
x_test = properties.drop(['parcelid'], axis=1)

x_train.shape, x_test.shape

((90275, 57), (2985217, 57))

In [49]:
train_df = train_df[train_df.logerror > -0.4]
train_df = train_df[train_df.logerror < 0.419]
x_train = train_df.drop(['parcelid', 'logerror', 'transactiondate'], axis=1)
y_train = train_df['logerror'].values.astype(np.float32)
y_mean = np.mean(y_train)

In [50]:
x_train.shape, x_test.shape

((88528, 57), (2985217, 57))

### run XGBoost

In [51]:
xgb_params = {
    'eta': 0.037,
    'max_depth': 5,
    'subsample': 0.80,
    'objective': 'reg:linear',
    'eval_metric': 'mae',
    'lambda': 0.8,   
    'alpha': 0.4, 
    'base_score': y_mean,
    'silent': 1
}

dtrain = xgb.DMatrix(x_train, y_train)
dtest = xgb.DMatrix(x_test)

In [52]:
num_boost_rounds = 250

model = xgb.train(dict(xgb_params, silent=1), dtrain, 
                  num_boost_round=num_boost_rounds)

Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




In [53]:
xgb_pred1 = model.predict(dtest)
pd.DataFrame(xgb_pred1).head()

Unnamed: 0,0
0,-0.030616
1,-0.028188
2,0.026397
3,0.063728
4,0.004398


In [55]:
sub = pd.read_csv(data_dir / 'sample_submission.csv')
for c in sub.columns[sub.columns != 'ParcelId']:
    sub[c] = xgb_pred1
sub.to_csv(predict_dir / 'xgb1.csv', index=False, float_format="%.4f")

### test score: 0.06457

### Run XGBoost again

In [56]:
xgb_params = {
    'eta': 0.033,
    'max_depth': 6,
    'subsample': 0.80,
    'objective': 'reg:linear',
    'eval_metric': 'mae',
    'base_score': y_mean,
    'silent': 1
}

num_boost_rounds = 150

In [59]:
model = xgb.train(dict(xgb_params, silent=1), dtrain, 
                  num_boost_round=num_boost_rounds)

Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




In [60]:
xgb_pred2 = model.predict(dtest)
pd.DataFrame(xgb_pred2).head()

Unnamed: 0,0
0,-0.09115
1,-0.034722
2,0.015816
3,0.075518
4,0.029908


### combine XGBoost results

In [61]:
xgb_pred = XGB1_WEIGHT * xgb_pred1 + (1-XGB1_WEIGHT) * xgb_pred2
pd.DataFrame(xgb_pred).head()

Unnamed: 0,0
0,-0.042723
1,-0.029495
2,0.024281
3,0.066086
4,0.0095


In [62]:
del train_df
del x_train
del x_test
del properties
del dtest
del dtrain
del xgb_pred1
del xgb_pred2
gc.collect()

184

## Neural Network