In [1]:
FUDGE_FACTOR = 1.12
XGB_WEIGHT = 0.62
BASELINE_WEIGHT = 0.01
OLS_WEIGHT = 0.062
NN_WEIGHT = 0.08
XGB1_WEIGHT = 0.8
BASELINE_PRED = 0.0115

In [2]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import gc
from sklearn.linear_model import LinearRegression
import random
import datetime as dt

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout, BatchNormalization
from tensorflow.keras.layers import PReLU
from tensorflow.keras.layers import GaussianDropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

### Read raw data

In [3]:
from pathlib import Path

root_dir = Path('C:/Users/sinjy/jupyter_notebook/datasets')
data_dir = root_dir / 'kaggle_datasets' / 'Zillow-Price'
predict_dir = root_dir / 'kaggle_predict'

train = pd.read_csv(data_dir / 'train_2016_v2.csv')
prop = pd.read_csv(data_dir / 'properties_2016.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


## LightGBM

### process data for LightGBM

In [4]:
for c, dtype in zip(prop.columns, prop.dtypes):
    if dtype == np.float64:
        prop[c] = prop[c].astype(np.float32)

In [5]:
df_train = train.merge(prop, how='left', on='parcelid')
df_train.fillna(df_train.median(), inplace=True)

x_train = df_train.drop([
    'parcelid', 'logerror', 'transactiondate', 'propertyzoningdesc', 
    'propertycountylandusecode', 'fireplacecnt', 'fireplaceflag'], axis=1)
y_train = df_train['logerror'].values
x_train.shape, y_train.shape

  


((90275, 53), (90275,))

In [6]:
train_columns = x_train.columns

for c in x_train.dtypes[x_train.dtypes == object].index.values:
    x_train[c] = (x_train[c] == True)
    
del df_train
gc.collect()

x_train = x_train.values.astype(np.float32, copy=False)
d_train = lgb.Dataset(x_train, label=y_train)

## Run LightGBM

In [7]:
params = {}
params['max_bin'] = 10
params['learning_rate'] = 0.0021 # shrinkage_rate
params['boosting_type'] = 'gbdt'
params['objective'] = 'regression'
params['metric'] = 'l1'          # or 'mae'
params['sub_feature'] = 0.345    # feature_fraction (small values => use very different submodels)
params['bagging_fraction'] = 0.85 # sub_row
params['bagging_freq'] = 40
params['num_leaves'] = 512        # num_leaf
params['min_data'] = 500         # min_data_in_leaf
params['min_hessian'] = 0.05     # min_sum_hessian_in_leaf
params['verbose'] = 0
params['feature_fraction_seed'] = 2
params['bagging_seed'] = 3

In [8]:
np.random.seed(0)
random.seed(0)

clf = lgb.train(params, d_train, 430)

del d_train
gc.collect()
del x_train
gc.collect()

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.








0

### predict

In [9]:
sample = pd.read_csv(data_dir / 'sample_submission.csv')
sample['parcelid'] = sample['ParcelId']

df_test = sample.merge(prop, on='parcelid', how='left')
del sample, prop
gc.collect()

21

In [10]:
x_test = df_test[train_columns]
del df_test
gc.collect()

21

In [11]:
for c in x_test.dtypes[x_test.dtypes == object].index.values:
    x_test[c] = (x_test[c] == True)

x_test = x_test.values.astype(np.float32, copy=False)

In [12]:
p_test = clf.predict(x_test)
del x_test
gc.collect()

46

In [13]:
pd.DataFrame(p_test).head()

Unnamed: 0,0
0,0.031132
1,0.033375
2,0.010257
3,0.008651
4,0.00966


In [14]:
sub = pd.read_csv(data_dir / 'sample_submission.csv')
for c in sub.columns[sub.columns != 'ParcelId']:
    sub[c] = p_test
sub.to_csv(predict_dir / 'lgb.csv', index=False, float_format="%.4f")

### test score: 0.06499

## XGBoost

In [15]:
properties = pd.read_csv(data_dir / 'properties_2016.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


### process data for XGBoost

In [16]:
for c in properties.columns:
    properties[c] = properties[c].fillna(-1)
    if properties[c].dtype == 'object':
        lbl = LabelEncoder()
        lbl.fit(list(properties[c].values))
        properties[c] = lbl.transform(list(properties[c].values))
        
train_df = train.merge(properties, how='left', on='parcelid')
x_train = train_df.drop(['parcelid', 'logerror', 'transactiondate'], axis=1)
x_test = properties.drop(['parcelid'], axis=1)

x_train.shape, x_test.shape

((90275, 57), (2985217, 57))

In [17]:
train_df = train_df[train_df.logerror > -0.4]
train_df = train_df[train_df.logerror < 0.419]
x_train = train_df.drop(['parcelid', 'logerror', 'transactiondate'], axis=1)
y_train = train_df['logerror'].values.astype(np.float32)
y_mean = np.mean(y_train)

In [18]:
x_train.shape, x_test.shape

((88528, 57), (2985217, 57))

### run XGBoost

In [19]:
xgb_params = {
    'eta': 0.037,
    'max_depth': 5,
    'subsample': 0.80,
    'objective': 'reg:linear',
    'eval_metric': 'mae',
    'lambda': 0.8,   
    'alpha': 0.4, 
    'base_score': y_mean,
    'silent': 1
}

dtrain = xgb.DMatrix(x_train, y_train)
dtest = xgb.DMatrix(x_test)

In [20]:
num_boost_rounds = 250

model = xgb.train(dict(xgb_params, silent=1), dtrain, 
                  num_boost_round=num_boost_rounds)

Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




In [21]:
xgb_pred1 = model.predict(dtest)
pd.DataFrame(xgb_pred1).head()

Unnamed: 0,0
0,-0.030616
1,-0.028188
2,0.026397
3,0.063728
4,0.004398


In [22]:
sub = pd.read_csv(data_dir / 'sample_submission.csv')
for c in sub.columns[sub.columns != 'ParcelId']:
    sub[c] = xgb_pred1
sub.to_csv(predict_dir / 'xgb1.csv', index=False, float_format="%.4f")

### test score: 0.06457

### Run XGBoost again

In [23]:
xgb_params = {
    'eta': 0.033,
    'max_depth': 6,
    'subsample': 0.80,
    'objective': 'reg:linear',
    'eval_metric': 'mae',
    'base_score': y_mean,
    'silent': 1
}

num_boost_rounds = 150

In [24]:
model = xgb.train(dict(xgb_params, silent=1), dtrain, 
                  num_boost_round=num_boost_rounds)

Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




In [25]:
xgb_pred2 = model.predict(dtest)
pd.DataFrame(xgb_pred2).head()

Unnamed: 0,0
0,-0.09115
1,-0.034722
2,0.015816
3,0.075518
4,0.029908


### combine XGBoost results

In [26]:
xgb_pred = XGB1_WEIGHT * xgb_pred1 + (1-XGB1_WEIGHT) * xgb_pred2
pd.DataFrame(xgb_pred).head()

Unnamed: 0,0
0,-0.042723
1,-0.029495
2,0.024281
3,0.066086
4,0.0095


In [27]:
del train_df
del x_train
del x_test
del properties
del dtest
del dtrain
del xgb_pred1
del xgb_pred2
gc.collect()

0

## Neural Network

### read in data for Neural Network

In [28]:
train = pd.read_csv(data_dir / 'train_2016_v2.csv', parse_dates=['transactiondate'])
prop = pd.read_csv(data_dir / 'properties_2016.csv')
sample = pd.read_csv(data_dir / 'sample_submission.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


### fitting label encoder

In [29]:
for c in prop.columns:
    prop[c] = prop[c].fillna(-1)
    if prop[c].dtype == 'object':
        lbl = LabelEncoder()
        lbl.fit(list(prop[c].values))
        prop[c] = lbl.transform(list(prop[c].values))

### datetime

In [30]:
df_train = train.merge(prop, how='left', on='parcelid')

df_train['transactiondate'] = pd.to_datetime(df_train['transactiondate'])
df_train['transactiondate_year'] = df_train['transactiondate'].dt.year
df_train['transactiondate_month'] = df_train['transactiondate'].dt.month
df_train['transactiondate_quarter'] = df_train['transactiondate'].dt.quarter
df_train['transactiondate'] = df_train['transactiondate'].dt.day

### filling nan values

In [31]:
df_train.fillna(-1.0)

Unnamed: 0,parcelid,logerror,transactiondate,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,...,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock,transactiondate_year,transactiondate_month,transactiondate_quarter
0,11016594,0.0276,1,1.0,-1.0,-1.0,2.0,3.0,-1.0,4.0,...,360170.0,2015.0,237416.0,6735.88,0,-1.0,6.037107e+13,2016,1,1
1,14366692,-0.1684,1,-1.0,-1.0,-1.0,3.5,4.0,-1.0,-1.0,...,585529.0,2015.0,239071.0,10153.02,0,-1.0,-1.000000e+00,2016,1,1
2,12098116,-0.0040,1,1.0,-1.0,-1.0,3.0,2.0,-1.0,4.0,...,119906.0,2015.0,57912.0,11484.48,0,-1.0,6.037464e+13,2016,1,1
3,12643413,0.0218,2,1.0,-1.0,-1.0,2.0,2.0,-1.0,4.0,...,244880.0,2015.0,73362.0,3048.74,0,-1.0,6.037296e+13,2016,1,1
4,14432541,-0.0050,2,-1.0,-1.0,-1.0,2.5,4.0,-1.0,-1.0,...,434551.0,2015.0,264977.0,5488.96,0,-1.0,6.059042e+13,2016,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90270,10774160,-0.0356,30,1.0,-1.0,-1.0,1.0,1.0,-1.0,4.0,...,191000.0,2015.0,147200.0,2495.24,0,-1.0,6.037132e+13,2016,12,4
90271,12046695,0.0070,30,-1.0,-1.0,-1.0,3.0,3.0,-1.0,4.0,...,161111.0,2015.0,43218.0,1886.54,0,-1.0,6.037301e+13,2016,12,4
90272,12995401,-0.2679,30,-1.0,-1.0,-1.0,2.0,4.0,-1.0,7.0,...,38096.0,2015.0,16088.0,1925.70,1,14.0,6.037433e+13,2016,12,4
90273,11402105,0.0602,30,-1.0,-1.0,-1.0,2.0,2.0,-1.0,4.0,...,165869.0,2015.0,32878.0,2285.57,0,-1.0,6.037601e+13,2016,12,4


### creating x_train and y_train

In [32]:
x_train = df_train.drop(['parcelid', 'logerror', 'transactiondate', 
                         'propertyzoningdesc', 'propertycountylandusecode', 
                         'fireplacecnt', 'fireplaceflag'], axis=1)
y_train = df_train['logerror']

y_mean = np.mean(y_train)
x_train.shape, y_train.shape

((90275, 56), (90275,))

In [33]:
train_columns = x_train.columns

In [34]:
for c in x_train.dtypes[x_train.dtypes == object].index.values:
    x_train[c] = (x_train[c] == True)

### creating df_test

In [35]:
sample['parcelid'] = sample['ParcelId']

In [36]:
df_test = sample.merge(prop, on='parcelid', how='left')

In [37]:
df_test["transactiondate"] = pd.to_datetime('2016-11-15')  # placeholder value for preliminary version
df_test["transactiondate_year"] = df_test["transactiondate"].dt.year
df_test["transactiondate_month"] = df_test["transactiondate"].dt.month
df_test['transactiondate_quarter'] = df_test['transactiondate'].dt.quarter
df_test["transactiondate"] = df_test["transactiondate"].dt.day     
x_test = df_test[train_columns]

In [38]:
x_test.shape

(2985217, 56)

In [39]:
for c in x_test.dtypes[x_test.dtypes == object].index.values:
    x_test[c] = (x_test[c] == True)

### preprocessing

In [40]:
imputer = SimpleImputer()
imputer.fit(x_train.iloc[:, :])
x_train = imputer.transform(x_train.iloc[:, :])
x_test = imputer.transform(x_test.iloc[:, :])

In [41]:
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

len_x = int(x_train.shape[1])
len_x

56

### modeling

In [42]:
nn = Sequential()
nn.add(Dense(units = 400 , kernel_initializer = 'normal', input_dim = len_x))
nn.add(PReLU())
nn.add(Dropout(.4))
nn.add(Dense(units = 160 , kernel_initializer = 'normal'))
nn.add(PReLU())
nn.add(BatchNormalization())
nn.add(Dropout(.6))
nn.add(Dense(units = 64 , kernel_initializer = 'normal'))
nn.add(PReLU())
nn.add(BatchNormalization())
nn.add(Dropout(.5))
nn.add(Dense(units = 26, kernel_initializer = 'normal'))
nn.add(PReLU())
nn.add(BatchNormalization())
nn.add(Dropout(.6))
nn.add(Dense(1, kernel_initializer='normal'))

nn.compile(loss='mae', optimizer=Adam(lr=4e-3, decay=1e-4))

  "The `lr` argument is deprecated, use `learning_rate` instead.")


In [43]:
nn.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 400)               22800     
_________________________________________________________________
p_re_lu (PReLU)              (None, 400)               400       
_________________________________________________________________
dropout (Dropout)            (None, 400)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 160)               64160     
_________________________________________________________________
p_re_lu_1 (PReLU)            (None, 160)               160       
_________________________________________________________________
batch_normalization (BatchNo (None, 160)               640       
_________________________________________________________________
dropout_1 (Dropout)          (None, 160)               0

In [44]:
nn.fit(np.array(x_train), np.array(y_train), batch_size=32, epochs=70, 
      verbose=2)

Epoch 1/70
2822/2822 - 21s - loss: 0.0703
Epoch 2/70
2822/2822 - 15s - loss: 0.0682
Epoch 3/70
2822/2822 - 16s - loss: 0.0681
Epoch 4/70
2822/2822 - 15s - loss: 0.0680
Epoch 5/70
2822/2822 - 16s - loss: 0.0680
Epoch 6/70
2822/2822 - 16s - loss: 0.0680
Epoch 7/70
2822/2822 - 15s - loss: 0.0679
Epoch 8/70
2822/2822 - 16s - loss: 0.0678
Epoch 9/70
2822/2822 - 16s - loss: 0.0678
Epoch 10/70
2822/2822 - 17s - loss: 0.0678
Epoch 11/70
2822/2822 - 16s - loss: 0.0677
Epoch 12/70
2822/2822 - 16s - loss: 0.0677
Epoch 13/70
2822/2822 - 16s - loss: 0.0677
Epoch 14/70
2822/2822 - 16s - loss: 0.0677
Epoch 15/70
2822/2822 - 16s - loss: 0.0677
Epoch 16/70
2822/2822 - 17s - loss: 0.0676
Epoch 17/70
2822/2822 - 16s - loss: 0.0676
Epoch 18/70
2822/2822 - 18s - loss: 0.0676
Epoch 19/70
2822/2822 - 16s - loss: 0.0676
Epoch 20/70
2822/2822 - 16s - loss: 0.0675
Epoch 21/70
2822/2822 - 16s - loss: 0.0676
Epoch 22/70
2822/2822 - 16s - loss: 0.0675
Epoch 23/70
2822/2822 - 16s - loss: 0.0675
Epoch 24/70
2822/282

<tensorflow.python.keras.callbacks.History at 0x1dac10051c8>

### predict

In [45]:
y_pred_ann = nn.predict(x_test)

In [46]:
nn_pred = y_pred_ann.flatten()

In [47]:
nn_pred.shape

(2985217,)

In [48]:
sub = pd.read_csv(data_dir / 'sample_submission.csv')
for c in sub.columns[sub.columns != 'ParcelId']:
    sub[c] = nn_pred
sub.to_csv(predict_dir / 'nn.csv', index=False, float_format="%.4f")

### test score: 0.06460

In [49]:
del train
del prop
del sample
del x_train
del x_test
del df_train
del df_test
del y_pred_ann
gc.collect()

801

## OLS

In [50]:
np.random.seed(17)
random.seed(17)

### Processing data for OLS

In [51]:
train = pd.read_csv(data_dir / 'train_2016_v2.csv', parse_dates=['transactiondate'])
properties = pd.read_csv(data_dir / 'properties_2016.csv')
submission = pd.read_csv(data_dir / 'sample_submission.csv')
len(train), len(properties), len(submission)

  exec(code_obj, self.user_global_ns, self.user_ns)


(90275, 2985217, 2985217)

In [52]:
def get_features(df):
    df["transactiondate"] = pd.to_datetime(df["transactiondate"])
    df["transactiondate_year"] = df["transactiondate"].dt.year
    df["transactiondate_month"] = df["transactiondate"].dt.month
    df['transactiondate'] = df['transactiondate'].dt.quarter
    df = df.fillna(-1.0)
    return df

In [53]:
def MAE(y, ypred):
    #logerror=log(Zestimate)−log(SalePrice)
    return np.sum([abs(y[i]-ypred[i]) for i in range(len(y))]) / len(y)

In [54]:
train = pd.merge(train, properties, how='left', on='parcelid')
y = train['logerror'].values
test = pd.merge(submission, properties, how='left', left_on='ParcelId', right_on='parcelid')
properties = []

In [55]:
exc = [train.columns[c] for c in range(len(train.columns)) if train.dtypes[c] == object] + ['logerror', 'parcelid']
col = [c for c in train.columns if c not in exc]

In [56]:
train = get_features(train[col])
test['transactiondate'] = '2016-01-01'
test = get_features(test[col])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_

### Fitting OLS

In [57]:
reg = LinearRegression(n_jobs=-1)
reg.fit(train, y)
MAE(y, reg.predict(train))

0.06837008810154052

In [58]:
train = []
y = []
test_dates = ['2016-10-01','2016-11-01','2016-12-01','2017-10-01','2017-11-01','2017-12-01']
test_columns = ['201610','201611','201612','201710','201711','201712']

### predict

In [59]:
reg_pred = reg.predict(get_features(test))

In [60]:
sub = pd.read_csv(data_dir / 'sample_submission.csv')
for c in sub.columns[sub.columns != 'ParcelId']:
    sub[c] = reg_pred
sub.to_csv(predict_dir / 'reg.csv', index=False, float_format="%.4f")

### test score: 0.06511

## Combine

In [61]:
lgb_weight = 1 - XGB_WEIGHT - BASELINE_WEIGHT - NN_WEIGHT - OLS_WEIGHT 
lgb_weight0 = lgb_weight / (1 - OLS_WEIGHT)
xgb_weight0 = XGB_WEIGHT / (1 - OLS_WEIGHT)
baseline_weight0 =  BASELINE_WEIGHT / (1 - OLS_WEIGHT)
nn_weight0 = NN_WEIGHT / (1 - OLS_WEIGHT)
pred0 = 0
pred0 += xgb_weight0*xgb_pred
pred0 += baseline_weight0*BASELINE_PRED
pred0 += lgb_weight0*p_test
pred0 += nn_weight0*nn_pred

In [62]:
for i in range(len(test_dates)):
    test['transactiondate'] = test_dates[i]
    pred = FUDGE_FACTOR * ( OLS_WEIGHT*reg_pred + (1-OLS_WEIGHT)*pred0 )
    submission[test_columns[i]] = [float(format(x, '.4f')) for x in pred]
    print('predict...', i)

predict... 0
predict... 1
predict... 2
predict... 3
predict... 4
predict... 5


### write the results

In [63]:
submission.to_csv(predict_dir / 'combine.csv', index=False)

### test score: 0.06442