In [1]:
FUDGE_FACTOR = 1.12
XGB_WEIGHT = 0.62
BASELINE_WEIGHT = 0.01
OLS_WEIGHT = 0.062
NN_WEIGHT = 0.08
XGB1_WEIGHT = 0.8
BASELINE_PRED = 0.0115

In [2]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import gc
from sklearn.linear_model import LinearRegression
import random
import datetime as dt

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout, BatchNormalization
from tensorflow.keras.layers import PReLU
from tensorflow.keras.layers import GaussianDropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

### Read raw data

In [3]:
from pathlib import Path

root_dir = Path('C:/Users/sinjy/jupyter_notebook/datasets')
data_dir = root_dir / 'kaggle_datasets' / 'Zillow-Price'
predict_dir = root_dir / 'kaggle_predict'

train = pd.read_csv(data_dir / 'train_2016_v2.csv')
prop = pd.read_csv(data_dir / 'properties_2016.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


## LightGBM

### process data for LightGBM

In [4]:
for c, dtype in zip(prop.columns, prop.dtypes):
    if dtype == np.float64:
        prop[c] = prop[c].astype(np.float32)

In [5]:
df_train = train.merge(prop, how='left', on='parcelid')
df_train.fillna(df_train.median(), inplace=True)

x_train = df_train.drop([
    'parcelid', 'logerror', 'transactiondate', 'propertyzoningdesc', 
    'propertycountylandusecode', 'fireplacecnt', 'fireplaceflag'], axis=1)
y_train = df_train['logerror'].values
x_train.shape, y_train.shape

  


((90275, 53), (90275,))

In [6]:
train_columns = x_train.columns

for c in x_train.dtypes[x_train.dtypes == object].index.values:
    x_train[c] = (x_train[c] == True)
    
del df_train
gc.collect()

x_train = x_train.values.astype(np.float32, copy=False)
d_train = lgb.Dataset(x_train, label=y_train)

## Run LightGBM

In [7]:
params = {}
params['max_bin'] = 10
params['learning_rate'] = 0.0021 # shrinkage_rate
params['boosting_type'] = 'gbdt'
params['objective'] = 'regression'
params['metric'] = 'l1'          # or 'mae'
params['sub_feature'] = 0.345    # feature_fraction (small values => use very different submodels)
params['bagging_fraction'] = 0.85 # sub_row
params['bagging_freq'] = 40
params['num_leaves'] = 512        # num_leaf
params['min_data'] = 500         # min_data_in_leaf
params['min_hessian'] = 0.05     # min_sum_hessian_in_leaf
params['verbose'] = 0
params['feature_fraction_seed'] = 2
params['bagging_seed'] = 3

In [8]:
np.random.seed(0)
random.seed(0)

clf = lgb.train(params, d_train, 430)

del d_train
gc.collect()
del x_train
gc.collect()

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.








0

### predict

In [9]:
sample = pd.read_csv(data_dir / 'sample_submission.csv')
sample['parcelid'] = sample['ParcelId']

df_test = sample.merge(prop, on='parcelid', how='left')
del sample, prop
gc.collect()

21

In [10]:
x_test = df_test[train_columns]
del df_test
gc.collect()

21

In [11]:
for c in x_test.dtypes[x_test.dtypes == object].index.values:
    x_test[c] = (x_test[c] == True)

x_test = x_test.values.astype(np.float32, copy=False)

In [12]:
p_test = clf.predict(x_test)
del x_test
gc.collect()

46

In [13]:
pd.DataFrame(p_test).head()

Unnamed: 0,0
0,0.031132
1,0.033375
2,0.010257
3,0.008651
4,0.00966


In [14]:
sub = pd.read_csv(data_dir / 'sample_submission.csv')
for c in sub.columns[sub.columns != 'ParcelId']:
    sub[c] = p_test
sub.to_csv(predict_dir / 'lgb.csv', index=False, float_format="%.4f")

### test score: 0.06499

## XGBoost

In [None]:
properties = pd.read_csv(data_dir / 'properties_2016.csv')

### process data for XGBoost

In [None]:
for c in properties.columns:
    properties[c] = properties[c].fillna(-1)
    if properties[c].dtype == 'object':
        lbl = LabelEncoder()
        lbl.fit(list(properties[c].values))
        properties[c] = lbl.transform(list(properties[c].values))
        
train_df = train.merge(properties, how='left', on='parcelid')
x_train = train_df.drop(['parcelid', 'logerror', 'transactiondate'], axis=1)
x_test = properties.drop(['parcelid'], axis=1)

x_train.shape, x_test.shape

In [None]:
train_df = train_df[train_df.logerror > -0.4]
train_df = train_df[train_df.logerror < 0.419]
x_train = train_df.drop(['parcelid', 'logerror', 'transactiondate'], axis=1)
y_train = train_df['logerror'].values.astype(np.float32)
y_mean = np.mean(y_train)

In [None]:
x_train.shape, x_test.shape

### run XGBoost

In [None]:
xgb_params = {
    'eta': 0.037,
    'max_depth': 5,
    'subsample': 0.80,
    'objective': 'reg:linear',
    'eval_metric': 'mae',
    'lambda': 0.8,   
    'alpha': 0.4, 
    'base_score': y_mean,
    'silent': 1
}

dtrain = xgb.DMatrix(x_train, y_train)
dtest = xgb.DMatrix(x_test)

In [None]:
num_boost_rounds = 250

model = xgb.train(dict(xgb_params, silent=1), dtrain, 
                  num_boost_round=num_boost_rounds)

In [None]:
xgb_pred1 = model.predict(dtest)
pd.DataFrame(xgb_pred1).head()

In [None]:
sub = pd.read_csv(data_dir / 'sample_submission.csv')
for c in sub.columns[sub.columns != 'ParcelId']:
    sub[c] = xgb_pred1
sub.to_csv(predict_dir / 'xgb1.csv', index=False, float_format="%.4f")

### test score: 0.06457

### Run XGBoost again

In [None]:
xgb_params = {
    'eta': 0.033,
    'max_depth': 6,
    'subsample': 0.80,
    'objective': 'reg:linear',
    'eval_metric': 'mae',
    'base_score': y_mean,
    'silent': 1
}

num_boost_rounds = 150

In [None]:
model = xgb.train(dict(xgb_params, silent=1), dtrain, 
                  num_boost_round=num_boost_rounds)

In [None]:
xgb_pred2 = model.predict(dtest)
pd.DataFrame(xgb_pred2).head()

### combine XGBoost results

In [None]:
xgb_pred = XGB1_WEIGHT * xgb_pred1 + (1-XGB1_WEIGHT) * xgb_pred2
pd.DataFrame(xgb_pred).head()

In [None]:
del train_df
del x_train
del x_test
del properties
del dtest
del dtrain
del xgb_pred1
del xgb_pred2
gc.collect()

## Neural Network

### read in data for Neural Network

In [None]:
train = pd.read_csv(data_dir / 'train_2016_v2.csv', parse_dates=['transactiondate'])
prop = pd.read_csv(data_dir / 'properties_2016.csv')
sample = pd.read_csv(data_dir / 'sample_submission.csv')

### fitting label encoder

In [None]:
for c in prop.columns:
    prop[c] = prop[c].fillna(-1)
    if prop[c].dtype == 'object':
        lbl = LabelEncoder()
        lbl.fit(list(prop[c].values))
        prop[c] = lbl.transform(list(prop[c].values))

### datetime

In [None]:
df_train = train.merge(prop, how='left', on='parcelid')

df_train['transactiondate'] = pd.to_datetime(df_train['transactiondate'])
df_train['transactiondate_year'] = df_train['transactiondate'].dt.year
df_train['transactiondate_month'] = df_train['transactiondate'].dt.month
df_train['transactiondate_quarter'] = df_train['transactiondate'].dt.quarter
df_train['transactiondate'] = df_train['transactiondate'].dt.day

### filling nan values

In [None]:
df_train.fillna(-1.0)

### creating x_train and y_train

In [None]:
x_train = df_train.drop(['parcelid', 'logerror', 'transactiondate', 
                         'propertyzoningdesc', 'propertycountylandusecode', 
                         'fireplacecnt', 'fireplaceflag'], axis=1)
y_train = df_train['logerror']

y_mean = np.mean(y_train)
x_train.shape, y_train.shape

In [None]:
train_columns = x_train.columns

In [None]:
for c in x_train.dtypes[x_train.dtypes == object].index.values:
    x_train[c] = (x_train[c] == True)

### creating df_test

In [None]:
sample['parcelid'] = sample['ParcelId']

In [None]:
df_test = sample.merge(prop, on='parcelid', how='left')

In [None]:
df_test["transactiondate"] = pd.to_datetime('2016-11-15')  # placeholder value for preliminary version
df_test["transactiondate_year"] = df_test["transactiondate"].dt.year
df_test["transactiondate_month"] = df_test["transactiondate"].dt.month
df_test['transactiondate_quarter'] = df_test['transactiondate'].dt.quarter
df_test["transactiondate"] = df_test["transactiondate"].dt.day     
x_test = df_test[train_columns]

In [None]:
x_test.shape

In [None]:
for c in x_test.dtypes[x_test.dtypes == object].index.values:
    x_test[c] = (x_test[c] == True)

### preprocessing

In [None]:
imputer = SimpleImputer()
imputer.fit(x_train.iloc[:, :])
x_train = imputer.transform(x_train.iloc[:, :])
x_test = imputer.transform(x_test.iloc[:, :])

In [None]:
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

len_x = int(x_train.shape[1])
len_x

### modeling

In [None]:
nn = Sequential()
nn.add(Dense(units = 400 , kernel_initializer = 'normal', input_dim = len_x))
nn.add(PReLU())
nn.add(Dropout(.4))
nn.add(Dense(units = 160 , kernel_initializer = 'normal'))
nn.add(PReLU())
nn.add(BatchNormalization())
nn.add(Dropout(.6))
nn.add(Dense(units = 64 , kernel_initializer = 'normal'))
nn.add(PReLU())
nn.add(BatchNormalization())
nn.add(Dropout(.5))
nn.add(Dense(units = 26, kernel_initializer = 'normal'))
nn.add(PReLU())
nn.add(BatchNormalization())
nn.add(Dropout(.6))
nn.add(Dense(1, kernel_initializer='normal'))

nn.compile(loss='mae', optimizer=Adam(lr=4e-3, decay=1e-4))

In [None]:
nn.summary()

In [None]:
nn.fit(np.array(x_train), np.array(y_train), batch_size=32, epochs=70, 
      verbose=2)

### predict

In [None]:
y_pred_ann = nn.predict(x_test)

In [None]:
nn_pred = y_pred_ann.flatten()

In [None]:
nn_pred.shape

In [None]:
sub = pd.read_csv(data_dir / 'sample_submission.csv')
for c in sub.columns[sub.columns != 'ParcelId']:
    sub[c] = nn_pred
sub.to_csv(predict_dir / 'nn.csv', index=False, float_format="%.4f")

### test score: 0.06460

In [None]:
del train
del prop
del sample
del x_train
del x_test
del df_train
del df_test
del y_pred_ann
gc.collect()

## OLS

In [50]:
np.random.seed(17)
random.seed(17)

In [51]:
train = pd.read_csv(data_dir / 'train_2016_v2.csv', parse_dates=['transactiondate'])
properties = pd.read_csv(data_dir / 'properties_2016.csv')
submission = pd.read_csv(data_dir / 'sample_submission.csv')
len(train), len(properties), len(submission)

  exec(code_obj, self.user_global_ns, self.user_ns)


(90275, 2985217, 2985217)

In [52]:
def get_features(df):
    df["transactiondate"] = pd.to_datetime(df["transactiondate"])
    df["transactiondate_year"] = df["transactiondate"].dt.year
    df["transactiondate_month"] = df["transactiondate"].dt.month
    df['transactiondate'] = df['transactiondate'].dt.quarter
    df = df.fillna(-1.0)
    return df

In [53]:
def MAE(y, ypred):
    #logerror=log(Zestimate)−log(SalePrice)
    return np.sum([abs(y[i]-ypred[i]) for i in range(len(y))]) / len(y)

In [54]:
train = pd.merge(train, properties, how='left', on='parcelid')
y = train['logerror'].values
test = pd.merge(submission, properties, how='left', left_on='ParcelId', right_on='parcelid')
properties = []

In [55]:
exc = [train.columns[c] for c in range(len(train.columns)) if train.dtypes[c] == object] + ['logerror', 'parcelid']
col = [c for c in train.columns if c not in exc]

In [56]:
train = get_features(train[col])
test['transactiondate'] = '2016-01-01'
test = get_features(test[col])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_

In [58]:
train.dtypes.value_counts()

float64    52
object      5
int64       3
dtype: int64

In [62]:
train.dtypes == object

transactiondate                 False
airconditioningtypeid           False
architecturalstyletypeid        False
basementsqft                    False
bathroomcnt                     False
bedroomcnt                      False
buildingclasstypeid             False
buildingqualitytypeid           False
calculatedbathnbr               False
decktypeid                      False
finishedfloor1squarefeet        False
calculatedfinishedsquarefeet    False
finishedsquarefeet12            False
finishedsquarefeet13            False
finishedsquarefeet15            False
finishedsquarefeet50            False
finishedsquarefeet6             False
fips                            False
fireplacecnt                    False
fullbathcnt                     False
garagecarcnt                    False
garagetotalsqft                 False
hashottuborspa                   True
heatingorsystemtypeid           False
latitude                        False
longitude                       False
lotsizesquar