In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from catboost import CatBoostRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
from sklearn.linear_model import LinearRegression
import random
import gc
import datetime as dt

In [3]:
# Parameters
XGB_WEIGHT = 0.6500
BASELINE_WEIGHT = 0.0056
BASELINE_PRED = 0.0115

In [4]:
# Load in Zillow dataset
print( "\n--- Loading Data ---")
print("\n - properties2016 [include properties (2016) features: NumOfBedRoom etc]")
print("\n - train2016 contains the logError between Zillow Est and Real Sales (2016)")
print("\n - properties2017 [include properties (2017) features: NumOfBedRoom etc]")
print("\n - train2017 contains the logError between Zillow Est and Real Sales (2017)")

properties2016 = pd.read_csv('../input/properties_2016.csv',low_memory=False)
train2016 = pd.read_csv("../input/train_2016_v2.csv",parse_dates=['transactiondate'],low_memory=False)
properties2017 = pd.read_csv('../input/properties_2017.csv',low_memory=False)
train2017 = pd.read_csv("../input/train_2017.csv",parse_dates=['transactiondate'],low_memory=False)



--- Loading Data ---


In [5]:
def add_date_features(df):
    df["transaction_year"] = df["transactiondate"].dt.year
    df["transaction_month"] = (df["transactiondate"].dt.year - 2016)*12 + df["transactiondate"].dt.month
    df["transaction_day"] = df["transactiondate"].dt.day
    df["transaction_quarter"] = (df["transactiondate"].dt.year - 2016)*4 +df["transactiondate"].dt.quarter
    df.drop(["transactiondate"], inplace=True, axis=1)
    return df

In [6]:
print("\n Add the Time and Date Feature into dataset (Year-Month-Day-Quarter)")
train2016 = add_date_features(train2016)
train2017 = add_date_features(train2017)

In [7]:
print('Merge Train with Properties ...')
train2016 = pd.merge(train2016, properties2016, how = 'left', on = 'parcelid')
train2017 = pd.merge(train2017, properties2017, how = 'left', on = 'parcelid')

Merge Train with Properties ...


In [8]:
print('Tax Features 2017  ...')
train2017.iloc[:, train2017.columns.str.startswith('tax')] = np.nan

Tax Features 2017  ...


In [9]:
print('Loading Submission Sample ...')
sample_submission = pd.read_csv('../input/sample_submission.csv', low_memory = False)

Loading Sample ...


In [10]:
print('Concat Train 2016 & 2017 ...')
train_df = pd.concat([train2016, train2017], axis = 0)
test_df = pd.merge(sample_submission[['ParcelId']], properties2016.rename(columns = {'parcelid': 'ParcelId'}), how = 'left', on = 'ParcelId')

del properties2016, properties2017, train2016, train2017
gc.collect();

Concat Train 2016 & 2017 ...


In [23]:
print("\n Showing the head of tran_df")
train_df.head()


 Showing the head of tran_df


Unnamed: 0,parcelid,logerror,transaction_year,transaction_month,transaction_day,transaction_quarter,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,...,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock
0,11016594,0.0276,2016,1,1,1,1.0,-999.0,-999.0,2.0,...,-999.0,-999,122754.0,360170.0,2015.0,237416.0,6735.88,-999,-999.0,60371070000000.0
1,14366692,-0.1684,2016,1,1,1,-999.0,-999.0,-999.0,3.5,...,-999.0,-999,346458.0,585529.0,2015.0,239071.0,10153.02,-999,-999.0,-999.0
2,12098116,-0.004,2016,1,1,1,1.0,-999.0,-999.0,3.0,...,-999.0,-999,61994.0,119906.0,2015.0,57912.0,11484.48,-999,-999.0,60374640000000.0
3,12643413,0.0218,2016,1,2,1,1.0,-999.0,-999.0,2.0,...,-999.0,-999,171518.0,244880.0,2015.0,73362.0,3048.74,-999,-999.0,60372960000000.0
4,14432541,-0.005,2016,1,2,1,-999.0,-999.0,-999.0,2.5,...,2.0,-999,169574.0,434551.0,2015.0,264977.0,5488.96,-999,-999.0,60590420000000.0


In [24]:
print("\n Save train_df to csv for more indepth look")
train_df.to_csv('train_df.csv', float_format='%.6f',index=False)


 Save train_df to csv for more indepth look


In [11]:
print('Remove missing data fields ...')

missing_perc_thresh = 0.98
exclude_missing = []
num_rows = train_df.shape[0]
for c in train_df.columns:
    num_missing = train_df[c].isnull().sum()
    if num_missing == 0:
        continue
    missing_frac = num_missing / float(num_rows)
    if missing_frac > missing_perc_thresh:
        exclude_missing.append(c)
print("We exclude: %s" % len(exclude_missing))

del num_rows, missing_perc_thresh
gc.collect();

Remove missing data fields ...
We exclude: 15


In [12]:
# Not sure why we need to remove features with one unique value
print ("Remove features with one unique value !!")
exclude_unique = []
for c in train_df.columns:
    num_uniques = len(train_df[c].unique())
    if train_df[c].isnull().sum() != 0:
        num_uniques -= 1
    if num_uniques == 1:
        exclude_unique.append(c)
print("We exclude: %s" % len(exclude_unique))

Remove features with one unique value !!
We exclude: 9


In [13]:
print ("Define training features !!")
exclude_other = ['parcelid', 'logerror','propertyzoningdesc']
train_features = []
for c in train_df.columns:
    if c not in exclude_missing \
       and c not in exclude_other and c not in exclude_unique:
        train_features.append(c)
print("We use these for training: %s" % len(train_features))

Define training features !!
We use these for training: 42


In [14]:
print ("Define categorial features !!")
cat_feature_inds = []
cat_unique_thresh = 1000
for i, c in enumerate(train_features):
    num_uniques = len(train_df[c].unique())
    if num_uniques < cat_unique_thresh \
       and not 'sqft' in c \
       and not 'cnt' in c \
       and not 'nbr' in c \
       and not 'number' in c:
        cat_feature_inds.append(i)
        
print("Cat features are: %s" % [train_features[ind] for ind in cat_feature_inds])

print ("Replacing NaN values by -999 !!")
train_df.fillna(-999, inplace=True)
test_df.fillna(-999, inplace=True)

Define categorial features !!
Cat features are: ['transaction_year', 'transaction_month', 'transaction_day', 'transaction_quarter', 'airconditioningtypeid', 'buildingqualitytypeid', 'fips', 'heatingorsystemtypeid', 'propertycountylandusecode', 'propertylandusetypeid', 'regionidcity', 'regionidcounty', 'regionidneighborhood', 'regionidzip', 'yearbuilt', 'assessmentyear']
Replacing NaN values by -999 !!


In [15]:
print ("Training time !!")
X_train = train_df[train_features]
y_train = train_df.logerror
y_train_GBM = train_df['logerror'].values
print(X_train.shape, y_train.shape, y_train_GBM.shape)

Training time !!
(167888, 42) (167888,) (167888,)


In [16]:
# Try to debug the LightGBM
print( "\nProcessing data for LightGBM ..." )
for c, dtype in zip(X_train.columns, X_train.dtypes):
    if dtype == np.float64:
        X_train[c] = X_train[c].astype(np.float32)


Processing data for LightGBM ...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [17]:
test_df['transactiondate'] = pd.Timestamp('2016-12-01') 
test_df = add_date_features(test_df)
X_test = test_df[train_features]
print(X_test.shape)

(2985217, 42)


In [18]:
num_ensembles = 5
y_pred = 0.0
for i in tqdm(range(num_ensembles)):
    model = CatBoostRegressor(
        iterations=630, learning_rate=0.03,
        depth=6, l2_leaf_reg=3,
        loss_function='MAE',
        eval_metric='MAE',
        random_seed=i)
    model.fit(
        X_train, y_train,
        cat_features=cat_feature_inds)
    y_pred += model.predict(X_test)
y_pred /= num_ensembles

  0%|          | 0/5 [00:00<?, ?it/s]

0:	learn: 0.0689284	total: 1.21s	remaining: 12m 42s
1:	learn: 0.0687339	total: 2.12s	remaining: 11m 5s
2:	learn: 0.0686147	total: 3.03s	remaining: 10m 32s
3:	learn: 0.0685249	total: 4.62s	remaining: 12m 2s
4:	learn: 0.0684645	total: 5.47s	remaining: 11m 24s
5:	learn: 0.0684146	total: 6.38s	remaining: 11m 3s
6:	learn: 0.0683836	total: 7.3s	remaining: 10m 49s
7:	learn: 0.0683441	total: 8.19s	remaining: 10m 36s
8:	learn: 0.0683068	total: 8.86s	remaining: 10m 11s
9:	learn: 0.0682771	total: 9.62s	remaining: 9m 56s
10:	learn: 0.0682595	total: 10.5s	remaining: 9m 48s
11:	learn: 0.0682460	total: 11.2s	remaining: 9m 34s
12:	learn: 0.0682315	total: 12s	remaining: 9m 27s
13:	learn: 0.0682165	total: 12.7s	remaining: 9m 19s
14:	learn: 0.0681989	total: 13.7s	remaining: 9m 22s
15:	learn: 0.0681881	total: 14.4s	remaining: 9m 11s
16:	learn: 0.0681701	total: 15.3s	remaining: 9m 11s
17:	learn: 0.0681500	total: 16.1s	remaining: 9m 7s
18:	learn: 0.0681442	total: 16.8s	remaining: 8m 58s
19:	learn: 0.0681221

156:	learn: 0.0673782	total: 2m 37s	remaining: 7m 54s
157:	learn: 0.0673748	total: 2m 38s	remaining: 7m 54s
158:	learn: 0.0673733	total: 2m 41s	remaining: 7m 56s
159:	learn: 0.0673696	total: 2m 48s	remaining: 8m 13s
160:	learn: 0.0673661	total: 2m 52s	remaining: 8m 22s
161:	learn: 0.0673634	total: 2m 55s	remaining: 8m 28s
162:	learn: 0.0673622	total: 2m 58s	remaining: 8m 30s
163:	learn: 0.0673579	total: 3m	remaining: 8m 32s
164:	learn: 0.0673542	total: 3m 2s	remaining: 8m 34s
165:	learn: 0.0673505	total: 3m 3s	remaining: 8m 33s
166:	learn: 0.0673442	total: 3m 4s	remaining: 8m 31s
167:	learn: 0.0673411	total: 3m 6s	remaining: 8m 32s
168:	learn: 0.0673360	total: 3m 7s	remaining: 8m 32s
169:	learn: 0.0673324	total: 3m 9s	remaining: 8m 34s
170:	learn: 0.0673293	total: 3m 11s	remaining: 8m 34s
171:	learn: 0.0673242	total: 3m 12s	remaining: 8m 32s
172:	learn: 0.0673202	total: 3m 13s	remaining: 8m 31s
173:	learn: 0.0673162	total: 3m 14s	remaining: 8m 29s
174:	learn: 0.0673114	total: 3m 15s	re

309:	learn: 0.0668972	total: 5m 59s	remaining: 6m 11s
310:	learn: 0.0668950	total: 6m	remaining: 6m 9s
311:	learn: 0.0668933	total: 6m 1s	remaining: 6m 8s
312:	learn: 0.0668911	total: 6m 2s	remaining: 6m 6s
313:	learn: 0.0668904	total: 6m 2s	remaining: 6m 5s
314:	learn: 0.0668882	total: 6m 3s	remaining: 6m 3s
315:	learn: 0.0668848	total: 6m 5s	remaining: 6m 2s
316:	learn: 0.0668822	total: 6m 6s	remaining: 6m 1s
317:	learn: 0.0668781	total: 6m 7s	remaining: 6m
318:	learn: 0.0668745	total: 6m 8s	remaining: 5m 59s
319:	learn: 0.0668738	total: 6m 9s	remaining: 5m 58s
320:	learn: 0.0668710	total: 6m 10s	remaining: 5m 56s
321:	learn: 0.0668679	total: 6m 11s	remaining: 5m 55s
322:	learn: 0.0668654	total: 6m 12s	remaining: 5m 53s
323:	learn: 0.0668631	total: 6m 13s	remaining: 5m 52s
324:	learn: 0.0668599	total: 6m 14s	remaining: 5m 51s
325:	learn: 0.0668554	total: 6m 15s	remaining: 5m 50s
326:	learn: 0.0668538	total: 6m 16s	remaining: 5m 48s
327:	learn: 0.0668497	total: 6m 16s	remaining: 5m 47

463:	learn: 0.0665387	total: 7m 56s	remaining: 2m 50s
464:	learn: 0.0665367	total: 7m 56s	remaining: 2m 49s
465:	learn: 0.0665361	total: 7m 57s	remaining: 2m 48s
466:	learn: 0.0665345	total: 7m 58s	remaining: 2m 46s
467:	learn: 0.0665329	total: 7m 58s	remaining: 2m 45s
468:	learn: 0.0665318	total: 7m 59s	remaining: 2m 44s
469:	learn: 0.0665303	total: 7m 59s	remaining: 2m 43s
470:	learn: 0.0665278	total: 8m	remaining: 2m 42s
471:	learn: 0.0665233	total: 8m 1s	remaining: 2m 41s
472:	learn: 0.0665195	total: 8m 1s	remaining: 2m 39s
473:	learn: 0.0665170	total: 8m 2s	remaining: 2m 38s
474:	learn: 0.0665130	total: 8m 2s	remaining: 2m 37s
475:	learn: 0.0665098	total: 8m 3s	remaining: 2m 36s
476:	learn: 0.0665085	total: 8m 4s	remaining: 2m 35s
477:	learn: 0.0665059	total: 8m 4s	remaining: 2m 34s
478:	learn: 0.0665052	total: 8m 5s	remaining: 2m 32s
479:	learn: 0.0665035	total: 8m 5s	remaining: 2m 31s
480:	learn: 0.0664999	total: 8m 6s	remaining: 2m 30s
481:	learn: 0.0664989	total: 8m 7s	remaini

617:	learn: 0.0662416	total: 9m 39s	remaining: 11.2s
618:	learn: 0.0662406	total: 9m 40s	remaining: 10.3s
619:	learn: 0.0662393	total: 9m 41s	remaining: 9.38s
620:	learn: 0.0662378	total: 9m 42s	remaining: 8.44s
621:	learn: 0.0662370	total: 9m 43s	remaining: 7.51s
622:	learn: 0.0662355	total: 9m 44s	remaining: 6.57s
623:	learn: 0.0662341	total: 9m 45s	remaining: 5.63s
624:	learn: 0.0662331	total: 9m 46s	remaining: 4.69s
625:	learn: 0.0662317	total: 9m 47s	remaining: 3.75s
626:	learn: 0.0662278	total: 9m 48s	remaining: 2.81s
627:	learn: 0.0662257	total: 9m 49s	remaining: 1.88s
628:	learn: 0.0662250	total: 9m 49s	remaining: 938ms
629:	learn: 0.0662240	total: 9m 50s	remaining: 0us


KeyboardInterrupt: 

In [19]:
# Light GBM Parameter setting
params = {}
params['max_bin'] = 10
params['learning_rate'] = 0.0021 # shrinkage_rate
params['boosting_type'] = 'gbdt'
params['objective'] = 'regression'
params['metric'] = 'l1'          # or 'mae'
params['sub_feature'] = 0.5      # feature_fraction -- OK, back to .5, but maybe later increase this
params['bagging_fraction'] = 0.85 # sub_row
params['bagging_freq'] = 40
params['num_leaves'] = 512        # num_leaf
params['min_data'] = 500         # min_data_in_leaf
params['min_hessian'] = 0.05     # min_sum_hessian_in_leaf
params['verbose'] = 0

In [21]:
# Create training dataset for LightGBM
d_train = lgb.Dataset(X_train, label=y_train_GBM)

In [22]:
print("\nFitting LightGBM model ...")
clf = lgb.train(params, d_train, 430)  # DataFrame.dtypes for data must be int, float or bool. Did not expect the data types in fields propertycountylandusecode
# Need to check d_train, d_train is generate from X_train and y_train. Let's check if they are float, int or bool


Fitting LightGBM model ...


ValueError: DataFrame.dtypes for data must be int, float or bool. Did not expect the data types in fields propertycountylandusecode

In [None]:
del d_train; gc.collect()
del x_train; gc.collect()

In [None]:
print("\nSetting up data for XGBoost ...")
# xgboost params
xgb_params = {
    'eta': 0.037,
    'max_depth': 5,
    'subsample': 0.80,
    'objective': 'reg:linear',
    'eval_metric': 'mae',
    'lambda': 0.8,   
    'alpha': 0.4, 
    'base_score': y_mean,
    'silent': 1
}

In [None]:
dtrain = xgb.DMatrix(x_train, y_train)
dtest = xgb.DMatrix(x_test)

In [None]:
num_boost_rounds = 242
print("\nXGBoost tuned with CV in:")
print("   https://www.kaggle.com/aharless/xgboost-without-outliers-tweak ")
print("num_boost_rounds="+str(num_boost_rounds))

# train model
print( "\nTraining XGBoost ...")
model = xgb.train(dict(xgb_params, silent=1), dtrain, num_boost_round=num_boost_rounds)

print( "\nPredicting with XGBoost ...")
xgb_pred = model.predict(dtest)

print( "\nXGBoost predictions:" )
print( pd.DataFrame(xgb_pred).head() )

In [None]:
print( "\nCombining XGBoost, LightGBM, and baseline predicitons ..." )
lgb_weight = 1 - XGB_WEIGHT - BASELINE_WEIGHT
pred = XGB_WEIGHT*xgb_pred + BASELINE_WEIGHT*BASELINE_PRED + lgb_weight*p_test

print( "\nCombined predictions:" )
print( pd.DataFrame(pred).head() )

In [None]:
submission = pd.DataFrame({
    'ParcelId': test_df['ParcelId'],
})

In [None]:
test_dates = {
    '201610': pd.Timestamp('2016-09-30'),
    '201611': pd.Timestamp('2016-10-31'),
    '201612': pd.Timestamp('2016-11-30'),
    '201710': pd.Timestamp('2017-09-30'),
    '201711': pd.Timestamp('2017-10-31'),
    '201712': pd.Timestamp('2017-11-30')
}

In [None]:
for label, test_date in test_dates.items():
    print("Predicting for: %s ... " % (label))
    submission[label] = y_pred
    
    
submission.to_csv('Only_CatBoost.csv', float_format='%.6f',index=False)