In [1]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
import gc
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
print('Loading Properties...')
properties2016 = pd.read_csv('../input/properties_2016.csv', low_memory = False)
properties2017 = pd.read_csv('../input/properties_2017.csv', low_memory = False)

print('Loading Train...')
train2016 = pd.read_csv('../input/train_2016_v2.csv', parse_dates=['transactiondate'], low_memory=False)
train2017 = pd.read_csv('../input/train_2017.csv', parse_dates=['transactiondate'], low_memory=False)

print('Loading Sample ...')
sample_submission = pd.read_csv('../input/sample_submission.csv', low_memory=False)

Loading Properties...
Loading Train...
Loading Sample ...


In [3]:
def add_date_features(df):
    df["transaction_year"] = df["transactiondate"].dt.year
    df["transaction_month"] = df["transactiondate"].dt.month
    df["transaction_day"] = df["transactiondate"].dt.day
    df["transaction_quarter"] = df["transactiondate"].dt.quarter
    df.drop(["transactiondate"], inplace=True, axis=1)
    return df

In [4]:
train2016 = add_date_features(train2016)
train2017 = add_date_features(train2017)

sample_submission['parcelid'] = sample_submission['ParcelId']

print('Merge Train & Test with Properties...')
train2016 = pd.merge(train2016, properties2016, how='left', on='parcelid')
train2017 = pd.merge(train2017, properties2017, how='left', on='parcelid')
test_df = pd.merge(sample_submission, properties2016, how='left', on='parcelid')

print('Concat Train 2016 & 2017...')
train_df = pd.concat([train2016, train2017], axis=0)

del properties2016, properties2017, train2016, train2017
gc.collect();

print("Train: ", train_df.shape)
print("Test: ", test_df.shape)

Merge Train & Test with Properties...
Concat Train 2016 & 2017...
Train:  (167888, 63)
Test:  (2985217, 65)


In [5]:
# print ("Replacing NaN values by -999 !!")
# train_df.fillna(-999, inplace=True)
# test_df.fillna(-999, inplace=True)

In [6]:
# print(train_df['hashottuborspa'])
# for c in train_df.columns:
#     if c not in ['parcelid', 'logerror']:
#         print(c)
#         print(sum(np.isnan(train_df[c])), len(train_df[c]))
#         #plt.plot(train_df[c][np.isfinite(train_df[c])], train_df['logerror'], marker='o', linestyle = 'None',)
# #         plt.scatter(train_df['logerror'], train_df[c][np.isfinite(train_df[c])])
# #         plt.ylabel(c)
# #         plt.xlabel('logerror')
# #         plt.show()

In [7]:
# 98% нь хоосон бол хасна
missing_perc_thresh = 0.98
exclude_missing = []
num_rows = train_df.shape[0]
for c in train_df.columns:
    num_missing = train_df[c].isnull().sum()
    if num_missing == 0:
        continue
    missing_frac = num_missing / float(num_rows)
    if missing_frac > missing_perc_thresh:
        exclude_missing.append(c)
print("We exclude: %s" % exclude_missing)
print(len(exclude_missing))

We exclude: ['architecturalstyletypeid', 'basementsqft', 'buildingclasstypeid', 'decktypeid', 'finishedsquarefeet13', 'finishedsquarefeet6', 'poolsizesum', 'pooltypeid10', 'pooltypeid2', 'storytypeid', 'typeconstructiontypeid', 'yardbuildingsqft26', 'fireplaceflag']
13


In [8]:
exclude_unique = []
for c in train_df.columns:
    num_uniques = len(train_df[c].unique())
    if train_df[c].isnull().sum() != 0:
        num_uniques -= 1
    if num_uniques == 1:
        exclude_unique.append(c)
print("We exclude: %s" % exclude_unique)
print(len(exclude_unique))

We exclude: ['decktypeid', 'hashottuborspa', 'poolcnt', 'pooltypeid10', 'pooltypeid2', 'pooltypeid7', 'storytypeid', 'fireplaceflag', 'taxdelinquencyflag']
9


In [9]:
exclude_other = ['parcelid', 'logerror','propertyzoningdesc']
train_features = []
for c in train_df.columns:
    if c not in exclude_missing \
       and c not in exclude_other and c not in exclude_unique:
        train_features.append(c)
print("We use these for training: %s" % train_features)
print(len(train_features))

We use these for training: ['transaction_year', 'transaction_month', 'transaction_day', 'transaction_quarter', 'airconditioningtypeid', 'bathroomcnt', 'bedroomcnt', 'buildingqualitytypeid', 'calculatedbathnbr', 'finishedfloor1squarefeet', 'calculatedfinishedsquarefeet', 'finishedsquarefeet12', 'finishedsquarefeet15', 'finishedsquarefeet50', 'fips', 'fireplacecnt', 'fullbathcnt', 'garagecarcnt', 'garagetotalsqft', 'heatingorsystemtypeid', 'latitude', 'longitude', 'lotsizesquarefeet', 'propertycountylandusecode', 'propertylandusetypeid', 'rawcensustractandblock', 'regionidcity', 'regionidcounty', 'regionidneighborhood', 'regionidzip', 'roomcnt', 'threequarterbathnbr', 'unitcnt', 'yardbuildingsqft17', 'yearbuilt', 'numberofstories', 'structuretaxvaluedollarcnt', 'taxvaluedollarcnt', 'assessmentyear', 'landtaxvaluedollarcnt', 'taxamount', 'taxdelinquencyyear', 'censustractandblock']
43


In [10]:
cat_feature_inds = []
cat_unique_thresh = 1000
for i, c in enumerate(train_features):
    num_uniques = len(train_df[c].unique())
    if num_uniques < cat_unique_thresh \
       and not 'sqft' in c \
       and not 'cnt' in c \
       and not 'nbr' in c \
       and not 'number' in c:
        cat_feature_inds.append(i)
        
print("Cat features are: %s" % [train_features[ind] for ind in cat_feature_inds])

Cat features are: ['transaction_year', 'transaction_month', 'transaction_day', 'transaction_quarter', 'airconditioningtypeid', 'buildingqualitytypeid', 'fips', 'heatingorsystemtypeid', 'propertycountylandusecode', 'propertylandusetypeid', 'regionidcity', 'regionidcounty', 'regionidneighborhood', 'regionidzip', 'yearbuilt', 'assessmentyear', 'taxdelinquencyyear']


In [11]:
print ("Replacing NaN values by -999 !!")
train_df.fillna(-999, inplace=True)
test_df.fillna(-999, inplace=True)

Replacing NaN values by -999 !!


In [12]:
def print_feature_importance(model, pool, X_train):
    feature_importances = model.get_feature_importance(pool)
    feature_names = X_train.columns
    for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
        print('{}\t{}'.format(name, score))

In [13]:
X_train, X_test, y_train, y_test = train_test_split(train_df[train_features], train_df.logerror, test_size=0.2, random_state=99)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

all_pool = Pool(train_df[train_features], train_df.logerror, cat_feature_inds)
train_pool = Pool(X_train, y_train, cat_feature_inds)
test_pool = Pool(X_test, y_test, cat_feature_inds)

(134310, 43) (134310,)
(33578, 43) (33578,)


In [14]:
catboost_parameters = {
    'iterations': 400,
    'learning_rate': 0.035,
    'depth': 7,
    'verbose': 20,
#     'l2_leaf_reg': 1000,
    'task_type': 'GPU',
    'loss_function': 'MAE',
    'eval_metric': 'MAE',
    'random_seed': 0,
}

In [15]:
model = CatBoostRegressor(**catboost_parameters)
model.fit(train_pool, eval_set=test_pool)

0:	learn: 0.0685970	test: 0.0699163	best: 0.0699163 (0)	total: 38.3ms	remaining: 15.3s
20:	learn: 0.0677143	test: 0.0691734	best: 0.0691724 (19)	total: 904ms	remaining: 16.3s
40:	learn: 0.0674822	test: 0.0691351	best: 0.0691332 (39)	total: 1.73s	remaining: 15.1s
60:	learn: 0.0672751	test: 0.0690890	best: 0.0690864 (59)	total: 2.53s	remaining: 14s
80:	learn: 0.0671101	test: 0.0690606	best: 0.0690606 (80)	total: 3.31s	remaining: 13s
100:	learn: 0.0669791	test: 0.0690428	best: 0.0690409 (96)	total: 4.19s	remaining: 12.4s
120:	learn: 0.0668344	test: 0.0690401	best: 0.0690401 (120)	total: 5.01s	remaining: 11.5s
140:	learn: 0.0666984	test: 0.0690549	best: 0.0690401 (120)	total: 5.81s	remaining: 10.7s
160:	learn: 0.0665935	test: 0.0690562	best: 0.0690401 (120)	total: 6.59s	remaining: 9.79s
180:	learn: 0.0664827	test: 0.0690363	best: 0.0690363 (180)	total: 7.4s	remaining: 8.95s
200:	learn: 0.0663762	test: 0.0690351	best: 0.0690342 (194)	total: 8.2s	remaining: 8.11s
220:	learn: 0.0662399	test: 

<catboost.core.CatBoostRegressor at 0x7c41393016d8>

In [16]:
print_feature_importance(model, train_pool, X_train)

regionidzip	7.724040631830069
yearbuilt	6.398873905798383
transaction_month	6.0486148783697855
finishedsquarefeet12	5.887595345735547
propertycountylandusecode	5.366697273336976
transaction_day	5.1986531767461015
regionidcity	5.1939020286442545
taxamount	4.722268908410884
regionidneighborhood	4.259706820111791
calculatedfinishedsquarefeet	3.7029941653460443
lotsizesquarefeet	3.6740485135729957
latitude	2.6329125494010617
structuretaxvaluedollarcnt	2.5889561539003307
propertylandusetypeid	2.5846271892814894
heatingorsystemtypeid	2.5550813763448494
transaction_quarter	2.501783396865115
buildingqualitytypeid	2.4521815972559797
taxvaluedollarcnt	2.4343941689869077
landtaxvaluedollarcnt	2.3977922113805183
longitude	2.39727912674393
assessmentyear	1.903621303438252
censustractandblock	1.8124729004581803
taxdelinquencyyear	1.7388974023329575
rawcensustractandblock	1.407794902449785
airconditioningtypeid	1.3714343119841792
transaction_year	1.3525987989624466
bedroomcnt	1.2370149042560297
fips	

In [17]:
# submission = pd.DataFrame({
#     'ParcelId': test_df['parcelid'],
# })

# test_dates = {
#     '201610': pd.Timestamp('2016-09-30'),
#     '201611': pd.Timestamp('2016-10-31'),
#     '201612': pd.Timestamp('2016-11-30'),
#     '201710': pd.Timestamp('2017-09-30'),
#     '201711': pd.Timestamp('2017-10-31'),
#     '201712': pd.Timestamp('2017-11-30')
# }

# for label, test_date in test_dates.items():
#     print("Predicting for: %s ... " % (label))
#     test_df['transactiondate'] = test_date
#     test_df = add_date_features(test_df)
#     y_pred = model.predict(test_df[train_features])
#     submission[label] = y_pred

# submission_major = 1
# print("Creating submission: submission_%03d.csv ..." % (submission_major))
# submission.to_csv(
#     'submission_%03d.csv' % (submission_major),
#     float_format='%.4f',
#     index=False)
# print("Finished.")

In [18]:
num_ensembles = 5
# ensemble models
models = [None] * num_ensembles
for i in range(num_ensembles):
    print("\nTraining (ensemble): %d ..." % (i))
    catboost_parameters['random_seed'] = i
    models[i] = CatBoostRegressor(**catboost_parameters)
    models[i].fit(train_pool, eval_set=test_pool)
    print('-- Feature Importance --')
    print_feature_importance(models[i], train_pool, X_train)


Training (ensemble): 0 ...
0:	learn: 0.0685969	test: 0.0699163	best: 0.0699163 (0)	total: 33.1ms	remaining: 13.2s
20:	learn: 0.0677143	test: 0.0691734	best: 0.0691724 (19)	total: 851ms	remaining: 15.4s
40:	learn: 0.0674822	test: 0.0691351	best: 0.0691332 (39)	total: 1.63s	remaining: 14.3s
60:	learn: 0.0672751	test: 0.0690890	best: 0.0690863 (59)	total: 2.44s	remaining: 13.6s
80:	learn: 0.0671101	test: 0.0690606	best: 0.0690606 (80)	total: 3.25s	remaining: 12.8s
100:	learn: 0.0669791	test: 0.0690428	best: 0.0690408 (96)	total: 4.11s	remaining: 12.2s
120:	learn: 0.0668344	test: 0.0690401	best: 0.0690401 (120)	total: 4.93s	remaining: 11.4s
140:	learn: 0.0666984	test: 0.0690549	best: 0.0690401 (120)	total: 5.75s	remaining: 10.6s
160:	learn: 0.0665935	test: 0.0690562	best: 0.0690401 (120)	total: 6.55s	remaining: 9.72s
180:	learn: 0.0664827	test: 0.0690363	best: 0.0690363 (180)	total: 7.35s	remaining: 8.89s
200:	learn: 0.0663761	test: 0.0690351	best: 0.0690342 (194)	total: 8.13s	remaining: 

In [19]:
submission = pd.DataFrame({
    'ParcelId': test_df['parcelid'],
})

test_dates = {
    '201610': pd.Timestamp('2016-09-30'),
    '201611': pd.Timestamp('2016-10-31'),
    '201612': pd.Timestamp('2016-11-30'),
    '201710': pd.Timestamp('2017-09-30'),
    '201711': pd.Timestamp('2017-10-31'),
    '201712': pd.Timestamp('2017-11-30')
}

for label, test_date in test_dates.items():
    print("Predicting for: %s ... " % (label))
    test_df['transactiondate'] = test_date
    test_df = add_date_features(test_df)
    y_pred = 0.0
    for i in range(num_ensembles):
        print("Ensemble:", i)
        y_pred += models[i].predict(test_df[train_features])
    y_pred /= num_ensembles
    submission[label] = y_pred

submission_major = 2
print("Creating submission: submission_%03d.csv ..." % (submission_major))
submission.to_csv(
    'submission_%03d.csv' % (submission_major),
    float_format='%.4f',
    index=False)
print("Finished.")

Predicting for: 201610 ... 
Ensemble: 0
Ensemble: 1
Ensemble: 2
Ensemble: 3
Ensemble: 4
Predicting for: 201611 ... 
Ensemble: 0
Ensemble: 1
Ensemble: 2
Ensemble: 3
Ensemble: 4
Predicting for: 201612 ... 
Ensemble: 0
Ensemble: 1
Ensemble: 2
Ensemble: 3
Ensemble: 4
Predicting for: 201710 ... 
Ensemble: 0
Ensemble: 1
Ensemble: 2
Ensemble: 3
Ensemble: 4
Predicting for: 201711 ... 
Ensemble: 0
Ensemble: 1
Ensemble: 2
Ensemble: 3
Ensemble: 4
Predicting for: 201712 ... 
Ensemble: 0
Ensemble: 1
Ensemble: 2
Ensemble: 3
Ensemble: 4
Creating submission: submission_002.csv ...
Finished.
