In [None]:
MAKE_SUBMISSION = True          # Generate output file.
CV_ONLY = False                 # Do validation only; do not generate predicitons.
FIT_FULL_TRAIN_SET = True       # Fit model to full training set after doing validation.
FIT_2017_TRAIN_SET = False      # Use 2017 training data for full fit (no leak correction)
FIT_COMBINED_TRAIN_SET = True   # Fit combined 2016-2017 training set
USE_SEASONAL_FEATURES = True
VAL_SPLIT_DATE = '2016-09-15'   # Cutoff date for validation split
LEARNING_RATE = 0.007           # shrinkage rate for boosting roudns
ROUNDS_PER_ETA = 20             # maximum number of boosting rounds times learning rate
OPTIMIZE_FUDGE_FACTOR = False   # Optimize factor by which to multiply predictions.
FUDGE_FACTOR_SCALEDOWN = 0.3    # exponent to reduce optimized fudge factor for prediction

In [None]:
import gc
import pylab
import calendar
import numpy as np
import pandas as pd
import seaborn as sn
from scipy import stats
import missingno as msno
from datetime import datetime
import matplotlib
import matplotlib.pyplot as plt
from scipy.stats import kendalltau
from sklearn import preprocessing
import warnings
matplotlib.style.use('ggplot')
pd.options.mode.chained_assignment = None
warnings.filterwarnings("ignore")
%matplotlib inline

#XGB import comes here
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error
import patsy
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.regression.quantile_regression import QuantReg
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline

In [None]:
properties2016 = pd.read_csv('../input/properties_2016.csv', low_memory = False)
properties2017 = pd.read_csv('../input/properties_2017.csv', low_memory = False)

# Number of properties in the zip
zip_count = properties2016['regionidzip'].value_counts().to_dict()
# Number of properties in the city
city_count = properties2016['regionidcity'].value_counts().to_dict()
# Median year of construction by neighborhood
med_year = properties2016.groupby('regionidneighborhood')['yearbuilt'].aggregate('median').to_dict()
# Mean square feet by neighborhood
mean_area = properties2016.groupby('regionidneighborhood')['calculatedfinishedsquarefeet'].aggregate('mean').to_dict()
# Neighborhood latitude and longitude
med_lat = properties2016.groupby('regionidneighborhood')['latitude'].aggregate('median').to_dict()
med_long = properties2016.groupby('regionidneighborhood')['longitude'].aggregate('median').to_dict()

# For this one it only read for the 2016 for now, it will be concat at the later stage
train = pd.read_csv("../input/train_2016_v2.csv")
for c in properties2016.columns:
    properties2016[c]=properties2016[c].fillna(-1)
    if properties2016[c].dtype == 'object':
        lbl = LabelEncoder()
        lbl.fit(list(properties2016[c].values))
        properties2016[c] = lbl.transform(list(properties2016[c].values))

In [None]:
train2016 = train.merge(properties2016, how='left', on='parcelid')
select_qtr4 = pd.to_datetime(train2016["transactiondate"]) >= VAL_SPLIT_DATE
if USE_SEASONAL_FEATURES:
    basedate = pd.to_datetime('2015-11-15').toordinal()


In [None]:
del train
gc.collect()

In [None]:
# Inputs to features that depend on target variable
# (Ideally these should be recalculated, and the dependent features recalculated,
#  when fitting to the full training set.  But I haven't implemented that yet.)

# Standard deviation of target value for properties in the city/zip/neighborhood
city_std = train2016[~select_qtr4].groupby('regionidcity')['logerror'].aggregate("std").to_dict()
zip_std = train2016[~select_qtr4].groupby('regionidzip')['logerror'].aggregate("std").to_dict()
hood_std = train2016[~select_qtr4].groupby('regionidneighborhood')['logerror'].aggregate("std").to_dict()

In [None]:
def clean_type(prop):
    # Convert Object features to Categorical
    # Convert float64 variables to float32
    for col in prop.columns:
        if prop[col].dtype.name =='object':
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(prop[col].values)) 
            prop[col] = lbl.transform(list(prop[col].values))
        elif prop[col].dtype.name == 'float64':
            prop[col] = prop[col].astype(np.float32)

    gc.collect()


In [None]:
def calculate_features(df):
    # Nikunj's features
    # Number of properties in the zip
    df['N-zip_count'] = df['regionidzip'].map(zip_count)
    # Number of properties in the city
    df['N-city_count'] = df['regionidcity'].map(city_count)
    # Does property have a garage, pool or hot tub and AC?
    df['N-GarPoolAC'] = ((df['garagecarcnt']>0) & \
                         (df['pooltypeid10']>0) & \
                         (df['airconditioningtypeid']!=5))*1 

    # More features
    # Mean square feet of neighborhood properties
    df['mean_area'] = df['regionidneighborhood'].map(mean_area)
    # Median year of construction of neighborhood properties
    df['med_year'] = df['regionidneighborhood'].map(med_year)
    # Neighborhood latitude and longitude
    df['med_lat'] = df['regionidneighborhood'].map(med_lat)
    df['med_long'] = df['regionidneighborhood'].map(med_long)

    df['zip_std'] = df['regionidzip'].map(zip_std)
    df['city_std'] = df['regionidcity'].map(city_std)
    df['hood_std'] = df['regionidneighborhood'].map(hood_std)
    
    if USE_SEASONAL_FEATURES:
        df['cos_season'] = ( (pd.to_datetime(df['transactiondate']).apply(lambda x: x.toordinal()-basedate)) * \
                             (2*np.pi/365.25) ).apply(np.cos)
        df['sin_season'] = ( (pd.to_datetime(df['transactiondate']).apply(lambda x: x.toordinal()-basedate)) * \
                             (2*np.pi/365.25) ).apply(np.sin)


In [None]:
dropvars = ['airconditioningtypeid', 'buildingclasstypeid',
            'buildingqualitytypeid', 'regionidcity']
droptrain = ['parcelid', 'logerror', 'transactiondate']
droptest = ['ParcelId']

In [None]:
calculate_features(train2016)

x_valid = train2016.drop(dropvars+droptrain, axis=1)[select_qtr4]
y_valid = train2016["logerror"].values.astype(np.float32)[select_qtr4]

print('Shape full training set: {}'.format(train2016.shape))
print('Dropped vars: {}'.format(len(dropvars+droptrain)))
print('Shape valid X: {}'.format(x_valid.shape))
print('Shape valid y: {}'.format(y_valid.shape))

train2016=train2016[ train2016.logerror > -0.4 ]
train2016=train2016[ train2016.logerror < 0.419 ]
print('\nFull training set after removing outliers, before dropping vars:')     
print('Shape training set: {}\n'.format(train2016.shape))

if FIT_FULL_TRAIN_SET:
    full_train = train2016.copy()

train2016=train2016[~select_qtr4]
x_train=train2016.drop(dropvars+droptrain, axis=1)
y_train = train2016["logerror"].values.astype(np.float32)
y_mean = np.mean(y_train)
n_train = x_train.shape[0]
print('Training subset after removing outliers:')     
print('Shape train X: {}'.format(x_train.shape))
print('Shape train y: {}'.format(y_train.shape))

if FIT_FULL_TRAIN_SET:
    x_full = full_train.drop(dropvars+droptrain, axis=1)
    y_full = full_train["logerror"].values.astype(np.float32)
    n_full = x_full.shape[0]
    print('\nFull trainng set:')     
    print('Shape train X: {}'.format(x_train.shape))
    print('Shape train y: {}'.format(y_train.shape))

In [None]:
if not CV_ONLY:
    # Generate test set data
    
    sample_submission = pd.read_csv('../input/sample_submission.csv', low_memory = False)
    
    # Process properties for 2016
    test_df = pd.merge( sample_submission[['ParcelId']], 
                        properties2016.rename(columns = {'parcelid': 'ParcelId'}), 
                        how = 'left', on = 'ParcelId' )
    if USE_SEASONAL_FEATURES:
        test_df['transactiondate'] = '2016-10-31'
        droptest += ['transactiondate']
    calculate_features(test_df)
    x_test = test_df.drop(dropvars+droptest, axis=1)
    print('Shape test: {}'.format(x_test.shape))

    # Process properties for 2017
    for c in properties2017.columns:
        properties2017[c]=properties2017[c].fillna(-1)
        if properties2017[c].dtype == 'object':
            lbl = LabelEncoder()
            lbl.fit(list(properties2017[c].values))
            properties2017[c] = lbl.transform(list(properties2017[c].values))
    zip_count = properties2017['regionidzip'].value_counts().to_dict()
    city_count = properties2017['regionidcity'].value_counts().to_dict()
    med_year = properties2017.groupby('regionidneighborhood')['yearbuilt'].aggregate('median').to_dict()
    mean_area = properties2017.groupby('regionidneighborhood')['calculatedfinishedsquarefeet'].aggregate('mean').to_dict()
    med_lat = properties2017.groupby('regionidneighborhood')['latitude'].aggregate('median').to_dict()
    med_long = properties2017.groupby('regionidneighborhood')['longitude'].aggregate('median').to_dict()

    test_df = pd.merge( sample_submission[['ParcelId']], 
                        properties2017.rename(columns = {'parcelid': 'ParcelId'}), 
                        how = 'left', on = 'ParcelId' )
    if USE_SEASONAL_FEATURES:
        test_df['transactiondate'] = '2017-10-31'
    calculate_features(test_df)
    x_test17 = test_df.drop(dropvars+droptest, axis=1)

    del test_df

In [None]:
del train2016
del select_qtr4
gc.collect()

In [None]:
xgb_params = {  # best as of 2017-09-28 13:20 UTC
    'eta': LEARNING_RATE,
    'max_depth': 7, 
    'subsample': 0.6,
    'objective': 'reg:linear',
    'eval_metric': 'mae',
    'lambda': 5.0,
    'alpha': 0.65,
    'colsample_bytree': 0.5,
    'base_score': y_mean,'taxdelinquencyyear'
    'silent': 1
}

clean_type(x_train)
clean_type(x_valid)
clean_type(x_test)
clean_type(x_test17)

dtrain = xgb.DMatrix(x_train, y_train)
dvalid_x = xgb.DMatrix(x_valid)
dvalid_xy = xgb.DMatrix(x_valid, y_valid)
if not CV_ONLY:
    dtest = xgb.DMatrix(x_test)
    dtest17 = xgb.DMatrix(x_test17)
    del x_test

In [None]:
del x_train
gc.collect()

In [None]:
num_boost_rounds = round( ROUNDS_PER_ETA / xgb_params['eta'] )
early_stopping_rounds = round( num_boost_rounds / 20 )
print('Boosting rounds: {}'.format(num_boost_rounds))
print('Early stoping rounds: {}'.format(early_stopping_rounds))

In [None]:
evals = [(dtrain,'train'),(dvalid_xy,'eval')]
model = xgb.train(xgb_params, dtrain, num_boost_round=num_boost_rounds,
                  evals=evals, early_stopping_rounds=early_stopping_rounds, 
                  verbose_eval=10)

In [None]:
valid_pred = model.predict(dvalid_x, ntree_limit=model.best_ntree_limit)
print( "XGBoost validation set predictions:" )
print( pd.DataFrame(valid_pred).head() )
print("\nMean absolute validation error:")
mean_absolute_error(y_valid, valid_pred)

In [None]:
if OPTIMIZE_FUDGE_FACTOR:
    mod = QuantReg(y_valid, valid_pred)
    res = mod.fit(q=.5)
    print("\nLAD Fit for Fudge Factor:")
    print(res.summary())

    fudge = res.params[0]
    print("Optimized fudge factor:", fudge)
    print("\nMean absolute validation error with optimized fudge factor: ")
    print(mean_absolute_error(y_valid, fudge*valid_pred))

    fudge **= FUDGE_FACTOR_SCALEDOWN
    print("Scaled down fudge factor:", fudge)
    print("\nMean absolute validation error with scaled down fudge factor: ")
    print(mean_absolute_error(y_valid, fudge*valid_pred))
else:
    fudge=1.0

In [None]:
if FIT_FULL_TRAIN_SET and not CV_ONLY:
    if FIT_COMBINED_TRAIN_SET:
        # Merge 2016 and 2017 data sets
        train16 = pd.read_csv('../input/train_2016_v2.csv')
        train17 = pd.read_csv('../input/train_2017.csv')
        train16 = pd.merge(train16, properties2016, how = 'left', on = 'parcelid')
        train17 = pd.merge(train17, properties2017, how = 'left', on = 'parcelid')
        train17[['structuretaxvaluedollarcnt', 'landtaxvaluedollarcnt', 'taxvaluedollarcnt', 'taxamount']] = np.nan
        train_df = pd.concat([train16, train17], axis = 0)
        # Generate features
        city_std = train_df.groupby('regionidcity')['logerror'].aggregate("std").to_dict()
        zip_std = train_df.groupby('regionidzip')['logerror'].aggregate("std").to_dict()
        hood_std = train_df.groupby('regionidneighborhood')['logerror'].aggregate("std").to_dict()
        calculate_features(train_df)
        # Remove outliers
        train_df=train_df[ train_df.logerror > -0.4 ]
        train_df=train_df[ train_df.logerror < 0.419 ]
        # Create final training data sets
        x_full = train_df.drop(dropvars+droptrain, axis=1)
        y_full = train_df["logerror"].values.astype(np.float32)
        n_full = x_full.shape[0]     
    elif FIT_2017_TRAIN_SET:
        train = pd.read_csv('../input/train_2017.csv')
        train_df = train.merge(properties2017, how='left', on='parcelid')
        # Generate features
        city_std = train_df.groupby('regionidcity')['logerror'].aggregate("std").to_dict()
        zip_std = train_df.groupby('regionidzip')['logerror'].aggregate("std").to_dict()
        hood_std = train_df.groupby('regionidneighborhood')['logerror'].aggregate("std").to_dict()
        calculate_features(train_df)
        # Remove outliers
        train_df=train_df[ train_df.logerror > -0.4 ]
        train_df=train_df[ train_df.logerror < 0.419 ]
        # Create final training data sets
        x_full = train_df.drop(dropvars+droptrain, axis=1)
        y_full = train_df["logerror"].values.astype(np.float32)
        n_full = x_full.shape[0]     
    dtrain = xgb.DMatrix(x_full, y_full)
    num_boost_rounds = int(model.best_ntree_limit*n_full/n_train)
    full_model = xgb.train(xgb_params, dtrain, num_boost_round=num_boost_rounds, 
                           evals=[(dtrain,'train')], verbose_eval=10)

In [None]:
del properties2016
del properties2017
gc.collect()

In [None]:
if not CV_ONLY:
    if FIT_FULL_TRAIN_SET:
        pred = fudge*full_model.predict(dtest)
        pred17 = fudge*full_model.predict(dtest17)
    else:
        pred = fudge*model.predict(dtest, ntree_limit=model.best_ntree_limit)
        pred17 = fudge*model.predict(dtest17, ntree_limit=model.best_ntree_limit)
        
    print( "XGBoost test set predictions for 2016:" )
    print( pd.DataFrame(pred).head() )
    print( "XGBoost test set predictions for 2017:" )
    print( pd.DataFrame(pred17).head() )    

In [None]:
if MAKE_SUBMISSION and not CV_ONLY:
   y_pred=[]
   y_pred17=[]

   for i,predict in enumerate(pred):
       y_pred.append(str(round(predict,4)))
   for i,predict in enumerate(pred17):
       y_pred17.append(str(round(predict,4)))
   y_pred=np.array(y_pred)
   y_pred17=np.array(y_pred17)

   output = pd.DataFrame({'ParcelId': sample_submission['ParcelId'].astype(np.int32),
           '201610': y_pred, '201611': y_pred, '201612': y_pred,
           '201710': y_pred17, '201711': y_pred17, '201712': y_pred17})
   # set col 'ParceID' to first col
   cols = output.columns.tolist()
   cols = cols[-1:] + cols[:-1]
   output = output[cols]

   output.to_csv('sub{}.csv'.format(datetime.now().strftime('%Y%m%d_%H%M%S')), index=False)

In [None]:
print("Mean absolute validation error without fudge factor: ", )
print( mean_absolute_error(y_valid, valid_pred) )
if OPTIMIZE_FUDGE_FACTOR:
    print("Mean absolute validation error with fudge factor:")
    print( mean_absolute_error(y_valid, fudge*valid_pred) )