In [1]:
import datetime
import pandas
import xgboost

In [2]:
# Read in data
train = pandas.read_csv('raw_data/train_2016_v2.csv')
train['transactiondate'] = pandas.to_datetime(train['transactiondate'])
train = train.rename(columns={'transactiondate' : 'transaction_date',
                              'parcelid'        : 'parcel_id'})

In [3]:
train['dayofweek'] = train['transaction_date'].dt.dayofweek
train['weekday_name'] = train['transaction_date'].apply(lambda x: datetime.datetime.strftime(x, '%A'))
train['month'] = train['transaction_date'].dt.month
train['year'] = train['transaction_date'].dt.year
train['abs_logerror'] = train['logerror'].abs()
train = train[['transaction_date', 'parcel_id', 'logerror', 'abs_logerror',
               'dayofweek', 'weekday_name', 'month', 'year']]

In [4]:
properties = pandas.read_csv('data/properties_2016_cleaned.csv')

In [5]:
train_data = train.merge(properties, on='parcel_id')

### Model

In [6]:
model = xgboost.XGBRegressor()
X_train = train_data.iloc[:, 5:].drop(['weekday_name'], axis=1)
Y_train = train_data['logerror']
model = model.fit(X_train, Y_train)

In [7]:
predictions = train_data[['logerror']].copy()
predictions['Y_pred'] = model.predict(X_train)

In [8]:
predictions['abs_err'] = predictions['logerror'] - predictions['Y_pred']
predictions['abs_err'] = predictions['abs_err'].abs()
predictions['abs_err'].mean()

0.06750932214092656

In [9]:
def direction(x):
    if x > 0: return 1
    if x < 0: return -1
predictions['true_direction'] = predictions['logerror'].apply(direction)
predictions['pred_direction'] = predictions['Y_pred'].apply(direction)

def hit(x):
    if x['true_direction'] == x['pred_direction']:
        return x['Y_pred'] < x['logerror']
    else:
        return True
predictions['hit'] = predictions.apply(hit, axis=1)
predictions['hit'].mean()

0.8437995015231238