Load libraries and import data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
train = pd.read_csv('../input/train_2016.csv', parse_dates=['transactiondate'])
train.shape

In [None]:
prop = pd.read_csv('../input/properties_2016.csv')
prop.shape

In [None]:
prop.head()

In [None]:
prop.columns

Use nearest neighbors regression on bathroomcnt + bedroomcnt + calculatedfinishedsquarefeet + year + month

In [None]:
X = prop[['parcelid', 'bathroomcnt', 'bedroomcnt', 'calculatedfinishedsquarefeet']]
X.head()


Remove NaNs and 0 values

In [None]:
X = X.dropna()
X = X[(X.bathroomcnt > 0) & (X.bedroomcnt > 0) & (X.calculatedfinishedsquarefeet > 0)]
X.shape

In [None]:
X.head()

In [None]:
X = pd.merge(X, train, on='parcelid')
X.shape

In [None]:
X.sort_values(by=['parcelid', 'transactiondate'], inplace=True)
X.head()

In [None]:
#X['transaction_year'] = X['transactiondate'].dt.year
#X['transaction_month'] = X['transactiondate'].dt.month
X['transaction_yearmonth'] = 100 * X['transactiondate'].dt.year + X['transactiondate'].dt.month
X.head()

Create index for transaction_yearmonth for computing distance measurement.  Only 2016 data, so no need to handle date differences including years.

In [None]:
min_transaction_yearmonth = X.transaction_yearmonth.min()
X['transaction_yearmonth_i'] = X.transaction_yearmonth - min_transaction_yearmonth
np.sort(X.transaction_yearmonth_i.unique())

In [None]:
y = X.logerror
X = X[['bathroomcnt', 'bedroomcnt', 'calculatedfinishedsquarefeet', 'transaction_yearmonth_i']]
print ("X shape =", X.shape)
print ("y shape =", y.shape)
X.head()

Fit nearest neighbors regression

In [None]:
from sklearn import neighbors
from sklearn.model_selection import cross_val_score

n_neighbors_lst = np.arange(1, 100+1, 1)
mae_lst = []
std_lst = []

for n_neighbors in n_neighbors_lst: 
    #n_neighbors = 5
    weights = 'distance'
    knn = neighbors.KNeighborsRegressor(n_neighbors, weights=weights)
    scores = cross_val_score(knn, X, y, scoring='neg_mean_absolute_error', cv=5)
    scores = np.fabs(scores)
    mean_score = np.mean(scores)
    std_score = np.std(scores)
    
    mae_lst.append(mean_score)
    std_lst.append(std_score)
#knn.fit(X, y)
#y_hat = knn.predict(X)

#mae = np.mean(np.fabs(y - y_hat))
#mae_lst.append(mae)

plt.plot(n_neighbors_lst, mae_lst, linewidth=2)
plt.title('MAE by number of neighbors')
plt.xlabel('k')
plt.ylabel('MAE')

plt2 = plt.twinx()
plt2.plot(n_neighbors_lst, std_lst, linewidth=2, color='red')

Using 5 fold cross-validation, the KNN Regressor shows improvement as we use more neighbors, but at a cost of increasing variance among the Mean Absolute Error scores