![](http://)In this kernel I will show how logerror of a parcel is related with its closest neighbors in the past.These values can be used as features and it is not using any leak because we are only using values in the past

In [None]:
import numpy as np #  algebra
import pandas as pd # data processing
import matplotlib.pyplot as plt #Provides MATLAB-like plotting framework
import seaborn as sns #python visualization library
from sklearn.neighbors import NearestNeighbors
color = sns.color_palette()

%matplotlib inline

##### Loading data

In [None]:
train = pd.read_csv('../input/train_2016_v2.csv' , parse_dates=["transactiondate"])
prop = pd.read_csv('../input/properties_2016.csv', usecols=['longitude', 'latitude', 'parcelid', 'regionidzip',
                                                            'propertyzoningdesc', 'taxvaluedollarcnt', 'structuretaxvaluedollarcnt'])

#### some boring data cleaning and munging

In [None]:
train = pd.merge(train, prop, on=['parcelid'], how='left')
train.sort_values('transactiondate', inplace=True)
train['latitude'] = train['latitude']/1e6
train['longitude'] = train['longitude']/1e6

#### lets restrict only for 3 neighbors for now
What we'll do is we'll loop over through all dates in our train set and for a given date, we will find out who are the closest neighbors for a given parcel in dates preceding that given date

In [None]:
num_neighbors = 3
train_dates = train.transactiondate.unique()[1:]
# creating some columns that we will be populating
for n in range(1, num_neighbors+1):
        train['nn{}_logerror'.format(n)] = 0
        train['nn{}_distance'.format(n)] = 0
        train['nn{}_taxvaluedollarcnt'.format(n)] = 0
        train['nn{}_structuretaxvaluedollarcnt'.format(n)] = 0
        


In [None]:
for d in train_dates:
    nbrs = NearestNeighbors(n_neighbors=num_neighbors, algorithm='ball_tree')\
            .fit(train.loc[train.transactiondate<d, ['longitude','latitude']])
    distances, indices = nbrs.kneighbors(train.loc[train.transactiondate==d, ['longitude','latitude']])
    indices = indices.reshape(-1, 1)
    
    nearest_logerrors = train.loc[indices.squeeze(), 'logerror']
    nearest_logerrors = nearest_logerrors.values.reshape(-1, num_neighbors)
    
    nearest_taxvaluedollarcnt = train.loc[indices.squeeze(), 'taxvaluedollarcnt']
    nearest_taxvaluedollarcnt = nearest_taxvaluedollarcnt.values.reshape(-1, num_neighbors)
    
    nearest_structuretaxvaluedollarcnt = train.loc[indices.squeeze(), 'structuretaxvaluedollarcnt']
    nearest_structuretaxvaluedollarcnt = nearest_structuretaxvaluedollarcnt.values.reshape(-1, num_neighbors)
    
    train.loc[train.transactiondate==d, ['nn1_logerror', 'nn2_logerror', 'nn3_logerror']] = nearest_logerrors
    train.loc[train.transactiondate==d, ['nn1_distance', 'nn2_distance', 'nn3_distance']] = distances
    train.loc[train.transactiondate==d, ['nn1_taxvaluedollarcnt', 'nn2_taxvaluedollarcnt', 'nn3_taxvaluedollarcnt']] = nearest_taxvaluedollarcnt
    train.loc[train.transactiondate==d, ['nn1_structuretaxvaluedollarcnt', 'nn2_structuretaxvaluedollarcnt', 'nn3_structuretaxvaluedollarcnt']] = nearest_structuretaxvaluedollarcnt
    


#### Lets see if there are any patterns

#### Logerror vs nearest neighbors logerrors

In [None]:
plt.figure(figsize=(10, 10))
plt.scatter(train.nn1_logerror, train.logerror)
plt.xlabel('logerror of 1st nearest neighbor in past')
plt.ylabel('logerror of parcel')
plt.show()

#####  Whenever the logerror of one is zero, the other is not.This pattern is consistent with other nearest neighbors as well

In [None]:
plt.figure(figsize=(10, 10))
plt.scatter(train.nn2_logerror, train.logerror)
plt.xlabel('logerror of 2nd nearest neighbor in past')
plt.ylabel('logerror of parcel')
plt.show()

In [None]:
plt.figure(figsize=(10, 10))
plt.scatter(train.nn3_logerror, train.logerror)
plt.xlabel('logerror of 3rd nearest neighbor in past')
plt.ylabel('logerror of parcel')
plt.show()

#### Logerrors vs Distance to nearest neighbors

In [None]:
plt.figure(figsize=(10, 10))
plt.scatter(train.nn1_distance, np.abs(train.logerror))
plt.xlabel('Distance to 1st nearest neighbor')
plt.ylabel('logerror of parcel')
plt.show()

##### as the distance to its nearest neighbor increases, the absolute logerror of the parcels seem to decreasing towards zero

#### It seems to be the same with 2nd nearest neighbor as well

In [None]:
plt.figure(figsize=(10, 10))
plt.scatter(train.nn2_distance, np.abs(train.logerror))
plt.xlabel('Distance to 2nd nearest neighbor')
plt.ylabel('logerror of parcel')
plt.show()

#### taxvaluedollarcnt vs nearest neighbors taxvaluedollarcnt

In [None]:
plt.figure(figsize=(10, 10))
plt.scatter(train.taxvaluedollarcnt, train.nn1_taxvaluedollarcnt, color=color[0], label='nn1')
plt.scatter(train.taxvaluedollarcnt, train.nn2_taxvaluedollarcnt, color=color[1], label='nn2')
plt.scatter(train.taxvaluedollarcnt, train.nn3_taxvaluedollarcnt, color=color[2], label='nn3')
plt.xlabel('taxvaluedollarcnt')
plt.ylabel('taxvaluedollarcnt of neighbor')
plt.legend(loc='best')
plt.show()

In [None]:
plt.figure(figsize=(10, 10))
plt.scatter(train.structuretaxvaluedollarcnt, train.nn1_structuretaxvaluedollarcnt, color=color[0], label='nn1')
plt.scatter(train.structuretaxvaluedollarcnt, train.nn2_structuretaxvaluedollarcnt, color=color[1], label='nn2')
plt.scatter(train.structuretaxvaluedollarcnt, train.nn3_structuretaxvaluedollarcnt, color=color[2], label='nn3')
plt.xlabel('structuretaxvaluedollarcnt')
plt.ylabel('structuretaxvaluedollarcnt of neighbor')
plt.legend(loc='best')
plt.show()

#### We can try other features of nearest neighbors as well and include them as features during training.

#### In my findings I found that regionidzip and propertyzoningdesc seem to be important to find logerror

##### Lets look at distribution of logerrors with top 15 frequent regionidzips

In [None]:
to_look = 10
fig,ax = plt.subplots(to_look, 1, figsize=(15, 15), sharex=True)
for i in range(to_look):
    sns.distplot(train.loc[train.regionidzip == train.regionidzip.value_counts().index[i], 'logerror'],
                 ax=ax[i], bins=80)
    ax[i].set_ylabel(train.regionidzip.value_counts().index[i])

The Distributions seem to be unimodal even when we split them for each zipcode, Does that mean zillow is using different model for different zipcode ?

##### Lets look at distribution of logerrors with top 10 frequent propertyzoningdesc

In [None]:
to_look = 10
fig,ax = plt.subplots(to_look, 1, figsize=(15, 15), sharex=True)
for i in range(to_look):
    sns.distplot(train.loc[train.propertyzoningdesc == train.propertyzoningdesc.value_counts().index[i], 'logerror'],
                 ax=ax[i], bins=80)
    ax[i].set_ylabel(train.propertyzoningdesc.value_counts().index[i])

#### I will be updating this kernel if I find anything useful so stay tuned and I welcome any suggestions
### Please upvote this kernel if you found this to be useful