In [None]:
import pandas as pd
import numpy as np
from sklearn.neighbors import BallTree
import sklearn.metrics as sm
import math
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
pd.options.mode.chained_assignment = None

# **Summary**

The data was shuffled and split into a training dataset containing 700 houses and a test dataset containing 220 houses.
I estimated the square meter price of a house in the test data based on the arithmetic mean of the square meter prices of the 10 spatially closest neighbors from the training dataset. The estimated square meter price is then simply multiplied by the area. I tried various forms of weighting by distance but this didn't show any improvements. The results for this approach are generally promising even though there is still a lot of room for improvement.

* RMSE: 187445.32644908634
* RMSLE: 0.20503366937947035
* Median Absolute Error: 51913.5
* R²: 0.8352636354299533

I'm really looking forward to your input, corrections and general feedback, as I just recently started my data science journey.

In [None]:
data = pd.read_csv("../input/amsterdam-house-price-prediction/HousingPrices-Amsterdam-August-2021.csv")

In [None]:
data = data.drop(['Unnamed: 0'], axis = 1)
data = data[data['Price'].notna()]
data = data.sample(frac=1, random_state = 0)
data = data.reset_index(drop=True)
train = data[:700]
test = data[700:]
test_y = test.pop('Price')
train['Price_m²'] = train['Price']/train['Area']

In [None]:
train

In [None]:
test

In [None]:
data.describe()

In [None]:
plt.scatter(data['Area'], data['Price'])

In [None]:
data['Price_m²'] = data['Price']/data['Area']
fig = plt.figure(figsize = (10, 10))
plt.scatter(data['Lon'], data['Lat'], c = data['Price_m²'],cmap = 'viridis')
plt.show()

In [None]:
tree = BallTree(np.deg2rad(train[['Lon', 'Lat']].values), metric='haversine')

In [None]:
test['Mean_price_m²'] = 0
n = 10
for i in  test.index:
    dist, ind = tree.query(np.deg2rad(np.c_[test['Lon'][i], test['Lat'][i]]), k = n)
    for j in range(n):
        test['Mean_price_m²'][i] += train['Price_m²'][ind[0][j]]
    test['Mean_price_m²'][i] /= n

In [None]:
test['Pred'] = test['Area'] * test['Mean_price_m²']

In [None]:
print('RMSE: ' + str(math.sqrt(sm.mean_squared_error(test_y,test['Pred']))))
print('RMSLE: ' + str(math.sqrt(sm.mean_squared_log_error(test_y,test['Pred']))))
print('Median absolute error: ' + str(sm.median_absolute_error(test_y,test['Pred'])))
print('R²: ' +str(sm.r2_score(test_y,test['Pred'])))

In [None]:
residuals = test_y - test['Pred']
fig, (ax1, ax2, ax3) = plt.subplots(3, figsize=(10,10))
fig.suptitle('Residual Plots')
ax1.scatter(test['Area'],residuals)
ax1.plot(np.unique(test['Area']), np.poly1d(np.polyfit(test['Area'], residuals, 1))(np.unique(test['Area'])))
ax1.set_title('Area')
ax2.scatter(test['Lon'],residuals)
ax2.plot(np.unique(test['Lon']), np.poly1d(np.polyfit(test['Lon'], residuals, 1))(np.unique(test['Lon'])))
ax2.set_title('Longitude')
ax3.scatter(test['Lat'],residuals)
ax3.plot(np.unique(test['Lat']), np.poly1d(np.polyfit(test['Lat'], residuals, 1))(np.unique(test['Lat'])))
ax3.set_title('Latitude')
plt.show()