In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.compose import make_column_transformer
from sklearn.metrics import mean_squared_error, r2_score

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df_prices = pd.read_csv('../input/amsterdam-house-price-prediction/HousingPrices-Amsterdam-August-2021.csv')

In [None]:
df_prices.info()
df_prices.describe()
df_prices.head()

In [None]:
df_prices.isnull().sum()

In [None]:
df_prices = df_prices.dropna()

In [None]:
### dataframe with Zip-Codes and corresponding area 
postal_code_district = pd.DataFrame(np.array([
    [1011, 1018, 1], [1019, 1019, 2], [1020, 1029, 3],
    [1030, 1039, 4], [1040, 1049, 5], [1050, 1059, 6], 
    [1060, 1069, 7], [1070, 1083, 8], [1086, 1099, 9], 
    [1100, 1108, 10], [1109, 1109, 11]]), 
    columns = ['under', 'upper', 'dstrct_id'])

postal_code_district

In [None]:
### create district column
df_prices['temp'] = df_prices['Zip'].apply( lambda x: int(float((x.split(' ', 1)[0]))))
df_prices['district'] = df_prices['temp'].apply( lambda x: postal_code_district.loc[(postal_code_district['under'] <= x) & (postal_code_district['upper'] >= x), 'dstrct_id'].values[0])


In [None]:
### drop Zip, Address, temp, Unnamed: 0
df_prices = df_prices.drop(['Zip', 'Address', 'temp', 'Unnamed: 0'], axis=1)

In [None]:
df_prices.head()

In [None]:
### start EDA 
sns.pairplot(df_prices)

In [None]:
### high correlation between price and area
plt.figure(figsize=(12,12))
sns.heatmap(df_prices.corr(), annot = True)

In [None]:
y = df_prices['Price']
X = df_prices.drop('Price', axis = 1)

In [None]:
### standardize numeric independent features and oneHotEncode categorical feature
std_sclr = StandardScaler()
test = make_column_transformer((StandardScaler(), ['Area', 'Room', 'Lat', 'Lon']), 
                              (OneHotEncoder(), ['district']))
X_scaled = test.fit_transform(X)

In [None]:
### Train-/Test split
X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(X_scaled, y, test_size=0.30, random_state=44, shuffle =True)

In [None]:
### LinReg 
linreg_model = LinearRegression().fit(X_train_scaled, y_train)
linreg_pred = linreg_model.predict(X_test_scaled)

### inner loss/ inner R^2
print(linreg_model.score(X_train_scaled, y_train))
### outer loss/ out-of-sample R^2
print(linreg_model.score(X_test_scaled, y_test))
print(mean_squared_error(y_test, linreg_pred))

In [None]:
### Random Forest
rndf_model = RandomForestRegressor(n_estimators = 150).fit(X_train_scaled, y_train)
rndf_predict = rndf_model.predict(X_test_scaled)

### inner loss/ inner R^2
print(rndf_model.score(X_train_scaled, y_train))
### outer loss/ out-of-sample R^2
print(rndf_model.score(X_test_scaled, y_test))
print(mean_squared_error(y_test, rndf_predict))

### Overfitting

In [None]:
### k
knn_model = KNeighborsRegressor(n_neighbors = 7).fit(X_train_scaled, y_train)
knn_predict = knn_model.predict(X_test_scaled)

### inner loss/ inner R^2
print(knn_model.score(X_train_scaled, y_train))
### outer loss/ out-of-sample R^2
print(knn_model.score(X_test_scaled, y_test))
print(mean_squared_error(y_test, knn_predict))