In [50]:
import pandas as pd
import re
import numpy as np
from sklearn.linear_model import LinearRegression
import sklearn.metrics

In [6]:
df = pd.read_csv("../find-a-house/funda-2018-02-10.csv")

def clean_number(p):
    p = str(p)
    p = p.replace("€ ", "")
    p = p.replace(" k.k.", "")
    p = p.replace(" v.o.n.", "")
    p = p.replace(" (auction)", "")
    p = p.replace(",", "")
    if (re.findall('Price on request', p)):
        return None
    return float(p)

df = df.drop(columns=['plot_size'])
df['price'] = [ clean_number(p) for p in df['price'] ]
df['price'] /= 1000
df['living_area'] = [ clean_number(p) for p in df['living_area'] ]
df = df.dropna()
print("Total", len(df))
df.head()

Total 2662


Unnamed: 0,address,living_area,price,rooms,url
0,Oterleekstraat 15 1023 ED Amsterdam,63.0,319.0,4.0,https://www.funda.nl/en/koop/amsterdam/huis-40...
1,Bastenakenstraat 142 1066 JG Amsterdam,151.0,850.0,4.0,https://www.funda.nl/en/koop/amsterdam/huis-86...
2,Roeselarestraat 11 1066 SW Amsterdam,95.0,350.0,4.0,https://www.funda.nl/en/koop/amsterdam/huis-86...
3,Jan Vrijmanstraat 165 1087 MB Amsterdam,158.0,699.0,5.0,https://www.funda.nl/en/koop/amsterdam/huis-40...
4,Van Boshuizenstraat 547 1082 AV Amsterdam,155.0,700.0,8.0,https://www.funda.nl/en/koop/amsterdam/huis-40...


$$ var(A)=E[(A−E[A])2] $$

In [26]:
def variance(a):
    return np.mean((np.array(a) - np.mean(a)) ** 2)

print("var(living_area)", variance(df['living_area']))
print("var(rooms)", variance(df['rooms']))

var(living_area) 5533.6381604980015
var(rooms) 3.8513222237337583


$$ Cov(A,B)=E[(A−E[A])(B−E[B])] $$

In [25]:
def covariance(a, b):
    return np.mean((np.array(a) - np.mean(a)) * (np.array(b) - np.mean(b)))

print("cov(living_area, price)", covariance(df['living_area'], df['price']))
print("cov(rooms, price)", covariance(df['rooms'], df['price']))

cov(living_area, price) 42288.041208142706
cov(rooms, price) 862.3315915723479


$$ Corr(A,B)=Cov(A,B)/\sqrt{var(A)var(B)} $$

In [37]:
def correlation(a, b):
    return covariance(a, b) / np.sqrt(variance(a) * variance(b))

print("corr(living_area, price)", correlation(df['living_area'], df['price']))
print("numpy corrcoef", np.corrcoef(df['living_area'], df['price'])[0,1])
print("\ncorr(rooms, price)", correlation(df['rooms'], df['price']))
print("numpy corrcoef", np.corrcoef(df['rooms'], df['price'])[0,1])

corr(living_area, price) 0.8538283284364955
numpy corrcoef 0.8538283284364955

corr(rooms, price) 0.6599753857883435
numpy corrcoef 0.6599753857883441


$$
MSE = \frac{\sum{(y−f(X))^2}}{n}
\\
\\
RMSE = \sqrt{MSE}
$$


In [71]:
X = np.array([df['living_area']]).T
y = df['price']

model = LinearRegression()
model.fit(X, y)

def mse(y, fn_X):
    return np.sum((y - fn_X) ** 2) / len(y)

y_pred = model.predict(X)
print("mse", mse(y, y_pred))
print("sklearn mse", sklearn.metrics.mean_squared_error(y, y_pred))
print("rmse", np.sqrt(mse(y, y_pred)))

mse 120120.18393659618
sklearn mse 120120.18393659618
rmse 346.58358867176065


$$ R^2 = 1 − \frac{MSE}{var(y)} $$

In [72]:
def r2(y, fn_X):
    return 1 - mse(y, fn_X) / variance(y)

print("r2", r2(y, y_pred))
print("sklearn r2", model.score(X, y))

r2 0.7290228144406602
sklearn r2 0.7290228144406602
