In [35]:
import pandas as pd
from sklearn.linear_model import LinearRegression
import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error,r2_score
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor

In [36]:
data = pd.read_csv('../data/Real_Estate.csv')

data                

Unnamed: 0,Transaction date,House age,Distance to the nearest MRT station,Number of convenience stores,Latitude,Longitude,House price of unit area
0,2012-09-02 16:42:30.519336,13.3,4082.01500,8,25.007059,121.561694,6.488673
1,2012-09-04 22:52:29.919544,35.5,274.01440,2,25.012148,121.546990,24.970725
2,2012-09-05 01:10:52.349449,1.1,1978.67100,10,25.003850,121.528336,26.694267
3,2012-09-05 13:26:01.189083,22.2,1055.06700,5,24.962887,121.482178,38.091638
4,2012-09-06 08:29:47.910523,8.5,967.40000,6,25.011037,121.479946,21.654710
...,...,...,...,...,...,...,...
409,2013-07-25 15:30:36.565239,18.3,170.12890,6,24.981186,121.486798,29.096310
410,2013-07-26 17:16:34.019780,11.9,323.69120,2,24.950070,121.483918,33.871347
411,2013-07-28 21:47:23.339050,0.0,451.64190,8,24.963901,121.543387,25.255105
412,2013-07-29 13:33:29.405317,35.9,292.99780,5,24.997863,121.558286,25.285620


In [37]:
data['Transaction date'] = pd.to_datetime(data['Transaction date'])
data['Transaction year'] = data['Transaction date'].dt.year
data['Transaction month'] = data['Transaction date'].dt.month
data

Unnamed: 0,Transaction date,House age,Distance to the nearest MRT station,Number of convenience stores,Latitude,Longitude,House price of unit area,Transaction year,Transaction month
0,2012-09-02 16:42:30.519336,13.3,4082.01500,8,25.007059,121.561694,6.488673,2012,9
1,2012-09-04 22:52:29.919544,35.5,274.01440,2,25.012148,121.546990,24.970725,2012,9
2,2012-09-05 01:10:52.349449,1.1,1978.67100,10,25.003850,121.528336,26.694267,2012,9
3,2012-09-05 13:26:01.189083,22.2,1055.06700,5,24.962887,121.482178,38.091638,2012,9
4,2012-09-06 08:29:47.910523,8.5,967.40000,6,25.011037,121.479946,21.654710,2012,9
...,...,...,...,...,...,...,...,...,...
409,2013-07-25 15:30:36.565239,18.3,170.12890,6,24.981186,121.486798,29.096310,2013,7
410,2013-07-26 17:16:34.019780,11.9,323.69120,2,24.950070,121.483918,33.871347,2013,7
411,2013-07-28 21:47:23.339050,0.0,451.64190,8,24.963901,121.543387,25.255105,2013,7
412,2013-07-29 13:33:29.405317,35.9,292.99780,5,24.997863,121.558286,25.285620,2013,7


In [38]:
data = data.drop('Transaction date',axis=1)
x = data.drop('House price of unit area',axis=1)
y = data['House price of unit area']
data

Unnamed: 0,House age,Distance to the nearest MRT station,Number of convenience stores,Latitude,Longitude,House price of unit area,Transaction year,Transaction month
0,13.3,4082.01500,8,25.007059,121.561694,6.488673,2012,9
1,35.5,274.01440,2,25.012148,121.546990,24.970725,2012,9
2,1.1,1978.67100,10,25.003850,121.528336,26.694267,2012,9
3,22.2,1055.06700,5,24.962887,121.482178,38.091638,2012,9
4,8.5,967.40000,6,25.011037,121.479946,21.654710,2012,9
...,...,...,...,...,...,...,...,...
409,18.3,170.12890,6,24.981186,121.486798,29.096310,2013,7
410,11.9,323.69120,2,24.950070,121.483918,33.871347,2013,7
411,0.0,451.64190,8,24.963901,121.543387,25.255105,2013,7
412,35.9,292.99780,5,24.997863,121.558286,25.285620,2013,7


In [39]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

scaler = StandardScaler()

x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

x_train_scaled.shape

(331, 7)

In [40]:
x_test_scaled.shape

(83, 7)

In [42]:
model = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'Decision Tree': DecisionTreeRegressor()
}

In [43]:
results = {}
for name,model in model.items():
    model.fit(x_train_scaled,y_train)
    y_pred = model.predict(x_test_scaled)
    mae = mean_absolute_error(y_test,y_pred)
    r2 = r2_score(y_test,y_pred)
    results[name] = {'MAE':mae,'R2':r2}

results_df = pd.DataFrame(results).T
print(results_df)

                         MAE        R2
Linear Regression   9.748246  0.529615
Random Forest       9.781656  0.512875
Gradient Boosting   9.985313  0.476768
Decision Tree      11.418972  0.241517
