In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import sklearn.model_selection as ms

In [2]:
df = pd.read_excel("Real estate valuation data set.xlsx")

In [3]:
df

Unnamed: 0,No,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
0,1,2012.916667,32.0,84.87882,10,24.98298,121.54024,37.9
1,2,2012.916667,19.5,306.59470,9,24.98034,121.53951,42.2
2,3,2013.583333,13.3,561.98450,5,24.98746,121.54391,47.3
3,4,2013.500000,13.3,561.98450,5,24.98746,121.54391,54.8
4,5,2012.833333,5.0,390.56840,5,24.97937,121.54245,43.1
...,...,...,...,...,...,...,...,...
409,410,2013.000000,13.7,4082.01500,0,24.94155,121.50381,15.4
410,411,2012.666667,5.6,90.45606,9,24.97433,121.54310,50.0
411,412,2013.250000,18.8,390.96960,7,24.97923,121.53986,40.6
412,413,2013.000000,8.1,104.81010,5,24.96674,121.54067,52.5


In [4]:
# checking for missing data
df.isnull().sum()

No                                        0
X1 transaction date                       0
X2 house age                              0
X3 distance to the nearest MRT station    0
X4 number of convenience stores           0
X5 latitude                               0
X6 longitude                              0
Y house price of unit area                0
dtype: int64

In [5]:
# preprocessing transaction_date
df["X1.1 transaction year"] = df["X1 transaction date"].astype(int)
df["X1.1 transaction month"] = (df["X1 transaction date"] - df["X1.1 transaction year"])*12

In [6]:
# separating Y
df_y = df["Y house price of unit area"]

In [7]:
# dropping redundant cols
df.drop(["No", "X1 transaction date", "Y house price of unit area"], axis=1, inplace=True)

In [8]:
# split into train and test sets
train_size=0.8
X_train, X_test, y_train, y_test = ms.train_test_split(df.to_numpy(), df_y.to_numpy(), train_size=train_size)

In [9]:
# fit normalizer on training set and use it to transform test set
normalizer = StandardScaler()  
X_train_norm = normalizer.fit_transform(X_train)
X_test_norm = normalizer.transform(X_test)

In [10]:
import sklearn.linear_model as lm
from sklearn.metrics import r2_score, mean_squared_error

In [11]:
lin_reg = lm.LinearRegression()
lin_reg.fit(X_train, y_train)
print("Columns:", df.columns.values)
print("Coefficients:", lin_reg.coef_)
print("Intercept:", lin_reg.intercept_)

Columns: ['X2 house age' 'X3 distance to the nearest MRT station'
 'X4 number of convenience stores' 'X5 latitude' 'X6 longitude'
 'X1.1 transaction year' 'X1.1 transaction month']
Coefficients: [-2.58221947e-01 -4.51571768e-03  1.21477518e+00  2.30855210e+02
 -2.45824236e+01  5.13674736e+00  3.71872747e-01]
Intercept: -13074.892844773127


In [12]:
y_pred = lin_reg.predict(X_test)
print(y_pred)

[13.3185523  39.47737646 45.55638984 43.7231895  40.34808506 43.88312597
 24.52549658 31.36238444 47.02222508 34.56499701 43.81471813 42.7186556
 43.64332877 47.28164017 33.12678626 21.69634199 55.33319131 29.06388189
 42.32391755 38.44803778 33.52896817 43.11390844 36.28150146 46.17836931
 37.63404653 44.91134026 44.55994785 30.98965602 13.64642502 48.56022577
 48.2778044  46.95428219 43.53197461 12.46061353 33.96786336 31.51509141
 43.60192772 33.81716442 46.10596562 50.28882189 43.99576047 53.74105094
 40.48413426 36.32771041 37.65388882 39.52653231 52.90718197 43.21374217
 44.7677237  45.2664379  33.82839296 37.75010705 33.25347258 36.28150146
 46.17479852 46.94961745 30.45906397 38.90333979 36.9620275  45.11227321
 50.59277799 32.64376178 30.60329917 37.38217011 26.0137286  44.0894281
 54.17523834 53.74105094 13.3185523  45.73409257 39.31359303 47.38372123
 47.3919769  20.83609097 42.46962388 41.98521648 45.337319   40.5659927
 23.05183541 15.1404946  43.56155511 48.96990565 41.91

In [13]:
print("R2 score:", r2_score(y_test, y_pred), "MSE:", mean_squared_error(y_test, y_pred))

R2 score: 0.6457451447859726 MSE: 46.9143072983629


In [14]:
# combining into one function
def train_lin_reg(df, df_y, train_size):
    X_train, X_test, y_train, y_test = ms.train_test_split(df.to_numpy(), df_y.to_numpy(), train_size=train_size)
    normalizer = StandardScaler()  
    X_train_norm = normalizer.fit_transform(X_train)
    X_test_norm = normalizer.transform(X_test)
    lin_reg = lm.LinearRegression()
    lin_reg.fit(X_train, y_train)
    y_pred = lin_reg.predict(X_test)
    print("R2 score:", r2_score(y_test, y_pred), "MSE:", mean_squared_error(y_test, y_pred))

In [15]:
train_lin_reg(df, df_y, train_size=0.8)
print()
train_lin_reg(df.drop(["X5 latitude", "X6 longitude"], axis=1), df_y, train_size=0.8)

R2 score: 0.6416421836758442 MSE: 68.89969130489158

R2 score: 0.5327204305933325 MSE: 98.3995106004715


In [16]:
# dropping X5 and X6 worsens performance
# so we keep them

In [17]:
train_sizes=[0.6, 0.7, 0.9]
for ts in train_sizes:
    print(ts, ":")
    train_lin_reg(df, df_y, train_size=ts)
    print()

0.6 :
R2 score: 0.49643792794562636 MSE: 95.98404111275615

0.7 :
R2 score: 0.6340104800770886 MSE: 63.863748493527886

0.9 :
R2 score: 0.6166874671319 MSE: 73.66259491190655



In [18]:
# 0.7 does better than 0.6 and 0.9
# it does slightly worse than 0.8

In [19]:
def train_ridge_reg(df, df_y, lam, train_size=0.8):
    X_train, X_test, y_train, y_test = ms.train_test_split(df.to_numpy(), df_y.to_numpy(), train_size=train_size)
    normalizer = StandardScaler()  
    X_train_norm = normalizer.fit_transform(X_train)
    X_test_norm = normalizer.transform(X_test)
    ridge_reg = lm.Ridge(alpha = lam)
    ridge_reg.fit(X_train, y_train)
    y_pred = ridge_reg.predict(X_test)
    print("R2 score:", r2_score(y_test, y_pred), "MSE:", mean_squared_error(y_test, y_pred))

In [27]:
lam_vals= [0.001, 0.005, 0.01, 0.05, 0.1, 0.5]
for lam in lam_vals:
    print(lam)
    train_ridge_reg(df, df_y, lam=lam)
    print()

0.001
R2 score: 0.5824983028894115 MSE: 73.96154378892558

0.005
R2 score: 0.34800313645632885 MSE: 154.16714655409

0.01
R2 score: 0.6338252820322119 MSE: 70.3278480109657

0.05
R2 score: 0.5599216192160328 MSE: 74.988225386308

0.1
R2 score: 0.4166754099016253 MSE: 87.39604743453768

0.5
R2 score: 0.41543610022323907 MSE: 108.40595729187505



In [28]:
# 0.01 is best value for ridge regression

In [22]:
def train_lasso_reg(df, df_y, lam, train_size=0.8):
    X_train, X_test, y_train, y_test = ms.train_test_split(df.to_numpy(), df_y.to_numpy(), train_size=train_size)
    normalizer = StandardScaler()  
    X_train_norm = normalizer.fit_transform(X_train)
    X_test_norm = normalizer.transform(X_test)
    lasso_reg = lm.Lasso(alpha = lam)
    lasso_reg.fit(X_train, y_train)
    y_pred = lasso_reg.predict(X_test)
    print("R2 score:", r2_score(y_test, y_pred), "MSE:", mean_squared_error(y_test, y_pred))

In [23]:
lam_vals= [0.001, 0.005, 0.01, 0.05, 0.1, 0.5]
for lam in lam_vals:
    print(lam)
    train_lasso_reg(df, df_y, lam=lam)
    print()

0.001
R2 score: 0.6069307256961058 MSE: 62.92275195490614

0.005
R2 score: 0.5029159378642627 MSE: 120.6968404225883

0.01
R2 score: 0.6010451673716766 MSE: 62.71195858274365

0.05
R2 score: 0.45529417666985594 MSE: 139.15997938789687

0.1
R2 score: 0.652587592774606 MSE: 51.90151847932529

0.5
R2 score: 0.6192201202875424 MSE: 61.595090420832896



In [24]:
# 0.1 is best value for lasso regression