In [0]:
import pandas as pd
import numpy as np 

In [0]:
input_fname = "realcasehousebitprediction/house1.csv"

In [0]:
!git clone https://github.com/purelyvivid/realcasehousebitprediction.git

fatal: destination path 'realcasehousebitprediction' already exists and is not an empty directory.


In [0]:
df = pd.read_csv(input_fname, encoding="big5")
df.head()

Unnamed: 0,次,樓層,棟,號,出售面積(坪),持分土地面積(坪),標售單價(元/m2),標售總價(元),投標數,得標總價(元)/標售總價(元)
0,2019,3,C,_1,29.79,5.31,69118,6806071,2,1.0171
1,2019,5,C,_1,29.79,5.31,69118,6806071,3,1.0088
2,2019,6,C,_1,29.79,5.31,69118,6806071,4,1.0298
3,2019,7,C,_1,29.79,5.31,69723,6865651,6,1.1055
4,2019,4,D,_1,29.79,5.31,68543,6749470,5,1.0214


In [0]:
to_int = lambda x: int(x.replace(",",""))

In [0]:
df["標售單價(元/m2)"] = df["標售單價(元/m2)"].apply(to_int)
df["標售總價(元)"] = df["標售總價(元)"].apply(to_int)

In [0]:
df_op = df.copy()

In [0]:
df.head()

Unnamed: 0,次,樓層,棟,號,出售面積(坪),持分土地面積(坪),標售單價(元/m2),標售總價(元),投標數,得標總價(元)/標售總價(元)
0,2019,3,C,_1,29.79,5.31,69118,6806071,2,1.0171
1,2019,5,C,_1,29.79,5.31,69118,6806071,3,1.0088
2,2019,6,C,_1,29.79,5.31,69118,6806071,4,1.0298
3,2019,7,C,_1,29.79,5.31,69723,6865651,6,1.1055
4,2019,4,D,_1,29.79,5.31,68543,6749470,5,1.0214


In [0]:
no_2019 = np.array(df["次"]==2019)
no_2020 = np.array(df["次"]==2020)

In [0]:
y_inter = np.array(df.loc[no_2019,"投標數"].astype(np.float32))

In [0]:
y = np.array(df.loc[no_2019,"得標總價(元)/標售總價(元)"].astype(np.float32))

In [0]:
feature_names_list = []
X_list = []

# 離散變數 ["棟", "號"]

In [0]:
from sklearn.preprocessing import OneHotEncoder

In [0]:
cnames = ["棟", "號"]

In [0]:
ohe = OneHotEncoder(sparse=False)
X = ohe.fit_transform(df[cnames])

In [0]:
ohe.get_feature_names()

array(['x0_A', 'x0_B', 'x0_C', 'x0_D', 'x1__1', 'x1__2', 'x1__3', 'x1__5'],
      dtype=object)

In [0]:
feature_names = []
for i, cn in enumerate(ohe.get_feature_names()):
  sp = cn.split("_")
  k = cnames[int(sp[0].replace("x",""))]+"_"+("".join(sp[1:]))
  feature_names.append(k)
feature_names

['棟_A', '棟_B', '棟_C', '棟_D', '號_1', '號_2', '號_3', '號_5']

In [0]:
feature_names_list += feature_names
X_list.append(X) 

# 連續變數

In [0]:
cnames = [ '樓層', '出售面積(坪)', '持分土地面積(坪)', '標售單價(元/m2)', ]

In [0]:
from sklearn.preprocessing import StandardScaler

In [0]:
ss = StandardScaler()

In [0]:
X = ss.fit_transform(df[cnames])

In [0]:
feature_names_list += cnames
X_list.append(X) 

# 總和

In [0]:
X = np.concatenate(X_list, 1)

In [0]:
feature_names = feature_names_list

In [0]:
X.shape, len(feature_names)

((42, 12), 12)

# 預測投標數

In [0]:
X_train, y_train = X[no_2019], y_inter
X_test = X[no_2020]

In [0]:
from sklearn.ensemble import RandomForestRegressor
submodel = RandomForestRegressor()
submodel.fit(X_train,y_train)
submodel.score(X_train,y_train)
y_test_pred = submodel.predict(X_test)

In [0]:
n_bit_pred = np.round(y_test_pred)
n_bit_pred

array([ 7., 11.,  8.,  9.,  5.,  5.,  5.,  5.,  5.,  5.,  3.,  8., 12.,
        9., 11.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,  6.,  9.])

In [0]:
np.mean(n_bit_pred)

6.84

In [0]:
n_bit = np.concatenate((y_inter,n_bit_pred)).reshape((-1,1))

In [0]:
ss = StandardScaler()

In [0]:
n_bit_norm = ss.fit_transform(n_bit)
n_bit_norm.shape

(42, 1)

In [0]:
X = np.concatenate((X,n_bit_norm), 1)
X.shape

(42, 13)

# Model

In [0]:
X_train, y_train = X[no_2019], y
X_test = X[no_2020]

In [0]:
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor

In [0]:
def get_y_test_pred(model, X_test, print_name=""):
  model = model()
  model.fit(X_train,y_train)
  print("R-sq:", model.score(X_train,y_train) )
  y_train_pred = model.predict(X_train)
  err_mean = np.mean((y_train_pred - y_train)**2)
  err_std = np.std((y_train_pred - y_train)**2)
  print("[{}] 誤差:0% ~ {:.2f}%".format( print_name, (err_mean+err_std*3)*100 ))
  y_test_pred =model.predict(X_test)
  return y_test_pred

In [0]:
models = [SVR, RandomForestRegressor, ExtraTreesRegressor]
print_names = ["SVR", "RandomForestRegressor", "ExtraTreesRegressor"]
y_test_pred_list = []
for model,name in zip(models,print_names):
  y_test_pred_list.append( get_y_test_pred(model, X_test, print_name=name) )
  print("*"*64)


R-sq: 0.38727311314737456
[SVR] 誤差:0% ~ 1.77%
****************************************************************
R-sq: 0.8539072922996184
[RandomForestRegressor] 誤差:0% ~ 0.86%
****************************************************************
R-sq: 1.0
[ExtraTreesRegressor] 誤差:0% ~ 0.00%
****************************************************************


In [0]:
y_test_pred_list[1]

array([1.09013601, 1.21507203, 1.13961801, 1.20914802, 1.059641  ,
       1.17991801, 1.059641  , 1.17991801, 1.059641  , 1.17991801,
       1.04251102, 1.13933501, 1.20788802, 1.13908201, 1.20161901,
       1.079647  , 1.17927502, 1.079647  , 1.17927502, 1.079647  ,
       1.17927502, 1.079647  , 1.17927502, 1.10548701, 1.13860901])

# Test

In [0]:
X_test.shape

(25, 13)

In [0]:
X_test_ = X_test[:, :-1]
X_test_.shape

(25, 12)

In [0]:
n_bit_assumed = np.array(list(range(1,21))).reshape((-1,1))
n_bit_assumed_norm = ss.transform(n_bit_assumed)
len(n_bit_assumed)

20

In [0]:
X_list = []
is_default_n_bit = []
nbs = []
for nbn, nb in zip(n_bit_assumed_norm, n_bit_assumed.flatten()):
    for x, nb_ in zip(X_test_, n_bit_pred):
        X_list.append( np.concatenate( [x,nbn]).reshape((1,-1)) )
        is_default_n_bit += [ nb == nb_ ]
        nbs.append(nb)
X_test_rep = np.concatenate(X_list, 0)
X_test_rep.shape

(500, 13)

In [0]:
is_default_n_bit = np.array(is_default_n_bit)
is_default_n_bit.sum()

25

In [0]:
is_default_n_bit[:26].sum()#??

0

In [0]:
models = [SVR, RandomForestRegressor, ExtraTreesRegressor]
print_names = ["SVR", "RandomForestRegressor", "ExtraTreesRegressor"]
y_test_pred_list = []
for model,name in zip(models,print_names):
  y_test_pred_list.append( get_y_test_pred(model, X_test_rep, print_name=name) )
  print("*"*64)


R-sq: 0.38727311314737456
[SVR] 誤差:0% ~ 1.77%
****************************************************************
R-sq: 0.8455097887581928
[RandomForestRegressor] 誤差:0% ~ 0.96%
****************************************************************
R-sq: 1.0
[ExtraTreesRegressor] 誤差:0% ~ 0.00%
****************************************************************


In [0]:
y_test_pred_list[0].shape

(500,)

# 匯出

In [0]:
del df_op["投標數"]
del df_op["得標總價(元)/標售總價(元)"]
df_op = df_op.iloc[no_2020,:]

In [0]:
print(df_op.shape)
df_op = pd.concat([df_op]*len(n_bit_assumed_norm), ignore_index=True)
print(df_op.shape)

(25, 8)
(500, 8)


In [0]:
for i,model_name in enumerate(print_names):
  df_op[model_name+"_"+"投標數(假設為)"] = nbs
  df_op[model_name+"_""得標總價/標售總價(預測)"] = y_test_pred_list[i]
  df_op[model_name+"_""得標總價(預測)"] = df_op["標售總價(元)"]*y_test_pred_list[i]

In [0]:
df_op.head()

Unnamed: 0,次,樓層,棟,號,出售面積(坪),持分土地面積(坪),標售單價(元/m2),標售總價(元),SVR_投標數(假設為),SVR_得標總價/標售總價(預測),SVR_得標總價(預測),RandomForestRegressor_投標數(假設為),RandomForestRegressor_得標總價/標售總價(預測),RandomForestRegressor_得標總價(預測),ExtraTreesRegressor_投標數(假設為),ExtraTreesRegressor_得標總價/標售總價(預測),ExtraTreesRegressor_得標總價(預測)
0,2020,8,C,_1,29.79,5.31,69723,6865651,1,1.118793,7681240.0,1,1.056261,7251919.0,1,1.071988,7359895.0
1,2020,8,C,_2,32.1,5.77,67803,7195300,1,1.129,8123497.0,1,1.104996,7950778.0,1,1.147116,8253844.0
2,2020,9,C,_1,29.79,5.31,70873,6978853,1,1.127093,7865817.0,1,1.056261,7371490.0,1,1.071988,7481247.0
3,2020,9,C,_2,32.1,5.77,68983,7320490,1,1.131865,8285808.0,1,1.099872,8051602.0,1,1.146041,8389582.0
4,2020,1,C,_1,29.79,5.31,72053,7095034,1,1.133001,8038679.0,1,1.04418,7408493.0,1,1.029476,7304167.0


In [0]:
df_op.to_csv("house_pred_2.csv", encoding="big5")

# 分析

In [0]:
from sklearn.svm import SVR
lr_ = SVR(kernel="linear")
lr_.fit(X_train,y_train)
coef = lr_.coef_[0]

In [0]:
coef

array([ 0.01905557, -0.01514689,  0.00107597, -0.00498465,  0.0054159 ,
        0.00973099, -0.01514689,  0.        , -0.0131775 ,  0.01112483,
        0.01144824,  0.01613987,  0.03121172])

In [0]:
feature_names += ["投標數(假設為)"]

In [0]:
feature_names = np.array(feature_names)

In [0]:
coef_sort = np.sort(coef)

In [0]:
rank = list(feature_names[ np.argsort(coef)])
rank

['棟_B',
 '號_3',
 '樓層',
 '棟_D',
 '號_5',
 '棟_C',
 '號_1',
 '號_2',
 '出售面積(坪)',
 '持分土地面積(坪)',
 '標售單價(元/m2)',
 '棟_A',
 '投標數(假設為)']

In [0]:
dict(zip(range(len(rank),0,-1), [(r,round(c,4)) for r,c in zip(rank, coef_sort)]  ))

{1: ('投標數(假設為)', 0.0312),
 2: ('棟_A', 0.0191),
 3: ('標售單價(元/m2)', 0.0161),
 4: ('持分土地面積(坪)', 0.0114),
 5: ('出售面積(坪)', 0.0111),
 6: ('號_2', 0.0097),
 7: ('號_1', 0.0054),
 8: ('棟_C', 0.0011),
 9: ('號_5', 0.0),
 10: ('棟_D', -0.005),
 11: ('樓層', -0.0132),
 12: ('號_3', -0.0151),
 13: ('棟_B', -0.0151)}