In [1]:
import pandas as pd
import numpy as np
from scipy.stats import skew
from sklearn.preprocessing import StandardScaler, LabelEncoder, PolynomialFeatures
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics.scorer import make_scorer
import xgboost as xgb
from sklearn.decomposition import PCA
from collections import defaultdict

In [2]:
def num_null(df):
    missing = df.isnull().sum()
    print('Show #missing in the columns:')
    for i in range(df.shape[1]):
        if missing[i]:
            print(missing.index[i], ':', missing[i])

def metric(truth, pred):
    truth = np.array(truth)
    pred = np.array(pred)
    diff = abs(pred - truth) / truth
    print(list(diff <= 0.1).count(True) / len(diff))
    
def fair_obj(preds, dtrain):
    fair_constant = 0.7
    labels = dtrain.get_label()
    x = (preds - labels)
    den = abs(x) + fair_constant
    grad = fair_constant * x / (den)
    hess = fair_constant * fair_constant / (den * den)
    return grad, hess

**Feature Engineering**

In [3]:
train = pd.read_csv("../input/dataset-0510/train.csv")
test = pd.read_csv("../input/dataset-0510/test.csv")

#train = train[train["total_price"] < 1.5e8]
price_sq = train["total_price"] / train["building_area"]
train = train[(price_sq<12000000) & (price_sq>60000)]

Y_train = np.log1p(train["total_price"].values / train["building_area"].values)
#Y_train = train["total_price"] / train["building_area"]
#offset = Y_train.min()
#Y_train = Y_train / offset
Y_train = np.expand_dims(Y_train, -1)
y_scale = StandardScaler()
Y_train = y_scale.fit_transform(Y_train)

offset = train["building_area"].values
train_greater_taipei_bool = train['city'].isin([7, 9, 13])
test_greater_taipei_bool = test['city'].isin([7, 9, 13])

train = train.drop('total_price', 1)
data = pd.concat([train, test], ignore_index=True)

In [4]:
data.loc[data["parking_way"] == 2, 'parking_area'] = data.loc[data["parking_way"] == 2, 'parking_area'].fillna(0.0)
data.loc[data["parking_way"] != 2, 'parking_area'] = data.loc[data["parking_way"] != 2, 'parking_area'].fillna(data.loc[data["parking_way"] != 2, 'parking_area'].median())
data.loc[data["parking_way"] == 2, 'parking_price'] = data.loc[data["parking_way"] == 2, 'parking_price'].fillna(0.0)
data.loc[data["parking_way"] != 2, 'parking_price'] = data.loc[data["parking_way"] != 2, 'parking_price'].fillna(data.loc[data["parking_way"] != 2, 'parking_price'].median())
data['txn_floor'] = data['txn_floor'].fillna(1)
data['village_income_median'] = data['village_income_median'].fillna(round(data.groupby(['city','town','village'])['village_income_median'].transform('mean')))
data['village_income_median'] = data['village_income_median'].fillna(round(data.groupby(['city','town'])['village_income_median'].transform('mean')))
data['village_income_median'] = data['village_income_median'].fillna(round(data.groupby(['city'])['village_income_median'].transform('mean')))
data["floor_ratio"] = data["txn_floor"] / data["total_floor"]
data.loc[data['land_area']==0, 'land_area'] = data['land_area'].median()
data["floor_area_ratio"] = data["building_area"] / data["land_area"]
data["have_parking"] = (data["parking_way"] != 2) * 1.0
data["have_parking"] = data["have_parking"].astype(int)

cat_data = data[["town", "village", "txn_floor", "building_material", "city", "building_type", "building_use", "parking_way"]].astype(str)
cat_data['village'] = cat_data["city"] + "_" + cat_data["town"] + "_" + cat_data["village"]
#cat_data["city_town"] = cat_data["city"] + "_" + cat_data["town"]
cat_data["city_town_building_type_use"] = cat_data["city"] + "_" + cat_data["town"] + "_" + cat_data["building_type"] + "_" + cat_data["building_use"]
#cat_data["parking_way_building_type"] = cat_data["parking_way"] + "_" + cat_data["building_type"]
cat_data["building_material_building_use"] = cat_data["building_material"] + "_" + cat_data["building_use"]
#cat_data["building_material_parking_way"] = cat_data["building_material"] + "_" + cat_data["parking_way"]

#cat_data["txn_dt"] = data["txn_dt"] // 365
#cat_data["building_complete_dt"] = data["building_complete_dt"] // 365
data["txn_duration"] = (data["txn_dt"] - data["building_complete_dt"]) / 365
#cat_data["building_type_txn_duration"] = cat_data["building_type"] + "_" + data["txn_duration"].astype(str)
#cat_data["building_use_txn_duration"] = cat_data["building_use"] + "_" + data["txn_duration"].astype(str) 

cat_data = cat_data.apply(LabelEncoder().fit_transform)
#cat_cols = [col for col in data.columns if data[col].dtype == np.object]
#data = data.apply(pd.to_numeric, errors='coerce')

In [5]:
num_null(data)

Show #missing in the columns:


In [6]:
cat_data.nunique()

town                               214
village                           4314
txn_floor                           28
building_material                    9
city                                11
building_type                        5
building_use                        10
parking_way                          3
city_town_building_type_use       2082
building_material_building_use      48
dtype: int64

In [7]:
svd_columns = ['town', 'village', 'city_town_building_type_use', 'building_material_building_use']
svd_dummy = pd.get_dummies(cat_data[svd_columns], columns=svd_columns).astype(float)
pca = PCA(n_components=100)
principalComponents = pca.fit_transform(svd_dummy)

principal_df = pd.DataFrame(data = principalComponents, columns = ['principal_component_{}'.format(i) for i in range(100)])

dummy_columns = ['txn_floor', 'building_material', 'city', 'building_type', 'building_use', 'parking_way']
dummy = pd.get_dummies(cat_data[dummy_columns], columns=dummy_columns)
#cat_data = cat_data.drop(dummy_columns, 1)

cont_data = data.drop(["building_id", 
                       "town", "village", "txn_floor", "building_material", "city", "building_type", "building_use", "parking_way"], 1)
cont_data = pd.concat([cont_data, principal_df], axis=1, join_axes=[cont_data.index])

skewness = cont_data.apply(lambda x: skew(x))
skewness = skewness[abs(skewness) > 1.5]
print(str(skewness.shape[0]) + " skewed numerical features to log transform")

skewed_features = skewness.index
cont_data[skewed_features] = np.log1p(cont_data[skewed_features])

cont_data = pd.concat([cont_data, dummy], axis=1, join_axes=[cont_data.index])
#train_data = pd.concat([cont_data, cat_data], axis=1, join_axes=[cont_data.index])
#scale = StandardScaler()
#cont_data = pd.DataFrame(scale.fit_transform(cont_data.values), columns=cont_data.columns, index=cont_data.index)

198 skewed numerical features to log transform


In [8]:
num_null(cont_data)

Show #missing in the columns:


In [9]:
X_train = cont_data.iloc[:-10000]
X_test = cont_data.iloc[-10000:]

**Xgboost**

In [10]:
param_ = {
    'booster': 'gbtree', 
    #'objective': 'reg:squarederror',
    'colsample_bytree': 0.7,
    #'max_depth': 8,
    'learning_rate': 0.01,
    'subsample': 0.9,
    'tree_method': 'gpu_hist',
    #'max_bin': 63,
    'eval_metric':'mae'
    }

**in taipei**

In [11]:
in_taipei_X_train = X_train[train_greater_taipei_bool.values]
in_taipei_Y_train = Y_train[train_greater_taipei_bool.values]

in_taipei_X_test = X_test[test_greater_taipei_bool.values]

In [12]:
in_taipei_valid_preds = []
in_taipei_test_preds = []
kf = KFold(n_splits=7, shuffle=False)

for i, (train_index, val_index) in enumerate(kf.split(in_taipei_X_train)):
    print("-" * 20)
    print(f"Fold {i+1}")
    print("-" * 20)
    train_data = xgb.DMatrix(
        in_taipei_X_train.iloc[train_index],
        label=in_taipei_Y_train[train_index]
    )
    valid_data = xgb.DMatrix(
        in_taipei_X_train.iloc[val_index],
        label=in_taipei_Y_train[val_index]
    )
    model = xgb.train(
        params=param_,
        dtrain=train_data,
        num_boost_round=2000000,
        evals=[(train_data, 'train'), (valid_data, 'valid')],
        early_stopping_rounds=100,
        verbose_eval=100,
        obj=fair_obj
    )

    # Predict valid
    valid_pred = np.expand_dims(model.predict(valid_data), -1)
    valid_pred = y_scale.inverse_transform(valid_pred)
    in_taipei_valid_preds.append(np.expm1(valid_pred))
    # Predict test
    test_data = xgb.DMatrix(in_taipei_X_test)
    pred = np.expand_dims(model.predict(test_data), -1)
    pred = y_scale.inverse_transform(pred)
    in_taipei_test_preds.append(np.expm1(pred))
        
    #Y_valid_predict = model.predict(valid_data)
    #Y_valid_predict = np.expm1(y_scale.inverse_transform(np.expand_dims(Y_valid_predict, -1)))
    #Y_valid = np.expm1(y_scale.inverse_transform(in_taipei_Y_train[val_index]))
    #metric(Y_valid, Y_valid_predict)

--------------------
Fold 1
--------------------
[0]	train-mae:0.572371	valid-mae:0.572092
Multiple eval metrics have been passed: 'valid-mae' will be used for early stopping.

Will train until valid-mae hasn't improved in 100 rounds.
[100]	train-mae:0.230831	valid-mae:0.232871
[200]	train-mae:0.163656	valid-mae:0.170798
[300]	train-mae:0.140197	valid-mae:0.15057
[400]	train-mae:0.1285	valid-mae:0.141366
[500]	train-mae:0.121385	valid-mae:0.136108
[600]	train-mae:0.116723	valid-mae:0.133097
[700]	train-mae:0.113073	valid-mae:0.130774
[800]	train-mae:0.109974	valid-mae:0.128843
[900]	train-mae:0.107134	valid-mae:0.127201
[1000]	train-mae:0.104779	valid-mae:0.125867
[1100]	train-mae:0.102635	valid-mae:0.124656
[1200]	train-mae:0.100502	valid-mae:0.123517
[1300]	train-mae:0.098502	valid-mae:0.122473
[1400]	train-mae:0.096546	valid-mae:0.121477
[1500]	train-mae:0.094886	valid-mae:0.120675
[1600]	train-mae:0.093325	valid-mae:0.119915
[1700]	train-mae:0.091886	valid-mae:0.119232
[1800]	train

In [13]:
in_taipei_y_valid = np.squeeze(np.concatenate(in_taipei_valid_preds, axis=0)) * offset[train_greater_taipei_bool]
in_taipei_y_test = np.squeeze(np.mean(in_taipei_test_preds, axis=0)) * test.loc[test_greater_taipei_bool, 'building_area'].values

**out taipei**

In [14]:
out_taipei_X_train = X_train[~train_greater_taipei_bool.values]
out_taipei_Y_train = Y_train[~train_greater_taipei_bool.values]

out_taipei_X_test = X_test[~test_greater_taipei_bool.values]

In [15]:
out_taipei_valid_preds = []
out_taipei_test_preds = []
kf = KFold(n_splits=7, shuffle=False)

for i, (train_index, val_index) in enumerate(kf.split(out_taipei_X_train)):
    print("-" * 20)
    print(f"Fold {i+1}")
    print("-" * 20)
    train_data = xgb.DMatrix(
        out_taipei_X_train.iloc[train_index],
        label=out_taipei_Y_train[train_index]
    )
    valid_data = xgb.DMatrix(
        out_taipei_X_train.iloc[val_index],
        label=out_taipei_Y_train[val_index]
    )
    model = xgb.train(
        params=param_,
        dtrain=train_data,
        num_boost_round=2000000,
        evals=[(train_data, 'train'), (valid_data, 'valid')],
        early_stopping_rounds=100,
        verbose_eval=100,
        obj=fair_obj
    )
    
    # Predict valid
    valid_pred = np.expand_dims(model.predict(valid_data), -1)
    valid_pred = y_scale.inverse_transform(valid_pred)
    out_taipei_valid_preds.append(np.expm1(valid_pred))
    # Predict test
    test_data = xgb.DMatrix(out_taipei_X_test)
    pred = np.expand_dims(model.predict(test_data), -1)
    pred = y_scale.inverse_transform(pred)
    out_taipei_test_preds.append(np.expm1(pred))
        
    #Y_valid_predict = model.predict(valid_data)
    #Y_valid_predict = np.expm1(y_scale.inverse_transform(np.expand_dims(Y_valid_predict, -1)))
    #Y_valid = np.expm1(y_scale.inverse_transform(out_taipei_Y_train[val_index]))
    #metric(Y_valid, Y_valid_predict)

--------------------
Fold 1
--------------------
[0]	train-mae:1.24619	valid-mae:1.25513
Multiple eval metrics have been passed: 'valid-mae' will be used for early stopping.

Will train until valid-mae hasn't improved in 100 rounds.
[100]	train-mae:0.32313	valid-mae:0.337175
[200]	train-mae:0.212962	valid-mae:0.232202
[300]	train-mae:0.186195	valid-mae:0.209128
[400]	train-mae:0.174376	valid-mae:0.200905
[500]	train-mae:0.166444	valid-mae:0.195736
[600]	train-mae:0.160274	valid-mae:0.19204
[700]	train-mae:0.155338	valid-mae:0.189178
[800]	train-mae:0.151022	valid-mae:0.186779
[900]	train-mae:0.147318	valid-mae:0.184916
[1000]	train-mae:0.143935	valid-mae:0.183194
[1100]	train-mae:0.140777	valid-mae:0.181691
[1200]	train-mae:0.137772	valid-mae:0.180281
[1300]	train-mae:0.134975	valid-mae:0.179066
[1400]	train-mae:0.132327	valid-mae:0.177904
[1500]	train-mae:0.129953	valid-mae:0.177006
[1600]	train-mae:0.127562	valid-mae:0.175985
[1700]	train-mae:0.125297	valid-mae:0.175064
[1800]	train-

In [16]:
out_taipei_y_valid = np.squeeze(np.concatenate(out_taipei_valid_preds, axis=0)) * offset[~train_greater_taipei_bool]
out_taipei_y_test = np.squeeze(np.mean(out_taipei_test_preds, axis=0)) * test.loc[~test_greater_taipei_bool, 'building_area'].values

In [17]:
y_valid = np.zeros(len(X_train))
y_valid[train_greater_taipei_bool] = in_taipei_y_valid
y_valid[~train_greater_taipei_bool] = out_taipei_y_valid

In [18]:
y_valid

array([  684278.8069427 ,  3075512.02307771,  9760546.70703877, ...,
       11931675.83723455, 16905429.59025576,  8195236.51387496])

In [19]:
valid_df = pd.DataFrame(y_valid, columns=["total_price"])
valid_df.to_csv("valid_prediction.csv", index=False)

In [20]:
y_test = np.zeros(len(test))
y_test[test_greater_taipei_bool] = in_taipei_y_test
y_test[~test_greater_taipei_bool] = out_taipei_y_test

In [21]:
y_test

array([11418943.69241195,  3950496.76818577, 10027978.02512846, ...,
        1129047.34312697,  2955567.97312601,  2910124.97006133])

In [22]:
submit = pd.read_csv("../input/dataset-0510/submit_test.csv")

with open("sample_submission.csv", "w") as f:
    f.write('building_id,total_price\n')
    for _id, label in zip(submit["building_id"], y_test):
        f.write(_id + ',' + str(label) + '\n')