In [1]:
import pandas as pd
import numpy as np
from scipy.stats import skew
from sklearn.preprocessing import StandardScaler, LabelEncoder, PolynomialFeatures
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics.scorer import make_scorer
import lightgbm as lgb
from collections import defaultdict

In [2]:
def num_null(df):
    missing = df.isnull().sum()
    print('Show #missing in the columns:')
    for i in range(df.shape[1]):
        if missing[i]:
            print(missing.index[i], ':', missing[i])

def metric(truth, pred):
    truth = np.array(truth)
    pred = np.array(pred)
    diff = abs(pred - truth) / truth
    print(list(diff <= 0.1).count(True) / len(diff))

**Feature Engineering**

In [3]:
train = pd.read_csv("../input/dataset-0510/train.csv")
test = pd.read_csv("../input/dataset-0510/test.csv")

#train = train[train["total_price"] < 1.5e8]
price_sq = train["total_price"] / train["building_area"]
train = train[(price_sq<12000000) & (price_sq>60000)]

#Y_train = pd.DataFrame(np.log1p(train["total_price"] / train["building_area"]), columns=["total_price"])
Y_train = train["total_price"] / train["building_area"]
#offset = Y_train.min()
#Y_train = Y_train / offset
Y_train = np.expand_dims(np.log1p(Y_train), -1)
y_scale = StandardScaler()
Y_train = pd.DataFrame(y_scale.fit_transform(Y_train), columns=['total_price'])

offset = train["building_area"].values
train_greater_taipei_bool = train['city'].isin([7, 9, 13])
test_greater_taipei_bool = test['city'].isin([7, 9, 13])

train = train.drop('total_price', 1)
data = pd.concat([train, test], ignore_index=True)

In [4]:
data.loc[data["parking_way"] == 2, 'parking_area'] = data.loc[data["parking_way"] == 2, 'parking_area'].fillna(0.0)
data.loc[data["parking_way"] != 2, 'parking_area'] = data.loc[data["parking_way"] != 2, 'parking_area'].fillna(data.loc[data["parking_way"] != 2, 'parking_area'].median())
data.loc[data["parking_way"] == 2, 'parking_price'] = data.loc[data["parking_way"] == 2, 'parking_price'].fillna(0.0)
data.loc[data["parking_way"] != 2, 'parking_price'] = data.loc[data["parking_way"] != 2, 'parking_price'].fillna(data.loc[data["parking_way"] != 2, 'parking_price'].median())
data['txn_floor'] = data['txn_floor'].fillna(1)
data['village_income_median'] = data['village_income_median'].fillna(round(data.groupby(['city','town','village'])['village_income_median'].transform('mean')))
data['village_income_median'] = data['village_income_median'].fillna(round(data.groupby(['city','town'])['village_income_median'].transform('mean')))
data['village_income_median'] = data['village_income_median'].fillna(round(data.groupby(['city'])['village_income_median'].transform('mean')))
data["floor_ratio"] = data["txn_floor"] / data["total_floor"]
data.loc[data['land_area']==0, 'land_area'] = data['land_area'].median()
data["floor_area_ratio"] = data["building_area"] / data["land_area"]
data["have_parking"] = (data["parking_way"] != 2) * 1.0
data["have_parking"] = data["have_parking"].astype(int)

cat_data = data[["town", "village", "txn_floor", "building_material", "city", "building_type", "building_use", "parking_way"]].astype(str)
cat_data['village'] = cat_data["city"] + "_" + cat_data["town"] + "_" + cat_data["village"]
#cat_data["city_town"] = cat_data["city"] + "_" + cat_data["town"]
cat_data["city_town_building_type_use"] = cat_data["city"] + "_" + cat_data["town"] + "_" + cat_data["building_type"] + "_" + cat_data["building_use"]
#cat_data["parking_way_building_type"] = cat_data["parking_way"] + "_" + cat_data["building_type"]
cat_data["building_material_building_use"] = cat_data["building_material"] + "_" + cat_data["building_use"]
#cat_data["building_material_parking_way"] = cat_data["building_material"] + "_" + cat_data["parking_way"]

#cat_data["txn_dt"] = data["txn_dt"] // 365
#cat_data["building_complete_dt"] = data["building_complete_dt"] // 365
data["txn_duration"] = (data["txn_dt"] - data["building_complete_dt"]) / 365
#cat_data["building_type_txn_duration"] = cat_data["building_type"] + "_" + data["txn_duration"].astype(str)
#cat_data["building_use_txn_duration"] = cat_data["building_use"] + "_" + data["txn_duration"].astype(str) 

cat_data = cat_data.apply(LabelEncoder().fit_transform)
#cat_cols = [col for col in data.columns if data[col].dtype == np.object]
#data = data.apply(pd.to_numeric, errors='coerce')

In [5]:
num_null(data)

Show #missing in the columns:


In [6]:
"""
corr_features = ['XIII_10000', 'VII_10000', 'V_10000', 'XIII_5000',
       'IX_10000', 'VIII_10000', 'III_10000', 'X_10000', 'XII_10000',
       'II_10000', 'VI_10000', 'XI_10000', 'jobschool_rate', 'I_10000',
       'IV_10000', 'V_5000', 'bachelor_rate', 'VII_5000', 'VIII_5000',
       'XII_5000', 'X_5000', 'master_rate', 'III_5000', 'IX_5000', 'II_5000']

poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)
data_poly = poly.fit_transform(data[corr_features])
data_poly_df = pd.DataFrame(data_poly, columns=poly.get_feature_names(corr_features))
data_poly_df.columns = data_poly_df.columns.str.replace(" ", "_")
"""

'\ncorr_features = [\'XIII_10000\', \'VII_10000\', \'V_10000\', \'XIII_5000\',\n       \'IX_10000\', \'VIII_10000\', \'III_10000\', \'X_10000\', \'XII_10000\',\n       \'II_10000\', \'VI_10000\', \'XI_10000\', \'jobschool_rate\', \'I_10000\',\n       \'IV_10000\', \'V_5000\', \'bachelor_rate\', \'VII_5000\', \'VIII_5000\',\n       \'XII_5000\', \'X_5000\', \'master_rate\', \'III_5000\', \'IX_5000\', \'II_5000\']\n\npoly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)\ndata_poly = poly.fit_transform(data[corr_features])\ndata_poly_df = pd.DataFrame(data_poly, columns=poly.get_feature_names(corr_features))\ndata_poly_df.columns = data_poly_df.columns.str.replace(" ", "_")\n'

In [7]:
cat_data.nunique()

town                               214
village                           4314
txn_floor                           28
building_material                    9
city                                11
building_type                        5
building_use                        10
parking_way                          3
city_town_building_type_use       2082
building_material_building_use      48
dtype: int64

In [8]:
#dummy_columns = ['txn_floor', 'building_material', 'city', 'building_type', 'building_use', 'parking_way', 'building_material_building_use']
#dummy = pd.get_dummies(cat_data[dummy_columns], columns=dummy_columns)
#cat_data = cat_data.drop(dummy_columns, 1)

cont_data = data.drop(["building_id", 
                       "town", "village", "txn_floor", "building_material", "city", "building_type", "building_use", "parking_way"], 1)

#cont_data = pd.concat([cont_data, dummy], axis=1, join_axes=[cont_data.index])

skewness = cont_data.apply(lambda x: skew(x))
skewness = skewness[abs(skewness) > 1.5]
print(str(skewness.shape[0]) + " skewed numerical features to log transform")

skewed_features = skewness.index
cont_data[skewed_features] = np.log1p(cont_data[skewed_features])

train_data = pd.concat([cont_data, cat_data], axis=1, join_axes=[cont_data.index])
#scale = StandardScaler()
#cont_data = pd.DataFrame(scale.fit_transform(cont_data.values), columns=cont_data.columns, index=cont_data.index)

132 skewed numerical features to log transform


In [9]:
"""
data = data.drop(["building_id", "parking_area", "txn_dt", "building_complete_dt",
                  "town", "village", "txn_floor", "building_material", "city", "building_type", "building_use", "parking_way",
                  'XIII_10000', 'VII_10000', 'V_10000', 'XIII_5000',
       'IX_10000', 'VIII_10000', 'III_10000', 'X_10000', 'XII_10000',
       'II_10000', 'VI_10000', 'XI_10000', 'jobschool_rate', 'I_10000',
       'IV_10000', 'V_5000', 'bachelor_rate', 'VII_5000', 'VIII_5000',
       'XII_5000', 'X_5000', 'master_rate', 'III_5000', 'IX_5000', 'II_5000'], 1)
cont_data = pd.concat([data, data_poly_df], axis=1, join_axes=[data.index])
scale = StandardScaler()
cont_data = pd.DataFrame(scale.fit_transform(cont_data.values), columns=cont_data.columns, index=cont_data.index)
train_data = pd.concat([cont_data, cat_data], axis=1, join_axes=[cont_data.index])
"""

'\ndata = data.drop(["building_id", "parking_area", "txn_dt", "building_complete_dt",\n                  "town", "village", "txn_floor", "building_material", "city", "building_type", "building_use", "parking_way",\n                  \'XIII_10000\', \'VII_10000\', \'V_10000\', \'XIII_5000\',\n       \'IX_10000\', \'VIII_10000\', \'III_10000\', \'X_10000\', \'XII_10000\',\n       \'II_10000\', \'VI_10000\', \'XI_10000\', \'jobschool_rate\', \'I_10000\',\n       \'IV_10000\', \'V_5000\', \'bachelor_rate\', \'VII_5000\', \'VIII_5000\',\n       \'XII_5000\', \'X_5000\', \'master_rate\', \'III_5000\', \'IX_5000\', \'II_5000\'], 1)\ncont_data = pd.concat([data, data_poly_df], axis=1, join_axes=[data.index])\nscale = StandardScaler()\ncont_data = pd.DataFrame(scale.fit_transform(cont_data.values), columns=cont_data.columns, index=cont_data.index)\ntrain_data = pd.concat([cont_data, cat_data], axis=1, join_axes=[cont_data.index])\n'

In [10]:
#categorical_features = [col for col in train_data.columns if train_data[col].dtype == np.int] 
categorical_features = ["town", "village", "txn_floor", "building_material", "city", "building_type", "building_use", "parking_way", 
                        "city_town_building_type_use", "building_material_building_use"]
#categorical_features = ["town", "village", "city_town_building_type_use"]

for col in categorical_features:
    train_data[col] = train_data[col].astype('category')

In [11]:
X_train = train_data.iloc[:-10000]
X_test = train_data.iloc[-10000:]

**LGBM**

In [12]:
param_ = {
    'boosting_type': 'gbdt', 
    'objective': 'huber', 
    'learning_rate': 0.01, 
    'num_leaves': 200, 
    #'max_depth': 8, 
    'feature_fraction': 0.25, 
    #'max_bin': 500,  
    'seed': 42, 
    'bagging_fraction': 0.95, 
    'bagging_freq': 5,
    'save_binary': True,
    'max_bin': 63,
    'metric':'l1'
    }

**in taipei**

In [13]:
in_taipei_X_train = X_train[train_greater_taipei_bool.values]
in_taipei_Y_train = Y_train[train_greater_taipei_bool.values]

in_taipei_X_test = X_test[test_greater_taipei_bool.values]

In [14]:
in_taipei_valid_preds = []
in_taipei_test_preds = []
kf = KFold(n_splits=7, shuffle=False)

for i, (train_index, val_index) in enumerate(kf.split(in_taipei_X_train)):
    print("-" * 20)
    print(f"Fold {i+1}")
    print("-" * 20)
    train_data = lgb.Dataset(
        in_taipei_X_train.iloc[train_index],
        label=in_taipei_Y_train.iloc[train_index]
    )
    valid_data = lgb.Dataset(
        in_taipei_X_train.iloc[val_index],
        label=in_taipei_Y_train.iloc[val_index],
        reference=train_data
    )
    model = lgb.train(
        param_,
        train_data,
        3000000,
        valid_sets=[train_data, valid_data],
        early_stopping_rounds=100,
        verbose_eval=100
    )
    
    # Predict valid
    valid_pred = np.expand_dims(model.predict(in_taipei_X_train.iloc[val_index], num_iteration=model.best_iteration), -1)
    valid_pred = y_scale.inverse_transform(valid_pred)
    in_taipei_valid_preds.append(np.expm1(valid_pred))
    # Predict test
    pred = np.expand_dims(model.predict(in_taipei_X_test, num_iteration=model.best_iteration), -1)
    pred = y_scale.inverse_transform(pred)
    in_taipei_test_preds.append(np.expm1(pred))
        
    #Y_valid_predict = model.predict(in_taipei_X_train.iloc[val_index], num_iteration=model.best_iteration)
    #Y_valid_predict = np.expm1(y_scale.inverse_transform(np.expand_dims(Y_valid_predict, -1)))
    #Y_valid = np.expm1(y_scale.inverse_transform(in_taipei_Y_train.iloc[val_index]))
    #metric(Y_valid, Y_valid_predict)

--------------------
Fold 1
--------------------




Training until validation scores don't improve for 100 rounds.
[100]	training's l1: 0.280073	valid_1's l1: 0.285071
[200]	training's l1: 0.171406	valid_1's l1: 0.183112
[300]	training's l1: 0.131264	valid_1's l1: 0.150068
[400]	training's l1: 0.11144	valid_1's l1: 0.136173
[500]	training's l1: 0.0985768	valid_1's l1: 0.12799
[600]	training's l1: 0.08897	valid_1's l1: 0.122494
[700]	training's l1: 0.0818115	valid_1's l1: 0.118893
[800]	training's l1: 0.0758628	valid_1's l1: 0.116088
[900]	training's l1: 0.0709435	valid_1's l1: 0.114059
[1000]	training's l1: 0.0667478	valid_1's l1: 0.112515
[1100]	training's l1: 0.0630511	valid_1's l1: 0.111333
[1200]	training's l1: 0.0597163	valid_1's l1: 0.110351
[1300]	training's l1: 0.0567269	valid_1's l1: 0.109504
[1400]	training's l1: 0.0540992	valid_1's l1: 0.10885
[1500]	training's l1: 0.0516019	valid_1's l1: 0.108222
[1600]	training's l1: 0.0493381	valid_1's l1: 0.107704
[1700]	training's l1: 0.0472227	valid_1's l1: 0.107247
[1800]	training's l1

In [15]:
in_taipei_y_valid = np.squeeze(np.concatenate(in_taipei_valid_preds, axis=0)) * offset[train_greater_taipei_bool]
in_taipei_y_test = np.squeeze(np.mean(in_taipei_test_preds, axis=0)) * test.loc[test_greater_taipei_bool, 'building_area'].values

**out taipei**

In [16]:
out_taipei_X_train = X_train[~train_greater_taipei_bool.values]
out_taipei_Y_train = Y_train[~train_greater_taipei_bool.values]

out_taipei_X_test = X_test[~test_greater_taipei_bool.values]

In [17]:
out_taipei_valid_preds = []
out_taipei_test_preds = []
kf = KFold(n_splits=7, shuffle=False)

for i, (train_index, val_index) in enumerate(kf.split(out_taipei_X_train)):
    print("-" * 20)
    print(f"Fold {i+1}")
    print("-" * 20)
    train_data = lgb.Dataset(
        out_taipei_X_train.iloc[train_index],
        label=out_taipei_Y_train.iloc[train_index]
    )
    valid_data = lgb.Dataset(
        out_taipei_X_train.iloc[val_index],
        label=out_taipei_Y_train.iloc[val_index],
        reference=train_data
    )
    model = lgb.train(
        param_,
        train_data,
        3000000,
        valid_sets=[train_data, valid_data],
        early_stopping_rounds=100,
        verbose_eval=100
    )
    
    # Predict valid
    valid_pred = np.expand_dims(model.predict(out_taipei_X_train.iloc[val_index], num_iteration=model.best_iteration), -1)
    valid_pred = y_scale.inverse_transform(valid_pred)
    out_taipei_valid_preds.append(np.expm1(valid_pred))
    # Predict test
    pred = np.expand_dims(model.predict(out_taipei_X_test, num_iteration=model.best_iteration), -1)
    pred = y_scale.inverse_transform(pred)
    out_taipei_test_preds.append(np.expm1(pred))
        
    #Y_valid_predict = model.predict(out_taipei_X_train.iloc[val_index], num_iteration=model.best_iteration)
    #Y_valid_predict = np.expm1(y_scale.inverse_transform(np.expand_dims(Y_valid_predict, -1)))
    #Y_valid = np.expm1(y_scale.inverse_transform(out_taipei_Y_train.iloc[val_index]))
    #metric(Y_valid, Y_valid_predict)

--------------------
Fold 1
--------------------
Training until validation scores don't improve for 100 rounds.
[100]	training's l1: 0.311838	valid_1's l1: 0.330341
[200]	training's l1: 0.222975	valid_1's l1: 0.252175
[300]	training's l1: 0.178606	valid_1's l1: 0.217103
[400]	training's l1: 0.152508	valid_1's l1: 0.199671
[500]	training's l1: 0.134727	valid_1's l1: 0.189005
[600]	training's l1: 0.121343	valid_1's l1: 0.181868
[700]	training's l1: 0.111073	valid_1's l1: 0.177207
[800]	training's l1: 0.102623	valid_1's l1: 0.173716
[900]	training's l1: 0.0955819	valid_1's l1: 0.171159
[1000]	training's l1: 0.089402	valid_1's l1: 0.169095
[1100]	training's l1: 0.0838408	valid_1's l1: 0.167361
[1200]	training's l1: 0.0788819	valid_1's l1: 0.165916
[1300]	training's l1: 0.0745151	valid_1's l1: 0.164819
[1400]	training's l1: 0.0704019	valid_1's l1: 0.163795
[1500]	training's l1: 0.0668148	valid_1's l1: 0.162974
[1600]	training's l1: 0.0634802	valid_1's l1: 0.162198
[1700]	training's l1: 0.06

In [18]:
out_taipei_y_valid = np.squeeze(np.concatenate(out_taipei_valid_preds, axis=0)) * offset[~train_greater_taipei_bool]
out_taipei_y_test = np.squeeze(np.mean(out_taipei_test_preds, axis=0)) * test.loc[~test_greater_taipei_bool, 'building_area'].values

In [19]:
y_valid = np.zeros(len(X_train))
y_valid[train_greater_taipei_bool] = in_taipei_y_valid
y_valid[~train_greater_taipei_bool] = out_taipei_y_valid

In [20]:
y_valid

array([  662192.22415542,  3116298.84052955,  9683504.714471  , ...,
       12468510.33563988, 16654874.52168291,  8134810.48408049])

In [21]:
valid_df = pd.DataFrame(y_valid, columns=["total_price"])
valid_df.to_csv("valid_prediction.csv", index=False)

In [22]:
y_test = np.zeros(len(test))
y_test[test_greater_taipei_bool] = in_taipei_y_test
y_test[~test_greater_taipei_bool] = out_taipei_y_test

In [23]:
y_test

array([10569765.65295199,  3871415.94826811, 10861397.96920879, ...,
        1105961.71842607,  3073177.54404006,  2989690.15574406])

In [24]:
submit = pd.read_csv("../input/dataset-0510/submit_test.csv")

with open("sample_submission.csv", "w") as f:
    f.write('building_id,total_price\n')
    for _id, label in zip(submit["building_id"], y_test):
        f.write(_id + ',' + str(label) + '\n')