In [1]:
import pandas as pd
import numpy as np
from scipy.stats import skew
from sklearn.preprocessing import StandardScaler, LabelEncoder, PolynomialFeatures
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from keras.models import Sequential, load_model, Model
from keras.layers import Dense, Input, Embedding, concatenate, Flatten, Activation, Dropout, Lambda, add, multiply
from keras.layers.advanced_activations import PReLU, LeakyReLU
from keras.layers.normalization import BatchNormalization
from keras import optimizers
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.utils import to_categorical
from keras import backend as K

Using TensorFlow backend.


In [2]:
def num_null(df):
    missing = df.isnull().sum()
    print('Show #missing in the columns:')
    for i in range(df.shape[1]):
        if missing[i]:
            print(missing.index[i], ':', missing[i])

def metric(truth, pred):
    truth = np.array(truth)
    pred = np.array(pred)
    diff = abs(pred - truth) / truth
    print(list(diff <= 0.1).count(True) / len(diff))

In [3]:
train = pd.read_csv("../input/dataset-0510/train.csv")
test = pd.read_csv("../input/dataset-0510/test.csv")

#train = train[train["total_price"] < 1.5e8]
price_sq = train["total_price"] / train["building_area"]
train = train[(price_sq<12000000) & (price_sq>60000)]

#Y_train = train[["total_price"]]
#offset = Y_train.min()
#Y_train = Y_train / offset
Y_train = train["total_price"] / train["building_area"]
Y_train = np.expand_dims(np.log1p(Y_train), -1)
y_scale = StandardScaler()
Y_train = y_scale.fit_transform(Y_train)

offset = train["building_area"].values
train_greater_taipei_bool = train['city'].isin([7, 9, 13])
test_greater_taipei_bool = test['city'].isin([7, 9, 13])

train = train.drop('total_price', 1)
data = pd.concat([train, test], ignore_index=True)

In [4]:
num_null(data)

Show #missing in the columns:
parking_area : 66266
parking_price : 53672
txn_floor : 18455
village_income_median : 1317


In [5]:
data.loc[data["parking_way"] == 2, 'parking_area'] = data.loc[data["parking_way"] == 2, 'parking_area'].fillna(0.0)
data.loc[data["parking_way"] != 2, 'parking_area'] = data.loc[data["parking_way"] != 2, 'parking_area'].fillna(data.loc[data["parking_way"] != 2, 'parking_area'].median())
data.loc[data["parking_way"] == 2, 'parking_price'] = data.loc[data["parking_way"] == 2, 'parking_price'].fillna(0.0)
data.loc[data["parking_way"] != 2, 'parking_price'] = data.loc[data["parking_way"] != 2, 'parking_price'].fillna(data.loc[data["parking_way"] != 2, 'parking_price'].median())
data['txn_floor'] = data['txn_floor'].fillna(1)
data['village_income_median'] = data['village_income_median'].fillna(round(data.groupby(['city','town','village'])['village_income_median'].transform('mean')))
data['village_income_median'] = data['village_income_median'].fillna(round(data.groupby(['city','town'])['village_income_median'].transform('mean')))
data['village_income_median'] = data['village_income_median'].fillna(round(data.groupby(['city'])['village_income_median'].transform('mean')))
data["floor_ratio"] = data["txn_floor"] / data["total_floor"]
data.loc[data['land_area']==0, 'land_area'] = data['land_area'].median()
data["floor_area_ratio"] = data["building_area"] / data["land_area"]
data["have_parking"] = (data["parking_way"] != 2) * 1.0
data["have_parking"] = data["have_parking"].astype(int)

cat_data = data[["town", "village", "txn_floor", "building_material", "city", "building_type", "building_use", "parking_way"]].astype(str)
cat_data['village'] = cat_data["city"] + "_" + cat_data["town"] + "_" + cat_data["village"]
#cat_data["city_town"] = cat_data["city"] + "_" + cat_data["town"]
cat_data["city_town_building_type_use"] = cat_data["city"] + "_" + cat_data["town"] + "_" + cat_data["building_type"] + "_" + cat_data["building_use"]
#cat_data["parking_way_building_type"] = cat_data["parking_way"] + "_" + cat_data["building_type"]
cat_data["building_material_building_use"] = cat_data["building_material"] + "_" + cat_data["building_use"]
#cat_data["building_material_parking_way"] = cat_data["building_material"] + "_" + cat_data["parking_way"]

#cat_data["txn_dt"] = data["txn_dt"] // 365
#cat_data["building_complete_dt"] = data["building_complete_dt"] // 365
data["txn_duration"] = (data["txn_dt"] - data["building_complete_dt"]) / 365
#cat_data["building_type_txn_duration"] = cat_data["building_type"] + "_" + data["txn_duration"].astype(str)
#cat_data["building_use_txn_duration"] = cat_data["building_use"] + "_" + data["txn_duration"].astype(str) 

cat_data = cat_data.apply(LabelEncoder().fit_transform)
#cat_cols = [col for col in data.columns if data[col].dtype == np.object]
#data = data.apply(pd.to_numeric, errors='coerce')

In [6]:
num_null(data)

Show #missing in the columns:


In [7]:
"""
for index in ['I', 'II', 'III', 'IV', 'V', 'VI', 'VII','VIII','IX','X', 'XI', 'XII', 'XIII', 'XIV']:
    data['{}_index_100'.format(index)] = np.sign(data['{}_100'.format(index)])
corr_features = ['I_index_100', 'II_index_100', 'III_index_100', 'IV_index_100', 'V_index_100', 'VI_index_100', 'VII_index_100', 'VIII_index_100',
                'IX_index_100', 'X_index_100', 'XI_index_100', 'XII_index_100', 'XIII_index_100', 'XIV_index_100']
poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)
data_poly = poly.fit_transform(data[corr_features])
data_poly_df = pd.DataFrame(data_poly, columns=poly.get_feature_names(corr_features))
data_poly_df.columns = data_poly_df.columns.str.replace(" ", "_")
"""

'\nfor index in [\'I\', \'II\', \'III\', \'IV\', \'V\', \'VI\', \'VII\',\'VIII\',\'IX\',\'X\', \'XI\', \'XII\', \'XIII\', \'XIV\']:\n    data[\'{}_index_100\'.format(index)] = np.sign(data[\'{}_100\'.format(index)])\ncorr_features = [\'I_index_100\', \'II_index_100\', \'III_index_100\', \'IV_index_100\', \'V_index_100\', \'VI_index_100\', \'VII_index_100\', \'VIII_index_100\',\n                \'IX_index_100\', \'X_index_100\', \'XI_index_100\', \'XII_index_100\', \'XIII_index_100\', \'XIV_index_100\']\npoly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)\ndata_poly = poly.fit_transform(data[corr_features])\ndata_poly_df = pd.DataFrame(data_poly, columns=poly.get_feature_names(corr_features))\ndata_poly_df.columns = data_poly_df.columns.str.replace(" ", "_")\n'

In [8]:
cat_data.nunique()

town                               214
village                           4314
txn_floor                           28
building_material                    9
city                                11
building_type                        5
building_use                        10
parking_way                          3
city_town_building_type_use       2082
building_material_building_use      48
dtype: int64

In [9]:
#dummy_columns = ['txn_floor', 'building_material', 'city', 'building_type', 'building_use', 'parking_way', 'building_material_building_use']
#dummy = pd.get_dummies(cat_data[dummy_columns], columns=dummy_columns)
#cat_data = cat_data.drop(dummy_columns, 1)

cont_data = data.drop(["building_id", 
                       "town", "village", "txn_floor", "building_material", "city", "building_type", "building_use", "parking_way"], 1)

#cont_data = pd.concat([cont_data, dummy], axis=1, join_axes=[cont_data.index])

skewness = cont_data.apply(lambda x: skew(x))
skewness = skewness[abs(skewness) > 1.5]
print(str(skewness.shape[0]) + " skewed numerical features to log transform")

skewed_features = skewness.index
cont_data[skewed_features] = np.log1p(cont_data[skewed_features])

scale = StandardScaler()
cont_data = pd.DataFrame(scale.fit_transform(cont_data.values), columns=cont_data.columns, index=cont_data.index)

132 skewed numerical features to log transform


In [10]:
cat_data.nunique()

town                               214
village                           4314
txn_floor                           28
building_material                    9
city                                11
building_type                        5
building_use                        10
parking_way                          3
city_town_building_type_use       2082
building_material_building_use      48
dtype: int64

In [11]:
selected_features = ['IX_1000', 'I_1000', 'IV_500', 'XI_1000', 'txn_dt', 'I_5000', 'V_10000', 'VII_500', 'XII_10000', 'lat', 'VI_MIN', 'VIII_1000', 'V_500', 'IV_1000', 'II_250', 'XIV_10000', 'VII_MIN', 'XI_250', 'XII_1000', 'X_MIN', 'jobschool_rate', 'XIV_100', 'floor_ratio', 'VII_5000', 'VI_5000', 'III_5000', 'X_500', 'X_250', 'VI_10000', 'village_income_median', 'VIII_10000', 'II_MIN', 'IX_10000', 'parking_price', 'XII_MIN', 'master_rate', 'lon', 'IV_10000', 'VII_10000', 'VII_1000', 'X_10000', 'II_10000', 'txn_duration', 'town_population_density', 'XIV_MIN', 'V_5000', 'IX_5000', 'VIII_100', 'XIV_250', 'XIII_5000', 'building_area', 'XIV_1000', 'floor_area_ratio', 'town_population', 'IX_MIN', 'parking_area', 'II_1000', 'III_1000', 'XII_5000', 'III_10000', 'III_500', 'II_500', 'IV_MIN', 'XI_MIN', 'I_MIN', 'town_area', 'XIII_500', 'V_250', 'XI_10000', 'total_floor', 'bachelor_rate', 'X_5000', 'IX_500', 'V_MIN', 'VIII_5000', 'I_500', 'IX_250', 'II_5000', 'XIII_1000', 'XI_500', 'doc_rate', 'III_250', 'VIII_250', 'I_10000', 'XII_250', 'X_1000', 'XIII_MIN', 'VII_250', 'VIII_MIN', 'XIV_5000', 'III_MIN', 'XIII_10000', 'V_1000', 'VIII_500', 'IV_5000', 'building_complete_dt', 'land_area', 'XI_5000', 'VI_1000', 'XIV_500', 'XII_500', 'XII_100', 'N_50']

In [12]:
cont_train = cont_data[selected_features].iloc[:-10000].values
cat_train = cat_data.iloc[:-10000].values
cont_test = cont_data[selected_features].iloc[-10000:].values
cat_test = cat_data.iloc[-10000:].values

**Latent Cross**

In [13]:
def dnn_model():
    # create model
    cont_inputs = Input(shape=(cont_train.shape[1],))

    cat_inputs = []
    cat_embeds = []
    for i in range(cat_train.shape[1]):
        input_i = Input(shape=(1,))
        dim = len(set(cat_data.iloc[:, i]))
        embed_i = Embedding(dim, 512, input_length=1, embeddings_initializer='glorot_normal')(input_i)
        flatten_i = Flatten()(embed_i)
        cat_inputs.append(input_i)
        cat_embeds.append(flatten_i)

    cont_dense = Dense(512, use_bias=False)(cont_inputs)
    cat_dense = add(cat_embeds)
    cat_dense = Lambda(lambda x: x + 1)(cat_dense)
    inputs = multiply([cat_dense, cont_dense])
    #inputs = concatenate([cont_inputs] + cat_embeds)
    #inputs = Dropout(0.2)(inputs)
    x = Dense(4096)(inputs)
    x = PReLU()(x)
    #x = Dropout(0.3)(x)
    x = Dense(2048)(x)
    x = PReLU()(x)
    #x = Dropout(0.3)(x)
    x = Dense(1024)(x)
    x = PReLU()(x)
    x = Dense(512)(x)
    x = PReLU()(x)
    x = Dense(256)(x)
    x = PReLU()(x)
    #x = Dropout(0.3)(x)
    #x = concatenate([x, cat_embeds[5], cat_embeds[6]])
    predictions = Dense(1)(x)
    model = Model(inputs=[cont_inputs] + cat_inputs, outputs=predictions)
    #model.summary()
    return model

In [14]:
#cont_X_train, cont_X_valid, cat_X_train, cat_X_valid, y_train, y_valid = train_test_split(cont_train[~train_greater_taipei_bool.values], 
#                                                                                          cat_train[~train_greater_taipei_bool.values], 
#                                                                                          Y_train[~train_greater_taipei_bool.values], test_size = 0.2, random_state = 42)
#X_train = [cont_X_train] + [np.expand_dims(cat_X_train[:, i], -1)  for i in range(cat_train.shape[1])]
#X_valid = [cont_X_valid] + [np.expand_dims(cat_X_valid[:, i], -1)  for i in range(cat_train.shape[1])]

In [15]:
in_taipei_cont_train = cont_train[train_greater_taipei_bool.values]
in_taipei_cat_train = cat_train[train_greater_taipei_bool.values]
in_taipei_Y_train = Y_train[train_greater_taipei_bool.values]

in_taipei_cont_test = cont_test[test_greater_taipei_bool.values]
in_taipei_cat_test = cat_test[test_greater_taipei_bool.values]

In [16]:
in_taipei_valid_preds = []
in_taipei_test_preds = []
kf = KFold(n_splits=7, shuffle=False)

for i, (train_index, val_index) in enumerate(kf.split(in_taipei_cont_train)):
    print("-" * 20)
    print(f"Fold {i+1}")
    print("-" * 20)
    X_train = [in_taipei_cont_train[train_index]] + [np.expand_dims(in_taipei_cat_train[train_index, i], -1)  for i in range(cat_train.shape[1])]
    y_train = in_taipei_Y_train[train_index]
    X_valid = [in_taipei_cont_train[val_index]] + [np.expand_dims(in_taipei_cat_train[val_index, i], -1)  for i in range(cat_train.shape[1])]
    y_valid = in_taipei_Y_train[val_index]

    model = dnn_model()
    # Compile model
    model.compile(loss='mean_absolute_error', optimizer=optimizers.Adam(0.0001), metrics=['mse'])
    # checkpoint
    filepath="model.h5"
    checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
    early_stopping = EarlyStopping(monitor='val_loss', patience=100, verbose=2)
    callbacks_list = [checkpoint, early_stopping]
    # Fit the model
    model.fit(X_train, y_train, epochs=200000, batch_size=128, validation_data=(X_valid, y_valid), callbacks=callbacks_list)
    # Load best model
    K.clear_session()
    best_model = load_model("model.h5")
    # Predict valid
    valid_pred = best_model.predict(X_valid)
    valid_pred = y_scale.inverse_transform(valid_pred)
    in_taipei_valid_preds.append(np.expm1(valid_pred))
    # Predict test
    pred = best_model.predict([in_taipei_cont_test] + [np.expand_dims(in_taipei_cat_test[:, i], -1)  for i in range(cat_train.shape[1])])
    pred = y_scale.inverse_transform(pred)
    in_taipei_test_preds.append(np.expm1(pred))

    #Y_valid_predict = best_model.predict(X_valid)
    #Y_valid_predict = np.floor(np.expm1(y_scale.inverse_transform(Y_valid_predict)))
    #Y_valid = np.floor(np.expm1(y_scale.inverse_transform(y_valid)))
    #metric(Y_valid, Y_valid_predict)

--------------------
Fold 1
--------------------
Train on 26982 samples, validate on 4498 samples
Epoch 1/200000

Epoch 00001: val_loss improved from inf to 0.18431, saving model to model.h5
Epoch 2/200000

Epoch 00002: val_loss improved from 0.18431 to 0.17101, saving model to model.h5
Epoch 3/200000

Epoch 00003: val_loss improved from 0.17101 to 0.14911, saving model to model.h5
Epoch 4/200000

Epoch 00004: val_loss improved from 0.14911 to 0.14018, saving model to model.h5
Epoch 5/200000

Epoch 00005: val_loss did not improve from 0.14018
Epoch 6/200000

Epoch 00006: val_loss improved from 0.14018 to 0.13751, saving model to model.h5
Epoch 7/200000

Epoch 00007: val_loss did not improve from 0.13751
Epoch 8/200000

Epoch 00008: val_loss improved from 0.13751 to 0.13243, saving model to model.h5
Epoch 9/200000

Epoch 00009: val_loss did not improve from 0.13243
Epoch 10/200000

Epoch 00010: val_loss improved from 0.13243 to 0.12815, saving model to model.h5
Epoch 11/200000

Epoch 00

In [17]:
in_taipei_y_valid = np.squeeze(np.concatenate(in_taipei_valid_preds, axis=0)) * offset[train_greater_taipei_bool]
in_taipei_y_test = np.squeeze(np.mean(in_taipei_test_preds, axis=0)) * test.loc[test_greater_taipei_bool, 'building_area'].values

In [18]:
out_taipei_cont_train = cont_train[~train_greater_taipei_bool.values]
out_taipei_cat_train = cat_train[~train_greater_taipei_bool.values]
out_taipei_Y_train = Y_train[~train_greater_taipei_bool.values]

out_taipei_cont_test = cont_test[~test_greater_taipei_bool.values]
out_taipei_cat_test = cat_test[~test_greater_taipei_bool.values]

In [19]:
out_taipei_valid_preds = []
out_taipei_test_preds = []
kf = KFold(n_splits=7, shuffle=False)

for i, (train_index, val_index) in enumerate(kf.split(out_taipei_cont_train)):
    print("-" * 20)
    print(f"Fold {i+1}")
    print("-" * 20)
    X_train = [out_taipei_cont_train[train_index]] + [np.expand_dims(out_taipei_cat_train[train_index, i], -1)  for i in range(cat_train.shape[1])]
    y_train = out_taipei_Y_train[train_index]
    X_valid = [out_taipei_cont_train[val_index]] + [np.expand_dims(out_taipei_cat_train[val_index, i], -1)  for i in range(cat_train.shape[1])]
    y_valid = out_taipei_Y_train[val_index]

    model = dnn_model()
    # Compile model
    model.compile(loss='mean_absolute_error', optimizer=optimizers.Adam(0.0001), metrics=['mse'])
    # checkpoint
    filepath="model.h5"
    checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
    early_stopping = EarlyStopping(monitor='val_loss', patience=100, verbose=2)
    callbacks_list = [checkpoint, early_stopping]
    # Fit the model
    model.fit(X_train, y_train, epochs=200000, batch_size=128, validation_data=(X_valid, y_valid), callbacks=callbacks_list)
    # Load best modelout_taipei_
    K.clear_session()
    best_model = load_model("model.h5")
    # Predict valid
    valid_pred = best_model.predict(X_valid)
    valid_pred = y_scale.inverse_transform(valid_pred)
    out_taipei_valid_preds.append(np.expm1(valid_pred))
    # Predict test
    pred = best_model.predict([out_taipei_cont_test] + [np.expand_dims(out_taipei_cat_test[:, i], -1)  for i in range(cat_train.shape[1])])
    pred = y_scale.inverse_transform(pred)
    out_taipei_test_preds.append(np.expm1(pred))

    #Y_valid_predict = best_model.predict(X_valid)
    #Y_valid_predict = np.floor(np.expm1(y_scale.inverse_transform(Y_valid_predict)))
    #Y_valid = np.floor(np.expm1(y_scale.inverse_transform(y_valid)))
    #metric(Y_valid, Y_valid_predict)

--------------------
Fold 1
--------------------
Train on 24332 samples, validate on 4056 samples
Epoch 1/200000

Epoch 00001: val_loss improved from inf to 0.23385, saving model to model.h5
Epoch 2/200000

Epoch 00002: val_loss improved from 0.23385 to 0.20631, saving model to model.h5
Epoch 3/200000

Epoch 00003: val_loss improved from 0.20631 to 0.19459, saving model to model.h5
Epoch 4/200000

Epoch 00004: val_loss did not improve from 0.19459
Epoch 5/200000

Epoch 00005: val_loss did not improve from 0.19459
Epoch 6/200000

Epoch 00006: val_loss improved from 0.19459 to 0.18791, saving model to model.h5
Epoch 7/200000

Epoch 00007: val_loss did not improve from 0.18791
Epoch 8/200000

Epoch 00008: val_loss did not improve from 0.18791
Epoch 9/200000

Epoch 00009: val_loss improved from 0.18791 to 0.18415, saving model to model.h5
Epoch 10/200000

Epoch 00010: val_loss improved from 0.18415 to 0.18229, saving model to model.h5
Epoch 11/200000

Epoch 00011: val_loss improved from 0.

In [20]:
out_taipei_y_valid = np.squeeze(np.concatenate(out_taipei_valid_preds, axis=0)) * offset[~train_greater_taipei_bool]
out_taipei_y_test = np.squeeze(np.mean(out_taipei_test_preds, axis=0)) * test.loc[~test_greater_taipei_bool, 'building_area'].values

In [21]:
y_valid = np.zeros(len(cont_train))
y_valid[train_greater_taipei_bool] = in_taipei_y_valid
y_valid[~train_greater_taipei_bool] = out_taipei_y_valid

In [22]:
y_valid

array([  705712.15497045,  2943522.62512413,  9485504.91882316, ...,
       12440902.37730765, 19240969.71950816,  8409342.63559498])

In [23]:
valid_df = pd.DataFrame(y_valid, columns=["total_price"])
valid_df.to_csv("valid_prediction.csv", index=False)

In [24]:
y_test = np.zeros(len(test))
y_test[test_greater_taipei_bool] = in_taipei_y_test
y_test[~test_greater_taipei_bool] = out_taipei_y_test

In [25]:
y_test

array([14354232.54925925,  3893668.91815371, 12840578.66261289, ...,
        1161379.37050462,  3004094.31468554,  3009364.16235374])

In [26]:
submit = pd.read_csv("../input/dataset-0510/submit_test.csv")

with open("sample_submission.csv", "w") as f:
    f.write('building_id,total_price\n')
    for _id, label in zip(submit["building_id"], y_test):
        f.write(_id + ',' + str(label) + '\n')