In [1]:
import warnings
warnings.filterwarnings("ignore")
from copy import copy
import os
import gc
import joblib
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics, preprocessing
from sklearn.impute import SimpleImputer
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras.models import Model, load_model
from tensorflow.keras import callbacks
from tensorflow.keras import backend as K
from tensorflow.keras import utils


pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [2]:
train = pd.read_csv('cat_in_dat/train.csv')
test = pd.read_csv('cat_in_dat/test.csv')

In [3]:
test["target"] = -1
data = pd.concat([train, test]).reset_index(drop=True)

# Converting ordinal labels into ordered values
ord_1 = {
    'Novice' : 0,
    'Contributor' : 1,
    'Expert' : 2,
    'Master' : 3,
    'Grandmaster' : 4
}

ord_2 = {
    'Freezing' : 0,
    'Cold' : 1,
    'Warm' : 2,
    'Hot' : 3,
    'Boiling Hot' : 4,
    'Lava Hot' : 5
}

data['ord_1'] = data['ord_1'].map(ord_1)
data['ord_2'] = data['ord_2'].map(ord_2)

bin_col = [col for col in data.columns if col.startswith('bin_')]
nom_col = [col for col in data.columns if col.startswith('nom_')]
nom_col_low = [c for c in nom_col if len(data[c].unique()) <= 20]
nom_col_high = [c for c in nom_col if len(data[c].unique()) > 20]

In [4]:
%%time
# One hot encoder
for col in bin_col + nom_col_low + ['day'] + ['month']:
    fill_value = -1
    if data[col].dtype == 'O':
        fill_value = 'missing'
    si = SimpleImputer(strategy='constant', fill_value=fill_value)
    tr = preprocessing.OneHotEncoder(categories='auto', sparse=False)
    temp = si.fit_transform(data[col].values.reshape(-1, 1))
    temp = tr.fit_transform(temp.reshape(-1, 1))
    columns = [col + '_' + col_names for col_names in tr.get_feature_names()]
    res = pd.DataFrame(temp, columns=columns)
    data = pd.concat([data.reset_index(drop=True), res.reset_index(drop=True)], axis=1)
    data.drop(col, axis=1, inplace=True)

CPU times: user 7.05 s, sys: 2.02 s, total: 9.06 s
Wall time: 9.07 s


In [5]:
features = [x for x in data.columns if x not in ["id", "target"]]
for feat in features:
    lbl_enc = preprocessing.LabelEncoder()
    data[feat] = lbl_enc.fit_transform(data[feat].fillna("-1").astype(str).values)
    ss = preprocessing.StandardScaler()
    data[feat] = ss.fit_transform(data[feat].values.reshape(-1,1))

In [6]:
data.head()

Unnamed: 0,id,nom_5,nom_6,nom_7,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,target,bin_0_x0_-1.0,bin_0_x0_0.0,bin_0_x0_1.0,bin_1_x0_-1.0,bin_1_x0_0.0,bin_1_x0_1.0,bin_2_x0_-1.0,bin_2_x0_0.0,bin_2_x0_1.0,bin_3_x0_F,bin_3_x0_T,bin_3_x0_missing,bin_4_x0_N,bin_4_x0_Y,bin_4_x0_missing,nom_0_x0_Blue,nom_0_x0_Green,nom_0_x0_Red,nom_0_x0_missing,nom_1_x0_Circle,nom_1_x0_Polygon,nom_1_x0_Square,nom_1_x0_Star,nom_1_x0_Trapezoid,nom_1_x0_Triangle,nom_1_x0_missing,nom_2_x0_Axolotl,nom_2_x0_Cat,nom_2_x0_Dog,nom_2_x0_Hamster,nom_2_x0_Lion,nom_2_x0_Snake,nom_2_x0_missing,nom_3_x0_Canada,nom_3_x0_China,nom_3_x0_Costa Rica,nom_3_x0_Finland,nom_3_x0_India,nom_3_x0_Russia,nom_3_x0_missing,nom_4_x0_Bassoon,nom_4_x0_Oboe,nom_4_x0_Piano,nom_4_x0_Theremin,nom_4_x0_missing,day_x0_-1.0,day_x0_1.0,day_x0_2.0,day_x0_3.0,day_x0_4.0,day_x0_5.0,day_x0_6.0,day_x0_7.0,month_x0_-1.0,month_x0_1.0,month_x0_2.0,month_x0_3.0,month_x0_4.0,month_x0_5.0,month_x0_6.0,month_x0_7.0,month_x0_8.0,month_x0_9.0,month_x0_10.0,month_x0_11.0,month_x0_12.0
0,0,1.311572,0.61613,-0.282442,-1.642618,-1.597645,1.226857,-0.434369,0.579122,-0.918435,1.03841,-0.569633,0,-0.175243,0.369005,-0.314583,-0.175987,0.516557,-0.469454,-0.175567,0.654898,-0.608557,0.799079,-0.749562,-0.175757,0.959286,-0.90313,-0.175857,-0.721667,-0.309652,0.923445,-0.17681,-0.460679,-0.583435,-0.214282,-0.155731,2.003362,-0.614078,-0.176174,-0.583748,-0.214323,-0.460171,1.626968,-0.498759,-0.155333,-0.176509,-0.214647,-0.155866,-0.582391,-0.45927,-0.615181,2.000419,-0.176759,1.431019,-0.301501,-0.215507,-1.028728,-0.175948,-0.175794,-0.405564,-0.350332,-0.484009,-0.202636,-0.475081,2.275033,-0.410403,-0.175778,-0.308234,-0.26932,2.74025,-0.157424,-0.360126,-0.334774,-0.312772,-0.390846,-0.187813,-0.060459,-0.305435,-0.358131
1,1,-1.022687,-0.85045,-1.188238,-0.614555,1.588644,1.226857,1.606769,0.005499,-0.517297,1.424121,1.08941,0,-0.175243,-2.709989,3.178808,-0.175987,-1.935894,2.130133,-0.175567,0.654898,-0.608557,0.799079,-0.749562,-0.175757,-1.042442,1.107261,-0.175857,-0.721667,-0.309652,0.923445,-0.17681,-0.460679,-0.583435,-0.214282,6.421316,-0.499161,-0.614078,-0.176174,1.713069,-0.214323,-0.460171,-0.61464,-0.498759,-0.155333,-0.176509,-0.214647,-0.155866,-0.582391,-0.45927,-0.615181,-0.499895,5.657435,-0.698803,-0.301501,-0.215507,0.972074,-0.175948,-0.175794,-0.405564,-0.350332,-0.484009,-0.202636,-0.475081,-0.439554,2.436631,-0.175778,-0.308234,-0.26932,-0.36493,-0.157424,-0.360126,-0.334774,3.197219,-0.390846,-0.187813,-0.060459,-0.305435,-0.358131
2,2,0.764437,-0.09922,-0.237152,-0.115642,-1.640435,1.226857,-1.795128,-1.141747,1.287828,0.395558,0.295188,0,-0.175243,0.369005,-0.314583,-0.175987,-1.935894,2.130133,-0.175567,0.654898,-0.608557,0.799079,-0.749562,-0.175757,0.959286,-0.90313,-0.175857,-0.721667,-0.309652,0.923445,-0.17681,-0.460679,-0.583435,-0.214282,-0.155731,-0.499161,-0.614078,5.676203,-0.583748,-0.214323,-0.460171,1.626968,-0.498759,-0.155333,-0.176509,4.658816,-0.155866,-0.582391,-0.45927,-0.615181,-0.499895,-0.176759,1.431019,-0.301501,-0.215507,-1.028728,-0.175948,-0.175794,-0.405564,-0.350332,-0.484009,-0.202636,2.104904,-0.439554,-0.410403,-0.175778,-0.308234,-0.26932,-0.36493,-0.157424,-0.360126,-0.334774,-0.312772,-0.390846,5.324435,-0.060459,-0.305435,-0.358131
3,3,-0.288591,-1.115063,-0.825919,0.912422,1.672695,-0.984019,-1.114748,1.726368,-1.319574,-1.275857,-1.575648,0,5.706372,-2.709989,-0.314583,-0.175987,0.516557,-0.469454,-0.175567,0.654898,-0.608557,0.799079,-0.749562,-0.175757,0.959286,-0.90313,-0.175857,-0.721667,-0.309652,0.923445,-0.17681,2.170707,-0.583435,-0.214282,-0.155731,-0.499161,-0.614078,-0.176174,-0.583748,-0.214323,-0.460171,1.626968,-0.498759,-0.155333,-0.176509,-0.214647,-0.155866,-0.582391,2.177366,-0.615181,-0.499895,-0.176759,-0.698803,-0.301501,-0.215507,0.972074,-0.175948,-0.175794,-0.405564,-0.350332,2.066077,-0.202636,-0.475081,-0.439554,-0.410403,-0.175778,-0.308234,-0.26932,2.74025,-0.157424,-0.360126,-0.334774,-0.312772,-0.390846,-0.187813,-0.060459,-0.305435,-0.358131
4,4,-0.071387,-0.845965,1.151736,-1.672856,1.030853,1.226857,1.606769,-0.568124,0.084412,-1.275857,-0.675529,0,-0.175243,0.369005,-0.314583,5.682239,-1.935894,-0.469454,-0.175567,0.654898,-0.608557,-1.25144,1.334112,-0.175757,0.959286,-0.90313,-0.175857,-0.721667,-0.309652,0.923445,-0.17681,-0.460679,-0.583435,-0.214282,-0.155731,-0.499161,1.628458,-0.176174,-0.583748,-0.214323,-0.460171,1.626968,-0.498759,-0.155333,-0.176509,-0.214647,-0.155866,1.717059,-0.45927,-0.615181,-0.499895,-0.176759,-0.698803,-0.301501,-0.215507,-1.028728,5.683507,-0.175794,-0.405564,-0.350332,-0.484009,-0.202636,2.104904,-0.439554,-0.410403,-0.175778,-0.308234,-0.26932,-0.36493,-0.157424,-0.360126,-0.334774,-0.312772,-0.390846,-0.187813,-0.060459,-0.305435,2.792271


In [7]:
train = data[data.target != -1].reset_index(drop=True).loc[:, features].values
test = data[data.target == -1].reset_index(drop=True).loc[:, features].values

In [8]:
inputs = layers.Input(shape=(len(features),))

x = layers.Dense(100, activation="relu")(inputs)

x = layers.Dense(20, activation="linear", name='encoded')(x)

x = layers.Dense(100, activation="relu")(x)

y = layers.Dense(len(features), activation="linear")(x)

model = Model(inputs=inputs, outputs=y)

layer_name = 'encoded'
encoder = Model(inputs=model.input, outputs=model.get_layer(layer_name).output)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [9]:
%%time
data = data.loc[:, features].values
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(data, test_size=0.10, random_state=42)

model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mse'])

es = callbacks.EarlyStopping(monitor='val_mean_squared_error', min_delta=0.001, patience=5,
                             verbose=1, mode='min', baseline=None, restore_best_weights=True)

rlr = callbacks.ReduceLROnPlateau(monitor='val_mean_squared_error', factor=0.5,
                                  patience=3, min_lr=1e-6, mode='min', verbose=1)

model.fit(X_train,
          X_train,
          validation_data=(X_test, X_test),
          verbose=1,
          batch_size=1024,
          callbacks=[es, rlr],
          epochs=100
         )

test_preds = model.predict(X_test)

Train on 900000 samples, validate on 100000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 00045: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 00052: early stopping
CPU times: user 5min 49s, sys: 40.7 s, total: 6min 29s
Wall time: 1min 40s


In [18]:
test_preds[0]

array([ 0.6169328 , -0.2866975 ,  0.9114652 , -0.0864603 ,  0.3102562 ,
       -0.30266073, -0.3026658 ,  0.23149244, -0.12209244,  0.11587837,
        0.31851727, -0.17507222,  0.46805716, -0.44409624, -0.18371047,
        0.51616234, -0.48290092, -0.16576144, -1.3495262 ,  1.4631467 ,
       -1.3502947 ,  1.4105562 , -0.14125492, -1.0905625 ,  1.1601479 ,
       -0.1915737 , -0.62457615, -0.3253173 ,  0.7989827 , -0.18966049,
       -0.40072036, -0.406928  , -0.19405457, -0.17696643,  1.9928397 ,
       -0.82404816, -0.16235083,  1.0171858 , -0.18207306,  0.3013304 ,
       -0.8891585 , -0.13391063, -0.1588852 , -0.19687621, -0.26125297,
        6.380669  , -0.59423095, -0.45748317, -0.61435306, -0.45500365,
       -0.18785888, -0.760279  , -0.36884233, -0.25070715,  1.1079426 ,
       -0.20316422, -0.16385056, -0.39958584, -0.32305723, -0.35398334,
       -0.14247933, -0.3846414 , -0.47316778,  2.181378  , -0.1579098 ,
       -0.29717654, -0.28540555, -0.33983368, -0.25159714, -0.35

In [19]:
X_test[0]

array([ 1.73498277,  1.38081453,  1.10644605, -1.40072116, -0.6211279 ,
        1.22685677,  0.24601018, -1.14174709, -0.9184353 ,  1.03840976,
       -0.37548925, -0.1752427 ,  0.36900516, -0.31458336, -0.17598697,
        0.5165573 , -0.46945426, -0.17556677, -1.52695442,  1.64323129,
       -1.25144029,  1.33411204, -0.17575733, -1.04244188,  1.1072607 ,
       -0.17585707, -0.7216674 , -0.30965207,  0.92344506, -0.17680969,
       -0.46067943, -0.58343494, -0.21428204, -0.15573132,  2.00336246,
       -0.61407801, -0.17617411,  1.71306942, -0.21432288, -0.46017057,
       -0.61464008, -0.49875907, -0.1553334 , -0.17650872, -0.2146468 ,
        6.41576737, -0.58239102, -0.45927047, -0.61518058, -0.49989531,
       -0.17675856, -0.69880296, -0.30150082, -0.21550695,  0.97207378,
       -0.1759477 , -0.1757936 , -0.40556369, -0.35033219, -0.48400895,
       -0.2026365 , -0.47508104, -0.43955407,  2.43663088, -0.17577849,
       -0.30823433, -0.26932003, -0.36493014, -0.15742398, -0.36

In [20]:
print("Overall MSE={}".format(metrics.mean_squared_error(X_test, test_preds)))
print("Random MSE={}".format(metrics.mean_squared_error(X_test,
                                                        np.zeros(shape=(X_test.shape[0], X_test.shape[1])))))

Overall MSE=0.11258384567243777
Random MSE=0.9996229299549075


In [21]:
0.11258384567243777

0.11258384567243777

In [22]:
encode_train = encoder.predict(train)
encode_test = encoder.predict(test)

encode_train = pd.DataFrame(encode_train, columns=[f'enc_{i}' for i in range(encode_train.shape[1])])
encode_test = pd.DataFrame(encode_test, columns=[f'enc_{i}' for i in range(encode_test.shape[1])])

In [23]:
encode_train.head()

Unnamed: 0,enc_0,enc_1,enc_2,enc_3,enc_4,enc_5,enc_6,enc_7,enc_8,enc_9,enc_10,enc_11,enc_12,enc_13,enc_14,enc_15,enc_16,enc_17,enc_18,enc_19
0,-3.628197,-4.214615,3.162791,-2.851017,0.733684,-2.10517,-3.654043,3.368169,8.733227,-9.304165,-6.155349,-13.96172,-1.860988,3.261696,-2.552499,-7.462784,-6.648254,1.215031,3.039558,-7.45793
1,5.473229,-12.043126,-3.110718,-7.11655,0.137809,0.137545,0.894782,-1.195416,0.025465,-9.84832,-3.185985,-2.082869,1.452042,1.212522,-2.001684,1.752465,-6.622913,-4.465862,1.056876,-2.89613
2,-4.723306,-9.212559,6.908667,-5.521707,-1.525551,-4.305672,-9.828537,-2.415355,7.607178,-13.40895,2.264282,-14.757087,1.142129,2.798655,2.902924,-7.999272,-5.391744,0.190947,-16.148094,-1.135958
3,-0.615331,-11.242261,7.183455,-5.536698,8.040099,-1.410003,-8.376881,-4.480841,1.970797,-9.971006,-12.468792,-6.622226,5.265586,-0.054815,-0.782956,-1.878253,-2.096591,1.068002,2.522064,-1.252572
4,1.041569,1.142792,6.858114,-1.561865,-4.468106,-2.49181,-5.631959,3.342344,2.752319,-14.936157,-5.771559,-10.802589,-0.177584,6.790238,4.165288,-2.998211,-1.888003,3.092746,4.096474,-8.714581


In [24]:
encode_test.head()

Unnamed: 0,enc_0,enc_1,enc_2,enc_3,enc_4,enc_5,enc_6,enc_7,enc_8,enc_9,enc_10,enc_11,enc_12,enc_13,enc_14,enc_15,enc_16,enc_17,enc_18,enc_19
0,-1.601917,-6.748988,-0.873609,-10.028065,-0.82667,-2.201148,-0.071471,0.754162,6.66808,-12.246215,4.091609,-2.580008,-2.345709,5.427414,0.998928,-1.292549,-5.212106,-5.284838,1.957668,2.370052
1,-0.278443,-2.701591,5.499518,-1.492787,1.074574,0.608068,-5.391108,4.827563,2.817517,-11.315014,3.631675,-9.750927,-0.58809,1.555272,9.241634,-4.863257,-3.699225,1.486904,7.172283,-6.245288
2,-0.124808,-6.116314,2.509565,-4.807785,-2.905211,2.361596,-0.425926,6.399082,0.469024,-12.790155,5.471367,-4.353714,-1.568642,0.223909,8.715662,-4.585989,-3.442437,0.423855,6.123113,-4.636481
3,4.536647,-7.661001,3.603499,-6.089979,-2.297252,-2.953677,-1.661912,8.35961,-0.131453,-10.906069,-0.79726,0.944871,5.838966,-0.948586,6.635248,-2.916905,-1.52934,-0.501608,7.456985,1.407279
4,0.167632,-10.014113,-4.307736,-0.614916,6.423235,4.479852,-2.872722,4.649438,8.898916,-13.349398,1.766877,-8.594969,-0.846956,4.281952,-6.167549,2.251335,-5.337626,-1.727542,-1.617362,-0.052094


In [25]:
encode_train.to_csv("cat_in_dat/train_autoencoder_ohe_20.csv", index=False)
encode_test.to_csv("cat_in_dat/test_autoencoder_ohe_20.csv", index=False)

In [None]:
test_preds /= 50
test_ids = test.id.values
print("Saving submission file")
submission = pd.DataFrame.from_dict({
    'id': test_ids,
    'target': test_preds
})
submission.to_csv("cat_in_dat/submission_embeddings.csv", index=False)