In [1]:
import warnings
warnings.filterwarnings("ignore")
from copy import copy
import os
import gc
import joblib
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics, preprocessing
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras.models import Model, load_model
from tensorflow.keras import callbacks
from tensorflow.keras import backend as K
from tensorflow.keras import utils


pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [2]:
def auc(y_true, y_pred):
    def fallback_auc(y_true, y_pred):
        try:
            return metrics.roc_auc_score(y_true, y_pred)
        except:
            return 0.5
    return tf.py_function(fallback_auc, (y_true, y_pred), tf.double)

In [3]:
def create_model(data, catcols):    
    inputs = []
    outputs = []
    for c in catcols:
        num_unique_values = int(data[c].nunique())
        embed_dim = int(min(np.ceil((num_unique_values)/2), 50))
        inp = layers.Input(shape=(1,))
        out = layers.Embedding(num_unique_values + 1, embed_dim, name=c)(inp)
        out = layers.SpatialDropout1D(0.3)(out)
        out = layers.Reshape(target_shape=(embed_dim, ))(out)
        inputs.append(inp)
        outputs.append(out)
    
    x = layers.Concatenate()(outputs)
    x = layers.BatchNormalization()(x)
    
    x = layers.Dense(300, activation="relu")(x)
    x = layers.Dropout(0.3)(x)
    x = layers.BatchNormalization()(x)
    
    x = layers.Dense(50, activation="relu", name='output_dense')(x)
    x = layers.Dropout(0.3)(x)
    x = layers.BatchNormalization()(x)
    
    y = layers.Dense(2, activation="softmax")(x)

    model = Model(inputs=inputs, outputs=y)
    
    layer_name = 'output_dense'
    intermediate_layer_model = Model(inputs=model.input,
                                     outputs=model.get_layer(layer_name).output)
    return model, intermediate_layer_model

In [4]:
train = pd.read_csv('cat_in_dat/train.csv')
test = pd.read_csv('cat_in_dat/test.csv')

In [5]:
test["target"] = -1
data = pd.concat([train, test]).reset_index(drop=True)

features = [x for x in train.columns if x not in ["id", "target"]]

for feat in features:
    lbl_enc = preprocessing.LabelEncoder()
    data[feat] = lbl_enc.fit_transform(data[feat].fillna("-1").astype(str).values)

In [6]:
train = data[data.target != -1].reset_index(drop=True)
test = data[data.target == -1].reset_index(drop=True)
train_data = [train.loc[:, features].values[:, k] for k in range(train.loc[:, features].values.shape[1])]
test_data = [test.loc[:, features].values[:, k] for k in range(test.loc[:, features].values.shape[1])]

In [7]:
%%time
oof_preds = np.zeros((len(train)))
test_preds = np.zeros((len(test)))

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train, train.target.values,
                                                      test_size=0.10, random_state=42)

X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
model, intermediate_layer_model = create_model(data, features)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[auc])
X_train = [X_train.loc[:, features].values[:, k] for k in range(X_train.loc[:, features].values.shape[1])]
X_test = [X_test.loc[:, features].values[:, k] for k in range(X_test.loc[:, features].values.shape[1])]

es = callbacks.EarlyStopping(monitor='val_auc', min_delta=0.001, patience=5,
                             verbose=1, mode='max', baseline=None, restore_best_weights=True)

rlr = callbacks.ReduceLROnPlateau(monitor='val_auc', factor=0.5,
                                  patience=3, min_lr=1e-6, mode='max', verbose=1)

model.fit(X_train,
          utils.to_categorical(y_train),
          validation_data=(X_test, utils.to_categorical(y_test)),
          verbose=1,
          batch_size=1024,
          callbacks=[es, rlr],
          epochs=100
         )
valid_fold_preds = model.predict(X_test)[:, 1]

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Train on 540000 samples, validate on 60000 samples
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 00006: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 7/100
Epoch 00007: early stopping
CPU times: user 4min 56s, sys: 16.5 s, total: 5min 12s
Wall time: 53.4 s


In [8]:
print("Overall AUC={}".format(metrics.roc_auc_score(y_test, valid_fold_preds)))

Overall AUC=0.7882208880624542


In [10]:
embedding_train = intermediate_layer_model.predict(train_data)
embedding_test = intermediate_layer_model.predict(test_data)

embedding_train = pd.DataFrame(embedding_train, columns=[f'emb_{i}' for i in range(embedding_train.shape[1])])
embedding_test = pd.DataFrame(embedding_test, columns=[f'emb_{i}' for i in range(embedding_test.shape[1])])

In [11]:
embedding_train.head()

Unnamed: 0,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,emb_8,emb_9,emb_10,emb_11,emb_12,emb_13,emb_14,emb_15,emb_16,emb_17,emb_18,emb_19,emb_20,emb_21,emb_22,emb_23,emb_24,emb_25,emb_26,emb_27,emb_28,emb_29,emb_30,emb_31,emb_32,emb_33,emb_34,emb_35,emb_36,emb_37,emb_38,emb_39,emb_40,emb_41,emb_42,emb_43,emb_44,emb_45,emb_46,emb_47,emb_48,emb_49
0,1.533888,0.0,0.0,0.0,1.824041,2.29249,0.0,1.941234,1.827931,2.054269,1.661875,0.391904,0.0,3.224514,1.603753,3.088054,2.118457,0.321351,1.132453,1.017076,2.976085,2.653766,0.376504,0.0,1.789676,3.040848,0.0,1.772582,2.419187,2.085474,0.0,0.202399,0.0,1.583567,1.69706,0.0,2.751202,0.0,1.831249,1.928904,1.950607,3.830657,2.242719,2.62659,1.217229,1.435701,0.0,0.0,0.0,1.834835
1,0.051431,0.0,0.0,0.600885,1.301777,1.299913,0.0,0.0,0.0,1.468437,0.547771,0.312431,0.0,2.463684,0.0,1.580257,0.214438,0.0,0.0,1.904642,0.617467,2.539844,0.0,0.0,0.819365,1.553037,0.0,0.406872,1.184847,1.738591,0.0,0.0,0.0,1.274886,2.028154,0.0,0.832975,0.0,1.333206,0.504538,2.028522,1.314011,1.560901,1.550431,2.855623,1.496621,0.741084,0.0,0.0,0.535495
2,0.0,2.581199,1.592893,1.071668,0.578497,0.0,1.534026,0.0,0.0,0.0,0.731763,0.0284,2.285823,0.0,0.0,0.0,0.0,0.392852,0.0,0.0,0.0,0.0,0.0,3.057581,0.0,0.0,1.955781,0.0,0.0,0.0,0.658292,1.113949,0.0,0.0,0.0,0.0,0.0,0.654163,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.148353,0.0,1.192848,0.0
3,2.537157,0.0,0.0,0.0,1.770057,3.817923,0.0,0.974968,1.911693,1.970397,2.841591,3.249789,0.0,1.545645,2.517019,0.982935,1.59369,0.0,2.768407,2.272049,0.448413,2.980695,1.70362,0.0,2.612808,2.29779,0.0,2.504112,2.75888,3.474554,1.720196,0.0,0.0,2.11504,3.302052,0.0,1.711625,0.266314,3.316986,0.756862,3.574126,4.357124,1.711538,3.592577,3.671941,2.985166,0.0,0.0,0.0,2.112608
4,0.942372,0.0,0.0,0.387796,1.164678,0.0,0.0,1.084644,1.750746,0.0,0.915064,0.0,0.0,0.0703,0.354098,0.458158,0.0,0.0,0.161678,0.0,0.0,0.0,2.11275,0.0,0.0,0.125145,0.0,1.352719,1.137936,0.744787,1.028595,0.841407,0.059518,0.830795,1.186101,0.0,0.383482,0.0,0.054586,0.14565,0.0,0.0,0.623416,0.784463,0.302571,1.67404,0.0,0.0,0.0,1.391667


In [12]:
embedding_test.head()

Unnamed: 0,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,emb_8,emb_9,emb_10,emb_11,emb_12,emb_13,emb_14,emb_15,emb_16,emb_17,emb_18,emb_19,emb_20,emb_21,emb_22,emb_23,emb_24,emb_25,emb_26,emb_27,emb_28,emb_29,emb_30,emb_31,emb_32,emb_33,emb_34,emb_35,emb_36,emb_37,emb_38,emb_39,emb_40,emb_41,emb_42,emb_43,emb_44,emb_45,emb_46,emb_47,emb_48,emb_49
0,1.046769,0.728432,0.0,0.509048,1.229386,1.039716,0.0,0.982355,0.028051,1.396201,1.046623,1.528544,0.453556,0.801414,0.419727,0.0,0.0,0.113246,0.265247,0.977642,0.623905,0.0,0.0,0.0,0.0,0.0,0.0,0.616754,1.512125,0.450576,0.652933,0.0,0.0,0.0,0.64132,0.0,1.341419,0.0,2.618965,0.0,0.0,0.198591,0.036226,0.0,0.0,0.9668,0.752,0.0,0.0,0.102758
1,0.0,0.327545,0.75083,0.0,0.0,0.0,0.06203,1.222194,0.0,1.017997,0.0,0.0,0.0,0.0,0.0,0.021146,0.0,0.0,0.0,0.0,0.183902,0.0,1.154608,0.203736,0.0,0.0,0.241218,0.0,0.0,0.0,0.433447,1.970543,1.481383,0.0,0.0,0.987869,0.0,0.496116,0.0,0.958832,0.0,0.0,0.050639,0.0,0.493869,0.0,2.040608,0.442298,0.0,0.0
2,1.328279,0.546664,0.0,0.0,0.0,0.0,0.159536,0.0,0.283018,0.595212,0.0,0.512192,0.313514,0.0,0.0,0.0,0.0,0.0,0.093655,0.0,1.868047,0.0,0.775932,0.615454,0.0,0.346407,0.467912,0.0,0.0,0.0,0.50064,0.0,0.0,1.158364,0.010715,0.0,0.0,0.0,1.208182,0.331149,0.0,0.0,0.0,0.0,0.0,0.0,0.661792,0.0,0.0,0.0
3,0.043035,0.0,0.0,1.4231,0.0,0.0,0.0,0.928615,0.766797,0.0,0.889139,0.0,0.0,0.537549,2.038993,2.902122,0.987285,0.0,0.407033,0.522625,3.825804,0.0,0.9368,0.0,0.0,0.0,0.0,0.57235,0.0,0.806817,0.74418,0.0,0.0,1.539574,0.0,0.0,2.921119,0.595921,1.084843,0.0,0.0,0.0,0.0,0.46258,0.312929,0.981194,0.226139,0.0,0.0,0.0
4,0.0,0.0,0.0,0.038074,0.0,0.415552,0.0,0.0,1.955534,0.30417,0.055279,0.036174,0.579166,1.402426,0.328859,0.0,0.044824,0.0,0.38315,0.225904,0.88763,0.0,0.0,0.0,0.0,0.626964,0.0,0.0,0.741929,0.0,1.211724,0.0,0.0,1.208405,0.0,0.113313,0.0,0.0,0.335711,1.301381,0.0,0.0,0.0,0.0,0.082485,0.0,0.0,0.0,0.441095,1.339985


In [13]:
embedding_train.shape

(600000, 50)

In [14]:
embedding_test.shape

(400000, 50)

In [16]:
embedding_train.to_csv("cat_in_dat/train_embeddings_50.csv", index=False)
embedding_test.to_csv("cat_in_dat/test_embeddings_50.csv", index=False)

In [None]:
test_preds /= 50
test_ids = test.id.values
print("Saving submission file")
submission = pd.DataFrame.from_dict({
    'id': test_ids,
    'target': test_preds
})
submission.to_csv("cat_in_dat/submission_embeddings.csv", index=False)