In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn import preprocessing
from sklearn.metrics import log_loss
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras import layers
from keras.models import Model
import tensorflow as tf
from tensorflow.keras import activations,callbacks
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import categorical_crossentropy
from keras.models import Sequential
import keras 
from keras.layers import Dense, Flatten, Conv1D,MaxPooling1D, Dropout,BatchNormalization,Embedding,Concatenate, Activation,Input
from keras.callbacks import ModelCheckpoint
from keras.models import model_from_json
from keras import backend as K


In [None]:
train = pd.read_csv('../input/tabular-playground-series-may-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-may-2021/test.csv')

<h2> Basic data preparation

<h3> Preparation for the Onehot encoding model

In [None]:
# to prepare data, I use my simple package (to clean data and Onehot encode)
!pip install git+https://github.com/Lpourchot/dfencoding.git

In [None]:
from dfencoding import utilities # Import package

In [None]:
# del dfe 

In [None]:
train_dum = train.copy()
test_dum = test.copy()

In [None]:
# The package works with Object for categories cleaning and encoding, so I need to change the type :
train_dum = train_dum.iloc[:,1:].astype('str')
test_dum = test_dum.iloc[:,1:].astype('str')

In [None]:
dfe = utilities.dfencoding(train_dum,'target',test_dum, missing_value = 'Y', cat_limit = 150, dummies_limit = 150)

In [None]:
dfe.get_dummies() # OneHot encoding

In [None]:
# Preparation of the files for training of the OneHot model :
X_Onehot = dfe.data.iloc[:len(train_dum),1:]
test_Onehot = dfe.data.iloc[len(train_dum):,1:]
print(X_Onehot.shape)
print(test_Onehot.shape)

<h3> Others preparation for Models Embedding and Conv1D

In [None]:
# Preparation of the files without labelencoding for the 2 models (embedding and Conv1D) :
target = pd.get_dummies(train['target'])
y = train['target']
X = train.iloc[:,1:-1]
test = test.iloc[:,1:]

# To avoid negative values (for embedding), we just add 8 to all categories :
X = X + 8
test = test + 8
X.shape, test.shape, y.shape, target.shape

In [None]:
es = callbacks.EarlyStopping(
                monitor = 'val_categorical_crossentropy', 
                min_delta = 0.0000001, 
                patience = 6,
                mode = 'min',
                baseline = None, 
                restore_best_weights = True,
                verbose = 1)

plateau  = callbacks.ReduceLROnPlateau(
                monitor = 'val_categorical_crossentropy',
                factor = 0.5, 
                patience = 3, 
                mode = 'min', 
                min_delt = 0.0000001,
                cooldown = 0, 
                min_lr = 1e-7,
                verbose = 1) 

metrics = [tf.keras.metrics.CategoricalCrossentropy()]
loss = tf.keras.losses.CategoricalCrossentropy(
                from_logits=False,
                label_smoothing=0,
                reduction="auto",
                name="categorical_crossentropy")


<h2> Kfold for 3 streams API : Embedding + Conv1D + Onehot sequential

In [None]:
N_FOLDS = 20
SEED = 2021
oof = np.zeros((X.shape[0],4))
pred = np.zeros((test.shape[0],4))
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

for fold, (tr_idx, ts_idx) in enumerate(skf.split(X, y)):
    print(f"===== FOLD {fold} =====")
       
    x_tr = X.iloc[tr_idx] # X_train
    x_Onehot_tr = X_Onehot.iloc[tr_idx] # X for Onehot encoding
    y_tr = target.iloc[tr_idx] # y_train
    x_ts = X.iloc[ts_idx] # X_valid
    x_Onehot_ts = X_Onehot.iloc[ts_idx] # X_valid for Onehot encoding
    y_ts = target.iloc[ts_idx] # y_valid

    # API functional for OneHot
    inputs_API_Onehot = Input(shape=(1285,), name = 'API_input_Onehot')
    w = Dense(1285, activation="relu")(inputs_API_Onehot)
    w = Dropout(0.3)(w)
    w = Dense(80, activation="relu")(w)
    w = Dropout(0.3)(w)
    w = Dense(20, activation="relu")(w)
    outputs_API_Onehot = Dense(4, activation="relu")(w)
    
    #API functional for Embedding
    inputs_API_Embedding = Input(shape=(50,), name = 'API_input_Embedding')
    x = Embedding(80, 10, input_length=50)(inputs_API_Embedding)
    x = Flatten()(x)
    x = Dense(80, activation="relu")(x)
    x = Dense(20, activation='relu')(x)
    outputs_API_Embedding = Dense(4, activation='relu')(x)
    
    #API functional for Conv1D
    inputs_API_Conv1D = Input(shape=(50,1), name = 'API_input_Conv1D') 
    v = Conv1D(
            filters=512, #256
            kernel_size=5, #4
            padding='same', 
            activation='relu',
            )(inputs_API_Conv1D)
    v = MaxPooling1D(pool_size=3)(v)
    v = Flatten()(v)
    v = Dense(80, activation='relu')(v)
    v = Dense(20, activation='relu')(v)
    outputs_API_Conv1D = Dense(4, activation='relu')(v)
    
    # Final step with concatenation of Embedding and Conv1D :
    z = Concatenate(axis=1)([outputs_API_Conv1D, outputs_API_Embedding,outputs_API_Onehot])
    out = Dense(4, activation = 'softmax', name = 'out')(z)

    # Creation of the merged model :
    model_merged = Model(
                 inputs=[inputs_API_Conv1D,inputs_API_Embedding,inputs_API_Onehot], 
                 outputs=out, 
                 name="model_merged")
    
    # Compile and fit of the merged model :
    model_merged.compile(tf.keras.optimizers.Adam(learning_rate=0.0001),loss=loss ,metrics=metrics)
    model_merged.fit(
                    {'API_input_Conv1D':x_tr, 'API_input_Embedding':x_tr,'API_input_Onehot':x_Onehot_tr},
                    {'out':y_tr},
                    validation_data = ([x_ts,x_ts,x_Onehot_ts], y_ts),
                    batch_size=256,
                    epochs=50,
                    verbose=1,
                    callbacks=[es,plateau])
    
    oof[ts_idx] = model_merged.predict([x_ts,x_ts,x_Onehot_ts])
    score = log_loss(y_ts, oof[ts_idx])
    print(f"FOLD {fold} Score {score}\n")
    
    pred += model_merged.predict([test,test,test_Onehot]) / N_FOLDS

score = log_loss(y, oof)
print(f"Score total {score}\n")   

In [None]:
submission = pd.read_csv('../input/tabular-playground-series-may-2021/sample_submission.csv')

In [None]:
submission_df = pd.DataFrame(pred)
submission_df.columns = ['Class_1', 'Class_2', 'Class_3', 'Class_4']
submission_df['id'] = submission['id']
submission_df = submission_df[['id', 'Class_1', 'Class_2', 'Class_3', 'Class_4']]
submission_df.to_csv("submission_Keras_3.csv", index=False)
display(submission_df.head())