In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import LabelEncoder #Encode Categorical Features
import lightgbm as lgb #Gradient Boosting Machine
import matplotlib.pyplot as plt #Visualization
import seaborn as sns #Visualization
from sklearn.model_selection import KFold #N-Fold Validation
from sklearn.metrics import mean_squared_error #Evaluation Metric
import optuna #hyperparams Tuning
import scipy
import random
import keras
from keras import layers
import tensorflow as tf

In [None]:
trainSet = pd.read_csv('../input/tabular-playground-series-feb-2021/train.csv')

# Data Preprocessing

In [None]:
trainSet.head()

In [None]:
#plot the Target Distribution
sns.displot(data=trainSet, x="target", kde=True)

In [None]:
len(trainSet[trainSet.target < 5])/len(trainSet)

In [None]:
len(trainSet[trainSet.target > 10])/len(trainSet)

In [None]:
#From the distribution graph, I would like to get rid of rows which has target < 5 and > 10 to minimize outlier.
trainSet = trainSet[(trainSet.target > 5) & (trainSet.target < 10)]

In [None]:
#encode categorical feats
cat_feat = [f"cat{val}" for val in range(0,10)]

labelEnc = [LabelEncoder() for _ in range(len(cat_feat))]

for i in range(len(cat_feat)):
    trainSet[cat_feat[i]] = labelEnc[i].fit_transform(trainSet[cat_feat[i]])

In [None]:
#Lets see the Correlation of each features and target

corr = trainSet.drop(['id'], axis=1).corr()
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(20, 9))
sns.heatmap(corr, mask=mask, cmap='BrBG', vmin=-1, vmax=1, annot=True)

From the correlation matrix, I could say that there is no single feature that is highly correlated to the target. So for this notebook, I will use all those features.

# Implement Denoising AutoEncoder

For this first try, I will try to use DAE on continuous features, and label encoding the categorical features

In [None]:
cont_var = [f"cont{val}" for val in range(14)]

In [None]:
X_input = trainSet.drop(['id', 'target'], axis=1)
X_input = X_input.loc[:, X_input.columns.isin(cont_var)].values 

In [None]:
from sklearn.preprocessing import OneHotEncoder

oheEnc = [OneHotEncoder() for _ in range(len(cat_feat))]
X_ohe = []
for i in range(len(cat_feat)):
    if X_input[cat_feat[i]].nunique() <= 2:
        X_ohe.append(X_input[cat_feat[i]])
    else:
        X_ohe.append(oheEnc[i].fit_transform(X_input[cat_feat[i]].values.reshape([-1,1])))

In [None]:
for i in range(len(cat_feat)):
    if type(X_ohe[i]) == scipy.sparse.csr.csr_matrix:
        X_ohe[i] = X_ohe[i].toarray()\

X_ohe_df = pd.DataFrame()
        
for i in range(len(cat_feat)):
    X_ohe_df = pd.concat([X_ohe_df, pd.DataFrame(X_ohe[i])], axis=1)
    
X_ohe_df = X_ohe_df.values

In this try, I use bottleneck DAE. The noise is taken from the same feature but taken from other rows. For deeper explanation check out this great notebook https://www.kaggle.com/springmanndaniel/1st-place-turn-your-data-into-daeta#denoising-autoencoders .

In [None]:
def add_random_noise(X, randomize_rate=0.4, row_random_rate=0.4):
    row_size = int(X.shape[0])
    col_size = int(X.shape[1])
    
    all_feat_size = int(row_size*col_size)
    randomize_size = int(all_feat_size*randomize_rate)
    row_random_size = int(row_size*row_random_rate)
    
    col_random_size = int(np.floor(randomize_size/row_random_size))
    
    idx_randomize = [random.randint(0, row_size-1) for _ in range(row_random_size)]
    
    for i in idx_randomize:
        col_feat = [random.randint(0, col_size-1) for _ in range(col_random_size)]
        
        for k in col_feat:
            X[i, k] = random.random()
            
    return X

In [None]:
def add_feat_noise(X, randomize_rate=0.4, row_random_rate=0.4):
    row_size = int(X.shape[0])
    col_size = int(X.shape[1])
    
    all_feat_size = int(row_size*col_size)
    randomize_size = int(all_feat_size*randomize_rate)
    row_random_size = int(row_size*row_random_rate)
    
    col_random_size = int(np.floor(randomize_size/row_random_size))
    
    idx_randomize = [random.randint(0, row_size-1) for _ in range(row_random_size)]
    
    for i in idx_randomize:
        col_feat = [random.randint(0, col_size-1) for _ in range(col_random_size)]
        
        for k in col_feat:
            idx = random.randint(0, row_size-1)
            X[i, k] = X[idx, k]
            
    return X

In [None]:
def create_batch_set(X, batch_size):
    X_sample = []
    row_size = X.shape[0]
    idx = [random.randint(0, row_size-1) for _ in range(batch_size)]

    return X[idx]

In [None]:
# This is the size of our encoded representations
encoding_dim = 1000  # 32 floats -> compression of factor 24.5, assuming the input is 784 floats
bottleneck_dim = 500
input_shape = X_input.shape[1]

# This is our input image
input_set = keras.Input(shape=(input_shape,))
input_encode1 = layers.Dense(encoding_dim, activation='relu')(input_set)
input_encode2 = layers.Dense(encoding_dim, activation='relu')(input_encode1)
encoded = layers.Dense(bottleneck_dim, activation='relu')(input_encode2)
input_decode1 = layers.Dense(encoding_dim, activation='relu')(encoded)
input_decode2 = layers.Dense(encoding_dim, activation='relu')(input_decode1)
decoded = layers.Dense(input_shape, activation='relu')(input_decode2)

# This model maps an input to its reconstruction
autoencoder = keras.Model(input_set, decoded)
encoder = keras.Model(input_set, encoded)

In [None]:
epochs = 500

# Instantiate an optimizer to train the model.
optimizer = keras.optimizers.Adam(learning_rate=1e-4)
# Instantiate a loss function.
loss_fn = keras.losses.MeanSquaredError()

for epoch in range(epochs):
    print("\nStart of epoch %d" % (epoch,))
    
    X_sample = create_batch_set(X_input, 1024)
    X_noise = add_feat_noise(X_sample, 0.3, 0.6)

    # Open a GradientTape to record the operations run
    # during the forward pass, which enables auto-differentiation.
    with tf.GradientTape() as tape:

        # Run the forward pass of the layer.
        # The operations that the layer applies
        # to its inputs are going to be recorded
        # on the GradientTape.
        logits = autoencoder(X_noise, training=True)  # Logits for this minibatch

        # Compute the loss value for this minibatch.
        loss_value = loss_fn(X_sample, logits)

    # Use the gradient tape to automatically retrieve
    # the gradients of the trainable variables with respect to the loss.
    grads = tape.gradient(loss_value, autoencoder.trainable_weights)

    # Run one step of gradient descent by updating
    # the value of the variables to minimize the loss.
    optimizer.apply_gradients(zip(grads, autoencoder.trainable_weights))


    print(
        "Training loss (for one batch) at step %d: %.4f"
        % (epoch, float(loss_value))
    )

In [None]:
X_encode = pd.DataFrame(encoder.predict(X_input))

In [None]:
cont_var = [f"cont_{val}" for val in range(bottleneck_dim)]
X_encode.columns = cont_var

I am going to tune the DAE hyperparams hope to get a better result.

# End DAE Training

In [None]:
cat_feat = [f"cat{val}" for val in range(0,10)]
trainSet = pd.concat([trainSet.loc[:, trainSet.columns.isin(cat_feat)], X_encode, trainSet.target], axis=1)

In [None]:
trainSet.head()

In [None]:
#Seperate features and its target
y = trainSet.target
X = trainSet.drop(['target'], axis=1)

# Optuna Hyperparams Tuning on Light GBM Model

In [None]:
def objective(trial):
    # Define the search spaces, for your guidance, visit the optuna official sample codes https://optuna.org/#code_examples
    params = {
        'num_iterations' : trial.suggest_int('num_iterations', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.05),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 256),
        'num_leaves': trial.suggest_int('num_leaves', 15, 256),
        'lambda_l1': trial.suggest_float('lambda_l1', 0, 25.0),
        'lambda_l2': trial.suggest_float('lambda_l2', 0, 25.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 0, 25.0),
        'random_state': 47,
        'boosting_type': 'gbdt', 
        'verbose': -1,
        'device' : 'gpu',
        'gpu_platform_id': 0,
        'gpu_device_id': 0

    }

    # Use 5 folds cross-validation
    N_FOLDS = 5
    rmse_score = 0
    lgbm_models = []

    kf = KFold(n_splits = N_FOLDS)
    
    for folds, (train_idx,val_idx) in enumerate(kf.split(X, y)):
        print(f"folds: {folds}")
        trainSet = lgb.Dataset(X.iloc[train_idx], y.iloc[train_idx])
        valSet = lgb.Dataset(X.iloc[val_idx], y.iloc[val_idx])

        model = lgb.train(params, trainSet)
        lgbm_models.append(model)
        y_pred = model.predict(X.iloc[val_idx])

        rmse_score += mean_squared_error(y.iloc[val_idx], y_pred, squared=False)/N_FOLDS

        print(mean_squared_error(y.iloc[val_idx], y_pred, squared=False))
        
    return rmse_score

In [None]:
import warnings
warnings.filterwarnings("ignore")

#Start the hyperparams tunning and suppress any warnings
study = optuna.create_study(direction='minimize')
study.optimize(objective, timeout=5*60)

In [None]:
best_params = study.best_params
print(study.best_params)

In [None]:
study.best_value

# End Hyperparam Tuning

In [None]:
N_FOLDS = 5
rmse_score = 0
lgbm_models = []
eval_results = [{} for _ in range (N_FOLDS)]

kf = KFold(n_splits = N_FOLDS)

In [None]:
#Train our LGBM using the best parameter

import warnings
warnings.filterwarnings("ignore")

for folds, (train_idx,val_idx) in enumerate(kf.split(X, y)):
    print(f"folds: {folds}")
    trainSet = lgb.Dataset(X.iloc[train_idx], y.iloc[train_idx])
    valSet = lgb.Dataset(X.iloc[val_idx], y.iloc[val_idx])
    
    model = lgb.train(best_params, trainSet, valid_sets=[trainSet, valSet], evals_result=eval_results[folds])
    lgbm_models.append(model)
    y_pred = model.predict(X.iloc[val_idx])
    
    rmse_score += mean_squared_error(y.iloc[val_idx], y_pred, squared=False)/N_FOLDS
    
    print(mean_squared_error(y.iloc[val_idx], y_pred, squared=False))

In [None]:
print(rmse_score)

In [None]:
#plot the rmse score for each iteration in 5th fold model
lgb.plot_metric(eval_results[4])

In [None]:
lgb.plot_importance(lgbm_models[4])

# Predict the Test Set

In [None]:
testSet = pd.read_csv('../input/tabular-playground-series-feb-2021/test.csv')

for i in range(len(cat_feat)):
    testSet[cat_feat[i]] = labelEnc[i].transform(testSet[cat_feat[i]])

In [None]:
cont_var = [f"cont{val}" for val in range(14)]
for i in cont_var:
    testSet[i] = np.log(testSet[i])

In [None]:
id = testSet.id
testSet.drop('id', axis=1, inplace=True)

In [None]:
y_pred = np.zeros(len(testSet))

In [None]:
for model in lgbm_models:
    y_pred += model.predict(testSet)

In [None]:
y_pred = pd.DataFrame(y_pred/N_FOLDS)

# Create Submission File as in sample_submission.csv

In [None]:
submFile = pd.concat([id, y_pred],axis=1)
submFile.columns = ['id', 'target']

In [None]:
submFile.to_csv('submFile.csv', index=False)