In [None]:
import gc
import numpy as np
import os
import pandas as pd
import warnings
from sklearn.exceptions import ConvergenceWarning

pd.set_option('display.max_columns', None)
warnings.filterwarnings(action="ignore", category=ConvergenceWarning)
warnings.filterwarnings(action="ignore", category=UserWarning)
warnings.filterwarnings(action="ignore", category=FutureWarning)

# Utils
import lightgbm as lgb
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import RocCurveDisplay
from sklearn.model_selection import train_test_split
from statsmodels.graphics.gofplots import qqplot

# Normalization
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

# Autoencoder
import tensorflow as tf
import tensorflow.keras.layers as L
import tensorflow.keras.metrics as T
import tensorflow.keras.models as M
import tensorflow.keras.optimizers as O
import tensorflow.keras.regularizers as R
import tensorflow.keras.initializers as I
import tensorflow.keras.callbacks as C
import tensorflow.keras.losses as S

# Model
from imblearn.over_sampling import RandomOverSampler as ROS
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold

# Utils

In [None]:
def plot_data_stats(X):
    fig, ax = plt.subplots(3, 1, figsize=(12, 12))

    bins = X.shape[1]
    
    sns.distplot(X.mean(axis=0), bins=bins, color='blue', kde=True, ax=ax[0])
    sns.distplot(X.std(axis=0), bins=bins, color='red', kde=True, ax=ax[1])

    outlier = (np.abs(stats.zscore(X)) > 3) * 1
    outlier_col_count = np.sum(outlier, axis=0)
    X_total = X.shape[0]
    outlier_col_perc = outlier_col_count/X_total*100

    pd.DataFrame(outlier_col_perc).plot.bar(color='purple', ax=ax[2], legend=None)
    ax[2].axes.get_xaxis().set_ticklabels([])
    ax[2].axhline(y=outlier_col_perc.mean(), color='orange', ls='--', lw=2.0)

    ax[0].set_title('Mean')
    ax[1].set_title('Standard deviation')
    ax[2].set_title('Z-score')

    plt.show()

In [None]:
def train_eval_model(Xtrain, ytrain, Xvalid, yvalid, 
                     parameters, verbose=True):
    model = lgb.LGBMClassifier(**parameters)
    verbose_eval = 200 if verbose else 1000
    model.fit(
        Xtrain, ytrain, 
        eval_set=(Xvalid, yvalid), 
        eval_metric='auc', verbose=verbose_eval)
    ypred = model.predict(Xvalid)
    score = roc_auc_score(yvalid, ypred)
    
    if verbose:
        print(f'\n\nScore = {score:.5f}', end='\n\n')
        print(classification_report(yvalid, np.round(ypred)))

        fig, ax = plt.subplots(1, 2, figsize=(15, 6))
        lgb.plot_metric(model, ax=ax[0])
        
        ypred = model.predict(Xtrain)
        fpr, tpr, _ = roc_curve(ytrain, ypred)
        roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr).plot(ax=ax[1], color='blue')
        
        ypred = model.predict(Xvalid)
        fpr, tpr, _ = roc_curve(yvalid, ypred)
        roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr).plot(ax=ax[1], color='orange', linestyle="--")
        
        plt.show()
    return score

In [None]:
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']

In [None]:
def plot_loss(history, label='', n=0):
    # Use a log scale on y-axis to show the wide range of values.
    plt.semilogy(history.epoch, history.history['loss'], color=colors[n], label='Train ' + label)
    plt.semilogy(history.epoch, history.history['val_loss'], color=colors[n], label='Val ' + label, linestyle="--")
    plt.xlabel('Epoch')
    plt.ylabel('Loss')

In [None]:
def summarize_diagnostics(history):
    fig, ax = plt.subplots(1, 1, figsize=(15, 8))
    try:
        # plot loss
        plt.subplot(211)
        plt.title('Cross Entropy Loss')
        plt.plot(history.history['loss'], color='blue', label='train')
        plt.plot(history.history['val_loss'], color='orange', label='test')
        # plot accuracy
        plt.subplot(212)
        plt.title('Classification Accuracy')
        plt.plot(history.history['accuracy'], color='blue', label='train')
        plt.plot(history.history['val_accuracy'], color='orange', label='test')
    except:
        pass

In [None]:
def plot_metrics(history):
    fig, ax = plt.subplots(1, 1, figsize=(15, 8))
    metrics = ['loss', 'prc', 'precision', 'recall']
    for n, metric in enumerate(metrics):
        name = metric.replace("_"," ").capitalize()
        plt.subplot(2,2,n+1)
        plt.plot(history.epoch, history.history[metric], color=colors[n], label='Train')
        plt.plot(history.epoch, history.history['val_'+metric], color=colors[n], linestyle="--", label='Val')
        plt.xlabel('Epoch')
        plt.ylabel(name)
        if metric == 'loss':
            plt.ylim([0, plt.ylim()[1]])
        elif metric == 'auc':
            plt.ylim([0.8,1])
        else:
            plt.ylim([0,1])
    plt.legend();

In [None]:
class ResidualBlock(tf.keras.Model):
    def __init__(self, filters, dropout):
        super(ResidualBlock, self).__init__(name='')
        # Layer 0
        self.dense_0 = L.Dense(units=filters, activation='relu')
        self.batch_0 = L.BatchNormalization()
        self.dropt_0 = L.Dropout(dropout)
        # Layer 1
        self.dense_1 = L.Dense(units=filters, activation='relu')
        self.batch_1 = L.BatchNormalization()
        self.dropt_1 = L.Dropout(dropout)

    def call(self, input_tensor, training=False):
        x_skip = input_tensor
        # Layer 0
        x = self.dense_0(input_tensor)
        x = self.batch_0(x, training=training)
        x = self.dropt_0(x, training=training)
        # Layer 1
        x = self.dense_1(x)
        x = self.batch_1(x, training=training)
        x = self.dropt_1(x, training=training)
        # Add Residue
        x = L.Add()([x, x_skip])     
        x = L.Activation('relu')(x)
        return x

# Stop

In [None]:
# True == False for quick saves
assert True == True

# Data loading

In [None]:
fnames = []
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        fnames.append(os.path.join(dirname, filename))

train = pd\
    .read_csv([fname for fname in fnames if 'train' in fname][0])\
    .drop('ID_code', axis=1)
X, y = train.drop(columns=['target']), train['target']

test = pd\
    .read_csv([fname for fname in fnames if 'test' in fname][0])
test_id = test.ID_code
test = test.drop('ID_code', axis=1)

### Base estimator

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

params = {
    'objective': 'binary',
    'metric': 'auc',
    'n_estimators': 400,
    'early_stopping_round': 10,
    'learning_rate': 0.1,
}

train_eval_model(X_train, y_train, X_valid, y_valid, params)

# Data analysis

In [None]:
train.info()

### Normally distributed

In [None]:
describe = train.describe()
columns_ordered_by_scale = describe.transpose().sort_values(by=['max']).index

cols = 8
rows = train.shape[1]//cols

fig, ax = plt.subplots(rows, cols, figsize=(4*cols, 3*rows))

for idx, col in enumerate([
    column for column in columns_ordered_by_scale
    if 'var_' in column
]):
    sns.kdeplot(
        data=train.loc[train.target == 0][[col]],
        legend=False, ax=ax[idx//cols, idx%cols],
        palette=sns.color_palette('Blues', 1)
    )
    sns.kdeplot(
        data=train.loc[train.target == 1][[col]],
        legend=False, ax=ax[idx//cols, idx%cols],
        palette=sns.color_palette('Greens', 1)
    )
    sns.kdeplot(
        data=test[[col]],
        legend=False, ax=ax[idx//cols, idx%cols],
        palette=sns.color_palette('Oranges', 1)
    )
plt.show()

In [None]:
describe = train.describe()
columns_ordered_by_scale = describe.transpose().sort_values(by=['max']).index

cols = 8
rows = train.shape[1]//cols

fig, ax = plt.subplots(rows, cols, figsize=(4*cols, 3*rows))

for idx, col in enumerate([
    column for column in columns_ordered_by_scale
    if 'var_' in column
]):
    qqplot(
        train[col], line='s',
        marker='.', markerfacecolor='b', markeredgecolor='b', alpha=0.3,
        ax=ax[idx//cols, idx%cols])
plt.show()

In [None]:
describe = train.describe()
columns_ordered_by_scale = describe.transpose().sort_values(by=['max']).index

cols = 8
rows = train.shape[1]//cols

fig, ax = plt.subplots(rows, cols, figsize=(4*cols, 3*rows))

for idx, col in enumerate([
    column for column in columns_ordered_by_scale
    if 'var_' in column
]):
    sns.boxplot(
        data=train[col], ax=ax[idx//cols, idx%cols],
        palette=sns.color_palette('Blues', 1)
    )
plt.show()

### Different scales

In [None]:
train.describe()

In [None]:
min_max_scl = MinMaxScaler()

X_scl = min_max_scl.fit_transform(X, y)

### Estimate scaled data

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X_scl, y, test_size=0.2, random_state=42)

train_eval_model(X_train, y_train, X_valid, y_valid, params)

### No null values

In [None]:
np.sum(train.isna().sum().values)

### Outliers

In [None]:
plot_data_stats(X_scl)

### Principal component analysis

In [None]:
pca = PCA(n_components=0.2)
X_pca = pca.fit_transform(X_scl, y)

In [None]:
plot_data_stats(X_pca)

### Estimate PCA data

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X_pca, y, test_size=0.2, random_state=42)

train_eval_model(X_train, y_train, X_valid, y_valid, params)

### Independent features

In [None]:
correlation = train.corr()
res = correlation[correlation.abs()>0.5].fillna(0).sum().reset_index()
res.columns = ['feature', 'correlations']
res[res.correlations > 1]

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10,8))
sns.heatmap(correlation);

### Imbalanced dataset

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 6))
sns.countplot(x=y);

# Estimate scaled and PCA data

In [None]:
params = {
    'objective': 'binary',
    'is_unbalance': True,
    'metric': 'auc',
    'n_estimators': 400,
    'early_stopping_round': 10,
    'learning_rate': 0.1,
}

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X_pca, y, test_size=0.2, random_state=42)

train_eval_model(X_train, y_train, X_valid, y_valid, params)

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X_scl, y, test_size=0.2, random_state=42)

train_eval_model(X_train, y_train, X_valid, y_valid, params)

In [None]:
X_scl_pca = np.concatenate((X_scl, X_pca), axis=1)

X_train, X_valid, y_train, y_valid = train_test_split(
    X_scl_pca, y, test_size=0.2, random_state=42)

train_eval_model(X_train, y_train, X_valid, y_valid, params)

As we can see, the combination of the min max scaled and PCA datasets  and informing the model the dataset is not balanced improve results.

In [None]:
plot_data_stats(X_scl_pca)

In [None]:
del(X_scl)
del(X_pca)

gc.collect()

### Autoencoder

We will use an autoencoder in order to remove unwanted characteristics from the dataset.

In [None]:
input_dim = X_scl_pca.shape[1] 

autoencoder = tf.keras.Sequential([
    L.Dense(64, input_shape=(input_dim, )),
    L.BatchNormalization(),
    L.LeakyReLU(),
    L.Dense(32),
    L.BatchNormalization(),
    L.LeakyReLU(),
    L.Dense(16),
    L.Dense(32),
    L.BatchNormalization(),
    L.LeakyReLU(),
    L.Dense(64),
    L.BatchNormalization(),
    L.LeakyReLU(),    
    L.Dense(input_dim, activation = 'linear')
])

autoencoder_model = tf.keras.Model(inputs=autoencoder.input, outputs=autoencoder.output)
autoencoder_model.summary()

In [None]:
autoencoder_model.compile(loss='mse', optimizer='adam')

epochs = 30
batch_size = 32
sample_perc = 0.25
sample_size = int(X.shape[0]*sample_perc)
random_indexes = np.random.choice(np.arange(X.shape[0]), sample_size)

autoencoder_history = autoencoder_model.fit(
    X_scl_pca[random_indexes],
    X_scl_pca[random_indexes],
    batch_size=batch_size, epochs=epochs,
    steps_per_epoch=sample_size//batch_size,
)

The summarize diagnostics was used to determine the right amount of epochs.

In [None]:
summarize_diagnostics(autoencoder_history)

In [None]:
X_enc = autoencoder_model.predict(X_scl_pca)

X_train, X_valid, y_train, y_valid = train_test_split(
    X_enc, y, test_size=0.2, random_state=42)

train_eval_model(X_train, y_train, X_valid, y_valid, params)

Autoencoded dataset alone does not improve results.

In [None]:
plot_data_stats(X_enc)

In [None]:
X_scl_pca_enc = np.concatenate((X_scl_pca, X_enc), axis=1)

X_train, X_valid, y_train, y_valid = train_test_split(
    X_scl_pca_enc, y, test_size=0.2, random_state=42)

train_eval_model(X_train, y_train, X_valid, y_valid, params)

Neither does the combination of min max, PCA and autoencoder by a small margin. At least using LightGBM Classifier.

In [None]:
plot_data_stats(X_scl_pca_enc)

In [None]:
del(X_scl_pca)
del(X_enc)

gc.collect()

# Model

In [None]:
X_test_scl = min_max_scl.transform(test)
X_test_pca = pca.transform(X_test_scl)
X_test_scl_pca = np.concatenate((X_test_scl, X_test_pca), axis=1)
X_test_enc = autoencoder_model.predict(X_test_scl_pca)
X_test_scl_pca_enc = np.concatenate((X_test_scl_pca, X_test_enc), axis=1)

In [None]:
del(X_test_scl)
del(X_test_pca)
del(X_test_scl_pca)
del(X_test_enc)

gc.collect()

In [None]:
X_resampled, y_resampled = ROS().fit_resample(X_scl_pca_enc, y)
std_scl = StandardScaler()

X_train_scaled = std_scl.fit_transform(X_resampled)
X_test_scaled = std_scl.transform(X_test_scl_pca_enc)

prediction = np.zeros(test.shape[0])

In [None]:
skf = StratifiedKFold(n_splits=10, shuffle=True)

params = {
    'objective': 'binary',
    'metric': 'auc',
    'is_unbalance': True,
    'learning_rate': 0.05,
    'boosting': 'gbdt',
    'force_col_wise': True,
    'feature_fraction': 0.8,
}

for train_index, test_index in skf.split(X_train_scaled, y_resampled):
    X_train, X_test = X_train_scaled[train_index], X_train_scaled[test_index]
    y_train, y_test = y_resampled[train_index],    y_resampled[test_index]
    
    train_data = lgb.Dataset(X_train, label=y_train)
    test_data = lgb.Dataset(X_test, label=y_test)
    
    model = lgb.train(
        params, 
        train_data, 
        valid_sets=test_data, 
        num_boost_round=200, 
        early_stopping_rounds=10,
        verbose_eval=False
    )
    
    y_pred = model.predict(X_test)
    score = roc_auc_score(y_test, y_pred)
    
    print(f'Score = {score:.5f}')
    
    prediction += model.predict(X_test_scaled)/skf.n_splits

# Submission

In [None]:
test['target'] = prediction
test['ID_code'] = test_id

test[['ID_code', 'target']].to_csv('submission.csv', index=False)

# References

https://www.kaggle.com/code/fatemetardasti/santander-transaction-prediction-lgbm

https://www.kaggle.com/code/alirezahanifi/santander-customer-pca-dae-using-lr-lgbm

https://www.kaggle.com/code/ricardopennaleite/internship-work

https://www.tensorflow.org/tutorials/structured_data/imbalanced_data

https://www.tensorflow.org/tutorials/customization/custom_layers