In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import gc
import time
from sklearn.preprocessing import StandardScaler, MinMaxScaler, QuantileTransformer, KBinsDiscretizer # standartization of vars
from sklearn.decomposition import PCA
from collections import Counter
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras import backend as K # for the definition of the bce metrics
from sklearn.metrics import roc_auc_score
import datatable as dt
from sklearn.feature_selection import VarianceThreshold

In [None]:
def dt_read_data(path,file_type):
    return dt.fread(f'{path}/{file_type}.csv').to_pandas()

In [None]:
# convert the boolian variables to 0/1
def convert_boolian(df, excluded_vars):
    bool_vars = df[np.setdiff1d(df.columns, excluded_vars)].select_dtypes(include='bool').columns
    for i in bool_vars:
        df[i]=df[i].astype(int)
    return df

In [None]:
def memory_reduction(df):
    mem = df.memory_usage().sum() / 1024 ** 2
    print(f'start memory size is {mem}')
    x,y=[],[]
    for i in ['int','float']:
        if i == 'int':
            for j in [8,16,32,64]:
                x.append(i+str(j))
                y.append(i)
        else:
            for j in [16,32,64]:
                x.append(i+str(j))
                y.append(i)

    p={}
    for i in x:
        if i.startswith('int'):
            p[i]=[np.iinfo(i).min, np.iinfo(i).max]
        else:
            p[i]=[np.finfo(i).min, np.finfo(i).max]

    p_df={}
    for i in df.columns:
        if df[i].dtype in ['int','float']:
            if df[i].dtype == 'int':
                p_df[i]=['int', np.min(df[i]),np.max(df[i])]
            else:
                p_df[i]=['float', np.min(df[i]),np.max(df[i])]

    dtype_min_array = np.array([i[0] for i in p.values()])
    dtype_max_array = np.array([i[1] for i in p.values()])

    dtype_required_type={}
    for k in p_df.keys():
        s0 = np.where(np.array(y) == p_df.get(k)[0])
        s1 = np.where(dtype_min_array < p_df.get(k)[1])
        s2 = np.where(dtype_max_array > p_df.get(k)[2])
        dtype_required_type[k] = x[np.min(np.intersect1d(s2, np.intersect1d(s0, s1)))]

    # convert the variable types into their appropriate type
    for k,v in dtype_required_type.items():
        df[k] = df[k].astype(v)

    mem_new = df.memory_usage().sum() / 1024 ** 2
    print(f'memory post reduction is {np.round(mem_new,2)}MG-reduced by {np.round((1-mem_new/mem)*100,2)}%')
    return df

In [None]:
def missing_values(df, excluded_vars):
    return {k:df[k].isna().sum() for k in np.setdiff1d(df.columns, excluded_vars) if df[k].isna().sum() > 0}

In [None]:
def IQR_scores(df, excluded_vars, IQR_treshhold=1.5, outlier_treshhold = 0.15):
    outlier = []
    length_vars = len(df.columns)
    for i in np.setdiff1d(df.columns, excluded_vars):
        Q1 = np.quantile(df[i], 0.25)
        Q3 = np.quantile(df[i], 0.75)
        IQR = Q3-Q1
        oulier_index = np.where((df[i] < (Q1 - IQR_treshhold * IQR)) | (df[i] > (Q3 + IQR_treshhold * IQR)))
        outlier.extend(oulier_index[0].tolist())
    outlier_indices0 = dict(Counter(outlier))
    outlier_indices1 = {k:v/length_vars for k,v in outlier_indices0.items()}
    outlier = {k:v for k,v in outlier_indices1.items() if v > outlier_treshhold}
    return list(outlier.keys())

In [None]:
# create three features per row: sd, min, max (for the non-boolian variables
def create_features(df, excluded_vars):
    df['sd_var'] = df[np.setdiff1d(df.columns, excluded_vars)].std(axis=1)
    df['min_var'] = df[np.setdiff1d(df.columns, excluded_vars)].min(axis=1)
    df['max_var'] = df[np.setdiff1d(df.columns, excluded_vars)].max(axis=1)
    return df

In [None]:
# drop columns with low variance threshhold
def drop_columns_with_low_var(df, threshold):
    var_thr = VarianceThreshold(threshold=threshold)
    var_thr.fit(df)
    columns_to_keep = df.columns[var_thr.get_support()]
    return columns_to_keep.tolist()

In [None]:
# standardized the variables
def standartization(df, excluded_vars=None):
    if excluded_vars is None:
        var_names = list(df.columns)
        scaler = StandardScaler()
        df_scaled = scaler.fit_transform(df)
        df_scaled = pd.DataFrame(df_scaled)
        df_scaled.columns = var_names
        return df_scaled
    else:
        vars_to_transform = np.setdiff1d(df.columns, excluded_vars)
        df_tmp = df[excluded_vars]
        scaler = StandardScaler()
        df_scaled = scaler.fit_transform(df[vars_to_transform])
        df_scaled = pd.DataFrame(df_scaled)
        df_scaled.columns = vars_to_transform
        return pd.concat([df_tmp, df_scaled], axis=1)

In [None]:
# function that create random noise for denoising autoencoders
def denoise(df, noise_type, noise_fill, noise_percent, seed):
    np.random.seed(seed)
    if type(df) is np.ndarray:
        pass
    else:
        df = np.array(df)
    if noise_type == 'random':
        indices = np.random.choice(
            range(
                np.multiply(
                    df.shape[0], df.shape[1])), int(noise_percent * df.shape[1]), replace=False)
        if noise_fill == 'zero':
            np.put(df, indices, 0)
        elif noise_fill == 'random_normal':
            df=df.astype(float)
            np.put(df, indices, np.random.normal(0,1,len(indices)))
    elif noise_type == 'row_wise':
        indices = np.array(
            [np.random.choice(
                range(df.shape[1]), int(noise_percent * df.shape[1]), replace=False) + i
             for i in range(0, np.multiply(df.shape[0], df.shape[1]), df.shape[1])])
        if noise_fill == 'zero':
            np.put(df, indices.flatten(), 0)
        elif noise_fill == 'random_normal':
            df = df.astype(float)
            np.put(df, indices.flatten(), np.random.normal(0, 1, len(indices.flatten())))
    return df

In [None]:
# create the denoise model
def denoise_architecture(train_denoise, train, epochs, batch_size, seed):
    tf.random.set_seed(seed)
    visible = tf.keras.layers.Input(shape=(train_denoise.shape[1],), name='input_denoise')
    # define the encoder layers
    encode_layer = tf.keras.layers.Dense(units=64, activation='relu', name='encoding_layer')(visible)
    # The output layer will have the same number of nodes
    # as there are columns in the denoising input data and will use
    # a linear activation function to output numeric values
    output = tf.keras.layers.Dense(units=train.shape[1], activation='linear', name='output_layer')(encode_layer)
    # define the model
    model = tf.keras.models.Model(inputs=visible, outputs=output)
    model.compile(loss='mse',
                  optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                  metrics=[tf.keras.metrics.RootMeanSquaredError()])
    model.fit(train_denoise,
              train,
              validation_split=0.2,
              epochs=epochs,
              batch_size=batch_size,
              callbacks=[tf.keras.callbacks.EarlyStopping(monitor='loss',patience=3,min_delta=0.0001,restore_best_weights=True),
                         tf.keras.callbacks.ReduceLROnPlateau(factor=0.7,patience=3,min_delta=0.00001)],
              verbose=1)
    return model

In [None]:
# extract the denoising new features
def denoise_features(denoising_model, encoding_layer_name, df):
    encoding_model = tf.keras.models.Model(inputs=denoising_model.input,
                                           outputs=denoising_model.get_layer(encoding_layer_name).output)
    encoding_layer_output = pd.DataFrame(encoding_model.predict(df))
    encoding_layer_output.columns = ['_'.join(['dae',str(i)]) for i in encoding_layer_output.columns]
    return encoding_layer_output

In [None]:
def bin_partition(df,excluded_vars,n_bins,random_state,output_distribution='uniform'):
    # store the excluded vars in a tmp data
    excluded_vars_data = df[excluded_vars]
    # drop the excluded vars from the data prior to bin partition
    df_tmp = df.drop(excluded_vars, axis=1)
    # store the column names of the transformed data to assign them at the end of the process
    df_tmp_columns = df_tmp.columns
    # set the seed
    np.random.seed(random_state)
    # fit_transform the bin data
    est = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy=output_distribution)
    df_tmp = pd.DataFrame(est.fit_transform(df_tmp), columns=df_tmp_columns)
    df_tmp = df_tmp.apply(lambda x: x.astype(int))
    df_tmp = pd.concat([excluded_vars_data,df_tmp], axis=1)
    return df_tmp

In [None]:
# create callbacks
def callbacks():
    return [tf.keras.callbacks.EarlyStopping(monitor='loss',patience=3,min_delta=0.0001,restore_best_weights=True),
            tf.keras.callbacks.ModelCheckpoint('model.h5',save_weights_only=True),
            tf.keras.callbacks.ReduceLROnPlateau(factor=0.7,patience=3,min_delta=0.00001)]

In [None]:
# residual block architecture
def residual_architecture(df, number_of_blocks, input_dim, output_dim, output_shape, seed):
    block = {}
    tf.random.set_seed(seed)
    auc = tf.keras.metrics.AUC()
    input = tf.keras.layers.Input(shape=(df.shape[1],), name='input_layer')

    for i in range(number_of_blocks):
        if i == 0:
            embed = tf.keras.layers.Embedding(input_dim=input_dim,
                                              output_dim=output_dim,
                                              input_length=1,
                                              name='embedding')(input)
            emb_flat = tf.keras.layers.Flatten()(embed)
            emb_norm = tf.keras.layers.BatchNormalization()(emb_flat)
            emb_drop = tf.keras.layers.Dropout(0.2)(emb_norm)
            block[i] = tf.keras.layers.Dense(units=64, activation='relu', name=f'block_{i}')(emb_drop)

        else:
            norm = tf.keras.layers.BatchNormalization()(block[i-1])
            drop = tf.keras.layers.Dropout(0.2)(norm)
            block[i] = tf.keras.layers.Dense(units=64, activation='relu', name=f'block_{i}')(drop)
            block[i] = tf.keras.layers.concatenate([block[i], block[i-1]])
    norm = tf.keras.layers.BatchNormalization()(block[number_of_blocks-1])
    output = tf.keras.layers.Dense(units=output_shape, activation='sigmoid')(norm)

    model = tf.keras.models.Model(inputs=input, outputs=output)
    model.compile(loss='binary_crossentropy',
                  optimizer=tf.keras.optimizers.Adam(learning_rate=0.08),
                  metrics=[auc])
    return model

In [None]:
# build the prediction pipeline
def predict_residual_model(df, test_df, splits,
                           seeds, number_of_blocks,
                           input_dim, output_dim,
                           output_shape, epochs, batch_size):
    n_splits = splits
    pred_test = 0
    general_prediction = 0
    label = df['target']
    _df = df.drop(['id','target'], axis=1)
    test_ids = test_df['id']
    _test = test_df.drop('id', axis=1)
    # run across the seeds
    for seed in seeds:
        np.random.seed(seed)
        tf.random.set_seed(seed)
        skf = StratifiedKFold(n_splits=n_splits, random_state=seed, shuffle=True)
        score = []
        for i, (train_index, test_index) in enumerate(skf.split(_df, label)):
            tr, ts = np.array(_df.iloc[train_index]), np.array(_df.iloc[test_index])
            tr_label, ts_label = np.array(label.iloc[train_index]), np.array(label.iloc[test_index])
            model = residual_architecture(tr,
                                          number_of_blocks,
                                          input_dim,
                                          output_dim,
                                          output_shape,
                                          seed)
            hist = model.fit(tr,
                             tr_label,
                             validation_data=(ts, ts_label),
                             epochs=epochs,
                             batch_size=batch_size,
                             callbacks=callbacks(),
                             verbose=0)
            # collect the loss value results to compute the average loss value
            model.load_weights('model.h5')
            pred_ts = roc_auc_score(ts_label, model.predict(ts))
            hist_scores = [i for i in hist.history.values()]
            print(f'roc_auc_score value for seed {seed} at fold {i} at step {pd.Series(hist_scores[0]).idxmin()} is {pred_ts}')
            pred_test += model.predict(_test)/n_splits
            score.append(pred_ts)
        print(f'"\n"the average roc_auc_score value for seed value {seed} is {np.mean(score)}"\n"')
    general_prediction += pred_test/len(seeds)
    p = pd.DataFrame({'id': test_ids, 'target': general_prediction.flatten()})
    return p

In [None]:
path = '../input/tabular-playground-series-oct-2021'
t1 = time.time()
print('reading the train and test data')
train = dt_read_data(path, 'train')
test = dt_read_data(path, 'test')
train['target']=train['target'].astype(int)
print(f'reading the train and test data in {time.time()-t1} seconds with {len(train)} rows in the train data')

In [None]:
# create a list of boolian variables for later use
bool_vars = train[np.setdiff1d(train.columns, ['id','target'])].select_dtypes(include='bool').columns.to_list()

In [None]:
print('convert boolian vars to int (0/1)')
train = convert_boolian(train, ['id','target'])
test = convert_boolian(test, ['id'])

In [None]:
# check for missing values
dictionary_missing = missing_values(train, ['id','target'])
print(f'the number of variables that have missing values is {len(dictionary_missing)}')

In [None]:
print('dropping outliers based on IQR index')
outlier_indices = IQR_scores(train, ['id','target'])
print('drop rows with high outlier occurrences')
train = train.iloc[np.setdiff1d(range(len(train)), outlier_indices)].reset_index(drop=True)
print(f'the number of rows in the train data is {len(train)}')

In [None]:
# create new features
train = create_features(train, excluded_vars=['id','target']+bool_vars)
test = create_features(test, excluded_vars=['id','target']+bool_vars)

In [None]:
# run the denoise function
print('running dae analysis on the train and test data')
columns_for_dae = np.setdiff1d(train.columns, ['id','target']+bool_vars)
train_denoise = denoise(train[columns_for_dae], noise_type='row_wise', noise_fill='zero', noise_percent=0.5, seed=1974)

In [None]:
# run the denoise model
model_dae = denoise_architecture(train_denoise, train[columns_for_dae], epochs=100, batch_size=512, seed=1974)
del train_denoise
gc.collect()

In [None]:
# extract the features from the encoding layer
denoise_train = denoise_features(model_dae, 'encoding_layer', train[columns_for_dae])
denoise_test = denoise_features(model_dae, 'encoding_layer', test[columns_for_dae])

In [None]:
# keep columns with variance greater than 0.05
columns_to_keep = drop_columns_with_low_var(denoise_train, 0.05)
print(columns_to_keep)
denoise_train = denoise_train[columns_to_keep]
denoise_test = denoise_test[columns_to_keep]

In [None]:
# add the new features to the train and test data
train = pd.concat([train,denoise_train], axis=1)
test = pd.concat([test,denoise_test], axis=1)
del [denoise_train,denoise_test]
gc.collect()

In [None]:
print('standardized the train and test data')
train = standartization(train,['id','target']+bool_vars)
test = standartization(test,['id']+bool_vars)

In [None]:
# devide the continous variables into 128 bins
train = bin_partition(train,
                      excluded_vars=['id','target'],
                      n_bins=128,
                      random_state=1974,
                      output_distribution='uniform')

test = bin_partition(test,
                     excluded_vars=['id'],
                     n_bins=128,
                     random_state=1974,
                     output_distribution='uniform')

In [None]:
print('memory reduction function')
train = memory_reduction(train)
test = memory_reduction(test)
gc.collect()

In [None]:
test_prediction = predict_residual_model(train,
                                         test,
                                         splits = 5,
                                         seeds = [1974],
                                         number_of_blocks = 4,
                                         input_dim = 128,
                                         output_dim = 11,
                                         output_shape = 1,
                                         epochs = 100,
                                         batch_size = 512)

In [None]:
test_prediction.to_csv('resnet.csv', index=False)