In [None]:
DEBUG = False
import torch
import numpy as np
import pandas as pd
if torch.cuda.is_available() :
    import cudf as cd
import matplotlib.pyplot as plt
import seaborn as sns
from plotly import express as px, graph_objects as go
if torch.cuda.is_available() :
    from cuml.preprocessing import StandardScaler
    from cuml.metrics import roc_auc_score
    from cuml.cluster import KMeans
else :
    from sklearn.preprocessing import StandardScaler
    from sklearn.metrics import roc_auc_score
    from sklearn.cluster import KMeans
from sklearn.model_selection import StratifiedKFold, train_test_split
from tqdm.auto import tqdm
tqdm.pandas()
from sklearn.linear_model import LogisticRegression

import tensorflow as tf
import warnings; warnings.filterwarnings('ignore')
import gc; gc.enable()

In [None]:
%%time
train = cd.read_csv('../input/tabular-playground-series-nov-2021/train.csv').to_pandas() if torch.cuda.is_available() else pd.read_csv('../input/tabular-playground-series-nov-2021/train.csv')
test = cd.read_csv('../input/tabular-playground-series-nov-2021/test.csv').to_pandas() if torch.cuda.is_available() else pd.read_csv('../input/tabular-playground-series-nov-2021/test.csv')
sample = pd.read_csv('../input/tabular-playground-series-nov-2021/sample_submission.csv')

In [None]:
display(train, test)

In [None]:
train.drop('id', axis = 1, inplace = True)
test.drop('id', axis = 1, inplace = True)

In [None]:
y = train.pop('target')
y

In [None]:
def reduce_memory_usage(df, verbose=True):
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

In [None]:
def add_feat(df, verbose = True) :
    floating = test.columns.tolist()
    df['skew'] = df[floating].skew(axis = 1)
    if verbose :
        print('Skew Done')
    df['sum'] = df[floating].sum(axis = 1)
    if verbose :
        print('Sum Done')
    df['ranksum'] = df['sum'].rank(pct = True)
    if verbose :
        print('Ranksum Done')
    df['max'] = df[floating].max(axis = 1)
    if verbose :
        print('Max Done')
    df['min'] = df[floating].min(axis = 1)
    if verbose :
        print('Min Done')
    df['mean'] = df[floating].mean(axis = 1)
    if verbose :
        print('Mean Done')
    df['rankmean'] = df['mean'].rank(pct = True)
    if verbose :
        print('Rankmean Done')
    df['std'] = df[floating].std(axis = 1)
    if verbose :
        print('STD Done')
    df['mad'] = df[floating].mad(axis = 1)
    if verbose :
        print('MAD Done')
    df['median'] = df[floating].median(axis = 1)
    if verbose :
        print('Median Done')
    df['sem'] = df[floating].sem(axis = 1)
    if verbose :
        print('SEM Done')
    df['var'] = df[floating].var(axis = 1)
    if verbose :
        print('Var Done')
    return df

In [None]:
%%time
train = add_feat(train)
test = add_feat(test)

In [None]:
feat = test.columns.tolist()
print(feat)

In [None]:
if DEBUG :
    correlation = train.corr()
    plt.figure(figsize = (27, 18))
    sns.heatmap(correlation, annot_kws = {'fontsize' : 14})
    plt.show()

In [None]:
if DEBUG :
    correlation

In [None]:
if DEBUG :
    raise DebugError()

In [None]:
if DEBUG :
    xtrain, xval, ytrain, yval = train_test_split(train, y, test_size = 0.25, random_state = 11)
    xtrain = xtrain.reset_index(drop = True)
    xval = xval.reset_index(drop = True)
    xtest = test.copy()
    scaler = StandardScaler()
    xtrain[feat] = scaler.fit_transform(xtrain[feat])
    xval[feat] = scaler.transform(xval[feat])
    xtest[feat] = scaler.transform(xtest[feat])
    del scaler
    gc.collect()

In [None]:
if DEBUG :
    wcss = []
    for x in range(1, 11) :
        kmeans = KMeans(n_clusters = x, random_state = 11)
        kmeans.fit(xtrain)
        wcss.append(kmeans.inertia_)
        print(f'CLUSTER {x} DONE')
        del kmeans
        gc.collect()
    px.line(y = wcss).show()

In [None]:
if DEBUG :
    raise DebugError('This is a debug, Stop here.')

In [None]:
n_clusters = 7
gc.collect()

In [None]:
if DEBUG :
    raise DebugError('This is a debug, stop here.')
    del xtrain, xval, xtest

In [None]:
skf = StratifiedKFold(n_splits = 50,
                      shuffle = True,
                      random_state = 11)

In [None]:
def build_model(df) :
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.BatchNormalization(input_shape = df.shape[1:]))
    
    model.add(tf.keras.layers.Dense(128, activation = 'swish'))
    #model.add(tf.keras.layers.Dropout(0.2))
    model.add(tf.keras.layers.Dense(128, activation = 'swish'))
    #model.add(tf.keras.layers.Dropout(0.2))
    model.add(tf.keras.layers.Dense(128, activation = 'swish'))
    #model.add(tf.keras.layers.Dropout(0.2))
    model.add(tf.keras.layers.Dense(128, activation = 'swish'))
    #model.add(tf.keras.layers.Dropout(0.2))
    model.add(tf.keras.layers.Dense(128, activation = 'swish'))
    model.add(tf.keras.layers.Dropout(0.2))
    model.add(tf.keras.layers.Dense(1, activation = 'sigmoid'))
    
    model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = 1e-3),
                  loss = tf.keras.losses.BinaryCrossentropy(),
                  metrics = [tf.keras.metrics.AUC(name = 'auc')])
    return model

In [None]:
preds = []
scores = []
tf.random.set_seed(11)
for i, (t, v) in enumerate(skf.split(train, y)) :
    print('='*40, f'FOLD {i}', '='*40)
    xtrain = train.iloc[t, :].reset_index(drop = True)
    xval = train.iloc[v, :].reset_index(drop = True)
    xtest = test.copy()
    ytrain = y[t]
    yval = y[v]
    
    scaler = StandardScaler()
    xtrain[feat] = scaler.fit_transform(xtrain[feat])
    xval[feat] = scaler.transform(xval[feat])
    xtest[feat] = scaler.transform(xtest[feat])
    
    kmeans = KMeans(n_clusters = n_clusters, random_state = 11)

    xtrain_cluster = pd.DataFrame(kmeans.fit_transform(xtrain[feat]), columns = [f'cluster{f}' for f in range(n_clusters)])
    xval_cluster = pd.DataFrame(kmeans.transform(xval[feat]), columns = [f'cluster{f}' for f in range(n_clusters)])
    xtest_cluster = pd.DataFrame(kmeans.transform(xtest[feat]), columns = [f'cluster{f}' for f in range(n_clusters)])
    
    xtrain = reduce_memory_usage(pd.concat([xtrain, xtrain_cluster], axis = 1))
    xval = reduce_memory_usage(pd.concat([xval, xval_cluster], axis = 1))
    xtest = reduce_memory_usage(pd.concat([xtest, xtest_cluster], axis = 1))
    
#    lgr = LogisticRegression()
#    lgr.fit(xtrain, ytrain)
#    xtrain['lgr_pred'] = lgr.predict(xtrain)
#    xval['lgr_pred'] = lgr.predict(xval)
#    xtest['lgr_pred'] = lgr.predict(xtest)
    
    del xtrain_cluster, xval_cluster, xtest_cluster, kmeans
    
    model = build_model(xtest)
    cb = [tf.keras.callbacks.EarlyStopping(patience = 8,
                                       mode = 'max',
                                       monitor = 'val_auc'),
          tf.keras.callbacks.ReduceLROnPlateau(patience = 4,
                                           mode = 'max',
                                           monitor = 'val_auc',
                                           factor = 0.08),
          tf.keras.callbacks.ModelCheckpoint(f'best_model{i}.h5',
                                         monitor = 'val_auc',
                                         mode = 'max',
                                         save_best_only = True,
                                         save_weights_only = True)]
    history = model.fit(xtrain, ytrain,
                    validation_data = (xval, yval),
                    epochs = 50,
                    batch_size = 256,
                    verbose = 0,
                    callbacks = cb)
    plt.figure(figsize = (12, 7))
    sns.lineplot(data = history.history)
    plt.show()
    model.load_weights(f'best_model{i}.h5')
    loss, auc = model.evaluate(xval, yval)
    print(i, ":", auc)
    scores.append(auc)
    pred = model.predict(xtest)
    preds.append(pred)
    del xtrain, xval, xtest, ytrain, yval, model, cb, history
    tf.keras.backend.clear_session()
    gc.collect()
    print('='*40, f'FOLD {i}', '='*40)
    print('')

In [None]:
print(np.mean(scores), np.std(scores))

In [None]:
def better_than_median(inputs, axis):
    """Compute the mean of the predictions if there are no outliers,
    or the median if there are outliers.

    Parameter: inputs = ndarray of shape (n_samples, n_folds)"""
    spread = inputs.max(axis=axis) - inputs.min(axis=axis) 
    spread_lim = np.median(spread)
    print(f"Inliers:  {(spread < spread_lim).sum():7} -> compute mean")
    print(f"Outliers: {(spread >= spread_lim).sum():7} -> compute median")
    print(f"Total:    {len(inputs):7}")
    return np.where(spread < spread_lim,
                    np.mean(inputs, axis=axis),
                    np.median(inputs, axis=axis))

In [None]:
preds = np.column_stack(preds)
preds = better_than_median(preds, axis = 1)
sample['target'] = preds
sample.to_csv('submission.csv', index = False)
display(sample)