In [None]:
!pip install pytorch-tabnet

In [None]:
import numpy as np 
import pandas as pd 
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from skimage.filters import threshold_otsu
from tqdm import tqdm
import gc

SEED = 0

In [None]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-nov-2021/train.csv", index_col='id')
test = pd.read_csv("/kaggle/input/tabular-playground-series-nov-2021/test.csv", index_col='id')

## contribution 1
I pretty much looked at whether the distribution was a unimodal point, or a bimodal distribution
Then split the features according to which distribution it was under and then apply feature engineering to each

In [None]:
pointy = [0,2,4,9,12,16,19,20,21,23,24,27,28,30,31,32,33,35,39,42,44,46,48,49,51,52,53,56,58,59,60,61,62,63,64,68,69,72,73,75,76,78,79,81,83,84,87,88,89,90,92,93,94,95,98,99]
bimodal = [x for x in range(0,100) if x not in pointy]

pointy = list(map(lambda x: 'f'+str(x), pointy))
bimodal = list(map(lambda x: 'f'+str(x), bimodal))

features = [x for x in train.columns.values if x[0]=="f"]

In [None]:
def create_features(df, cols, prefix='new_'):
    df[prefix+'sum'] = df[cols].sum(axis=1)
    df[prefix+'std'] = df[cols].std(axis=1)
    df[prefix+'avg'] = df[cols].mean(axis=1)
    df[prefix+'max'] = df[cols].max(axis=1)
    df[prefix+'min'] = df[cols].min(axis=1)
    
    return df

In [None]:
train = create_features(train, pointy, 'point_')
train = create_features(train, bimodal, 'bimodal_')
test = create_features(test, pointy, 'point_')
test = create_features(test, bimodal, 'bimodal_')

## contribution 2
The bimodal distributions clearly influence the target. We can create a boolean comparison as to which peak it sits under

See: www.kaggle.com/realtimshady/eda-feature-exploration

In [None]:
def check_peak(df, test_df, cols, suffix='_peak'):
    for col in cols:
        peak = threshold_otsu(df[col])
        df[str(col)+suffix] = df[col] > peak
        test_df[str(col)+suffix] = test_df[col] > peak

In [None]:
#check_peak(train, test, bimodal)

In [None]:
test.head()

In [None]:
X = train.drop(["target"], axis=1)
X_test = test
y = train["target"]

In [None]:
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

In [None]:
import pytorch_tabnet
from pytorch_tabnet.tab_model import TabNetClassifier
from pytorch_tabnet.pretraining import TabNetPretrainer
import torch
import shutil

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score

In [None]:
x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

In [None]:
shutil.make_archive('tabnet_model', 'zip', '../input/nov-tabnet-pretrain/test_pretrain2')

loaded_pretrain = TabNetPretrainer()
loaded_pretrain.load_model('./tabnet_model.zip')

In [None]:
# define the model
clf1_nopreproc = TabNetClassifier(n_steps=4,
                                  optimizer_fn=torch.optim.Adam,
                                  optimizer_params=dict(lr=2e-2),
                                  scheduler_params={"step_size":2, # how to use learning rate scheduler
                                                    "gamma":0.85},
                                  scheduler_fn=torch.optim.lr_scheduler.StepLR,
                                  mask_type='entmax' # "sparsemax"
                                  )

# fit the model 
clf1_nopreproc.fit(
    x_train,y_train,
    eval_set=[(x_val, y_val)],
    eval_name=['valid'],
    eval_metric=['auc'],
    max_epochs=1000, patience=30,
    batch_size=8192, virtual_batch_size=4096,
    num_workers=0,
    weights=1,
    drop_last=False,
    from_unsupervised=loaded_pretrain
)            

In [None]:
preds = clf1_nopreproc.predict_proba(X_test)[:,1]

In [None]:
submission = pd.read_csv('../input/tabular-playground-series-nov-2021/sample_submission.csv', index_col='id')
submission['target'] = preds
submission.to_csv('submission.csv')