In [None]:
!pip install pytorch_tabnet 

# Import

In [None]:
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import roc_auc_score
import pandas as pd
import numpy as np
np.random.seed(0)
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import QuantileTransformer,  KBinsDiscretizer
from sklearn.impute import SimpleImputer

import torch
# from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold

%matplotlib inline

# Load data 

In [None]:
%%time
train = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')

In [None]:
%%time
train['claim'] = train['claim'].astype(str)
## cleaner:
def get_stats_per_row(data):
    data['n_missing'] = data.isna().sum(axis=1)
    data['max_row'] = data.select_dtypes(include='number').max(axis=1)
    data['min_row'] = data.select_dtypes(include='number').min(axis=1)
    data['std_row'] = data.select_dtypes(include='number').std(axis=1)
    data['mean_row'] = data.select_dtypes(include='number').mean(axis=1)
    data['row_nunique'] = data.select_dtypes(include='number').nunique(axis=1)
    data['abs_max'] = data.select_dtypes(include='number').abs().max(axis=1)
    data['skew'] = data.select_dtypes(include='number').skew(axis=1)
    data['median'] = data.select_dtypes(include='number').median(axis=1)
    data['positive_counts'] = data.select_dtypes(include='number').gt(0).sum(axis=1)
    ## https://www.kaggle.com/bernhardklinger/tps-lightgbm-feature-eng 
#     data['mean2'] = (data.select_dtypes(include='number')**2).mean(axis=1) ## maybe also needs select numbers? 
    return data
train = get_stats_per_row(train.set_index(['claim', 'id'])).reset_index()
test = get_stats_per_row(test.set_index(['claim', 'id'])).reset_index()

features = [col for col in train.columns if col not in ['claim', 'id']]

##ORIG:
# pipe = Pipeline([
#         ('imputer', SimpleImputer(strategy='median',missing_values=np.nan)),
#         ("scaler", QuantileTransformer(n_quantiles=200, output_distribution='normal'))
#         ])

# mine:
pipe = Pipeline([
        ('imputer', SimpleImputer(strategy='median',missing_values=np.nan,add_indicator=True)),
        ("scaler", QuantileTransformer(n_quantiles=250, output_distribution='normal'))
        ])

X = pipe.fit_transform(train[features])
X_test=pipe.transform(test[features])

# Training

In [None]:
tabnet_params = dict(n_steps = 1,
                   optimizer_fn=torch.optim.Adam,
                   optimizer_params=dict(lr=1e-2, weight_decay = 5e-4),
                   scheduler_params={"step_size":1, # how to use learning rate scheduler
                                     "gamma":0.7},
                   scheduler_fn=torch.optim.lr_scheduler.StepLR,
                   mask_type='entmax',
                   verbose = 5)

In [None]:
kf = KFold(n_splits=5, random_state = 42, shuffle = True)
preds = np.zeros((493474,))
for  fold , (train_index, test_index) in enumerate(kf.split(X)):
    print(20*"*")
    print("Fold {}:".format(fold))
    X_train, X_valid = X[train_index], X[test_index]
    y_train, y_valid = train.claim[train_index].values, train.claim[test_index].values

    clf = TabNetClassifier(**tabnet_params)
    clf.fit(
        X_train=X_train, y_train=y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        eval_name=['train', 'valid'],
        eval_metric=['auc'],
#         max_epochs= 10,  ##ORIG
        max_epochs= 14,
        patience=4,
        batch_size=1024*10, virtual_batch_size=128*10,
        num_workers=0,
        weights=1,
        drop_last=False
    ) 
    preds += clf.predict_proba(X_test)[:,1]/5
    print(preds.shape)


In [None]:
sub = pd.read_csv("../input/tabular-playground-series-sep-2021/sample_solution.csv")
sub.iloc[:,1]= preds
sub=sub.set_index('id')
sub.to_csv('submission.csv')
