In [None]:
!pip install --upgrade scikit-learn

In [None]:
import numpy as np 
import pandas as pd 
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from skimage.filters import threshold_otsu
from tqdm import tqdm
import gc

SEED = 0

In [None]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-nov-2021/train.csv", index_col='id')
test = pd.read_csv("/kaggle/input/tabular-playground-series-nov-2021/test.csv", index_col='id')

## contribution 1
I pretty much looked at whether the distribution was a unimodal point, or a bimodal distribution
Then split the features according to which distribution it was under and then apply feature engineering to each

In [None]:
pointy = [0,2,4,9,12,16,19,20,21,23,24,27,28,30,31,32,33,35,39,42,44,46,48,49,51,52,53,56,58,59,60,61,62,63,64,68,69,72,73,75,76,78,79,81,83,84,87,88,89,90,92,93,94,95,98,99]
bimodal = [x for x in range(0,100) if x not in pointy]

pointy = list(map(lambda x: 'f'+str(x), pointy))
bimodal = list(map(lambda x: 'f'+str(x), bimodal))

features = [x for x in train.columns.values if x[0]=="f"]

In [None]:
def create_features(df, cols, prefix='new_'):
    df[prefix+'abs_sum'] = df[cols].abs().sum(axis=1)
    df[prefix+'sem'] = df[cols].sem(axis=1)
    df[prefix+'std'] = df[cols].std(axis=1)
    df[prefix+'avg'] = df[cols].mean(axis=1)
    df[prefix+'max'] = df[cols].max(axis=1)
    df[prefix+'min'] = df[cols].min(axis=1)
    
    return df

In [None]:
train = create_features(train, pointy, 'point_')
train = create_features(train, bimodal, 'bimodal_')
test = create_features(test, pointy, 'point_')
test = create_features(test, bimodal, 'bimodal_')

## contribution 2
The bimodal distributions clearly influence the target. We can create a boolean comparison as to which peak it sits under

See: www.kaggle.com/realtimshady/eda-feature-exploration

In [None]:
def check_peak(df, test_df, cols, suffix='_peak'):
    for col in cols:
        peak = threshold_otsu(df[col])
        df[str(col)+suffix] = df[col] > peak
        test_df[str(col)+suffix] = test_df[col] > peak

In [None]:
check_peak(train, test, bimodal)

In [None]:
test.head()

In [None]:
X = train.drop(["target"], axis=1)
X_test = test
y = train["target"]

In [None]:
scaler = RobustScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

In [None]:
del test, train, scaler
gc.collect()

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
%%time
model = MLPClassifier((5,128),random_state=SEED, verbose=1, warm_start=True, early_stopping=True)
model.fit(X, y)

gc.collect()

0.735633

In [None]:
preds = model.predict_proba(X_test)[:,1]

In [None]:
submission = pd.read_csv('../input/tabular-playground-series-nov-2021/sample_submission.csv', index_col='id')
submission['target'] = preds
submission.to_csv('submission.csv')