In [None]:
!pip install --upgrade scikit-learn-intelex --progress-bar off

In [None]:
# Just in case I want to use an algorithm supported by scikit-learn-intelex
# in the future
from sklearnex import patch_sklearn
patch_sklearn()

In [None]:
import numpy as np
import pandas as pd

In [None]:
train_df = pd.read_csv('../input/tabular-playground-series-nov-2021/train.csv')
test_df = pd.read_csv('../input/tabular-playground-series-nov-2021/test.csv')
ss = pd.read_csv('../input/tabular-playground-series-nov-2021/sample_submission.csv')

X = train_df.drop(['target', 'id'], axis = 1).copy()
y = train_df['target'].copy()
X_test = test_df.drop('id', axis = 1).copy()

del train_df, test_df

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import QuantileTransformer

scaler = QuantileTransformer(output_distribution = 'normal')
clf = GaussianNB()

model = make_pipeline(scaler, clf)

In [None]:
# Simple variation

simple_submit = model.fit(X, y).predict_proba(X_test)[:, 1]

In [None]:
# "Blended", inverted variation ("bootstrapped?")

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

kf_splitter = StratifiedKFold(25, shuffle = True, random_state = 2311)

test_predictions = []
validation_metrics = []

# Deliberately invert train and validation indices.
# I want to train many models on many small subsets.
for fold, (idx_val, idx_train) in enumerate(kf_splitter.split(X, y)):
    print("==========", f"FOLD {fold}", "==========")
    X_train = X.loc[idx_train, :]
    X_val = X.loc[idx_val, :]
    
    y_train = y[idx_train]
    y_val = y[idx_val]
    
    model.fit(X_train, y_train)
    
    val_prediction = model.predict_proba(X_val)[:, 1]
    val_auc = roc_auc_score(y_val, val_prediction)
    print("Validation AUC:", val_auc)
    
    validation_metrics.append(val_auc)
    
    test_prediction = model.predict_proba(X_test)[:, 1]
    test_predictions.append(test_prediction)

In [None]:
preds = np.stack(test_predictions, axis = 1)

uniform_blend = np.mean(preds, axis = 1)

auc_weighting = np.array(validation_metrics) - 0.5
auc_weighting = auc_weighting / auc_weighting.sum()

weighted_blend = np.dot(preds, auc_weighting)

In [None]:
# Out of curiosity, how correlated are my candidate-submissions
np.corrcoef(np.stack([simple_submit, uniform_blend, weighted_blend], axis = 1), rowvar = False)

In [None]:
ss['target'] = simple_submit
ss.to_csv('simple.csv', index = False)

ss['target'] = uniform_blend
ss.to_csv('uniform_blend.csv', index = False)

ss['target'] = weighted_blend
ss.to_csv('weighted_blend.csv', index = False)