In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os


from sklearn.model_selection import StratifiedShuffleSplit, train_test_split
from sklearn.impute import  SimpleImputer, MissingIndicator
from sklearn.pipeline import make_pipeline, Pipeline, FeatureUnion
from sklearn.preprocessing import RobustScaler, QuantileTransformer, KBinsDiscretizer, StandardScaler

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import regularizers

import datatable as dt



In [None]:
if 'KAGGLE_CONTAINER_NAME' in os.environ:
    train = dt.fread('../input/tabular-playground-series-sep-2021/train.csv').to_pandas()
    test = dt.fread('../input/tabular-playground-series-sep-2021/test.csv').to_pandas().set_index('id')
else:
    train = dt.fread('./train.csv').to_pandas()
    test = dt.fread('./test.csv').to_pandas().set_index('id')
y = train['claim']
train.drop(columns=['claim', 'id'], inplace=True)


In [None]:
def reduce_memory_usage(df, verbose=True):
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    if isinstance(df, pd.Series):
        start_mem = df.memory_usage() / 1024 ** 2
    else:
        start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    if isinstance(df, pd.Series):
        end_mem = df.memory_usage() / 1024 ** 2
    else:
        end_mem = df.memory_usage().sum() / 1024 ** 2

    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

In [None]:
train['missing'] = train.isnull().sum(axis=1)
test['missing'] = test.isnull().sum(axis=1)


X_train, Y_train = reduce_memory_usage(train), reduce_memory_usage(y.to_frame())
X_train = X_train.copy()  # reduce fragmentation


## Missing values: Normalize for NN, missing values indicator as features

In [None]:

numeric_column_transform = make_pipeline(
    SimpleImputer(missing_values=np.nan, strategy='median'),
    QuantileTransformer(n_quantiles=1000, output_distribution='uniform'),
    KBinsDiscretizer(n_bins=128, encode='ordinal',strategy='uniform')
)

input_pipeline = FeatureUnion([
    ('features', numeric_column_transform),
    #('missing', MissingIndicator()),
]
)



In [None]:
X_train = input_pipeline.fit_transform(X_train)
x_train, x_test, y_train, y_test = train_test_split(X_train, Y_train.values, shuffle=True, stratify=Y_train.values, test_size=0.1)

## Load pretrained model

In [None]:
stacked_model = keras.models.load_model('../input/tpspretrained/stacked_model.h5')

In [None]:
# make a prediction with a stacked model
def predict_stacked_model(model, inputX):
    # prepare input data
    X = [inputX for _ in range(len(model.input))]
    # make prediction
    return model.predict(X, verbose=0)

## Import IsotonicRegression, train on train set predictions vs train target

In [None]:
from sklearn.calibration import calibration_curve
from sklearn.isotonic import IsotonicRegression as IR


p_train = predict_stacked_model(stacked_model, X_train)
ir = IR(out_of_bounds = 'clip')
ir.fit(p_train.flatten(), Y_train.values.flatten())
p_calibrated = ir.transform(p_train.flatten())

## Let's plot charts

In [None]:
fraction_of_positives, mean_predicted_value = calibration_curve(Y_train.values, p_train, n_bins=20)
plt.plot(mean_predicted_value, fraction_of_positives, "s-", label="Not calibrated")
fraction_of_positives, mean_predicted_value = calibration_curve(Y_train.values, p_calibrated, n_bins=20)
plt.plot(mean_predicted_value, fraction_of_positives, "s-", label="Calibrated")
plt.plot()

## Let's check how ROC changed

In [None]:
test_preds = predict_stacked_model(stacked_model, x_test)
calibrated_test_preds = ir.transform(test_preds.flatten())
print(f'ROC AUC: {roc_auc_score(y_test, test_preds)}')
print(f'ROC AUC calibrated: {roc_auc_score(y_test, calibrated_test_preds)}')

## Make predictions on test set and transform before submitting

In [None]:
transformed_x = input_pipeline.transform(test)
preds= predict_stacked_model(stacked_model, transformed_x)


In [None]:
preds = ir.transform(preds.flatten())

In [None]:
pd.DataFrame({'id': test.index, 'claim': preds.flatten()}).to_csv('nn_residual_ensemble_calibrated.csv', index=False)