In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import lightgbm as lgbm
from IPython.display import display
from sklearn import model_selection
from sklearn.metrics import mean_absolute_error
import optuna
from tensorflow import keras
import tensorflow as tf
from sklearn.preprocessing import normalize, RobustScaler
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model
from catboost import CatBoostClassifier


import datatable as dt
pd.set_option('display.max_rows', None, 'display.max_columns', None)
import xgboost as xgb
from sklearn import model_selection, metrics
from functools import partial
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
def reduce_memory_usage(df, verbose=True):
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

In [None]:
valid_1 = pd.read_csv('../input/tps10/catboost_valid.csv')
valid_2 = pd.read_csv('../input/tps10/catboost_valid.csv')
valid_3 = pd.read_csv('../input/tps10/oof_df.csv').drop('target', axis=1)
valid_4 = pd.read_parquet('../input/tps10/cat_oof.parquet')

test_1 = pd.read_csv('../input/tps10/catboost_test.csv').drop('id', axis=1)
test_2 = pd.read_csv('../input/tps10/xgb_test.csv').drop('id', axis=1)
test_3 = pd.read_csv('../input/tps10/test_df.csv')
test_4 = pd.read_csv('../input/tps10/cat_submission.csv').drop('id', axis=1)

In [None]:
X = pd.concat([valid_1, valid_2, valid_3, valid_4], axis=1)
X.columns = [f"col_{i}" for i in range(X.shape[1])]
X = reduce_memory_usage(X).values
del valid_1, valid_2, valid_3,valid_4

test = pd.concat([test_1, test_2, test_3, test_4], axis=1)
test.columns = [f"col_{i}" for i in range(test.shape[1])]
test = reduce_memory_usage(test).values
del test_1, test_2, test_3, test_4

In [None]:
train = pd.read_csv('../input/tabular-playground-series-oct-2021/train.csv')
y = train.target.values
del train

sample = pd.read_csv('../input/tabular-playground-series-oct-2021/sample_submission.csv')

In [None]:
n_splits = 5
BATCH_SIZE=1024
skf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=2021)
scores_train = []
scores_valid = []

preds_valid_array_mlp = np.zeros(X.shape[0])
preds_test_array_mlp = np.zeros(test.shape[0])

for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
    
    print(f"Fold {fold+1} -------------->")
    x_train, y_train = X[train_idx], y[train_idx]
    x_valid, y_valid = X[valid_idx], y[valid_idx]
    
    scheduler = tf.keras.optimizers.schedules.ExponentialDecay(1e-4, 400*((len(X)*0.8)/BATCH_SIZE), 1e-5)
    
    #with tpu_strategy.scope():
    model = keras.models.Sequential([
        keras.layers.Input(shape=(X.shape[1])),
        keras.layers.Dense(512, activation='selu'),
        #keras.layers.Dropout(0.30),
        keras.layers.Dense(256, 'selu'),
        keras.layers.Dense(128, 'selu'),
        keras.layers.Dense(64, 'selu'),
        keras.layers.Dense(32, activation='selu'),
        keras.layers.Dense(16, activation='selu'),
        keras.layers.Dense(1, activation='sigmoid'),
    ])
    
    model.compile(optimizer="adam", loss=tf.keras.losses.BinaryCrossentropy(), metrics=[tf.keras.metrics.AUC()])
    
    checkpoint_filepath = './my_best_model.hdf5'
    
    
    checkpoint = ModelCheckpoint(
          checkpoint_filepath, monitor='val_auc', verbose=1, save_best_only=True,
          save_weights_only=False, mode='max', save_freq='epoch',
          options=None
                  )
    
    model.fit(x_train, y_train, validation_data=(x_valid, y_valid), epochs=10, batch_size=BATCH_SIZE, callbacks=[tf.keras.callbacks.LearningRateScheduler(scheduler),
                                                                                                           checkpoint])#, es])
    
    #model = load_model(filepath)

    preds_train = model.predict(x_train).reshape(-1,)
    preds_valid = model.predict(x_valid).reshape(-1,)
    preds_test = model.predict(test).reshape(-1,)

    preds_valid_array_mlp[valid_idx] += preds_valid
    preds_test_array_mlp += preds_test / n_splits

    try:
        score_train = metrics.roc_auc_score(y_train.reshape(-1,), preds_train)
        score_valid = metrics.roc_auc_score(y_valid.reshape(-1,), preds_valid)
        print(score_valid)
        scores_train.append(score_train)
        scores_valid.append(score_valid)
    except:
        pass
    
print('Mean train score =', np.mean(scores_train), 'STD train =', np.std(scores_train, ddof=1))
print('Mean valid score =', np.mean(scores_valid), 'STD valid =', np.std(scores_valid, ddof=1))

pd.DataFrame({'target': preds_valid_array_mlp}).to_csv('mlp_valid.csv', index=False)
sample.iloc[:, 1] = preds_test_array_mlp
sample.to_csv('mlp_test.csv', index=False)

In [None]:
params = {
        "grow_policy": "lossguide",
        "max_depth": 6,
        #"min_child_weight": 88.86018015023126,
        #"colsample_bytree": 0.6732013209557288,
        #"lambda": 0.25233965104214506
        }


# KFold
n_splits=5
skf = model_selection.StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)
scores_train = []
scores_valid = []
preds_valid_array_xgb = np.zeros((X.shape[0], ))
preds_test_array_xgb = np.zeros((test.shape[0], ))

for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):

    print(f"Fold {fold+1} -------------->")
    x_train, y_train = X[train_idx], y[train_idx]
    x_valid, y_valid = X[valid_idx], y[valid_idx]

    y_train_log = y_train
    y_valid_log = y_valid

    model= xgb.XGBClassifier(
                               **params,
                               eval_metric='auc',
                               subsample=0.7,
                               tree_method='gpu_hist',
                               learning_rate=0.001,
                               n_estimators=10000,
                               objective='binary:logistic',
                                )
    model.fit(
            x_train, y=y_train,
            eval_set=[(x_valid, y_valid)],
            early_stopping_rounds=50,
            verbose=100
            )

    preds_train = model.predict_proba(np.float32(x_train))[:, 1]
    preds_valid = model.predict_proba(np.float32(x_valid))[:, 1]
    preds_test = model.predict_proba(np.float32(test))[:, 1]
    
    preds_valid_array_xgb[valid_idx] += preds_valid
    preds_test_array_xgb += preds_test / n_splits
    
    score_train = metrics.roc_auc_score(y_train, preds_train)
    score_valid = metrics.roc_auc_score(y_valid, preds_valid)
    print(score_valid)
    scores_train.append(score_train)
    scores_valid.append(score_valid)
        
print('Mean train score =', np.mean(scores_train), 'STD train =', np.std(scores_train, ddof=1))
print('Mean valid score =', np.mean(scores_valid), 'STD valid =', np.std(scores_valid, ddof=1))

pd.DataFrame({'target': preds_valid_array_xgb}).to_csv('xgb_valid.csv', index=False)
sample.iloc[:, 1] = preds_test_array_xgb
sample.to_csv('xgb_test.csv', index=False)

In [None]:
params = {
        "depth": 6,
        "grow_policy": "Depthwise",
        "l2_leaf_reg": 3.0,
        "random_strength": 1.0,
        }


# KFold
n_splits=5
skf = model_selection.StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)
scores_train = []
scores_valid = []
preds_valid_array_cb = np.zeros((X.shape[0], ))
preds_test_array_cb = np.zeros((test.shape[0], ))

for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):

    print(f"Fold {fold+1} -------------->")
    x_train, y_train = X[train_idx], y[train_idx]
    x_valid, y_valid = X[valid_idx], y[valid_idx]

    y_train_log = y_train
    y_valid_log = y_valid

    model = CatBoostClassifier(
                           **params,
                           learning_rate=0.03,
                           iterations=10000,
                           loss_function='CrossEntropy',
                           eval_metric='AUC',
                           use_best_model=True,
                           early_stopping_rounds=100,
                           task_type='GPU'
                           )

        
    model.fit(
          x_train, y=y_train,
          #embedding_features=None,
          use_best_model=True,
          eval_set=[(x_valid, y_valid)],
          verbose=100
             )



    preds_train = model.predict_proba(x_train)[:, 1]
    preds_valid = model.predict_proba(x_valid)[:, 1]
    preds_test = model.predict_proba(test)[:, 1]
    
    preds_valid_array_cb[valid_idx] += preds_valid
    preds_test_array_cb += preds_test / n_splits
    
    score_train = metrics.roc_auc_score(y_train, preds_train)
    score_valid = metrics.roc_auc_score(y_valid, preds_valid)
    print(score_valid)
    scores_train.append(score_train)
    scores_valid.append(score_valid)
        
print('Mean train score =', np.mean(scores_train), 'STD train =', np.std(scores_train, ddof=1))
print('Mean valid score =', np.mean(scores_valid), 'STD valid =', np.std(scores_valid, ddof=1))

pd.DataFrame({'target': preds_valid_array_cb}).to_csv('catboost_valid.csv', index=False)
sample.iloc[:, 1] = preds_test_array_cb
sample.to_csv('catboost_test.csv', index=False)

In [None]:
# optimized hyperparameters
params = {
        "min_child_weight": 638.7295413674256,
        "num_leaves": 32,
        "reg_alpha": 0.7635991288488166,
        "reg_lambda": 93.08626337603258
        }

# construct the model
model= lgbm.LGBMClassifier(
                       #**params,
                       objective='binary',
                       metric='auc',
                       subsample=0.7,
                       learning_rate=0.03,
                       n_estimators=10000,
                       n_jobs=-1
                       )

# construct KFold cross validation
n_splits=5
skf = model_selection.StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=2021)

# initiate lists to save folds scores
scores_train = []
scores_valid = []

# initiate zeros array for test data predictions
preds_valid_array_lgb = np.zeros((X.shape[0], ))
preds_test_array_lgb = np.zeros((test.shape[0], ))

# KFold cross validation 
for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):

    print(f"Fold {fold+1} -------------->")
    x_train, y_train = X[train_idx], y[train_idx]
    x_valid, y_valid = X[valid_idx], y[valid_idx]

    # fit the model
    model.fit(
            x_train, y_train,
            eval_set=[(x_valid,y_valid)],
            verbose=100,
            early_stopping_rounds=100
            )

    # clip the results so that the minimum and maximum values are 0 and 50, respectively
    preds_train = model.predict_proba(x_train)[:, 1]
    preds_valid = model.predict_proba(x_valid)[:, 1]
    preds_test = model.predict_proba(test)[:, 1]
    
    # add the predictions of each fold to the array
    preds_valid_array_lgb[valid_idx] += preds_valid
    preds_test_array_lgb += preds_test / n_splits
    
    # find both train and test rsme and observe if there is overfitting
    score_train = metrics.roc_auc_score(y_train, preds_train)
    score_valid = metrics.roc_auc_score(y_valid, preds_valid)
    
    # print the fold score
    print(score_valid)
    
    # append the fold score
    scores_train.append(score_train)
    scores_valid.append(score_valid)

print('Mean train score =', np.mean(scores_train), 'STD train =', np.std(scores_train, ddof=1))
print('Mean valid score =', np.mean(scores_valid), 'STD valid =', np.std(scores_valid, ddof=1))

# populate the submission dataframe
pd.DataFrame({'target': preds_valid_array_lgb}).to_csv('lgbm_valid.csv', index=False)
sample.iloc[:, 1] = preds_test_array_lgb
sample.to_csv('lgbm_test.csv', index=False)

In [None]:
plt.figure(figsize=(20, 10))
plt.subplot(1, 2, 1)
sns.histplot(preds_valid_array_mlp, kde=False)
sns.histplot(preds_valid_array_xgb, kde=False, color='grey')
sns.histplot(preds_valid_array_cb, kde=False, color='green', alpha=0.3)
sns.histplot(preds_valid_array_lgb, kde=False, color='red', alpha=0.3)
plt.legend(['MLP', 'XGBoost', 'CatBoost', 'LightGBM'])
plt.title('OOF Preds')
plt.subplot(1, 2, 2)
sns.histplot(preds_test_array_mlp, kde=False)
sns.histplot(preds_test_array_xgb, kde=False, color='grey')
sns.histplot(preds_test_array_cb, kde=False, color='green', alpha=0.3)
sns.histplot(preds_test_array_lgb, kde=False, color='red', alpha=0.3)
plt.legend(['MLP', 'XGBoost', 'CatBoost', 'LightGBM'])
plt.title('TEST Preds')
plt.show()