In [None]:
import pandas as pd
import numpy as np
import optuna
import matplotlib.pyplot as plt
from sklearn import model_selection, metrics
import seaborn as sns
from functools import partial
import xgboost as xgb
from tqdm import tqdm
import datatable as dt

In [None]:
def reduce_memory_usage(df, verbose=True):
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

In [None]:
%%time
train = dt.fread('../input/tabular-playground-series-oct-2021/train.csv').to_pandas().drop('id', axis=1)
train = reduce_memory_usage(train)
test = dt.fread('../input/tabular-playground-series-oct-2021/test.csv').to_pandas().drop('id', axis=1)
test = reduce_memory_usage(test)
sample = dt.fread('../input/tabular-playground-series-oct-2021/sample_submission.csv').to_pandas()
sample = reduce_memory_usage(sample)

In [None]:
bool_cols_train = []
for i, col in enumerate(train.columns):
    if train[col].dtypes == bool:
        bool_cols_train.append(i)
    
print(bool_cols_train)

In [None]:
bool_cols_test = []
for i, col in enumerate(test.columns):
    if train[col].dtypes == bool:
        bool_cols_test.append(i)
    
print(bool_cols_test)

In [None]:
# reduce memory usage by converting boolean columns into integers
train.iloc[:, bool_cols_train] = train.iloc[:, bool_cols_train].astype(int)
test.iloc[:, bool_cols_test] = test.iloc[:, bool_cols_test].astype(int)

In [None]:
print("Train set shape", train.shape, "\n", "Test set shape", test.shape)

In [None]:
X = train.drop('target', axis=1).values
y = train['target'].values

In [None]:
# delete the train data to reduce memory usage
del train

In [None]:
params = {
        "grow_policy": "lossguide",
        "max_depth": 3,
        #"min_child_weight": 88.86018015023126,
        #"colsample_bytree": 0.6732013209557288,
        #"lambda": 0.25233965104214506
        }


# KFold
n_splits=5
skf = model_selection.StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)
scores_train = []
scores_valid = []
preds_valid_array = np.zeros((X.shape[0], ))
preds_test_array = np.zeros((test.shape[0], ))

for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):

    print(f"Fold {fold+1} -------------->")
    x_train, y_train = X[train_idx], y[train_idx]
    x_valid, y_valid = X[valid_idx], y[valid_idx]

    y_train_log = y_train
    y_valid_log = y_valid

    model= xgb.XGBClassifier(
                               **params,
                               eval_metric='auc',
                               subsample=0.7,
                               tree_method='gpu_hist',
                               learning_rate=0.03,
                               n_estimators=10000,
                               objective='binary:logistic',
                                )
    model.fit(
            x_train, y=y_train,
            eval_set=[(x_valid, y_valid)],
            early_stopping_rounds=50,
            verbose=100
            )

    preds_train = model.predict_proba(x_train)[:, 1]
    preds_valid = model.predict_proba(x_valid)[:, 1]
    preds_test = model.predict_proba(test)[:, 1]
    
    preds_valid_array[valid_idx] += preds_valid
    preds_test_array += preds_test / n_splits
    
    score_train = metrics.roc_auc_score(y_train, preds_train)
    score_valid = metrics.roc_auc_score(y_valid, preds_valid)
    print(score_valid)
    scores_train.append(score_train)
    scores_valid.append(score_valid)
        
print('Mean train score =', np.mean(scores_train), 'STD train =', np.std(scores_train, ddof=1))
print('Mean valid score =', np.mean(scores_valid), 'STD valid =', np.std(scores_valid, ddof=1))

pd.DataFrame({'target': preds_valid_array}).to_csv('xgb_valid.csv', index=False)
sample.iloc[:, 1] = preds_test_array
sample.to_csv('xgb_test.csv', index=False)