# Libraries and Data import

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import random

from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

from sklearn.metrics import mean_squared_error, roc_auc_score 
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit, train_test_split

# Pandas setting to display more dataset rows and columns
pd.set_option('display.max_rows', 30)
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

import warnings
warnings.simplefilter(action='ignore', category=UserWarning)

In [None]:
%%time
train = pd.read_feather('../input/oct21tp-feathercreator/Oct21TP_train.feather')

test = pd.read_feather('../input/oct21tp-feathercreator/Oct21TP_test.feather')

TARGET = "target"

In [None]:
train = train.set_index("id")
test = test.set_index("id")
target = train[TARGET]

In [None]:
'''
Description: Generate new feature by several statistic methods
Args:
    dataset: The chosen dataset
    numerical_features: The numerical features in a list
    categorical_features: The categorical features in a list
Return: None
'''
train_head = train.head(2)
test_head = test.head(2)

# quite a lot
bool_features = list((train_head.drop(columns = TARGET).T)[train_head.dtypes == "bool"].index)
numeric_features = list(train_head.drop(columns = TARGET).drop(columns = bool_features).T.index)

def feature_generator(dataset, numerical_features, categorical_features):
    # Numerical feature
    dataset['n_min'] = dataset[numerical_features].min(axis=1)
    print("min done")
    dataset['n_max'] = dataset[numerical_features].max(axis=1)
    print("max done")
    dataset['n_std'] = dataset[numerical_features].std(axis=1)
    print("std done")
    dataset['n_mean'] = dataset[numerical_features].mean(axis=1)
    print("mean done")
    # Categorical feature
    dataset['c_sum'] = dataset[categorical_features].sum(axis=1)
    print("cat sum done")
#     dataset['c_mode'] = dataset[categorical_features].mode(axis=1)
#     print("mode done")
    
    # Generate new feature by several statistic methods
new_features = ['n_min', 'n_max', 'n_std', 'n_mean', 'c_sum', 'c_mode']
feature_generator(train, numeric_features, bool_features)
feature_generator(test, numeric_features, bool_features)

In [None]:
# Holdout set of 50% for Ensamble Meta Train
RANDOM_SEED = 42

train, holdout = train_test_split(
    train,
    test_size = 0.25,
    shuffle = True,
    stratify = train[TARGET],
    random_state = RANDOM_SEED,
)

target = train[TARGET].astype('int')
oof_target = holdout[TARGET].astype('int')

train = train.drop(columns=TARGET)
holdout = holdout.drop(columns=TARGET)


In [None]:
oof_df = pd.DataFrame()
oof_df["id"] = holdout.index

test_df = pd.DataFrame()
test_df["id"] = test.index

xgb_params = {
    "random_state": 0,
    "n_estimators": 8000,
    "learning_rate":0.008,
    "eval_metric": "auc",
    "objective":"binary:logistic",
    "use_label_encoder": False,
    "booster": "gbtree",
    # GPU
    "gpu_id": 0,
    "tree_method": "gpu_hist",
    "predictor": "gpu_predictor"
}

xgbc = XGBClassifier(**xgb_params)
xgbc.fit(train, target, verbose=False)
print("fitted")

In [None]:
# xgbc.evals_result() #Return the evaluation results of eval_sets
oof_df["xgb"] = xgbc.predict_proba(holdout)[:,1]
auc_xgbc = roc_auc_score(oof_target, oof_df["xgb"])
test_df["xgb"] = xgbc.predict_proba(test)[:,1]
print(f'AUC: {auc_xgbc}')

In [None]:
catb_params = {
    "random_seed": 0,
    "iterations": 8000,
    "learning_rate":0.008,
    "eval_metric" : "AUC",
    "verbose": 0,
    # GPU
    "task_type" : "GPU",
    "devices" : "0",
}

catbc = CatBoostClassifier(**catb_params)
catbc.fit(train, target, verbose=False)
print("fitted")

In [None]:
oof_df["ctb"] = catbc.predict_proba(holdout)[:,1]
auc_catbc = roc_auc_score(oof_target, oof_df["ctb"])
test_df["ctb"] = catbc.predict_proba(test)[:,1]
print(f'AUC: {auc_catbc}')

In [None]:

lgbc_params = {
    "n_estimators":8000, 
    "learning_rate":0.008, 
    "objective":'binary',                      
    "metric":'auc',                       
    "reg_alpha":10,
    "reg_lambda":0.1,                     
    "num_leaves":31,
    "max_depth":-1,
    "subsample":0.6,
    "subsample_freq":1, 
    "colsample_bytree":0.4,
    "min_child_weight":256,
    "min_child_samples":20, 
    "random_state":0,
    # GPU
    "device": "gpu"
}

lgbc = LGBMClassifier(**lgbc_params)

lgbc.fit(train, target, eval_metric='auc', verbose=-1)
print("fitted")

oof_df["lgb"] = lgbc.predict_proba(holdout)[:,1]
auc_lgbc = roc_auc_score(oof_target, oof_df["lgb"])

test_df["lgb"] = catbc.predict_proba(test)[:,1]
print(f'AUC: {auc_lgbc}')

In [None]:
import optimizeauc as oa

In [None]:
opt = oa.OptimizeAUC()

In [None]:
opt.fit(oof_df.drop(columns="id"), oof_target)

In [None]:
predictions = opt.predict(test_df.drop(columns="id"))


In [None]:
submissions = pd.DataFrame()
submissions["id"] = test.index
submissions["target"] = list(predictions)

submissions.to_csv('submission.csv', index=False, header=submissions.columns)
submissions.head()