In [None]:
import pandas as pd
import numpy as np

import datatable as dt
import xgboost as xgb
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve, auc

import optuna
import tensorflow as tf

In [None]:
%%time
train = pd.read_csv('../input/tabular-playground-series-oct-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-oct-2021/test.csv')
sample_submission = pd.read_csv('../input/tabular-playground-series-oct-2021/sample_submission.csv')

memory_usage = train.memory_usage(deep=True) / 2 ** 11
start_memory = memory_usage.sum()

In [None]:
feature_cols = train.columns.tolist()[1:-1]
con_features = train.select_dtypes(include = 'float64').columns.tolist()
cat_features = train.select_dtypes(include = 'int64').columns.tolist()[1:-1]

train[con_features] = train[con_features].astype('float32')
train[cat_features] = train[cat_features].astype('uint8')

test[con_features] = test[con_features].astype('float32')
test[cat_features] = test[cat_features].astype('uint8')

memory_usage = train.memory_usage(deep=True) / 2 ** 11
end_memory = memory_usage.sum()

print('Memory usage decreased from {:.2f} MB to {:2f} MB ({:.2f} % reduction)'.format(start_memory, end_memory, 100 * (start_memory - end_memory) / start_memory))

In [None]:
from sklearn.preprocessing import StandardScaler

X = train.drop(columns=["id", "target"]).copy()
y = train["target"].copy()
X_test = test.drop(columns=["id"]).copy()

scaler = StandardScaler()
X = pd.DataFrame (data=scaler.fit_transform(X), columns=X.columns)
X_test = pd.DataFrame (data=scaler.transform(X_test), columns=X_test.columns)

In [None]:
params = {
    'boosting_type':'gbdt', 
    'num_leaves': 31, 
    'max_depth': 8, 
    'n_estimators':1000, 
    'subsample_for_bin':2000, 
    'min_split_gain':0.0, 
    'min_child_weight':1e-3, 
    'min_child_samples':20, 
    'subsample':1.0, 
    'subsample_freq':0, 
    'colsample_bytree':1, 
    'reg_alpha':0, 
    'reg_lambda':0, 
}

In [None]:
# def objective(trial):
    
#     kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=786)

#     scores = []
    
#     params = {
#         'device_type':'gpu',
#         'boosting_type':'dart', # 'dart' accuracy, 'gbdt' speed
#         'num_leaves':trial.suggest_int("num_leaves", 20, 3000, step=20), # overfitting, accuracy
#         'max_depth': trial.suggest_int("max_depth", 3, 12), # overfitting
#         'n_estimators':trial.suggest_int("n_estimators", 100, 10000), 
# #         'subsample_for_bin':200000, 
# #         "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 200, 10000, step=100),
# #         "max_bin": trial.suggest_int("max_bin", 200, 300), # accuracy
# #         "lambda_l1": trial.suggest_int("lambda_l1", 0, 100, step=5), # overfitting
# #         "lambda_l2": trial.suggest_int("lambda_l2", 0, 100, step=5), # overfitting
# #         "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15), # regularization
# #         "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 0.95, step=0.1),
# #         "bagging_freq": trial.suggest_categorical("bagging_freq", [1]),
# #         "feature_fraction": trial.suggest_float(
# #             "feature_fraction", 0.2, 0.95, step=0.1
# #         ),
#         'min_split_gain':0.0, 
#         'min_child_weight':trial.suggest_int('min_child_weight', 1, 100), 
#         'min_child_samples':20, 
#         'subsample':1.0, 
#         'subsample_freq':0, 
#         'colsample_bytree':trial.suggest_float("colsample_bytree", 1e-8, 1), 
#         'reg_alpha':trial.suggest_float("reg_alpha", 1e-8, 1, log=True), 
#         'reg_lambda':trial.suggest_int("reg_lambda", 1, 100), 
# #         'random_state':2021, 
# #         'n_jobs':- 1, 
# #         'silent':'warn'

# #         'max_depth': trial.suggest_int("max_depth", 1, 20),
# #         'n_estimators': trial.suggest_int("n_estimators", 200, 10000),
# #         'subsample': trial.suggest_float("subsample", 0.2, 1),
# #         'colsample_bytree': trial.suggest_float("colsample_bytree", 1e-8, 1),
# #         'colsample_bylevel': trial.suggest_float("colsample_bylevel", 1e-8, 1),
# #         'min_child_weight': trial.suggest_int('min_child_weight', 1, 100),
# #         'reg_lambda': trial.suggest_int("reg_lambda", 1, 100),
# #         'reg_alpha': trial.suggest_float("reg_alpha", 1e-8, 1, log=True),
# #         'gamma': trial.suggest_float("gamma", 0, 1),
# #         'booster': 'gbtree',
# #         'eval_metric': 'auc',
# #         'tree_method': 'gpu_hist',
# #         'predictor': 'gpu_predictor',
# #         'use_label_encoder': False
#     }
    
    
#     for fold, (idx_train, idx_valid) in enumerate(kf.split(X.iloc[:10000], y[:10000])):
        
        
#         X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
#         X_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]

#         params['learning_rate']=trial.suggest_categorical(
#             'learning_rate', [0.008,0.009,0.01,0.012,0.014,0.016,0.018, 0.02])
#         model1 = LGBMClassifier(objective="binary", **params)

#         model1.fit(X_train,y_train,
#                   eval_set=[(X_train, y_train),(X_valid,y_valid)],
# #                   early_stopping_rounds=200,
#                   verbose=False)

# #         params['learning_rate']=trial.suggest_categorical(
# #             'learning_rate', [0.008,0.009,0.01,0.012,0.014,0.016,0.018, 0.02])
# #         model2 = LGBMClassifier(objective='binary', **params)

# #         model2.fit(X_train,y_train,
# #                   eval_set=[(X_train, y_train),(X_valid,y_valid)],
# #                   early_stopping_rounds=200,
# #                   verbose=False,
# #                   init_model=model1)

# #         params['learning_rate']=trial.suggest_categorical(
# #             'learning_rate', [0.008,0.009,0.01,0.012,0.014,0.016,0.018, 0.02])
# #         model3 = LGBMClassifier(objective='binary', **params)

# #         model3.fit(X_train,y_train,
# #                   eval_set=[(X_train, y_train),(X_valid,y_valid)],
# #                   early_stopping_rounds=200,
# #                   verbose=False,
# #                   init_model=model2)

# #         params['learning_rate']=trial.suggest_categorical(
# #             'learning_rate', [0.008,0.009,0.01,0.012,0.014,0.016,0.018, 0.02])
# #         model4 = LGBMClassifier(objective='binary', **params)

# #         model4.fit(X_train,y_train,
# #                   eval_set=[(X_train, y_train),(X_valid,y_valid)],
# #                   early_stopping_rounds=200,
# #                   verbose=False,
# #                   init_model=model3)

#         pred_valid = model1.predict_proba(X_valid)[:,1]
#         fpr, tpr, _ = roc_curve(y_valid, pred_valid)
#         score = auc(fpr, tpr)
#         scores.append(score)

#         print(f"Fold: {fold + 1} Score: {score}")

#     print(f"Overall Validation Score: {np.mean(scores)}")
#     return np.mean(scores)
    
    
# study = optuna.create_study(direction='maximize')
# study.optimize(objective, n_trials=100)

# print('Number of finished trials: {}'.format(len(study.trials)))

# print('Best trial:')
# trial = study.best_trial

# print('  Value: {}'.format(trial.value))

# print('  Params: ')
# for key, value in trial.params.items():
#     print('     {}: {}'.format(key, value))

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)
    BATCH_SIZE = tpu_strategy.num_replicas_in_sync * 64
    print("Running on TPU:", tpu.master())
    print(f"Batch Size: {BATCH_SIZE}")
    
except ValueError:
    strategy = tf.distribute.get_strategy()
    BATCH_SIZE = 512
    print(f"Running on {strategy.num_replicas_in_sync} replicas")
    print(f"Batch Size: {BATCH_SIZE}")

In [None]:
with tpu_strategy.scope():
        
    kf = StratifiedKFold(n_splits=7, shuffle=True, random_state=786)

    test_preds = []
    scores = []

    for fold, (idx_train, idx_valid) in enumerate(kf.split(X.iloc[:10000], y[:10000])):

        X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
        X_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]

        params['learning_rate']=0.05
        model1 = LGBMClassifier(**params)

        print('Processing Model1 ...')
        model1.fit(X_train,y_train,
                   eval_set=[(X_valid,y_valid)],
                   eval_metric='auc',
                   verbose=False)

        params['learning_rate']=0.01
        model2 = LGBMClassifier(**params)

        print('Processing Model2 ...')
        model2.fit(X_train,y_train,
                   eval_set=[(X_valid,y_valid)],
                   eval_metric='auc',
                   verbose=False,
                   init_model=model1)

        params['learning_rate']=0.007
        model3 = LGBMClassifier(**params)

        print('Processing Model3 ...')
        model3.fit(X_train,y_train,
                   eval_set=[(X_valid,y_valid)],
                   eval_metric='auc',
                   verbose=False,
                   init_model=model2)

        params['learning_rate']=0.001
        model4 = LGBMClassifier(**params)

        print('Processing Model4 ...')
        model4.fit(X_train,y_train,
                   eval_set=[(X_valid,y_valid)],
                   eval_metric='auc',
                   verbose=False,
                   init_model=model3)

        pred_valid = model4.predict_proba(X_valid)[:,1]
        fpr, tpr, _ = roc_curve(y_valid, pred_valid)
        score = auc(fpr, tpr)
        scores.append(score)

        print(f"Fold: {fold + 1} Score: {score}")
        print('Predicting test data ...')

        test_preds.append(model3.predict_proba(X_test)[:,1])

    print(f"Overall Validation Score: {np.mean(scores)}")

In [None]:
predictions = np.mean(np.column_stack(test_preds),axis=1)

sample_submission['target'] = predictions
sample_submission.to_csv('lgbm_sub_mean.csv', index=False)
sample_submission.head()

In [None]:
predictions = np.median(np.vstack(test_preds),axis=0)

sample_submission['target'] = predictions
sample_submission.to_csv('lgbm_sub_median.csv', index=False)
sample_submission.head()