![](https://lightgbm.readthedocs.io/en/latest/_images/LightGBM_logo_black_text.svg)

# Importing libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn import model_selection
from sklearn.metrics import roc_auc_score
import optuna
from optuna.integration import LightGBMPruningCallback
from lightgbm import LGBMClassifier

# Loading the data

In [None]:
df_train = pd.read_csv('../input/tabular-playground-series-nov-2021/train.csv')
df_test = pd.read_csv('../input/tabular-playground-series-nov-2021/test.csv')
sample_submission = pd.read_csv('../input/tabular-playground-series-nov-2021/sample_submission.csv')

# Preprocessing

In [None]:
features = [c for c in df_test.columns if 'f' in c]

df_train['sum'] = df_train[features].sum(axis=1)
df_test['sum'] = df_test[features].sum(axis=1)
features.append('sum')

df_train['std'] = df_train[features].std(axis=1)
df_test['std'] = df_test[features].std(axis=1)
features.append('std')

scaler = preprocessing.RobustScaler()
df_train[features] = scaler.fit_transform(df_train[features])
df_test[features] = scaler.transform(df_test[features])

# Creating folds

In [None]:
df_train['kfold'] = -1

y_train = df_train.target
X_train = df_train.drop('target', axis=1)

skf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for fold, (i_train, i_valid) in enumerate (skf.split(X_train, y_train)):
    df_train.loc[i_valid, 'kfold'] = fold

# Hyperparameter optimization with Optuna

In [None]:
# s = 0

# def objective(trial):
#     fold = 0
#     params = {
#         'num_leaves': trial.suggest_int('num_leaves', 30, 70),
#         'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 2000, 4000),
#         'max_depth': trial.suggest_int('max_depth', 4, 10),
#         'max_bin': trial.suggest_int('max_bin', 200, 400),
#         'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
#         'lambda_l1': trial.suggest_loguniform('lambda_l1', 0.00001, 50),
#         'lambda_l2': trial.suggest_loguniform('lambda_l2', 0.00001, 50),
#         'min_gain_to_split': trial.suggest_float('min_gain_to_split', 0, 10),
#         'feature_fraction': trial.suggest_float('feature_fraction', 0.2, 0.9),
#         'bagging_fraction': trial.suggest_float('bagging_fraction', 0.2, 0.9),
#         'bagging_freq': trial.suggest_int('bagging_freq', 1, 1)        
#     }

#     X_train = df_train[df_train.kfold != fold].reset_index(drop=True)
#     X_valid = df_train[df_train.kfold == fold].reset_index(drop=True)
        
#     y_train = X_train.target
#     y_valid = X_valid.target
    
#     X_train = X_train[features]
#     X_valid = X_valid[features]
    
#     model = LGBMClassifier(
#             objective='binary',
#             tree_learner='serial',
#             random_state=s,
#             n_estimators=30000,
#             **params)
    
#     model.fit(X_train,
#               y_train,
#               early_stopping_rounds=200,
#               eval_set=[(X_valid, y_valid)],
#               eval_metric='auc',
#               callbacks=[LightGBMPruningCallback(trial, 'auc')],
#               verbose=1000)
    
#     valid_pred = model.predict_proba(X_valid)[:,1]
#     auc = roc_auc_score(y_valid, valid_pred)
#     return auc

# study = optuna.create_study(direction="maximize")
# study.optimize(objective, n_trials=5000)

# Model training

In [None]:
%%time

m = 1
s = 0

valid_preds = {}
test_preds = []
scores = []

for fold in range(5):
    X_train = df_train[df_train.kfold != fold].reset_index(drop=True)
    X_valid = df_train[df_train.kfold == fold].reset_index(drop=True)

    X_test = df_test[features].copy()

    valid_ids = X_valid.id.values.tolist()

    y_train = X_train.target
    y_valid = X_valid.target

    X_train = X_train[features]
    X_valid = X_valid[features]
                                                
    params = {'num_leaves': 68,
              'min_data_in_leaf': 2213,
              'max_depth': 0,
              'max_bin': 352,
              'learning_rate': 0.01475315458728817,
              'lambda_l1': 0.18507317882864482,
              'lambda_l2': 4.884457066789047,
              'min_gain_to_split': 1.1320575715988301,
              'feature_fraction': 0.49146056893550066,
              'bagging_fraction': 0.5615998320661925,
              'bagging_freq': 1}

    model = LGBMClassifier(
        objective='binary',
        importance_type='split',
        boosting_type='gbdt',
        tree_learner='serial',
        num_threads=-1,
        random_state=s,
        n_estimators=30000,
        **params)

    model.fit(X_train,
              y_train,
              early_stopping_rounds=200,
              eval_set=[(X_valid, y_valid)],
              eval_metric='auc',
              verbose=1000)

    valid_pred = model.predict_proba(X_valid)[:,1]
    test_pred = model.predict_proba(X_test)[:,1]

    valid_preds.update(dict(zip(valid_ids, valid_pred)))
    test_preds.append(test_pred)

    score = roc_auc_score(y_valid, valid_pred)    
    scores.append(score)

print(f'Mean auc {np.mean(scores)}, std {np.std(scores)}')

# Out-of-fold predictions for later use
valid_preds = pd.DataFrame.from_dict(valid_preds, orient='index').reset_index()
valid_preds.columns = ['id', f'm{m}s{s}_pred']
valid_preds.to_csv(f'm{m}s{s}_valid_pred.csv', index=False)

# Test predictions for later use
sample_submission.target = np.mean(np.column_stack(test_preds), axis=1)
sample_submission.columns = ['id', f'm{m}s{s}_pred']
sample_submission.to_csv(f'm{m}s{s}_test_pred.csv', index=False)

# Submission
sample_submission.target = np.mean(np.column_stack(test_preds), axis=1)
sample_submission.columns = ['id', 'target']
sample_submission.to_csv('submission.csv', index=False)