# Importing libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn import model_selection
from sklearn.metrics import roc_auc_score
import optuna
from optuna.integration import LightGBMPruningCallback
from lightgbm import LGBMClassifier

# Loading the data

In [None]:
df_train = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv')
df_test = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')
sample_solution = pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv')

# Preprocessing

In [None]:
features = [c for c in df_test.columns if 'f' in c]

df_train['kfold'] = -1

df_train['missing'] = df_train.isnull().sum(axis=1)
df_test['missing']  = df_test.isnull().sum(axis=1)

features.append('missing')

y_train = df_train.claim
X_train = df_train.drop('claim', axis=1)

df_train[features] = df_train[features].fillna(df_train[features].median())
df_test[features] = df_test[features].fillna(df_test[features].median())

scaler = preprocessing.RobustScaler()
df_train[features] = scaler.fit_transform(df_train[features])
df_test[features] = scaler.transform(df_test[features])

# Creating folds

In [None]:
skf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for fold, (i_train, i_valid) in enumerate (skf.split(X_train, y_train)):
    df_train.loc[i_valid, 'kfold'] = fold

# Hyperparameter tuning with Optuna

In [None]:
# seed = 0

# def objective(trial):
#     fold = 0
#     params = {
#         'num_leaves': trial.suggest_int('num_leaves', 21, 30),
#         'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1000, 20000),
#         'max_depth': trial.suggest_int('max_depth', 0, 0),
#         'max_bin': trial.suggest_int('max_bin', 200, 400),
#         'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.01),
#         'lambda_l1': trial.suggest_loguniform('lambda_l1', 0.00001, 5),
#         'lambda_l2': trial.suggest_loguniform('lambda_l2', 0.00001, 1),
#         'min_gain_to_split': trial.suggest_float('min_gain_to_split', 1, 3),
#         'feature_fraction': trial.suggest_float('feature_fraction', 0.05, 0.75),
#         'bagging_fraction': trial.suggest_float('bagging_fraction', 0.78, 0.9),
#         'bagging_freq': trial.suggest_int('bagging_freq', 1, 1)        
#     }

#     X_train = df_train[df_train.kfold != fold].reset_index(drop=True)
#     X_valid = df_train[df_train.kfold == fold].reset_index(drop=True)
        
#     y_train = X_train.claim
#     y_valid = X_valid.claim
    
#     X_train = X_train[features]
#     X_valid = X_valid[features]
    
#     model = LGBMClassifier(
#             objective='binary',
#             tree_learner='serial',
#             seed=seed,
#             n_estimators=50000,
#             **params)
    
#     model.fit(X_train,
#               y_train,
#               early_stopping_rounds=500,
#               eval_set=[(X_valid, y_valid)],
#               eval_metric='auc',
#               callbacks=[LightGBMPruningCallback(trial, 'auc')],
#               verbose=1000)
    
#     valid_pred = model.predict_proba(X_valid)[:,1]
        
#     auc = roc_auc_score(y_valid, valid_pred)
#     return auc

# for i in range(3):
#     study = optuna.create_study(direction="maximize")
#     study.optimize(objective, n_trials=40)

# Model training

In [None]:
%%time

m = 7
s = 26

test_preds = []
valid_preds = {}
scores = []
    
for fold in range(5):
    X_train = df_train[df_train.kfold != fold].reset_index(drop=True)
    X_valid = df_train[df_train.kfold == fold].reset_index(drop=True)

    X_test = df_test[features].copy()

    valid_ids = X_valid.id.values.tolist()

    y_train = X_train.claim
    y_valid = X_valid.claim

    X_train = X_train[features]
    X_valid = X_valid[features]

    params = {'num_leaves': 15,
              'min_data_in_leaf': 2200,
              'max_depth': 4,
              'max_bin': 220,
              'learning_rate': 0.010799862652877246,
              'lambda_l1': 1.8358777923407985,
              'lambda_l2': 0.0003751344396869442,
              'min_gain_to_split': 2.850800313709466,
              'feature_fraction': 0.44543603862631437,
              'bagging_fraction': 0.7701524274455008,
              'bagging_freq': 1}

    model = LGBMClassifier(
        objective='binary',
        importance_type='split', #default=split. try gain
        boosting_type='gbdt', #default=gbdt. try dart, goss, rf
        tree_learner='serial',
        num_threads=-1,
        random_state=s,
        n_estimators=50000,
        **params)

    model.fit(X_train,
              y_train,
              early_stopping_rounds=500,
              eval_set=[(X_valid, y_valid)],
              eval_metric='auc',
              verbose=1000)

    valid_pred = model.predict_proba(X_valid)[:,1]
    test_pred = model.predict_proba(X_test)[:,1]

    valid_preds.update(dict(zip(valid_ids, valid_pred)))
    test_preds.append(test_pred)
    
    score = roc_auc_score(y_valid, valid_pred)    
    scores.append(score)
    
print(f'Mean auc {np.mean(scores)}, std {np.std(scores)}')

valid_preds = pd.DataFrame.from_dict(valid_preds, orient='index').reset_index()
valid_preds.columns = ['id', f'm{m}s{s}_pred']
valid_preds.to_csv(f'm{m}s{s}_valid_pred.csv', index=False)

sample_solution.claim = np.mean(np.column_stack(test_preds), axis=1)
sample_solution.columns = ['id', f'm{m}s{s}_pred']
sample_solution.to_csv(f'm{m}s{s}_test_pred.csv', index=False)

In [None]:
# %%time

# m = 7
# s = 35

# test_preds = []
# valid_preds = {}
# scores = []
    
# for fold in range(5):
#     X_train = df_train[df_train.kfold != fold].reset_index(drop=True)
#     X_valid = df_train[df_train.kfold == fold].reset_index(drop=True)

#     X_test = df_test[features].copy()

#     valid_ids = X_valid.id.values.tolist()

#     y_train = X_train.claim
#     y_valid = X_valid.claim

#     X_train = X_train[features]
#     X_valid = X_valid[features]

#     params = {'num_leaves': 15,
#               'min_data_in_leaf': 2200,
#               'max_depth': 4,
#               'max_bin': 220,
#               'learning_rate': 0.010799862652877246,
#               'lambda_l1': 1.8358777923407985,
#               'lambda_l2': 0.0003751344396869442,
#               'min_gain_to_split': 2.850800313709466,
#               'feature_fraction': 0.44543603862631437,
#               'bagging_fraction': 0.7701524274455008,
#               'bagging_freq': 1}

#     model = LGBMClassifier(
#         objective='binary',
#         importance_type='split', #default=split. try gain
#         boosting_type='gbdt', #default=gbdt. try dart, goss, rf
#         tree_learner='serial',
#         num_threads=-1,
#         random_state=s,
#         n_estimators=50000,
#         **params)

#     model.fit(X_train,
#               y_train,
#               early_stopping_rounds=500,
#               eval_set=[(X_valid, y_valid)],
#               eval_metric='auc',
#               verbose=1000)

#     valid_pred = model.predict_proba(X_valid)[:,1]
#     test_pred = model.predict_proba(X_test)[:,1]

#     valid_preds.update(dict(zip(valid_ids, valid_pred)))
#     test_preds.append(test_pred)
    
#     score = roc_auc_score(y_valid, valid_pred)    
#     scores.append(score)
    
# print(f'Mean auc{np.mean(scores)}, std {np.std(scores)}')

# valid_preds = pd.DataFrame.from_dict(valid_preds, orient='index').reset_index()
# valid_preds.columns = ['id', f'm{m}s{s}_pred']
# valid_preds.to_csv(f'm{m}s{s}_valid_pred.csv', index=False)

# sample_solution.claim = np.mean(np.column_stack(test_preds), axis=1)
# sample_solution.columns = ['id', f'm{m}s{s}_pred']
# sample_solution.to_csv(f'm{m}s{s}_test_pred.csv', index=False)