LightGBM with CV
* This notebook scores aprox. 0.89200 (depends on random seed)
* To optimze parameters use this script: https://www.kaggle.com/jmargni/tabular-mar-lightgbm-hyperopt
* LightGBM parameters are from hyperopt result with loss of -0.89918 using the link above. Obtaining loss values near -1 will improve final score.
* Find the best parameters combination and climb to the top. Good luck!!! ;-)



In [None]:
import pandas as pd
import numpy as np
from keras.utils import np_utils
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, LabelBinarizer, StandardScaler
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, mean_squared_error, make_scorer, roc_auc_score
import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')

In [None]:
# All categorical features encoded onehot 
def preprocess(df):
    categorical_cols = [c for c in df.columns if 'cat' in c]
    numerical_cols = [c for c in df.columns if 'cat' not in c]
    
    onehot_encoded_df = pd.get_dummies(df[categorical_cols])
    numerical_df = df[numerical_cols]
    
    return pd.concat([numerical_df, onehot_encoded_df], axis=1)

In [None]:
train_df = pd.read_csv('../input/tabular-playground-series-mar-2021/train.csv')
test_df = pd.read_csv('../input/tabular-playground-series-mar-2021/test.csv')

In [None]:
train_size = train_df.shape[0]
test_size = test_df.shape[0]
all_data = pd.concat([train_df, test_df])

In [None]:
all_data = preprocess(all_data)
train_data = all_data[:train_size]
test_data = all_data[train_size:].drop(columns=['target'])

In [None]:
y = train_data.target.values
X = train_data.drop(columns=['id', 'target'])
X_ = test_data.drop(columns='id')

In [None]:
params = {
    'lambda': 0.0001,
    'learning_rate': 0.007930236488607134,
    'max_bin': 270,
    'max_depth': 98,
    'metric': 'auc',
    'min_data_in_leaf': 60,
    'n_estimators': 20000,
    'num_leaves': 263,
    'objective': 'binary',
    'sub_feature': 0.2098021977637481
}

In [None]:
folds = KFold(n_splits = 50, shuffle=True)
oof = np.zeros(X.shape[0])
predictions = np.zeros(X_.shape[0])

In [None]:
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
    print("Fold {}".format(fold_))
    X_train = X.iloc[trn_idx]
    y_train = y[trn_idx]
    X_test = X.iloc[val_idx]
    y_test = y[val_idx]
    clf = lgb.LGBMClassifier(**params, random_state=42)
    clf.fit(X_train, y_train, eval_set=[(X_train, y_train),(X_test, y_test)],
        eval_metric='auc', early_stopping_rounds=250, verbose=250  )
    predictions += clf.predict_proba(X_, num_iteration=clf.best_iteration_)[:,1] / folds.n_splits

In [None]:
submission = pd.read_csv('../input/tabular-playground-series-mar-2021/sample_submission.csv')
submission = pd.concat([submission, pd.DataFrame(predictions)], axis=1).drop(columns='target')
submission.columns = ['id', 'target']
submission.to_csv('submission.csv', index=False)