## LGBM is Powerful!

I want to prove LGBM is better than Logistic Regression. 

Let's try :)

## Import Libarary & Read CSV

In [None]:
import lightgbm as lgb
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

import seaborn as sns
import numpy as np 
import pandas as pd
import os, gc
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
sns.set()

In [None]:
%%time
train_df = pd.read_csv('/kaggle/input/cat-in-the-dat/train.csv')
test_df = pd.read_csv('/kaggle/input/cat-in-the-dat/test.csv')

In [None]:
train_df.shape

In [None]:
target = train_df['target']
train_id = train_df['id']
test_id = test_df['id']
train_df.drop(['target', 'id'], axis=1, inplace=True)
test_df.drop('id', axis=1, inplace=True)

In [None]:
print(train_df.shape)
print(test_df.shape)

In [None]:
train_df.head()

## Feature Engineering (Target Encoding)

and remaining loooooong features : target encoding

In [None]:
%%time

traintest = pd.concat([train_df, test_df])
dummies = pd.get_dummies(traintest, columns=traintest.columns, drop_first=True, sparse=True)
train = dummies.iloc[:train_df.shape[0], :]
test = dummies.iloc[train_df.shape[0]:, :]
train = train.sparse.to_coo().tocsr()
test = test.sparse.to_coo().tocsr()

In [None]:
train = train.astype('float32')
test = test.astype('float32')

## LightGBM model

This is my first single LGBM Model (public leaderboard score low)

In [None]:
# %%time 
# X_train, X_test, y_train, y_test = train_test_split(train, target, test_size=0.2, random_state=97)

# param = {   
#     'boost': 'gbdt',
#     'learning_rate': 0.005,
#     'feature_fraction':0.3,
#     'bagging_freq':1,
#     'max_depth': -1,
#     'num_leaves':18,
#     'lambda_l2': 3,
#     'lambda_l1': 3,
#     'metric':{'auc'},
#     'tree_learner': 'serial',
#     'objective': 'binary',
#     'verbosity': 1,
#     'seed': 97,
#     'feature_fraction_seed': 97,
#     'bagging_seed': 97,
#     'drop_seed': 97,
#     'data_random_seed': 97,
# }


# evals_result = {}
# predictions = np.zeros(test.shape[0])

# lgb_train = lgb.Dataset(X_train, y_train)
# lgb_valid = lgb.Dataset(X_test, y_test)

# num_round = 20000
# clf = lgb.train(param, lgb_train, num_round, valid_sets = [lgb_train, lgb_valid],
#       verbose_eval=100, early_stopping_rounds = 1000, evals_result = evals_result)

# ## Prediction
# predictions = clf.predict(test, num_iteration=clf.best_iteration)


## LGBM with CV

In [None]:
%%time

# CV function original : @Peter Hurford : Why Not Logistic Regression? https://www.kaggle.com/peterhurford/why-not-logistic-regression
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score as auc

def run_cv_model(train, test, target, model_fn, params={}, label='model'):
    kf = KFold(n_splits=5)
    fold_splits = kf.split(train, target)

    cv_scores = []
    pred_full_test = 0
    pred_train = np.zeros((train.shape[0]))
    i = 1
    for dev_index, val_index in fold_splits:
        print('Started {} fold {}/5'.format(label, i))
        dev_X, val_X = train[dev_index], train[val_index]
        dev_y, val_y = target[dev_index], target[val_index]
        
        pred_val_y, pred_test_y = model_fn(dev_X, dev_y, val_X, val_y, test, params)
        
        pred_full_test = pred_full_test + pred_test_y
        pred_train[val_index] = pred_val_y
        
        cv_score = auc(val_y, pred_val_y)
        cv_scores.append(cv_score)
        print(label + ' cv score {}: {}\n'.format(i, cv_score))
        i += 1
        
    print('{} cv scores : {}'.format(label, cv_scores))
    print('{} cv mean score : {}'.format(label, np.mean(cv_scores)))
    print('{} cv std score : {}'.format(label, np.std(cv_scores)))
    pred_full_test = pred_full_test / 5.0
    results = {'label': label, 'train': pred_train, 'test': pred_full_test, 'cv': cv_scores}
    return results


def runLGBM(X_train, y_train, X_val, y_val, X_test, params):
    predictions = np.zeros(test.shape[0])
    lgb_train, lgb_valid = lgb.Dataset(X_train, y_train), lgb.Dataset(X_val, y_val)
    num_round = 5000
    clf = lgb.train(params, lgb_train, num_round, valid_sets = [lgb_train, lgb_valid], verbose_eval=1000, early_stopping_rounds = 1000)
    pred_val_y = clf.predict(X_val, num_iteration=clf.best_iteration)
    pred_test_y = clf.predict(X_test, num_iteration=clf.best_iteration)
    return pred_val_y, pred_test_y

params = {   
    'boost': 'gbdt',
    'learning_rate': 0.005,
    'feature_fraction':0.3,
    'bagging_freq':1,
    'max_depth': 1<<5,
    'num_leaves':18,
    'lambda_l2': 0.9,
    'lambda_l1': 0.9,
    'metric':{'auc'},
    'tree_learner': 'serial',
    'objective': 'binary',
    'verbosity': 1,
    'seed': 97,
    'feature_fraction_seed': 97,
    'bagging_seed': 97,
    'drop_seed': 97,
    'data_random_seed': 97,
}

results = run_cv_model(train, test, target, runLGBM, params, 'LGBM')


## Result

In [None]:
sub_df = pd.DataFrame({'id': test_id, 'target' : results['test']})

sub_df.to_csv("lightgbm_onehotencoding_cv.csv", index=False)
sub_df.head()