# Baseline

In this notebook, we generate our first LightGBM model using mostly default settings except for the following:

* We set a high value for `n_estimators` and use `early_stopping_rounds` to avoid overfitting by using too many trees.

Since we do not have access to GPUs locally we will test XGBoost and CatBoost using Kaggle Kernels.

In [1]:
# Global variables for testing changes to this notebook quickly
MODEL_SEED = 0
NUM_TREES = 25000
EARLY_STOP = 250
NUM_FOLDS = 5
SUBMIT = False

In [2]:
import numpy as np
import pandas as pd
import pyarrow
import time

# Model and Evaluation
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier

# Hide warnings
import warnings
warnings.filterwarnings('ignore')

# Print the paths to all of the output files
import os
for dirname, _, filenames in os.walk('..\data'):
    for filename in filenames:
        if filename.endswith('.feather'):
            print(os.path.join(dirname, filename))

..\data\test.feather
..\data\train.feather


# Loading Data

We use the feather files created in the previous notebook rather than the raw `.csv` data.

In [3]:
%%time
train = pd.read_feather('../data/train.feather')
test = pd.read_feather('../data/test.feather')


features = [x for x in train.columns if x not in ['id','claim','kfold','3fold','4fold','5fold','6fold']]

print("Train Size (Mb):", 
      round(train.memory_usage().sum() / 1024 ** 2, 2))

print("Test Size (Mb):", 
      round(test.memory_usage().sum() / 1024 ** 2, 2))

train.head()

Train Size (Mb): 512.5
Test Size (Mb): 248.48
Wall time: 1.06 s


Unnamed: 0,id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f114,f115,f116,f117,f118,claim,3fold,4fold,5fold,6fold
0,0,0.10859,0.004314,-37.566002,0.017364,0.28915,-10.251,135.119995,168900.0,399240000000000.0,...,4378.799805,1.2096,861340000000000.0,140.100006,1.0177,1,2,3,4,5
1,1,0.1009,0.29961,11822.0,0.2765,0.4597,-0.83733,1721.900024,119810.0,3874100000000000.0,...,913.22998,1.2464,7575100000000000.0,1861.0,0.28359,0,1,1,2,2
2,2,0.17803,-0.00698,907.27002,0.27214,0.45948,0.17327,2298.0,360650.0,12245000000000.0,...,45119.0,1.1764,321810000000000.0,3838.199951,0.4069,1,1,1,2,2
3,3,0.15236,0.007259,780.099976,0.025179,0.51947,7.4914,112.510002,259490.0,77814000000000.0,...,4952.399902,1.1784,4533000000000.0,4889.100098,0.51486,1,0,0,0,0
4,4,0.11623,0.5029,-109.150002,0.29791,0.3449,-0.40932,2538.899902,65332.0,1907200000000000.0,...,3856.5,1.483,-8991300000000.0,,0.23049,1,1,1,1,2


## LightGBM

In [4]:
# Scores, validation and test predictions
preds = np.zeros((test.shape[0],))
oof_preds = np.zeros((train.shape[0],))
scores = np.zeros(NUM_FOLDS)
  
for i in range(NUM_FOLDS):
    start = time.time()
    X_train = train[train[f'{NUM_FOLDS}fold'] != i][features].copy()
    X_valid = train[train[f'{NUM_FOLDS}fold'] == i][features].copy()
    y_train = train[train[f'{NUM_FOLDS}fold'] != i]['claim'].copy()
    y_valid = train[train[f'{NUM_FOLDS}fold'] == i]['claim'].copy()
    X_test = test[features].copy()


    model = LGBMClassifier(random_state=MODEL_SEED,
                           n_estimators = NUM_TREES)
    model =  model.fit(X_train, y_train,
                       verbose = False,
                       eval_set = [(X_valid, y_valid)],
                       eval_metric = "auc",
                       early_stopping_rounds = EARLY_STOP
                      )
    
    # Generate predictions on test set and validation set
    valid_preds = model.predict_proba(X_valid)[:,1]
    preds += model.predict_proba(X_test)[:, 1] / NUM_FOLDS
        
    # Store scores and out-of-fold predictions
    oof_preds[train[f'{NUM_FOLDS}fold'] == i] = valid_preds
    scores[i] = roc_auc_score(y_valid, valid_preds)
    
    end = time.time()
        
    print(f"LightGBM Fold {i} (AUC):", 
          round(scores[i], 6), " ",
          str(round(end-start, 3))+"s")
    
print("\nLightGBM (Avg):", round(scores.mean(), 6))
print("LightGBM (Min):", round(scores.min(), 6))
print(f'OOF AUC: ', roc_auc_score(train['claim'], oof_preds))

LightGBM Fold 0 (AUC): 0.804226   79.442s
LightGBM Fold 1 (AUC): 0.805669   88.275s
LightGBM Fold 2 (AUC): 0.805595   99.473s
LightGBM Fold 3 (AUC): 0.805048   104.03s
LightGBM Fold 4 (AUC): 0.802824   128.966s

LightGBM (Avg): 0.804673
LightGBM (Min): 0.802824
OOF AUC:  0.804627509973545


In [5]:
# Generate Submission
oof_preds = pd.DataFrame({'id': train.id, 'claim': oof_preds})
test_preds = pd.DataFrame({'id': test.id, 'claim': preds})

if SUBMIT:
    timestr = time.strftime("%Y%m%d-%H%M%S")
    test_preds.to_csv('../submissions/lightgbm_baseline_'+timestr+'.csv', index=False)