In [1]:
import os
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
from sklearn.impute import SimpleImputer
import gc
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
DATA_DIRECTORY = ""

In [3]:
train = pd.read_csv(os.path.join(DATA_DIRECTORY, 'train.csv'))
test = pd.read_csv(os.path.join(DATA_DIRECTORY, 'test.csv'))
labels = pd.read_csv(os.path.join(DATA_DIRECTORY, 'labels.csv'))

In [4]:
labels = labels.to_numpy()
test_id = test['SK_ID_CURR']

In [5]:
train = train.drop(['SK_ID_CURR'], axis=1)
test = test.drop(['SK_ID_CURR'], axis=1)

In [6]:
imputer = SimpleImputer(strategy = 'median')
imputer.fit(train)
train = imputer.transform(train)
test = imputer.transform(test)

In [7]:
def model(features, test_features, labels, test_ids, n_folds = 5):
    
    
    cat_indices = 'auto'
        
    print('Training Data Shape: ', features.shape)
    print('Testing Data Shape: ', test_features.shape)
    
    
    # Convert to np arrays
    features = np.array(features)
    test_features = np.array(test_features)
    
    # Create the kfold object
    k_fold = KFold(n_splits = n_folds, shuffle = True, random_state = 88)
    
    # Empty array for test predictions
    test_predictions = np.zeros(test_features.shape[0])
    
    # Empty array for out of fold validation predictions
    out_of_fold = np.zeros(features.shape[0])
    
    # Lists for recording validation and training scores
    valid_scores = []
    train_scores = []
    
    # Iterate through each fold
    for train_indices, valid_indices in k_fold.split(features):
        
        # Training data for the fold
        train_features, train_labels = features[train_indices], labels[train_indices]
        # Validation data for the fold
        valid_features, valid_labels = features[valid_indices], labels[valid_indices]
        
        params = {'random_state': 88888, 'nthread': -1}
        # Create the model
        model = lgb.LGBMClassifier(**{**params, **LIGHTGBM_PARAMS})
        
        # Train the model
        model.fit(train_features, train_labels, eval_metric = 'auc',
                  eval_set = [(valid_features, valid_labels), (train_features, train_labels)],
                  eval_names = ['valid', 'train'], categorical_feature = cat_indices,
                  early_stopping_rounds = 2500, verbose = 500)
        
        # Record the best iteration
        best_iteration = model.best_iteration_
        
        # Make predictions
        test_predictions += model.predict_proba(test_features, num_iteration = best_iteration)[:, 1] / k_fold.n_splits
        
        # Record the out of fold predictions
        out_of_fold[valid_indices] = model.predict_proba(valid_features, num_iteration = best_iteration)[:, 1]
        
        # Record the best score
        valid_score = model.best_score_['valid']['auc']
        train_score = model.best_score_['train']['auc']
        
        valid_scores.append(valid_score)
        train_scores.append(train_score)
        
        # Clean up memory
        gc.enable()
        del model, train_features, valid_features
        gc.collect()
        
    # Make the submission dataframe
    submission = pd.DataFrame({'SK_ID_CURR': test_ids, 'TARGET': test_predictions})
    
    # Overall validation score
    valid_auc = roc_auc_score(labels, out_of_fold)
    
    # Add the overall scores to the metrics
    valid_scores.append(valid_auc)
    train_scores.append(np.mean(train_scores))
    
    # Needed for creating dataframe of validation scores
    fold_names = list(range(n_folds))
    fold_names.append('overall')
    
    # Dataframe of validation scores
    metrics = pd.DataFrame({'fold': fold_names,
                            'train': train_scores,
                            'valid': valid_scores}) 
    
    return submission, metrics

In [8]:
LIGHTGBM_PARAMS = {
    'boosting_type': 'goss',
    'n_estimators': 8888,
    'learning_rate': 0.0098,
    'num_leaves': 58,
    'max_depth': 11,
    'reg_alpha': 3.564,
    'reg_lambda': 4.930,
    'colsample_bytree': 0.613,
    'subsample': 0.708,
    'silent': -1,
    'verbose': -1,
    'max_bin': 407,
    'min_child_weight': 6,
    'min_child_samples': 165
}

In [9]:
submission, metrics = model(train, test, labels, test_id)
print('LightGBM metrics')
print(metrics)

Training Data Shape:  (307506, 649)
Testing Data Shape:  (48744, 649)


  return f(*args, **kwargs)


Training until validation scores don't improve for 2500 rounds
[500]	train's auc: 0.826064	train's binary_logloss: 0.223354	valid's auc: 0.791938	valid's binary_logloss: 0.238104
[1000]	train's auc: 0.858942	train's binary_logloss: 0.20909	valid's auc: 0.799395	valid's binary_logloss: 0.234918
[1500]	train's auc: 0.883269	train's binary_logloss: 0.19815	valid's auc: 0.801804	valid's binary_logloss: 0.233958
[2000]	train's auc: 0.903606	train's binary_logloss: 0.188329	valid's auc: 0.802856	valid's binary_logloss: 0.233541
[2500]	train's auc: 0.920452	train's binary_logloss: 0.179361	valid's auc: 0.803469	valid's binary_logloss: 0.233312
[3000]	train's auc: 0.934787	train's binary_logloss: 0.171018	valid's auc: 0.803397	valid's binary_logloss: 0.23333
[3500]	train's auc: 0.946688	train's binary_logloss: 0.163258	valid's auc: 0.803211	valid's binary_logloss: 0.233441
[4000]	train's auc: 0.956663	train's binary_logloss: 0.155919	valid's auc: 0.803147	valid's binary_logloss: 0.233519
[4500

  return f(*args, **kwargs)


Training until validation scores don't improve for 2500 rounds
[500]	train's auc: 0.826911	train's binary_logloss: 0.223024	valid's auc: 0.786204	valid's binary_logloss: 0.239061
[1000]	train's auc: 0.859289	train's binary_logloss: 0.208892	valid's auc: 0.794015	valid's binary_logloss: 0.235986
[1500]	train's auc: 0.88386	train's binary_logloss: 0.197859	valid's auc: 0.796429	valid's binary_logloss: 0.235057
[2000]	train's auc: 0.90385	train's binary_logloss: 0.188147	valid's auc: 0.797313	valid's binary_logloss: 0.234676
[2500]	train's auc: 0.920616	train's binary_logloss: 0.179164	valid's auc: 0.797562	valid's binary_logloss: 0.234599
[3000]	train's auc: 0.934665	train's binary_logloss: 0.170887	valid's auc: 0.797736	valid's binary_logloss: 0.234575
[3500]	train's auc: 0.94644	train's binary_logloss: 0.163161	valid's auc: 0.797518	valid's binary_logloss: 0.234663
[4000]	train's auc: 0.956393	train's binary_logloss: 0.155883	valid's auc: 0.797217	valid's binary_logloss: 0.234852
[4500

  return f(*args, **kwargs)


Training until validation scores don't improve for 2500 rounds
[500]	train's auc: 0.827779	train's binary_logloss: 0.223384	valid's auc: 0.781905	valid's binary_logloss: 0.237462
[1000]	train's auc: 0.860219	train's binary_logloss: 0.209162	valid's auc: 0.7896	valid's binary_logloss: 0.234682
[1500]	train's auc: 0.884773	train's binary_logloss: 0.198131	valid's auc: 0.79184	valid's binary_logloss: 0.233873
[2000]	train's auc: 0.904719	train's binary_logloss: 0.188321	valid's auc: 0.792612	valid's binary_logloss: 0.23361
[2500]	train's auc: 0.921314	train's binary_logloss: 0.179352	valid's auc: 0.792879	valid's binary_logloss: 0.233529
[3000]	train's auc: 0.935267	train's binary_logloss: 0.171067	valid's auc: 0.792757	valid's binary_logloss: 0.233609
[3500]	train's auc: 0.946811	train's binary_logloss: 0.163376	valid's auc: 0.792741	valid's binary_logloss: 0.233693
[4000]	train's auc: 0.95676	train's binary_logloss: 0.156074	valid's auc: 0.792539	valid's binary_logloss: 0.233853
[4500]	

  return f(*args, **kwargs)


Training until validation scores don't improve for 2500 rounds
[500]	train's auc: 0.826926	train's binary_logloss: 0.223796	valid's auc: 0.787658	valid's binary_logloss: 0.23567
[1000]	train's auc: 0.859935	train's binary_logloss: 0.209453	valid's auc: 0.795021	valid's binary_logloss: 0.232752
[1500]	train's auc: 0.884716	train's binary_logloss: 0.198299	valid's auc: 0.797376	valid's binary_logloss: 0.231837
[2000]	train's auc: 0.904586	train's binary_logloss: 0.188559	valid's auc: 0.798381	valid's binary_logloss: 0.231425
[2500]	train's auc: 0.921252	train's binary_logloss: 0.179621	valid's auc: 0.798539	valid's binary_logloss: 0.231329
[3000]	train's auc: 0.935128	train's binary_logloss: 0.171346	valid's auc: 0.798422	valid's binary_logloss: 0.231322
[3500]	train's auc: 0.946885	train's binary_logloss: 0.163587	valid's auc: 0.798312	valid's binary_logloss: 0.231422
[4000]	train's auc: 0.956634	train's binary_logloss: 0.156353	valid's auc: 0.79799	valid's binary_logloss: 0.231585
[450

  return f(*args, **kwargs)


Training until validation scores don't improve for 2500 rounds
[500]	train's auc: 0.827042	train's binary_logloss: 0.223303	valid's auc: 0.785061	valid's binary_logloss: 0.237874
[1000]	train's auc: 0.860014	train's binary_logloss: 0.209026	valid's auc: 0.793145	valid's binary_logloss: 0.234871
[1500]	train's auc: 0.884326	train's binary_logloss: 0.198024	valid's auc: 0.795705	valid's binary_logloss: 0.234018
[2000]	train's auc: 0.904434	train's binary_logloss: 0.188149	valid's auc: 0.79668	valid's binary_logloss: 0.23373
[2500]	train's auc: 0.921183	train's binary_logloss: 0.179166	valid's auc: 0.79717	valid's binary_logloss: 0.233606
[3000]	train's auc: 0.935167	train's binary_logloss: 0.170861	valid's auc: 0.797244	valid's binary_logloss: 0.233606
[3500]	train's auc: 0.946873	train's binary_logloss: 0.163089	valid's auc: 0.797117	valid's binary_logloss: 0.233715
[4000]	train's auc: 0.956744	train's binary_logloss: 0.155823	valid's auc: 0.797127	valid's binary_logloss: 0.23378
[4500]

In [10]:
submission.to_csv(os.path.join(DATA_DIRECTORY, 'lgb.csv'), index = False)