In [1]:
import os
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import gc
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
DATA_DIRECTORY = ""

In [3]:
train = pd.read_csv(os.path.join(DATA_DIRECTORY, 'train.csv'))
test = pd.read_csv(os.path.join(DATA_DIRECTORY, 'test.csv'))
labels = pd.read_csv(os.path.join(DATA_DIRECTORY, 'labels.csv'))

In [4]:
labels = labels.to_numpy()

In [5]:
def model(features, test_features, labels, n_folds = 5):
    
    # Extract the ids
    train_ids = features['SK_ID_CURR']
    test_ids = test_features['SK_ID_CURR']
    
    # Remove the ids and target
    features = features.drop(columns = ['SK_ID_CURR'])
    test_features = test_features.drop(columns = ['SK_ID_CURR'])
    
    
    cat_indices = 'auto'
        
    print('Training Data Shape: ', features.shape)
    print('Testing Data Shape: ', test_features.shape)
    
    
    # Convert to np arrays
    features = np.array(features)
    test_features = np.array(test_features)
    
    # Create the kfold object
    k_fold = KFold(n_splits = n_folds, shuffle = True, random_state = 50)
    
    # Empty array for test predictions
    test_predictions = np.zeros(test_features.shape[0])
    
    # Empty array for out of fold validation predictions
    out_of_fold = np.zeros(features.shape[0])
    
    # Lists for recording validation and training scores
    valid_scores = []
    train_scores = []
    
    # Iterate through each fold
    for train_indices, valid_indices in k_fold.split(features):
        
        # Training data for the fold
        train_features, train_labels = features[train_indices], labels[train_indices]
        # Validation data for the fold
        valid_features, valid_labels = features[valid_indices], labels[valid_indices]
        
        params = {'random_state': 8888, 'nthread': -1}
        # Create the model
        model = lgb.LGBMClassifier(**{**params, **LIGHTGBM_PARAMS})
        
        # Train the model
        model.fit(train_features, train_labels, eval_metric = 'auc',
                  eval_set = [(valid_features, valid_labels), (train_features, train_labels)],
                  eval_names = ['valid', 'train'], categorical_feature = cat_indices,
                  early_stopping_rounds = 100, verbose = 200)
        
        # Record the best iteration
        best_iteration = model.best_iteration_
        
        # Make predictions
        test_predictions += model.predict_proba(test_features, num_iteration = best_iteration)[:, 1] / k_fold.n_splits
        
        # Record the out of fold predictions
        out_of_fold[valid_indices] = model.predict_proba(valid_features, num_iteration = best_iteration)[:, 1]
        
        # Record the best score
        valid_score = model.best_score_['valid']['auc']
        train_score = model.best_score_['train']['auc']
        
        valid_scores.append(valid_score)
        train_scores.append(train_score)
        
        # Clean up memory
        gc.enable()
        del model, train_features, valid_features
        gc.collect()
        
    # Make the submission dataframe
    submission = pd.DataFrame({'SK_ID_CURR': test_ids, 'TARGET': test_predictions})
    
    # Overall validation score
    valid_auc = roc_auc_score(labels, out_of_fold)
    
    # Add the overall scores to the metrics
    valid_scores.append(valid_auc)
    train_scores.append(np.mean(train_scores))
    
    # Needed for creating dataframe of validation scores
    fold_names = list(range(n_folds))
    fold_names.append('overall')
    
    # Dataframe of validation scores
    metrics = pd.DataFrame({'fold': fold_names,
                            'train': train_scores,
                            'valid': valid_scores}) 
    
    return submission, metrics

In [6]:
LIGHTGBM_PARAMS = {
    'boosting_type': 'goss',
    'n_estimators': 10000,
    'learning_rate': 0.005134,
    'num_leaves': 54,
    'max_depth': 10,
    'subsample_for_bin': 240000,
    'reg_alpha': 0.436193,
    'reg_lambda': 0.479169,
    'colsample_bytree': 0.508716,
    'min_split_gain': 0.024766,
    'subsample': 1,
    'is_unbalance': False,
    'silent':-1,
    'verbose':-1
}

In [7]:
submission, metrics = model(train, test, labels)
print('LightGBM metrics')
print(metrics)

Training Data Shape:  (307506, 657)
Testing Data Shape:  (48744, 657)


  return f(*args, **kwargs)


Training until validation scores don't improve for 100 rounds
[200]	train's auc: 0.780177	train's binary_logloss: 0.248548	valid's auc: 0.757979	valid's binary_logloss: 0.250784
[400]	train's auc: 0.803354	train's binary_logloss: 0.236692	valid's auc: 0.769293	valid's binary_logloss: 0.243274
[600]	train's auc: 0.819102	train's binary_logloss: 0.229269	valid's auc: 0.777341	valid's binary_logloss: 0.239534
[800]	train's auc: 0.831547	train's binary_logloss: 0.223574	valid's auc: 0.783028	valid's binary_logloss: 0.237241
[1000]	train's auc: 0.841869	train's binary_logloss: 0.218877	valid's auc: 0.786676	valid's binary_logloss: 0.235787
[1200]	train's auc: 0.851192	train's binary_logloss: 0.214717	valid's auc: 0.789287	valid's binary_logloss: 0.23478
[1400]	train's auc: 0.859779	train's binary_logloss: 0.210919	valid's auc: 0.791141	valid's binary_logloss: 0.234075
[1600]	train's auc: 0.867744	train's binary_logloss: 0.207393	valid's auc: 0.79257	valid's binary_logloss: 0.233529
[1800]	t

  return f(*args, **kwargs)


Training until validation scores don't improve for 100 rounds
[200]	train's auc: 0.77965	train's binary_logloss: 0.248434	valid's auc: 0.762643	valid's binary_logloss: 0.251117
[400]	train's auc: 0.803343	train's binary_logloss: 0.236569	valid's auc: 0.77326	valid's binary_logloss: 0.243528
[600]	train's auc: 0.819443	train's binary_logloss: 0.229068	valid's auc: 0.780479	valid's binary_logloss: 0.23985
[800]	train's auc: 0.831563	train's binary_logloss: 0.223395	valid's auc: 0.78531	valid's binary_logloss: 0.237685
[1000]	train's auc: 0.841744	train's binary_logloss: 0.218711	valid's auc: 0.788647	valid's binary_logloss: 0.236284
[1200]	train's auc: 0.851042	train's binary_logloss: 0.214581	valid's auc: 0.790942	valid's binary_logloss: 0.23538
[1400]	train's auc: 0.859573	train's binary_logloss: 0.210788	valid's auc: 0.792895	valid's binary_logloss: 0.234644
[1600]	train's auc: 0.8675	train's binary_logloss: 0.207253	valid's auc: 0.794331	valid's binary_logloss: 0.234123
[1800]	train'

  return f(*args, **kwargs)


Training until validation scores don't improve for 100 rounds
[200]	train's auc: 0.779876	train's binary_logloss: 0.247991	valid's auc: 0.756889	valid's binary_logloss: 0.252947
[400]	train's auc: 0.803321	train's binary_logloss: 0.236187	valid's auc: 0.768414	valid's binary_logloss: 0.245484
[600]	train's auc: 0.819604	train's binary_logloss: 0.228698	valid's auc: 0.776588	valid's binary_logloss: 0.241766
[800]	train's auc: 0.831764	train's binary_logloss: 0.223059	valid's auc: 0.78185	valid's binary_logloss: 0.239576
[1000]	train's auc: 0.842218	train's binary_logloss: 0.218343	valid's auc: 0.785295	valid's binary_logloss: 0.238168
[1200]	train's auc: 0.851559	train's binary_logloss: 0.214179	valid's auc: 0.787845	valid's binary_logloss: 0.237184
[1400]	train's auc: 0.860244	train's binary_logloss: 0.210366	valid's auc: 0.789567	valid's binary_logloss: 0.236508
[1600]	train's auc: 0.86815	train's binary_logloss: 0.206826	valid's auc: 0.790798	valid's binary_logloss: 0.23602
[1800]	tr

  return f(*args, **kwargs)


Training until validation scores don't improve for 100 rounds
[200]	train's auc: 0.778792	train's binary_logloss: 0.24814	valid's auc: 0.765007	valid's binary_logloss: 0.252606
[400]	train's auc: 0.802636	train's binary_logloss: 0.236403	valid's auc: 0.776865	valid's binary_logloss: 0.244531
[600]	train's auc: 0.818754	train's binary_logloss: 0.228959	valid's auc: 0.78492	valid's binary_logloss: 0.240466
[800]	train's auc: 0.830908	train's binary_logloss: 0.223313	valid's auc: 0.7896	valid's binary_logloss: 0.238154
[1000]	train's auc: 0.841304	train's binary_logloss: 0.218619	valid's auc: 0.79264	valid's binary_logloss: 0.236717
[1200]	train's auc: 0.850594	train's binary_logloss: 0.214475	valid's auc: 0.794666	valid's binary_logloss: 0.235766
[1400]	train's auc: 0.859299	train's binary_logloss: 0.210657	valid's auc: 0.796015	valid's binary_logloss: 0.235144
[1600]	train's auc: 0.867351	train's binary_logloss: 0.207102	valid's auc: 0.796983	valid's binary_logloss: 0.234695
[1800]	trai

  return f(*args, **kwargs)


Training until validation scores don't improve for 100 rounds
[200]	train's auc: 0.780503	train's binary_logloss: 0.247396	valid's auc: 0.75532	valid's binary_logloss: 0.255279
[400]	train's auc: 0.804043	train's binary_logloss: 0.235627	valid's auc: 0.767337	valid's binary_logloss: 0.24767
[600]	train's auc: 0.820112	train's binary_logloss: 0.228213	valid's auc: 0.775289	valid's binary_logloss: 0.24389
[800]	train's auc: 0.83215	train's binary_logloss: 0.222571	valid's auc: 0.780682	valid's binary_logloss: 0.241564
[1000]	train's auc: 0.842442	train's binary_logloss: 0.217885	valid's auc: 0.784111	valid's binary_logloss: 0.240121
[1200]	train's auc: 0.851916	train's binary_logloss: 0.213716	valid's auc: 0.786626	valid's binary_logloss: 0.239111
[1400]	train's auc: 0.860587	train's binary_logloss: 0.209911	valid's auc: 0.788821	valid's binary_logloss: 0.238291
[1600]	train's auc: 0.868347	train's binary_logloss: 0.206412	valid's auc: 0.790179	valid's binary_logloss: 0.237754
[1800]	tra

In [8]:
submission.to_csv('lgb.csv', index = False)