In [1]:
import os
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
import lightgbm as lgb
import gc
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
DATA_DIRECTORY = ""

In [3]:
train = pd.read_csv(os.path.join(DATA_DIRECTORY, 'train.csv'))
test = pd.read_csv(os.path.join(DATA_DIRECTORY, 'test.csv'))
labels = pd.read_csv(os.path.join(DATA_DIRECTORY, 'labels.csv'))

In [4]:
test_ids = test['SK_ID_CURR']
train.drop('SK_ID_CURR', axis=1, inplace=True)
test.drop('SK_ID_CURR', axis=1, inplace=True)
labels = labels.to_numpy()

In [5]:
imputer = SimpleImputer(strategy = 'median')
imputer.fit(train)
train = imputer.transform(train)
test = imputer.transform(test)

In [6]:
scaler = MinMaxScaler(feature_range = (0, 1))
scaler.fit(train)
train = scaler.transform(train)
test = scaler.transform(test)

In [7]:
def model(features, test_features, labels, test_ids, n_folds = 5):
        
    print('Training Data Shape: ', features.shape)
    print('Testing Data Shape: ', test_features.shape)
    
    # Create the kfold object
    k_fold = KFold(n_splits = n_folds, shuffle = True, random_state = 50)
    
    # Empty array for test predictions
    test_predictions = np.zeros(test_features.shape[0])
    
    # Empty array for out of fold validation predictions
    out_of_fold = np.zeros(features.shape[0])
    
    # Lists for recording validation and training scores
    valid_scores = []
    train_scores = []
    
    # Iterate through each fold
    for train_indices, valid_indices in k_fold.split(features):
        
        # Training data for the fold
        train_features, train_labels = features[train_indices], labels[train_indices]
        # Validation data for the fold
        valid_features, valid_labels = features[valid_indices], labels[valid_indices]
        
        # Create the model
        model = lgb.LGBMClassifier(n_estimators=10000, objective = 'binary', 
                                   class_weight = 'balanced', learning_rate = 0.05, 
                                   reg_alpha = 0.1, reg_lambda = 0.1, 
                                   subsample = 0.8, n_jobs = -1, random_state = 50)
        
        # Train the model
        model.fit(train_features, train_labels, eval_metric = 'auc',
                  eval_set = [(valid_features, valid_labels), (train_features, train_labels)],
                  eval_names = ['valid', 'train'], categorical_feature = 'auto',
                  early_stopping_rounds = 100, verbose = 200)
        
        # Record the best iteration
        best_iteration = model.best_iteration_
        
        # Make predictions
        test_predictions += model.predict_proba(test_features, num_iteration = best_iteration)[:, 1] / k_fold.n_splits
        
        # Record the out of fold predictions
        out_of_fold[valid_indices] = model.predict_proba(valid_features, num_iteration = best_iteration)[:, 1]
        
        # Record the best score
        valid_score = model.best_score_['valid']['auc']
        train_score = model.best_score_['train']['auc']
        
        valid_scores.append(valid_score)
        train_scores.append(train_score)
        
        # Clean up memory
        gc.enable()
        del model, train_features, valid_features
        gc.collect()
        
    # Make the submission dataframe
    submission = pd.DataFrame({'SK_ID_CURR': test_ids, 'TARGET': test_predictions})
    
    # Overall validation score
    valid_auc = roc_auc_score(labels, out_of_fold)
    
    # Add the overall scores to the metrics
    valid_scores.append(valid_auc)
    train_scores.append(np.mean(train_scores))
    
    # Needed for creating dataframe of validation scores
    fold_names = list(range(n_folds))
    fold_names.append('overall')
    
    # Dataframe of validation scores
    metrics = pd.DataFrame({'fold': fold_names,
                            'train': train_scores,
                            'valid': valid_scores}) 
    
    return submission, metrics

In [9]:
submission, metrics = model(train, test, labels, test_ids)
print('LightGBM metrics')
print(metrics)

Training Data Shape:  (307506, 460)
Testing Data Shape:  (48744, 460)


  return f(*args, **kwargs)


Training until validation scores don't improve for 100 rounds
[200]	train's auc: 0.834062	train's binary_logloss: 0.509721	valid's auc: 0.785997	valid's binary_logloss: 0.529633
Early stopping, best iteration is:
[291]	train's auc: 0.852715	train's binary_logloss: 0.488495	valid's auc: 0.787232	valid's binary_logloss: 0.516437


  return f(*args, **kwargs)


Training until validation scores don't improve for 100 rounds
[200]	train's auc: 0.833142	train's binary_logloss: 0.510675	valid's auc: 0.788635	valid's binary_logloss: 0.530732
[400]	train's auc: 0.87022	train's binary_logloss: 0.468469	valid's auc: 0.791102	valid's binary_logloss: 0.503449
Early stopping, best iteration is:
[410]	train's auc: 0.871773	train's binary_logloss: 0.466645	valid's auc: 0.791196	valid's binary_logloss: 0.502256


  return f(*args, **kwargs)


Training until validation scores don't improve for 100 rounds
[200]	train's auc: 0.834513	train's binary_logloss: 0.509192	valid's auc: 0.782047	valid's binary_logloss: 0.531465
[400]	train's auc: 0.87217	train's binary_logloss: 0.46583	valid's auc: 0.783951	valid's binary_logloss: 0.504387
Early stopping, best iteration is:
[440]	train's auc: 0.878361	train's binary_logloss: 0.458439	valid's auc: 0.784336	valid's binary_logloss: 0.499543


  return f(*args, **kwargs)


Training until validation scores don't improve for 100 rounds
[200]	train's auc: 0.833576	train's binary_logloss: 0.510381	valid's auc: 0.789398	valid's binary_logloss: 0.531397
Early stopping, best iteration is:
[279]	train's auc: 0.850105	train's binary_logloss: 0.491796	valid's auc: 0.789683	valid's binary_logloss: 0.519993


  return f(*args, **kwargs)


Training until validation scores don't improve for 100 rounds
[200]	train's auc: 0.834678	train's binary_logloss: 0.508898	valid's auc: 0.782478	valid's binary_logloss: 0.531959
[400]	train's auc: 0.871986	train's binary_logloss: 0.465976	valid's auc: 0.783904	valid's binary_logloss: 0.505075
Early stopping, best iteration is:
[368]	train's auc: 0.866897	train's binary_logloss: 0.471852	valid's auc: 0.784085	valid's binary_logloss: 0.508745
LightGBM metrics
      fold     train     valid
0        0  0.852715  0.787232
1        1  0.871773  0.791196
2        2  0.878361  0.784336
3        3  0.850105  0.789683
4        4  0.866897  0.784085
5  overall  0.863970  0.787198


In [None]:
submission.to_csv('lgb.csv', index = False)