In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
import matplotlib.pyplot as plt
import lightgbm as lgb
import gc
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import confusion_matrix,precision_recall_curve,auc,roc_auc_score,roc_curve,recall_score,classification_report

In [None]:
# Training data
app_train = pd.read_csv('../input/application_train.csv')
print('Training data shape: ', app_train.shape)
app_train.head()

In [None]:
# Testing data features
app_test = pd.read_csv('../input/application_test.csv')
print('Testing data shape: ', app_test.shape)
app_test.head()

In [None]:
app_train['TARGET'].value_counts()
print('The proportion of label 1 is %.2f' % (sum(app_train['TARGET']==1)/app_train.shape[0]*100), '%')

In [None]:
def missing_values_table(df):
    #Total missing values
    mis_val = df.isnull().sum()
    
    #Percentages of missing values
    mis_val_percent = df.isnull().sum() * 100 / df.shape[0]
    
    # Make a table with the results
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis = 1)
    
    # Rename the columns
    mis_val_table_rename_columns = mis_val_table.rename(columns = {0: 'Missing Values', 1: 'Percentage'})
    
    #Sort the table
    mis_val_table_rename_columns = mis_val_table_rename_columns[
        mis_val_table_rename_columns.iloc[:, 1]!=0].sort_values('Percentage', ascending=False).round(1) #round(1) to keep only one decimal
    
    #Print information
    print('The total dataframe has ' + str(df.shape[1]) + ' columns')
    print('There are ' + str(mis_val_table_rename_columns.shape[0]) + ' columns')
    
    return mis_val_table_rename_columns

In [None]:
missing_values = missing_values_table(app_train)
missing_values.head(20)

In [None]:
app_train.dtypes.value_counts()

In [None]:
app_train.select_dtypes('object').apply(pd.Series.nunique, axis=0)

In [None]:
app_test.dtypes.value_counts()

In [None]:
string_columns = app_train.select_dtypes('object').columns

In [None]:
app_train = app_train.drop(columns = string_columns)
app_test = app_test.drop(columns = string_columns)

In [None]:
print(app_train.shape)
print(app_test.shape)

In [None]:
#Find correlations with the target and sort
correlations = app_train.corr()['TARGET'].sort_values()

In [None]:
# Display correlations
print('Most Positive Correlations:\n', correlations.tail(15))
print('\nMost Negative Correlations:\n', correlations.head(15))

In [None]:
def train_with_cv(train_data, test_data, n_folds, seed_varying):
    train_ids = train_data['SK_ID_CURR']
    test_ids = test_data['SK_ID_CURR']
    
    train_labels = train_data['TARGET']
    
    train_features = train_data.drop(columns = ['SK_ID_CURR', 'TARGET'])
    test_features = test_data.drop(columns = ['SK_ID_CURR'])
    
    feature_names = list(train_features.columns)
    feature_importance_values = np.zeros(len(feature_names))
    
    train_features = np.array(train_features)
    test_features = np.array(test_features)
    
    k_fold = KFold(n_splits = n_folds, shuffle = True, random_state = 50+seed_varying)
    
    test_pred = np.zeros(test_features.shape[0])
    out_of_fold = np.zeros(train_features.shape[0])
    
    valid_scores = []
    train_scores = []
    
    for train_indices, valid_indices in k_fold.split(train_features):
        x_train, y_train = train_features[train_indices], train_labels[train_indices]
        x_valid, y_valid = train_features[valid_indices], train_labels[valid_indices]
        
        model = lgb.LGBMClassifier(n_estimators=10000, objective = 'binary', 
                                   class_weight = 'balanced', learning_rate = 0.05, 
                                   reg_alpha = 0.1, reg_lambda = 0.1, 
                                   subsample = 0.8, n_jobs = -1, random_state = 50+seed_varying)
        
        model.fit(x_train, y_train, eval_metric = 'auc',
                  eval_set = [(x_valid, y_valid), (x_train, y_train)],
                  eval_names = ['valid', 'train'],
                  early_stopping_rounds = 100, verbose = -1)
        
        best_iteration = model.best_iteration_
        
        feature_importance_values += model.feature_importances_ / k_fold.n_splits
        
        test_pred += model.predict_proba(test_features, num_iteration = best_iteration)[:, 1] / k_fold.n_splits
        out_of_fold[valid_indices] = model.predict_proba(x_valid, num_iteration = best_iteration)[:, 1]
        
        valid_score = model.best_score_['valid']['auc']
        train_score = model.best_score_['train']['auc']
        
        valid_scores.append(valid_score)
        train_scores.append(train_score)
        
        gc.enable()
        del model, x_train, y_train, x_valid, y_valid
        gc.collect()
    
    pred_score = pd.DataFrame({'SK_ID_CURR': test_ids, 'TARGET': test_pred})
    feature_importances = pd.DataFrame({'feature': feature_names, 'importance': feature_importance_values})
    
    valid_auc = roc_auc_score(train_labels, out_of_fold)
    
    valid_scores.append(valid_auc)
    train_scores.append(np.mean(train_scores))
    
    # Needed for creating dataframe of validation scores
    fold_names = list(range(n_folds))
    fold_names.append('overall')
    
    # Dataframe of validation scores
    metrics = pd.DataFrame({'fold': fold_names,
                            'train': train_scores,
                            'valid': valid_scores}) 
    
    return pred_score, feature_importances, metrics

In [None]:
train_times = 8
n_folds = 5

i = 0
metrics_all = np.zeros((train_times, 2))
for seed_varying in range(train_times):
    sub, fi, metrics = train_with_cv(app_train, app_test, n_folds, seed_varying)
    if i==0:
        submission = sub
        feat_import = fi
    else:
        submission['TARGET'] += sub['TARGET']
        feat_import['importance'] += fi['importance']
    
    metrics_all[i, :] = metrics.iloc[-1, 1:3]
    i += 1

submission['TARGET'] = submission['TARGET'] / train_times
feat_import['importance'] = feat_import['importance'] / train_times

submission.to_csv('lightgbm_version_2.csv', index = False)

In [None]:
metrics_all