In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

# Suppress warnings from pandas
import warnings
warnings.filterwarnings('ignore')

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
import matplotlib.pyplot as plt
import lightgbm as lgb
import gc
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import confusion_matrix,precision_recall_curve,auc,roc_auc_score,roc_curve,recall_score,classification_report
from sklearn.preprocessing import LabelEncoder

In [None]:
# Training data
app_train = pd.read_csv('../input/application_train.csv')
print('Training data shape: ', app_train.shape)
app_train.head()

In [None]:
# Testing data features
app_test = pd.read_csv('../input/application_test.csv')
print('Testing data shape: ', app_test.shape)
app_test.head()

In [None]:
app_train['TARGET'].value_counts()
print('The proportion of label 1 is %.2f' % (sum(app_train['TARGET']==1)/app_train.shape[0]*100), '%')

In [None]:
# 该函数确定数据集每一列的缺失值个数及占比
def missing_values_table(df):
    #Total missing values
    mis_val = df.isnull().sum()
    
    #Percentages of missing values
    mis_val_percent = df.isnull().sum() * 100 / df.shape[0]
    
    # Make a table with the results
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis = 1)
    
    # Rename the columns
    mis_val_table_rename_columns = mis_val_table.rename(columns = {0: 'Missing Values', 1: 'Percentage'})
    
    #Sort the table
    mis_val_table_rename_columns = mis_val_table_rename_columns[
        mis_val_table_rename_columns.iloc[:, 1]!=0].sort_values('Percentage', ascending=False).round(1) #round(1) to keep only one decimal
    
    #Print information
    print('The total dataframe has ' + str(df.shape[1]) + ' columns')
    print('There are ' + str(mis_val_table_rename_columns.shape[0]) + ' columns')
    
    return mis_val_table_rename_columns

In [None]:
missing_values = missing_values_table(app_train)
missing_values.head(20)

In [None]:
app_train.dtypes.value_counts()

In [None]:
app_train.select_dtypes('object').apply(pd.Series.nunique, axis=0)

In [None]:
app_test.dtypes.value_counts()

In [None]:
# 该部分将'object'类型的columns转换为数值型，One-hot

app_train = pd.get_dummies(app_train)
app_test = pd.get_dummies(app_test)

print('Training Features shape: ', app_train.shape)
print('Testing Features shape: ', app_test.shape)

In [None]:
# 对齐train和test的数据，丢掉无用的列
train_labels = app_train['TARGET']

app_train, app_test = app_train.align(app_test, join = 'inner', axis = 1)

app_train['TARGET'] = train_labels

print('Training Features shape: ', app_train.shape)
print('Testing Features shape: ', app_test.shape)

In [None]:
# 删除部分列的异常值
app_train['DAYS_EMPLOYED_ANOM'] = app_train['DAYS_EMPLOYED'] == 365243

app_train['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)

app_test['DAYS_EMPLOYED_ANOM'] = app_test['DAYS_EMPLOYED'] == 365243

app_test['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)

print('Training Features shape: ', app_train.shape)
print('Testing Features shape: ', app_test.shape)

In [None]:
# 查看数据的Correlations

correlations = app_train.corr()['TARGET'].sort_values()

# 输出最相关的30个特征
print('Most Positive Correlations:\n', correlations.tail(15))
print('\nMost Negative Correlations:\n', correlations.head(15))

In [None]:
#该部分加入专业知识特征

app_train_domain = app_train.copy()
app_test_domain = app_test.copy()

app_train_domain['CREDIT_INCOME_PERCENT'] = app_train_domain['AMT_CREDIT'] / app_train_domain['AMT_INCOME_TOTAL']
app_train_domain['ANNUITY_INCOME_PERCENT'] = app_train_domain['AMT_ANNUITY'] / app_train_domain['AMT_INCOME_TOTAL']
app_train_domain['CREDIT_TERM'] = app_train_domain['AMT_ANNUITY'] / app_train_domain['AMT_CREDIT']
app_train_domain['DAYS_EMPLOYED_PERCENT'] = app_train_domain['DAYS_EMPLOYED'] / app_train_domain['DAYS_BIRTH']

app_test_domain['CREDIT_INCOME_PERCENT'] = app_test_domain['AMT_CREDIT'] / app_test_domain['AMT_INCOME_TOTAL']
app_test_domain['ANNUITY_INCOME_PERCENT'] = app_test_domain['AMT_ANNUITY'] / app_test_domain['AMT_INCOME_TOTAL']
app_test_domain['CREDIT_TERM'] = app_test_domain['AMT_ANNUITY'] / app_test_domain['AMT_CREDIT']
app_test_domain['DAYS_EMPLOYED_PERCENT'] = app_test_domain['DAYS_EMPLOYED'] / app_test_domain['DAYS_BIRTH']

print('Domain Training Features shape: ', app_train_domain.shape)
print('Domain Testing Features shape: ', app_test_domain.shape)

** Manual Feature Engineering (Part 1) : bureau.csv**

分为四步：
1. 将SK_ID_CURR的次数统计出来
2. 将bureau的数值类型特征抽取出来
3. 将bureau的字符类型特征抽取出来
4. 将上述特征加入train和test

In [None]:
# Read in bureau
bureau = pd.read_csv('../input/bureau.csv')
bureau.head()

In [None]:
# Step 1: 将每个'SK_ID_CURR'出现的次数统计出来
previous_loan_counts = bureau.groupby('SK_ID_CURR', as_index=False)['SK_ID_BUREAU'].count().rename(columns = {'SK_ID_BUREAU': 'previous_loan_counts'})
previous_loan_counts.head()

## 特别注意：将次数加入到train和test时候需要将设为0，即没有出现过

In [None]:
# Step 2: 将数值特征的抽取出来

def agg_numeric(df, group_var, df_name):
    """Aggregates the numeric values in a dataframe. This can
    be used to create features for each instance of the grouping variable.
    
    Parameters
    --------
        df (dataframe): 
            the dataframe to calculate the statistics on
        group_var (string): 
            the variable by which to group df
        df_name (string): 
            the variable used to rename the columns
        
    Return
    --------
        agg (dataframe): 
            a dataframe with the statistics aggregated for 
            all numeric columns. Each instance of the grouping variable will have 
            the statistics (mean, min, max, sum; currently supported) calculated. 
            The columns are also renamed to keep track of features created.
    
    """
    # Remove id variables other than grouping variable
    for col in df:
        if col != group_var and 'SK_ID' in col:  #去掉'SK_ID_CURR'以外的带有'SK_ID'的特征，即其它文件的id
            df = df.drop(columns = col)
            
    group_ids = df[group_var]
    numeric_df = df.select_dtypes('number')
    numeric_df[group_var] = group_ids        #将数值型特征和TARGET组成一个新的DF

    # Group by the specified variable and calculate the statistics
    agg = numeric_df.groupby(group_var).agg(['count', 'mean', 'max', 'min', 'sum']).reset_index()

    # Need to create new column names
    columns = [group_var]

    # Iterate through the variables names
    for var in agg.columns.levels[0]:
        # Skip the grouping variable
        if var != group_var:
            # Iterate through the stat names
            for stat in agg.columns.levels[1][:-1]:
                # Make a new column name for the variable and stat
                columns.append('%s_%s_%s' % (df_name, var, stat))

    agg.columns = columns
    return agg

bureau_agg = agg_numeric(bureau.drop(columns = ['SK_ID_BUREAU']), group_var = 'SK_ID_CURR', df_name = 'bureau')
bureau_agg.head()

In [None]:
# 第3步：将字符型特征的抽取出来

def count_categorical(df, group_var, df_name):
    """Computes counts and normalized counts for each observation
    of `group_var` of each unique category in every categorical variable
    
    Parameters
    --------
    df : dataframe 
        The dataframe to calculate the value counts for.
        
    group_var : string
        The variable by which to group the dataframe. For each unique
        value of this variable, the final dataframe will have one row
        
    df_name : string
        Variable added to the front of column names to keep track of columns

    
    Return
    --------
    categorical : dataframe
        A dataframe with counts and normalized counts of each unique category in every categorical variable
        with one row for every unique value of the `group_var`.
        
    """
    
    # Select the categorical columns
    categorical = pd.get_dummies(df.select_dtypes('object'))

    # Make sure to put the identifying id on the column
    categorical[group_var] = df[group_var]

    # Groupby the group var and calculate the sum and mean
    categorical = categorical.groupby(group_var).agg(['sum', 'mean'])
    
    column_names = []
    
    # Iterate through the columns in level 0
    for var in categorical.columns.levels[0]:
        # Iterate through the stats in level 1
        for stat in ['count', 'count_norm']:
            # Make a new column name
            column_names.append('%s_%s_%s' % (df_name, var, stat))
    
    categorical.columns = column_names
    
    return categorical

bureau_counts = count_categorical(bureau, group_var = 'SK_ID_CURR', df_name = 'bureau')
bureau_counts.head()

In [None]:
# Step 4: 将上述特征加入到train和test中

# 1. 将previous_loan_counts加入到train和test中

train = app_train_domain.merge(previous_loan_counts, on = 'SK_ID_CURR', how = 'left')
train['previous_loan_counts'] = train['previous_loan_counts'].fillna(0)
test = app_test_domain.merge(previous_loan_counts, on = 'SK_ID_CURR', how = 'left')
test['previous_loan_counts'] = test['previous_loan_counts'].fillna(0)

In [None]:
# 2. 将bureau_agg加入到train和test中
train = train.merge(bureau_agg, on = 'SK_ID_CURR', how = 'left')
test = test.merge(bureau_agg, on = 'SK_ID_CURR', how = 'left')

In [None]:
# 3. 将bureau_counts加入到train和test中
train = train.merge(bureau_counts, on = 'SK_ID_CURR', how = 'left')
test = test.merge(bureau_counts, on = 'SK_ID_CURR', how = 'left')

In [None]:
print('Before align train.shape: ', train.shape)
print('Before align test.shape: ', test.shape)

train_labels = train['TARGET']

# Align the dataframes, this will remove the 'TARGET' column
train, test = train.align(test, join = 'inner', axis = 1)

train['TARGET'] = train_labels

print('After align train.shape: ', train.shape)
print('After align test.shape: ', test.shape)

In [None]:
def train_with_cv(train_data, test_data, n_folds, seed_varying):
    train_ids = train_data['SK_ID_CURR']
    test_ids = test_data['SK_ID_CURR']
    
    train_labels = train_data['TARGET']
    
    train_features = train_data.drop(columns = ['SK_ID_CURR', 'TARGET'])
    test_features = test_data.drop(columns = ['SK_ID_CURR'])
    
    feature_names = list(train_features.columns)
    feature_importance_values = np.zeros(len(feature_names))
    
    train_features = np.array(train_features)
    test_features = np.array(test_features)
    
    k_fold = KFold(n_splits = n_folds, shuffle = True, random_state = 50+seed_varying)
    
    test_pred = np.zeros(test_features.shape[0])
    out_of_fold = np.zeros(train_features.shape[0])
    
    valid_scores = []
    train_scores = []
    
    for train_indices, valid_indices in k_fold.split(train_features):
        x_train, y_train = train_features[train_indices], train_labels[train_indices]
        x_valid, y_valid = train_features[valid_indices], train_labels[valid_indices]
        
        model = lgb.LGBMClassifier(n_estimators=10000, objective = 'binary', 
                                   class_weight = 'balanced', learning_rate = 0.05, 
                                   reg_alpha = 0.1, reg_lambda = 0.1, 
                                   subsample = 0.8, n_jobs = -1, random_state = 50+seed_varying)
        
        model.fit(x_train, y_train, eval_metric = 'auc',
                  eval_set = [(x_valid, y_valid), (x_train, y_train)],
                  eval_names = ['valid', 'train'], categorical_feature = 'auto',
                  early_stopping_rounds = 100, verbose = -1)
        
        best_iteration = model.best_iteration_
        
        feature_importance_values += model.feature_importances_ / k_fold.n_splits
        
        test_pred += model.predict_proba(test_features, num_iteration = best_iteration)[:, 1] / k_fold.n_splits
        out_of_fold[valid_indices] = model.predict_proba(x_valid, num_iteration = best_iteration)[:, 1]
        
        valid_score = model.best_score_['valid']['auc']
        train_score = model.best_score_['train']['auc']
        
        valid_scores.append(valid_score)
        train_scores.append(train_score)
        
        gc.enable()
        del model, x_train, y_train, x_valid, y_valid
        gc.collect()
    
    pred_score = pd.DataFrame({'SK_ID_CURR': test_ids, 'TARGET': test_pred})
    feature_importances = pd.DataFrame({'feature': feature_names, 'importance': feature_importance_values})
    
    valid_auc = roc_auc_score(train_labels, out_of_fold)
    
    valid_scores.append(valid_auc)
    train_scores.append(np.mean(train_scores))
    
    # Needed for creating dataframe of validation scores
    fold_names = list(range(n_folds))
    fold_names.append('overall')
    
    # Dataframe of validation scores
    metrics = pd.DataFrame({'fold': fold_names,
                            'train': train_scores,
                            'valid': valid_scores}) 
    
    return pred_score, feature_importances, metrics

In [None]:
train_times = 3
n_folds = 5

i = 0
metrics_all = np.zeros((train_times, 2))
for seed_varying in range(train_times):
    print('\n=======================================================')
    print('The ', seed_varying, ' time of train')
    print('\n=======================================================')
    sub, fi, metrics = train_with_cv(train, test, n_folds, seed_varying)      #注意修改train和test
    if i==0:
        submission = sub
        feat_import = fi
    else:
        submission['TARGET'] += sub['TARGET']
        feat_import['importance'] += fi['importance']
    
    metrics_all[i, :] = metrics.iloc[-1, 1:3]
    i += 1

metrics_all_average = metrics_all.mean(axis = 0)
metrics_all = np.row_stack([metrics_all, metrics_all_average])
train_time_names = list(range(train_times))
train_time_names.append('Average')
metrics_final = pd.DataFrame({'train_time': train_time_names,
                            'train': metrics_all[:,0],
                            'valid': metrics_all[:,1]}) 

submission['TARGET'] = submission['TARGET'] / train_times
feat_import['importance'] = feat_import['importance'] / train_times

submission.to_csv('lightgbm_version_9.csv', index = False)

In [None]:
metrics_final