# Imports and Read Data
### 读取原始数据application_train/test.csv，处理完之后保存为processed_train_test.csv。

In [1]:
# numpy and pandas for data manipulation
import numpy as np
import pandas as pd

# File system manangement
import os

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns

# modeling 
import lightgbm as lgb

# utilities
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder

# memory management
import gc

FILE_NAME = "vis.ipynb"
PARENT_DIR = os.path.abspath(os.path.join(os.path.dirname(FILE_NAME), "."))

app_train = pd.read_csv( PARENT_DIR + '/data/application_train.csv')
app_test = pd.read_csv( PARENT_DIR + '/data/application_test.csv')

In [2]:
# Replace the anomalous values with nan
app_train['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)
app_test['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)

# Generate New Features
## Polynomial Features
### 提取兴趣特征

In [3]:
# Make a new dataframe for polynomial features
poly_features = app_train[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH', 'TARGET']]
poly_features_test = app_test[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH']]

### 简单的缺失值处理

In [4]:
# imputer for handling missing values
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy = 'median')

poly_target = poly_features['TARGET']
poly_features = poly_features.drop(columns = ['TARGET'])

# Need to impute missing values
poly_features = imputer.fit_transform(poly_features)
poly_features_test = imputer.transform(poly_features_test)

### 直接构造多项式特征

In [5]:
from sklearn.preprocessing import PolynomialFeatures
                                  
# Create the polynomial object with specified degree
poly_transformer = PolynomialFeatures(degree = 3)

# Train the polynomial features
poly_transformer.fit(poly_features)

# Transform the features
poly_features = poly_transformer.transform(poly_features)
poly_features_test = poly_transformer.transform(poly_features_test)

### 新特征可视化

In [6]:
# Create a dataframe of the features 
poly_features = pd.DataFrame(poly_features, 
                             columns = poly_transformer.get_feature_names(['EXT_SOURCE_1', 'EXT_SOURCE_2', 
                                                                           'EXT_SOURCE_3', 'DAYS_BIRTH']))

# Add in the target
poly_features['TARGET'] = poly_target

# Find the correlations with the target
poly_corrs = poly_features.corr()['TARGET']

In [7]:
p_corrs = poly_corrs.drop(['TARGET']).drop(['1']).abs().sort_values()

In [None]:
# Display
plt.figure(figsize = (10, 10))
plt.bar( x=0, bottom=p_corrs.index.astype(str), height=0.25, width=p_corrs.values, orientation="horizontal")
plt.title('New Feature Correlations with target');

In [None]:
plt.figure(figsize = (12, 20))
# iterate through the new features
for i, feature in enumerate(['EXT_SOURCE_2 EXT_SOURCE_3', 'EXT_SOURCE_1 EXT_SOURCE_2 EXT_SOURCE_3', 'EXT_SOURCE_2 EXT_SOURCE_3 DAYS_BIRTH', 'EXT_SOURCE_2 DAYS_BIRTH']):
    
    # create a new subplot for each source
    plt.subplot(4, 1, i + 1)
    # plot repaid loans
    sns.kdeplot(poly_features.loc[poly_features['TARGET'] == 0, feature], label = 'target == 0')
    # plot loans that were not repaid
    sns.kdeplot(poly_features.loc[poly_features['TARGET'] == 1, feature], label = 'target == 1')
    
    # Label the plots
    plt.title('Distribution of %s by Target Value' % feature)
    plt.xlabel('%s' % feature); plt.ylabel('Density');
    
plt.tight_layout(h_pad = 2.5)

### Merge到原数据集中

In [8]:
# Put test features into dataframe
poly_features_test = pd.DataFrame(poly_features_test, 
                                  columns = poly_transformer.get_feature_names(['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH']))

# Select the best ones
best_poly_feature_names = p_corrs.tail(10).index.to_list()
best_poly_features = poly_features[best_poly_feature_names]
best_poly_features_test = poly_features_test[best_poly_feature_names]

In [9]:
# Merge polynomial features into training dataframe
best_poly_features['SK_ID_CURR'] = app_train['SK_ID_CURR']
app_train_poly = app_train.merge(best_poly_features, on = 'SK_ID_CURR', how = 'left')

# Merge polnomial features into testing dataframe
best_poly_features_test['SK_ID_CURR'] = app_test['SK_ID_CURR']
app_test_poly = app_test.merge(best_poly_features_test, on = 'SK_ID_CURR', how = 'left')

# Align the dataframes
app_train_poly, app_test_poly = app_train_poly.align(app_test_poly, join = 'inner', axis = 1)

# Add the target column to train dataset
app_train_poly['TARGET'] = poly_target

# Print out the new shapes
print('Training data with polynomial features shape: ', app_train_poly.shape)
print('Testing data with polynomial features shape:  ', app_test_poly.shape)

Training data with polynomial features shape:  (307511, 132)
Testing data with polynomial features shape:   (48744, 131)


## Domain Knowledge Features
### 训练集构造特征

In [10]:
app_train_domain = app_train_poly.copy()
app_test_domain = app_test_poly.copy()

app_train_domain['CREDIT_INCOME_PERCENT'] = app_train_domain['AMT_CREDIT'] / app_train_domain['AMT_INCOME_TOTAL']
app_train_domain['ANNUITY_INCOME_PERCENT'] = app_train_domain['AMT_ANNUITY'] / app_train_domain['AMT_INCOME_TOTAL']
app_train_domain['ANNUITY_CREDIT_PERCENT'] = app_train_domain['AMT_ANNUITY'] / app_train_domain['AMT_CREDIT']
app_train_domain['DAYS_EMPLOYED_PERCENT'] = app_train_domain['DAYS_EMPLOYED'] / app_train_domain['DAYS_BIRTH']

### 测试集构造特征

In [11]:
app_test_domain['CREDIT_INCOME_PERCENT'] = app_test_domain['AMT_CREDIT'] / app_test_domain['AMT_INCOME_TOTAL']
app_test_domain['ANNUITY_INCOME_PERCENT'] = app_test_domain['AMT_ANNUITY'] / app_test_domain['AMT_INCOME_TOTAL']
app_test_domain['ANNUITY_CREDIT_PERCENT'] = app_test_domain['AMT_ANNUITY'] / app_test_domain['AMT_CREDIT']
app_test_domain['DAYS_EMPLOYED_PERCENT'] = app_test_domain['DAYS_EMPLOYED'] / app_test_domain['DAYS_BIRTH']

In [12]:
# Print out the new shapes
print('Training data with polynomial & domain features shape: ', app_train_domain.shape)
print('Testing data with polynomial $ domain features shape:  ', app_test_domain.shape)

Training data with polynomial & domain features shape:  (307511, 136)
Testing data with polynomial $ domain features shape:   (48744, 135)


### 特征相关性可视化

In [None]:
domain_features = app_train_domain[['AMT_CREDIT', 'AMT_INCOME_TOTAL', 'AMT_ANNUITY', 'DAYS_EMPLOYED', 'DAYS_BIRTH', 'CREDIT_INCOME_PERCENT', 'ANNUITY_INCOME_PERCENT', 'ANNUITY_CREDIT_PERCENT', 'DAYS_EMPLOYED_PERCENT', 'TARGET']]
domain_corrs = domain_features.corr()['TARGET'].sort_values()
domain_corrs

In [None]:
# Display most relevant
d_corrs = domain_corrs.drop(['TARGET']).abs().sort_values()

plt.figure(figsize = (10, 5))
plt.bar( x=0, bottom=d_corrs.index.astype(str), height=0.5, width=d_corrs.values, orientation="horizontal")
plt.title('New Feature Correlations with target');

In [None]:
plt.figure(figsize = (12, 20))
# iterate through the new features
for i, feature in enumerate(['CREDIT_INCOME_PERCENT', 'ANNUITY_INCOME_PERCENT', 'ANNUITY_CREDIT_PERCENT', 'DAYS_EMPLOYED_PERCENT']):
    
    # create a new subplot for each source
    plt.subplot(4, 1, i + 1)
    # plot repaid loans
    sns.kdeplot(app_train_domain.loc[app_train_domain['TARGET'] == 0, feature], label = 'target == 0')
    # plot loans that were not repaid
    sns.kdeplot(app_train_domain.loc[app_train_domain['TARGET'] == 1, feature], label = 'target == 1')
    
    # Label the plots
    plt.title('Distribution of %s by Target Value' % feature)
    plt.xlabel('%s' % feature); plt.ylabel('Density');
    
plt.tight_layout(h_pad = 2.5)

## Feature Tools

In [None]:
app_train_auto = app_train.copy()
app_test_auto = app_test.copy()

app_train_auto = app_train_auto.sort_values('SK_ID_CURR').reset_index(drop = True).loc[:1000, :]

In [None]:
import featuretools as ft

# Entity set with id applications
es = ft.EntitySet(id = 'clients')

# Entities with a unique index
es = es.entity_from_dataframe(entity_id = 'app_train', dataframe = app_train_auto, index = 'SK_ID_CURR')

### Default primitives

In [None]:
primitives = ft.list_primitives()
pd.options.display.max_colwidth = 100
primitives[primitives['type'] == 'transform']

In [None]:
# Default primitives from featuretools
default_agg_primitives = []
default_trans_primitives =  ["diff", "divide_by_feature", "absolute", "haversine"]

# DFS with specified primitives
feature_matrix, feature_names = ft.dfs(entityset = es, target_entity = 'app_train',
                    trans_primitives = default_trans_primitives,
                    agg_primitives = default_agg_primitives,
                    max_depth = 2, features_only = False, verbose = True)

print('%d Total Features' % len(feature_names))

In [None]:
feature_matrix.shape

In [None]:
feature_names[-20:]

## Result datasets with new features

In [13]:
app_train_nf = app_train_domain.copy()
app_test_nf = app_test_domain.copy()

# Feature Selection
## Remove Collinear Variables

In [16]:
# Absolute value correlation matrix
corr_matrix = app_train_nf.corr().abs()
# Upper triangle of correlations
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
upper.head()

Unnamed: 0,SK_ID_CURR,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,...,EXT_SOURCE_2 EXT_SOURCE_3^2,EXT_SOURCE_2^2 EXT_SOURCE_3,EXT_SOURCE_2 EXT_SOURCE_3 DAYS_BIRTH,EXT_SOURCE_1 EXT_SOURCE_2 EXT_SOURCE_3,EXT_SOURCE_2 EXT_SOURCE_3,TARGET,CREDIT_INCOME_PERCENT,ANNUITY_INCOME_PERCENT,ANNUITY_CREDIT_PERCENT,DAYS_EMPLOYED_PERCENT
SK_ID_CURR,,0.001129,0.00182,0.000343,0.000433,0.000232,0.000849,0.0015,8.4e-05,0.000973,...,0.001338,0.001527,0.001149,0.001189,0.001658,0.002108,0.001726,0.002351,0.000319,0.000136
CNT_CHILDREN,,,0.012882,0.002145,0.021374,0.001827,0.025573,0.330938,0.061145,0.183395,...,0.039596,0.035572,0.175433,0.073692,0.037726,0.019187,0.016012,0.002296,0.020751,0.009985
AMT_INCOME_TOTAL,,,,0.15687,0.191657,0.15961,0.074796,0.027261,0.013005,0.027805,...,0.008854,0.039285,0.006438,0.03109,0.023917,0.003982,0.108191,0.153033,0.026788,0.010678
AMT_CREDIT,,,,,0.770138,0.986968,0.099738,0.055436,0.091295,0.009621,...,0.09077,0.127608,0.107749,0.141562,0.113471,0.030369,0.651097,0.373921,0.558789,0.0668
AMT_ANNUITY,,,,,,0.775109,0.118429,0.009445,0.053604,0.038514,...,0.083815,0.120899,0.078002,0.123227,0.105109,0.012817,0.393239,0.484624,0.063489,0.041393


In [17]:
# Threshold for removing correlated variables
threshold = 0.99

# Select columns with correlations above threshold
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]

print('There are %d columns to remove.' % (len(to_drop)))

There are 16 columns to remove.


In [18]:
app_train_nf = app_train_nf.drop(columns = to_drop)
app_test_nf = app_test_nf.drop(columns = to_drop)

print('Training shape: ', app_train_nf.shape)
print('Testing shape: ', app_test_nf.shape)

Training shape:  (307511, 120)
Testing shape:  (48744, 119)


In [19]:
app_train_nf.to_csv(PARENT_DIR + '/data/processed_train.csv')
app_test_nf.to_csv(PARENT_DIR + '/data/processed_test.csv')

## Remove missing values

In [None]:
# Train missing values (in percent)
train_missing = (app_train_nf.isnull().sum() / len(app_train_nf)).sort_values(ascending = False)
train_missing.head(10)

In [None]:
# Test missing values (in percent)
test_missing = (app_test_nf.isnull().sum() / len(app_test_nf)).sort_values(ascending = False)
test_missing.head(50)

In [None]:
# Identify missing values above threshold
train_missing = train_missing.index[train_missing > 0.75]
test_missing = test_missing.index[test_missing > 0.75]

all_missing = list(set(set(train_missing) | set(test_missing)))
print('There are %d columns with more than 75%% missing values' % len(all_missing))

In [None]:
app_train_nf = pd.get_dummies(app_train_nf.drop(columns = all_missing))
app_test_nf = pd.get_dummies(app_test_nf.drop(columns = all_missing))

## Remove features with lower importance

In [None]:
# some pre
# Need to save the labels because aligning will remove this column
train_labels = app_train_nf["TARGET"]
train_ids = app_train_nf['SK_ID_CURR']
test_ids = app_test_nf['SK_ID_CURR']

app_train_nf = pd.get_dummies(app_train_nf.drop(columns = all_missing))
app_test_nf = pd.get_dummies(app_test_nf.drop(columns = all_missing))

app_train_nf, app_test_nf = app_train_nf.align(app_test_nf, join = 'inner', axis = 1)

print('Training set full shape: ', app_train_nf.shape)
print('Testing set full shape: ' , app_test_nf.shape)

In [None]:
app_train_nf = app_train_nf.drop(columns = ['SK_ID_CURR'])
app_test_nf = app_test_nf.drop(columns = ['SK_ID_CURR'])

In [None]:
imputer2 = SimpleImputer(strategy = 'median')
# Need to impute missing values
app_train_nf = imputer2.fit_transform(app_train_nf)
app_train_nf = pd.DataFrame(app_train_nf)

In [None]:
# Initialize an empty array to hold feature importances
feature_importances = np.zeros(app_train_nf.shape[1])

# Create the model with several hyperparameters
model = lgb.LGBMClassifier(objective='binary', boosting_type = 'goss', n_estimators = 10000, class_weight = 'balanced')

In [None]:
# Fit the model twice to avoid overfitting
for i in range(2):
    
    # Split into training and validation set
    train_features, valid_features, train_y, valid_y = train_test_split(app_train_nf, train_labels, test_size = 0.25, random_state = i)
    
    # Train using early stopping
    model.fit(train_features, train_y, early_stopping_rounds=100, eval_set = [(valid_features, valid_y)], 
              eval_metric = 'auc', verbose = 200)
    
    # Record the feature importances
    feature_importances += model.feature_importances_

In [None]:
# Make sure to average feature importances! 
feature_importances = feature_importances / 2
feature_importances = pd.DataFrame({'feature': list(app_train_nf.columns), 'importance': feature_importances}).sort_values('importance', ascending = False)

feature_importances.head()

In [None]:
# Find the features with zero importance
zero_features = list(feature_importances[feature_importances['importance'] == 0.0]['feature'])
print('There are %d features with 0.0 importance' % len(zero_features))
feature_importances.tail()

In [None]:
def plot_feature_importances(df, threshold = 0.9):
    """
    Plots 15 most important features and the cumulative importance of features.
    Prints the number of features needed to reach threshold cumulative importance.
    
    Parameters
    --------
    df : dataframe
        Dataframe of feature importances. Columns must be feature and importance
    threshold : float, default = 0.9
        Threshold for prining information about cumulative importances
        
    Return
    --------
    df : dataframe
        Dataframe ordered by feature importances with a normalized column (sums to 1)
        and a cumulative importance column
    
    """
    
    plt.rcParams['font.size'] = 18
    
    # Sort features according to importance
    df = df.sort_values('importance', ascending = False).reset_index()
    
    # Normalize the feature importances to add up to one
    df['importance_normalized'] = df['importance'] / df['importance'].sum()
    df['cumulative_importance'] = np.cumsum(df['importance_normalized'])

    # Make a horizontal bar chart of feature importances
    plt.figure(figsize = (10, 6))
    ax = plt.subplot()
    
    # Need to reverse the index to plot most important on top
    ax.barh(list(reversed(list(df.index[:15]))), 
            df['importance_normalized'].head(15), 
            align = 'center', edgecolor = 'k')
    
    # Set the yticks and labels
    ax.set_yticks(list(reversed(list(df.index[:15]))))
    ax.set_yticklabels(df['feature'].head(15))
    
    # Plot labeling
    plt.xlabel('Normalized Importance'); plt.title('Feature Importances')
    plt.show()
    
    # Cumulative importance plot
    plt.figure(figsize = (8, 6))
    plt.plot(list(range(len(df))), df['cumulative_importance'], 'r-')
    plt.xlabel('Number of Features'); plt.ylabel('Cumulative Importance'); 
    plt.title('Cumulative Feature Importance');
    plt.show();
    
    importance_index = np.min(np.where(df['cumulative_importance'] > threshold))
    print('%d features required for %0.2f of cumulative importance' % (importance_index + 1, threshold))
    
    return df

In [None]:
norm_feature_importances = plot_feature_importances(feature_importances, 0.99)