In [3]:
import pandas as pd
import sklearn
import numpy as np
import matplotlib.pyplot as plt
import os
import warnings
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import LabelEncoder
import plotly.offline as py
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
from sklearn.model_selection import train_test_split
init_notebook_mode(connected=True)
import cufflinks as cf
cf.go_offline()
import pickle
import gc
import lightgbm as lgb
import time
warnings.filterwarnings('ignore')
%matplotlib inline

In [4]:
train = pd.read_csv('apptrain_bureau_prev.csv')

In [5]:
train.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,PREV_PRODUCT_COMBINATION_Cash X-Sell: low,PREV_PRODUCT_COMBINATION_Cash X-Sell: middle,PREV_PRODUCT_COMBINATION_POS household with interest,PREV_PRODUCT_COMBINATION_POS household without interest,PREV_PRODUCT_COMBINATION_POS industry with interest,PREV_PRODUCT_COMBINATION_POS industry without interest,PREV_PRODUCT_COMBINATION_POS mobile with interest,PREV_PRODUCT_COMBINATION_POS mobile without interest,PREV_PRODUCT_COMBINATION_POS other with interest,PREV_PRODUCT_COMBINATION_POS others without interest
0,100002,1,0,1,0,1,0,202500.0,12.915579,10.114579,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,100003,0,0,0,0,0,0,270000.0,14.072864,10.482864,...,0.333333,0.0,0.333333,0.0,0.333333,0.0,0.0,0.0,0.0,0.0
2,100004,0,1,1,1,1,0,67500.0,11.81303,8.817298,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,100006,0,0,0,0,1,0,135000.0,12.652944,10.298448,...,0.222222,0.0,0.111111,0.0,0.111111,0.0,0.0,0.0,0.0,0.0
4,100007,0,0,1,0,1,0,121500.0,13.148031,9.992665,...,0.0,0.5,0.166667,0.0,0.0,0.0,0.166667,0.0,0.0,0.0


In [6]:
def encoding_data(df):
    # Create a label encoder object
    le = LabelEncoder()
    le_count = 0
    col_le = []

    # Iterate through the columns
    for col in df:
        if df[col].dtype == 'object':
            # If 2 or fewer unique categories
            if len(list(df[col].unique())) <= 2:
                # Train on the training data
                le.fit(df[col])
                # Transform both training and testing data
                df[col] = le.transform(df[col])
                col_le.append(col)

                # Keep track of how many columns were label encoded
                le_count += 1
    
    print('%d columns were label encoded.' % le_count)
    print('encoded column =', col_le)

In [7]:
encoding_data(train)

0 columns were label encoded.
encoded column = []


In [8]:
# one-hot encoding of categorical variable
train = pd.get_dummies(train)

print('shape: ', train.shape)

shape:  (304526, 386)


In [9]:
def missing_values_table(df):
        mis_val = df.isnull().sum()
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        return mis_val_table_ren_columns

In [10]:
missing_values_table(train)

Your selected dataframe has 386 columns.
There are 0 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values


# Correlation with Target

In [11]:
# Find correlations with the target and sort
correlations = train.corr()['TARGET'].sort_values()

# Display correlations
print('Most Positive Correlations:\n', correlations.tail(35))
print('\nMost Negative Correlations:\n', correlations.head(35))

Most Positive Correlations:
 BUREAU_CREDIT_TYPE_Microloan                         0.039091
PREV_NAME_YIELD_GROUP_high                           0.039262
PREV_NAME_CONTRACT_TYPE_Revolving loans              0.039823
PREV_CODE_REJECT_REASON_LIMIT                        0.039911
BUREAU_DAYS_CREDIT_ENDDATE                           0.039970
PREV_PRODUCT_COMBINATION_Card Street                 0.040849
EMERGENCYSTATE_MODE_unk                              0.041190
DAYS_REGISTRATION                                    0.042272
OCCUPATION_TYPE_Laborers                             0.043011
PREV_NAME_YIELD_GROUP_XNA                            0.044577
REG_CITY_NOT_LIVE_CITY                               0.044726
DEBT_CREDIT_RATIO                                    0.045864
FLAG_EMP_PHONE                                       0.046220
BUREAU_CREDIT_ACTIVE_Active                          0.048926
NAME_EDUCATION_TYPE_Secondary / secondary special    0.049650
REG_CITY_NOT_WORK_CITY                   

Dari pencarian nilai yang paling positif dan negatif terhadap 'TARGET' diambil sekitar 30 data yang paling berpengaruh terhadap model yang mana paling positif. Dari 30 data kolom tersebut akan dibuat kolom baru dan diseleksi kembali dengan metode Weight of Evidence yang sering digunakan dalam pemodelan kasus credit. Dengan demikian, kolom yang paling baik korelasinya dan hasil WoEnya akan dijadikan sebagai pemodelan dengan jumlah kolom yang akan dipakai sekitar 25 kolom.

In [83]:
data = train[['PREV_NAME_CONTRACT_TYPE_Revolving loans','PREV_CODE_REJECT_REASON_LIMIT','BUREAU_DAYS_CREDIT_ENDDATE','PREV_PRODUCT_COMBINATION_Card Street','EMERGENCYSTATE_MODE_unk','DAYS_REGISTRATION','OCCUPATION_TYPE_Laborers','PREV_NAME_YIELD_GROUP_XNA','REG_CITY_NOT_LIVE_CITY','DEBT_CREDIT_RATIO','FLAG_EMP_PHONE','BUREAU_CREDIT_ACTIVE_Active',
              'NAME_EDUCATION_TYPE_Secondary / secondary special','REG_CITY_NOT_WORK_CITY','DAYS_ID_PUBLISH','PREV_CODE_REJECT_REASON_HC','PREV_CODE_REJECT_REASON_SCOFR','CODE_GENDER','DAYS_LAST_PHONE_CHANGE','NAME_INCOME_TYPE_Working','PREV_NAME_PRODUCT_TYPE_walk-in','REGION_RATING_CLIENT','REGION_RATING_CLIENT_W_CITY','DAYS_EMPLOYED',
              'BUREAU_DAYS_ENDDATE_FACT','BUREAU_DAYS_CREDIT_UPDATE','PREV_NAME_CONTRACT_STATUS_Refused','DAYS_BIRTH','BUREAU_DAYS_CREDIT','TARGET']].copy()

In [84]:
data.head()

Unnamed: 0,PREV_NAME_CONTRACT_TYPE_Revolving loans,PREV_CODE_REJECT_REASON_LIMIT,BUREAU_DAYS_CREDIT_ENDDATE,PREV_PRODUCT_COMBINATION_Card Street,EMERGENCYSTATE_MODE_unk,DAYS_REGISTRATION,OCCUPATION_TYPE_Laborers,PREV_NAME_YIELD_GROUP_XNA,REG_CITY_NOT_LIVE_CITY,DEBT_CREDIT_RATIO,FLAG_EMP_PHONE,BUREAU_CREDIT_ACTIVE_Active,NAME_EDUCATION_TYPE_Secondary / secondary special,REG_CITY_NOT_WORK_CITY,DAYS_ID_PUBLISH,PREV_CODE_REJECT_REASON_HC,PREV_CODE_REJECT_REASON_SCOFR,CODE_GENDER,DAYS_LAST_PHONE_CHANGE,NAME_INCOME_TYPE_Working,PREV_NAME_PRODUCT_TYPE_walk-in,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,DAYS_EMPLOYED,BUREAU_DAYS_ENDDATE_FACT,BUREAU_DAYS_CREDIT_UPDATE,PREV_NAME_CONTRACT_STATUS_Refused,DAYS_BIRTH,BUREAU_DAYS_CREDIT,TARGET
0,0.0,0.0,-349.0,0.0,0,-3648.0,1,0.0,0,0.28,1,0.25,1,0,-2120,0.0,0.0,1,-1134.0,1,0.0,2,2,-637.0,-697.5,-499.88,0.0,-9461,-874.0,1
1,0.0,0.0,-544.5,0.0,0,-1186.0,0,0.0,0,0.0,1,0.25,0,0,-291,0.0,0.0,0,-828.0,0,0.0,1,1,-1188.0,-1097.33,-816.0,0.0,-16765,-1400.75,0
2,0.0,0.0,-488.5,0.0,1,-4260.0,1,0.0,0,0.0,1,0.0,1,0,-2531,0.0,0.0,1,-815.0,1,0.0,2,2,-225.0,-532.5,-532.0,0.0,-19046,-867.0,0
3,0.22,0.11,0.0,0.11,1,-9833.0,1,0.44,0,0.0,1,0.0,1,0,-2437,0.0,0.0,0,-617.0,1,0.0,2,2,-3039.0,0.0,0.0,0.11,-19005,0.0,0
4,0.0,0.0,-783.0,0.0,1,-4311.0,0,0.0,0,0.0,1,0.0,1,1,-3458,0.0,0.0,1,-1106.0,1,0.17,2,2,-3038.0,-783.0,-783.0,0.0,-19932,-1149.0,0


# Weight of Evidence

In [85]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 304526 entries, 0 to 304525
Data columns (total 30 columns):
 #   Column                                             Non-Null Count   Dtype  
---  ------                                             --------------   -----  
 0   PREV_NAME_CONTRACT_TYPE_Revolving loans            304526 non-null  float64
 1   PREV_CODE_REJECT_REASON_LIMIT                      304526 non-null  float64
 2   BUREAU_DAYS_CREDIT_ENDDATE                         304526 non-null  float64
 3   PREV_PRODUCT_COMBINATION_Card Street               304526 non-null  float64
 4   EMERGENCYSTATE_MODE_unk                            304526 non-null  uint8  
 5   DAYS_REGISTRATION                                  304526 non-null  float64
 6   OCCUPATION_TYPE_Laborers                           304526 non-null  uint8  
 7   PREV_NAME_YIELD_GROUP_XNA                          304526 non-null  float64
 8   REG_CITY_NOT_LIVE_CITY                             304526 non-null  int64 

In [86]:
def calculate_woe_iv(dataset, feature, target):
    lst = []
    for i in range(dataset[feature].nunique()):
        val = list(dataset[feature].unique())[i]
        lst.append({
            'Value': val,
            'All': dataset[dataset[feature] == val].count()[feature],
            'Good': dataset[(dataset[feature] == val) & (dataset[target] == 0)].count()[feature],
            'Bad': dataset[(dataset[feature] == val) & (dataset[target] == 1)].count()[feature]
        }) 
    dset = pd.DataFrame(lst)
    dset['Distr_Good'] = dset['Good'] / dset['Good'].sum()
    dset['Distr_Bad'] = dset['Bad'] / dset['Bad'].sum()
    dset['WoE'] = np.log(dset['Distr_Good'] / dset['Distr_Bad'])
    dset = dset.replace({'WoE': {np.inf: 0, -np.inf: 0}})
    dset['IV'] = (dset['Distr_Good'] - dset['Distr_Bad']) * dset['WoE']
    iv = dset['IV'].sum()
    dset = dset.sort_values(by='WoE')
    return dset, iv

In [88]:
data['BUREAU_DAYS_CREDIT']= pd.qcut(data['BUREAU_DAYS_CREDIT '], 10) 

KeyError: 'BUREAU_DAYS_CREDIT '

In [None]:
X = train.drop(['TARGET'], axis=1)
y= train['TARGET']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)


In [None]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

In [None]:
log_reg = LogisticRegression()

In [None]:
log_reg.fit(X_train,y_train)

In [None]:
y_pred = log_reg.predict(X_test)

In [None]:
n_errors = (y_pred != y_test).sum()
# Run Classification Metrics
print("{}: {}".format("Logistic Regression errors",n_errors))
print("Accuracy Score :")
print(accuracy_score(y_test,y_pred))
print("Confusion matrix :")
print(confusion_matrix(y_test, y_pred))
print("Classification Report :")
print(classification_report(y_test,y_pred))
print("ROC AUC score is: ",roc_auc_score(y_test,y_pred))

In [None]:
state = np.random.RandomState(42)

In [None]:
random_forest = RandomForestClassifier(n_estimators = 100, random_state = state, verbose = 1, n_jobs = -1)

In [None]:
random_forest.fit(X_train,y_train)

In [None]:
y_pred_rf = random_forest.predict(X_test)

In [None]:
n_errors = (y_pred_rf != y_test).sum()
# Run Classification Metrics
print("{}: {}".format("Random Forest errors",n_errors))
print("Accuracy Score :")
print(accuracy_score(y_test,y_pred_rf))
print("Confusion matrix :")
print(confusion_matrix(y_test, y_pred_rf))
print("Classification Report :")
print(classification_report(y_test,y_pred_rf))
print("ROC AUC score is: ",roc_auc_score(y_test,y_pred_rf))