In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [2]:
import sys

def return_size(df):
    """Return size of dataframe in gigabytes"""
    return round(sys.getsizeof(df) / 1e9, 2)

def convert_types(df):
    print(f'Original size of data: {return_size(df)} gb.')
    for c in df:
        if df[c].dtype == 'object':
            df[c] = df[c].astype('category')
    print(f'New size of data: {return_size(df)} gb.')
    return df

In [3]:
full = pd.read_csv('clean_full.csv')
full = convert_types(full)
full.head()

Original size of data: 4.13 gb.
New size of data: 3.78 gb.


Unnamed: 0,SK_ID_CURR,AMT_ANNUITY,AMT_CREDIT,AMT_GOODS_PRICE,AMT_INCOME_TOTAL,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_WEEK,...,CC_CLIENT_CC_LOAN_AMT_DRAWINGS_ATM_CURRENT_mean_max,CC_CLIENT_CC_LOAN_AMT_PAYMENT_CURRENT_mean_min,CC_CLIENT_CC_LOAN_AMT_PAYMENT_CURRENT_mean_mean,CC_CLIENT_CC_LOAN_AMT_PAYMENT_CURRENT_mean_max,CC_CLIENT_CC_LOAN_AMT_PAYMENT_CURRENT_max_min,CC_CLIENT_CC_LOAN_AMT_PAYMENT_CURRENT_max_mean,CC_CLIENT_CC_LOAN_AMT_PAYMENT_CURRENT_max_max,CC_CLIENT_CC_LOAN_AMT_DRAWINGS_ATM_CURRENT_max_min,CC_CLIENT_CC_LOAN_AMT_DRAWINGS_ATM_CURRENT_max_mean,CC_CLIENT_CC_LOAN_AMT_DRAWINGS_ATM_CURRENT_max_max
0,100002.0,24700.5,406597.5,351000.0,202500.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
1,100003.0,35698.5,1293502.5,1129500.0,270000.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
2,100004.0,6750.0,135000.0,135000.0,67500.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
3,100006.0,29686.5,312682.5,297000.0,135000.0,,,,,,...,,,,,,,,,,
4,100007.0,21865.5,513000.0,513000.0,121500.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,


In [4]:
# missing_values_table

def missing_values_table(df):
    
    mis_val = df.isnull().sum()
    mis_val_percent = 100 * df.isnull().sum()/len(df)
    
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis = 1)
    
    mis_val_table_ren_columns = mis_val_table.rename(columns = {0: 'Missing Values', 1: '% of Total Values'})
    
    mis_val_table_ren_columns = mis_val_table_ren_columns[
        mis_val_table_ren_columns.iloc[:,1] != 0].sort_values('% of Total Values', ascending=False).round(1)
    
    print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n" + 
           "There are " + str(mis_val_table_ren_columns.shape[0]) + "columns that have missing values.")
        
        # Return the dataframe with missing information
    return mis_val_table_ren_columns

In [5]:
missing = missing_values_table(full)
missing.head(10)

Your selected dataframe has 1341 columns.
There are 1286columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
PREVIOUS_RATE_INTEREST_PRIMARY_max,350534,98.4
PREVIOUS_RATE_INTEREST_PRIMARY_min,350534,98.4
PREVIOUS_RATE_INTEREST_PRIVILEGED_min,350534,98.4
PREVIOUS_RATE_INTEREST_PRIVILEGED_mean,350534,98.4
PREVIOUS_RATE_INTEREST_PRIVILEGED_max,350534,98.4
PREVIOUS_RATE_INTEREST_PRIMARY_mean,350534,98.4
CC_CLIENT_CC_LOAN_AMT_PAYMENT_CURRENT_max_min,294314,82.6
CC_CLIENT_CC_LOAN_AMT_PAYMENT_CURRENT_max_max,294314,82.6
CC_CLIENT_CC_LOAN_AMT_PAYMENT_CURRENT_min_min,294314,82.6
CC_CLIENT_CC_LOAN_AMT_PAYMENT_CURRENT_min_mean,294314,82.6


In [6]:
missing = list(missing.index[missing['% of Total Values'] > 90])
len(missing)

6

In [7]:
full = full.drop(columns = missing)

### Functions

In [8]:
def target_corrs(df):
    
    corrs = []
    
    for col in df.columns:
        if col != 'TARGET': # Y variable
            corr = df['TARGET'].corr(df[col])
            corrs.append((col, corr))
            
    corrs = pd.DataFrame(sorted(corrs, key = lambda x: abs(x[1]), reverse = True)).rename(columns = {0: 'variable', 1:'corr_with_target'})
    
    return corrs

In [9]:
#try to find important interaction features
def interact_variable_list(df, target, thresh):
    key_v = corrs_target.head(5).variable.tolist()
    key_variable = df[key_v]
    for a in key_variable.columns:
        for b in key_variable.columns:
            if a != b:
                name = a + '*' + b
                key_variable[name] = key_variable[a] * key_variable[b]
    
    key_variable[target] = df[target]
    intact_variable = target_corrs(key_variable)
    key_intact_variable = intact_variable[abs(intact_variable['corr_with_target']) > thres_interact[thresh]].reset_index().drop(columns = ['index'])
    key_intact_variable = key_intact_variable.groupby('corr_with_target', as_index = False).agg(['first']).reset_index()
    
    columns = ['corr_with_target', 'key_variables']
    key_intact_variable.columns = columns
    key_intact_variable_list = key_intact_variable['key_variables'].tolist()
    
    return key_v, key_intact_variable_list

In [10]:
def add_interact_variable(df):
    
    key_variable = df[key_v]
    for a in key_variable.columns:
        for b in key_variable.columns:
            if a != b:
                name = a + '*' + b
                key_variable[name] = key_variable[a] * key_variable[b]
                
    df[key_intact_variable_list] = key_variable[key_intact_variable_list]
    
    return df

In [11]:
row = 0
def get_numeric(df, target):
    d_type_list = pd.DataFrame((df.iloc[row, ] != 0 & (df.iloc[row, ] != 1))).reset_index().rename(columns = {row : 'mark'})
    numeric_list = d_type_list[d_type_list['mark'] == True]['index'].tolist()
    numeric = df[numeric_list].select_dtypes('number')
    numeric[target] = df[target]
    
    return numeric

In [12]:
def get_log_list(df, target):
    log_list = []
    for col in df.drop(columns = [target]).columns:
        name = col + '_log'
        df[name] = np.log(df[col] + 1)
        corr_pre_log = df[target].corr(df[col])
        corr_post_log = df[target].corr(df[name])
        if corr_post_log > corr_pre_log:
            log_list.append(col)
            
    return log_list

In [13]:
def log_transf(df):
    for col in log_list:
        name = col + '_log'
        df[name] = np.log(abs(df[col]) + 1)
        df = df.drop(columns = [col])
    return df

In [14]:
def get_binary(df, target):
    numeric_list = get_numeric(df, target).drop(columns = [target]).columns.tolist()
    number_list = df.select_dtypes('number').columns
    binary_list = [x for x in number_list if x not in numeric_list]
    
    return binary_list

In [15]:
#can only be used for numeric variables, so, plz do the following with get_numeric() function
thresh_imp = 0

def missing_imp_list(df, target):
    missing_imp_list = []
    for col in df.drop(columns = [target]).columns:
        name = col + '_imp'
        df[name] = df[col]
        a = df[name].values
        m = np.isnan(a)
        mu, sig = df[col].mean(), df[col].std()
        a[m] = np.random.normal(mu, abs(sig), size = m.sum())
        corr_pre_imp = df[target].corr(df[col])
        corr_post_imp = df[target].corr(df[name])
        if corr_post_imp - corr_pre_imp > thresh_imp:
            missing_imp_list.append(col)
            
    return missing_imp_list

In [16]:
def missing_imputing(df):
    for col in missing_imp_lists:
        name = col + '_imp'
        df[name] = df[col]
        a = df[name].values
        m = np.isnan(a)
        mu, sig = df[col].mean(), df[col].std()
        a[m] = np.random.normal(mu, abs(sig), size = m.sum())
        df = df.drop(columns = [col])
        
    return df

### Correlations

In [17]:
corrs = full.corr()

In [18]:
corrs = corrs.sort_values('TARGET', ascending = False)
# Ten most positive correlations
pd.DataFrame(corrs['TARGET'].head(10))

Unnamed: 0,TARGET
TARGET,1.0
CC_CLIENT_CC_LOAN_CNT_DRAWINGS_ATM_CURRENT_mean_sum,0.116795
CC_CLIENT_CC_LOAN_CNT_DRAWINGS_ATM_CURRENT_mean_max,0.102996
CC_CLIENT_CC_LOAN_CNT_DRAWINGS_ATM_CURRENT_mean_mean,0.102917
CC_CLIENT_CC_LOAN_CNT_DRAWINGS_ATM_CURRENT_mean_min,0.102765
CC_CLIENT_CC_LOAN_CNT_DRAWINGS_CURRENT_max_max,0.10109
CC_CLIENT_CC_LOAN_CNT_DRAWINGS_CURRENT_max_mean,0.100961
CC_CLIENT_CC_LOAN_CNT_DRAWINGS_CURRENT_max_sum,0.100923
CC_CLIENT_CC_LOAN_CNT_DRAWINGS_CURRENT_max_min,0.100664
PREVIOUS_DAYS_FIRST_DRAWING_max,0.096222


In [19]:
pd.DataFrame(corrs['TARGET'].dropna().tail(10))

Unnamed: 0,TARGET
EMPLOYED_BIRTH_RATIO,-0.067955
BB_CLIENT_BB_LOAN_MONTHS_BALANCE_count_max,-0.068787
PREVIOUS_CODE_REJECT_REASON_XAP_mean,-0.073929
BUREAU_CREDIT_ACTIVE_Closed_mean,-0.079369
BB_CLIENT_BB_LOAN_MONTHS_BALANCE_count_mean,-0.080193
EXT_SOURCE_1,-0.155317
EXT_SOURCE_2,-0.160472
EXIT_SOURCE_SUM,-0.173322
EXT_SOURCE_3,-0.178919
EXIT_SOURCE_MEAN,-0.222052


#### Collinear Variables

In [20]:
#set the threshold for collinear
threshold = 0.95

above_threshold_vars = {}

for col in corrs:
    above_threshold_vars[col] = list(corrs.index[corrs[col] > threshold])

In [22]:
cols_to_remove = []
cols_seen = []
cols_to_remove_pair = []

# Iterate through columns and correlated columns
for key, value in above_threshold_vars.items():
    # Keep track of columns already examined
    cols_seen.append(key)
    for x in value:
        if x == key:
            next
        else:
            # Only want to remove one in a pair
            if x not in cols_seen:
                cols_to_remove.append(x)
                cols_to_remove_pair.append(key)
            
cols_to_remove = list(set(cols_to_remove))
print('Number of columns to remove: ', len(cols_to_remove))

Number of columns to remove:  523


In [23]:
cols_to_remove[:5]

['YEARS_BUILD_MODE',
 'CC_CLIENT_CC_LOAN_NAME_CONTRACT_STATUS_Completed_mean_max',
 'CC_CLIENT_CC_LOAN_AMT_DRAWINGS_CURRENT_max_sum',
 'CC_CLIENT_CC_LOAN_AMT_DRAWINGS_POS_CURRENT_sum_mean',
 'IN_CLIENT_IN_LOAN_AMT_INSTALMENT_min_max']

In [24]:
full = full.drop(columns = cols_to_remove)

In [25]:
import gc
gc.enable()
del corrs
gc.collect()

21

In [29]:
target_corrs(full[full['TARGET'].notnull()].select_dtypes('number')).head(10)

Unnamed: 0,variable,corr_with_target
0,EXIT_SOURCE_MEAN,-0.222052
1,EXT_SOURCE_3,-0.178919
2,EXIT_SOURCE_SUM,-0.173322
3,EXT_SOURCE_2,-0.160472
4,EXT_SOURCE_1,-0.155317
5,BUREAU_DAYS_CREDIT_mean,0.089728
6,BB_CLIENT_BB_LOAN_MONTHS_BALANCE_count_mean,-0.080193
7,DAYS_BIRTH,0.078239
8,PREVIOUS_NAME_CONTRACT_STATUS_Refused_mean,0.077671
9,BUREAU_DAYS_CREDIT_min,0.075247


In [30]:
full.to_csv('clean_corrs_full.csv')

### Categorical Variable

#### Step 1
* Combine train and test, impute missing as 'unknown'
* get dummies for 'object'
* combine binary items to both train and test

In [31]:
Categorical = pd.concat([full.select_dtypes('category'), full.select_dtypes('object')], axis = 1)
Categorical.head()

Unnamed: 0,CODE_GENDER,EMERGENCYSTATE_MODE,FLAG_OWN_CAR,FLAG_OWN_REALTY,FONDKAPREMONT_MODE,HOUSETYPE_MODE,NAME_CONTRACT_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,NAME_INCOME_TYPE,NAME_TYPE_SUITE,OCCUPATION_TYPE,ORGANIZATION_TYPE,WALLSMATERIAL_MODE,WEEKDAY_APPR_PROCESS_START
0,M,No,N,Y,reg oper account,block of flats,Cash loans,Secondary / secondary special,Single / not married,House / apartment,Working,Unaccompanied,Laborers,Business Entity Type 3,"Stone, brick",WEDNESDAY
1,F,No,N,N,reg oper account,block of flats,Cash loans,Higher education,Married,House / apartment,State servant,Family,Core staff,School,Block,MONDAY
2,M,,Y,Y,,,Revolving loans,Secondary / secondary special,Single / not married,House / apartment,Working,Unaccompanied,Laborers,Government,,MONDAY
3,F,,N,Y,,,Cash loans,Secondary / secondary special,Civil marriage,House / apartment,Working,Unaccompanied,Laborers,Business Entity Type 3,,WEDNESDAY
4,M,,N,Y,,,Cash loans,Secondary / secondary special,Single / not married,House / apartment,Working,Unaccompanied,Core staff,Religion,,THURSDAY


In [32]:
Categorical.shape

(356255, 16)

In [33]:
Cate_variable_list = Categorical.columns.tolist()
Cate_variable_list

['CODE_GENDER',
 'EMERGENCYSTATE_MODE',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'FONDKAPREMONT_MODE',
 'HOUSETYPE_MODE',
 'NAME_CONTRACT_TYPE',
 'NAME_EDUCATION_TYPE',
 'NAME_FAMILY_STATUS',
 'NAME_HOUSING_TYPE',
 'NAME_INCOME_TYPE',
 'NAME_TYPE_SUITE',
 'OCCUPATION_TYPE',
 'ORGANIZATION_TYPE',
 'WALLSMATERIAL_MODE',
 'WEEKDAY_APPR_PROCESS_START']

In [34]:
for col in Categorical.columns:
    print(col)
    print(len(Categorical[col].unique()), Categorical[col].unique())

CODE_GENDER
3 [M, F, XNA]
Categories (3, object): [M, F, XNA]
EMERGENCYSTATE_MODE
3 [No, NaN, Yes]
Categories (2, object): [No, Yes]
FLAG_OWN_CAR
2 [N, Y]
Categories (2, object): [N, Y]
FLAG_OWN_REALTY
2 [Y, N]
Categories (2, object): [Y, N]
FONDKAPREMONT_MODE
5 [reg oper account, NaN, org spec account, reg oper spec account, not specified]
Categories (4, object): [reg oper account, org spec account, reg oper spec account, not specified]
HOUSETYPE_MODE
4 [block of flats, NaN, terraced house, specific housing]
Categories (3, object): [block of flats, terraced house, specific housing]
NAME_CONTRACT_TYPE
2 [Cash loans, Revolving loans]
Categories (2, object): [Cash loans, Revolving loans]
NAME_EDUCATION_TYPE
5 [Secondary / secondary special, Higher education, Incomplete higher, Lower secondary, Academic degree]
Categories (5, object): [Secondary / secondary special, Higher education, Incomplete higher, Lower secondary, Academic degree]
NAME_FAMILY_STATUS
6 [Single / not married, Married, 

In [35]:
Categorical = pd.get_dummies(Categorical, dummy_na = True)
Categorical.head()

Unnamed: 0,CODE_GENDER_F,CODE_GENDER_M,CODE_GENDER_XNA,CODE_GENDER_nan,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes,EMERGENCYSTATE_MODE_nan,FLAG_OWN_CAR_N,FLAG_OWN_CAR_Y,FLAG_OWN_CAR_nan,...,WALLSMATERIAL_MODE_Wooden,WALLSMATERIAL_MODE_nan,WEEKDAY_APPR_PROCESS_START_FRIDAY,WEEKDAY_APPR_PROCESS_START_MONDAY,WEEKDAY_APPR_PROCESS_START_SATURDAY,WEEKDAY_APPR_PROCESS_START_SUNDAY,WEEKDAY_APPR_PROCESS_START_THURSDAY,WEEKDAY_APPR_PROCESS_START_TUESDAY,WEEKDAY_APPR_PROCESS_START_WEDNESDAY,WEEKDAY_APPR_PROCESS_START_nan
0,0,1,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
1,1,0,0,0,1,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
2,0,1,0,0,0,0,1,0,1,0,...,0,1,0,1,0,0,0,0,0,0
3,1,0,0,0,0,0,1,1,0,0,...,0,1,0,0,0,0,0,0,1,0
4,0,1,0,0,0,0,1,1,0,0,...,0,1,0,0,0,0,1,0,0,0


In [36]:
cate_df = pd.DataFrame(Categorical.apply(lambda x: sum(x))).reset_index()
drop_list = cate_df[cate_df[0] == 0]['index'].tolist()
drop_list

['CODE_GENDER_nan',
 'FLAG_OWN_CAR_nan',
 'FLAG_OWN_REALTY_nan',
 'NAME_CONTRACT_TYPE_nan',
 'NAME_EDUCATION_TYPE_nan',
 'NAME_FAMILY_STATUS_nan',
 'NAME_HOUSING_TYPE_nan',
 'NAME_INCOME_TYPE_nan',
 'ORGANIZATION_TYPE_nan',
 'WEEKDAY_APPR_PROCESS_START_nan']

In [37]:
Categorical = Categorical.drop(columns = drop_list)
Categorical['TARGET'] = full['TARGET']
Categorical.shape

(356255, 147)

In [38]:
cate_train = Categorical[Categorical['TARGET'].notnull()]
cate_test = Categorical[Categorical['TARGET'].isnull()]

#### Step 2
* use cate_train to get interaction items for top5 target_related variables
* combine interact items

In [39]:
thres_interact = [0.18, 0.057]

In [40]:
corrs_target = target_corrs(cate_train)
corrs_target.head(10)

Unnamed: 0,variable,corr_with_target
0,NAME_INCOME_TYPE_Working,0.057481
1,NAME_EDUCATION_TYPE_Higher education,-0.056593
2,CODE_GENDER_M,0.054713
3,CODE_GENDER_F,-0.054704
4,NAME_EDUCATION_TYPE_Secondary / secondary special,0.049824
5,NAME_INCOME_TYPE_Pensioner,-0.046209
6,ORGANIZATION_TYPE_XNA,-0.045987
7,OCCUPATION_TYPE_Laborers,0.043019
8,EMERGENCYSTATE_MODE_No,-0.042201
9,EMERGENCYSTATE_MODE_nan,0.041392


In [41]:
cate_intact = corrs_target['variable'][:5].tolist()
cate_intact

['NAME_INCOME_TYPE_Working',
 'NAME_EDUCATION_TYPE_Higher education',
 'CODE_GENDER_M',
 'CODE_GENDER_F',
 'NAME_EDUCATION_TYPE_Secondary / secondary special']

In [42]:
key_v, key_intact_variable_list = interact_variable_list(cate_train, 'TARGET', thresh = 1)

In [40]:
key_intact_variable_list

['NAME_INCOME_TYPE_Working',
 'CODE_GENDER_M*NAME_INCOME_TYPE_Working',
 'CODE_GENDER_M*NAME_EDUCATION_TYPE_Secondary / secondary special',
 'CODE_GENDER_M*NAME_INCOME_TYPE_Working*NAME_EDUCATION_TYPE_Secondary / secondary special',
 'NAME_INCOME_TYPE_Working*NAME_EDUCATION_TYPE_Secondary / secondary special']

In [43]:
key_intact_variable_list = key_intact_variable_list[1:]
key_intact_variable_list

['CODE_GENDER_M*NAME_INCOME_TYPE_Working',
 'CODE_GENDER_M*NAME_EDUCATION_TYPE_Secondary / secondary special',
 'CODE_GENDER_M*NAME_INCOME_TYPE_Working*NAME_EDUCATION_TYPE_Secondary / secondary special',
 'NAME_INCOME_TYPE_Working*NAME_EDUCATION_TYPE_Secondary / secondary special']

In [44]:
cate_train = add_interact_variable(cate_train).drop(columns = ['TARGET'])
cate_test = add_interact_variable(cate_test).drop(columns = ['TARGET'])

In [45]:
cate_train.shape[1] == cate_test.shape[1]

True

In [46]:
train, test = full[full['TARGET'].notnull()].copy(), full[full['TARGET'].isnull()].copy()

In [47]:
train = train.drop(columns = Cate_variable_list)
train[cate_train.columns] = cate_train

In [48]:
test = test.drop(columns = Cate_variable_list)
test[cate_test.columns] = cate_test

In [49]:
gc.enable()
del full, Categorical, cate_train, cate_test
gc.collect()

252

### Check with interaction

In [50]:
corrs_target = target_corrs(train.select_dtypes('number'))
corrs_target.head(10)

Unnamed: 0,variable,corr_with_target
0,EXIT_SOURCE_MEAN,-0.222052
1,EXT_SOURCE_3,-0.178919
2,EXIT_SOURCE_SUM,-0.173322
3,EXT_SOURCE_2,-0.160472
4,EXT_SOURCE_1,-0.155317
5,BUREAU_DAYS_CREDIT_mean,0.089728
6,BB_CLIENT_BB_LOAN_MONTHS_BALANCE_count_mean,-0.080193
7,DAYS_BIRTH,0.078239
8,PREVIOUS_NAME_CONTRACT_STATUS_Refused_mean,0.077671
9,BUREAU_DAYS_CREDIT_min,0.075247


In [57]:
thres_interact = [0.18, 0.057]

In [58]:
key_v, key_intact_variable_list = interact_variable_list(train, 'TARGET', thresh = 0)

In [59]:
key_intact_variable_list

['EXIT_SOURCE_MEAN',
 'EXT_SOURCE_3*EXT_SOURCE_2',
 'EXIT_SOURCE_MEAN*EXT_SOURCE_3',
 'EXT_SOURCE_3*EXIT_SOURCE_SUM',
 'EXIT_SOURCE_MEAN*EXIT_SOURCE_SUM',
 'EXT_SOURCE_2*EXT_SOURCE_3*EXT_SOURCE_1',
 'EXT_SOURCE_3*EXT_SOURCE_1',
 'EXT_SOURCE_3*EXIT_SOURCE_MEAN*EXT_SOURCE_2',
 'EXIT_SOURCE_MEAN*EXT_SOURCE_2',
 'EXT_SOURCE_3*EXIT_SOURCE_MEAN*EXIT_SOURCE_SUM',
 'EXIT_SOURCE_SUM*EXT_SOURCE_3*EXT_SOURCE_2',
 'EXT_SOURCE_2*EXT_SOURCE_3*EXIT_SOURCE_SUM']

In [60]:
key_intact_variable_list = key_intact_variable_list[1:]
key_intact_variable_list 

['EXT_SOURCE_3*EXT_SOURCE_2',
 'EXIT_SOURCE_MEAN*EXT_SOURCE_3',
 'EXT_SOURCE_3*EXIT_SOURCE_SUM',
 'EXIT_SOURCE_MEAN*EXIT_SOURCE_SUM',
 'EXT_SOURCE_2*EXT_SOURCE_3*EXT_SOURCE_1',
 'EXT_SOURCE_3*EXT_SOURCE_1',
 'EXT_SOURCE_3*EXIT_SOURCE_MEAN*EXT_SOURCE_2',
 'EXIT_SOURCE_MEAN*EXT_SOURCE_2',
 'EXT_SOURCE_3*EXIT_SOURCE_MEAN*EXIT_SOURCE_SUM',
 'EXIT_SOURCE_SUM*EXT_SOURCE_3*EXT_SOURCE_2',
 'EXT_SOURCE_2*EXT_SOURCE_3*EXIT_SOURCE_SUM']

In [62]:
train = add_interact_variable(train)

In [63]:
train.shape

(307511, 957)

In [64]:
test = add_interact_variable(test)

In [65]:
test.shape

(48744, 957)

##### -----No further interaction required

### Log transformation

In [67]:
numeric = get_numeric(train, 'TARGET')
numeric.head()

Unnamed: 0,SK_ID_CURR,AMT_ANNUITY,AMT_CREDIT,AMT_INCOME_TOTAL,AMT_REQ_CREDIT_BUREAU_YEAR,APARTMENTS_AVG,BASEMENTAREA_AVG,CNT_FAM_MEMBERS,COMMONAREA_AVG,DAYS_BIRTH,...,EXIT_SOURCE_MEAN*EXT_SOURCE_3,EXT_SOURCE_3*EXIT_SOURCE_SUM,EXIT_SOURCE_MEAN*EXIT_SOURCE_SUM,EXT_SOURCE_2*EXT_SOURCE_3*EXT_SOURCE_1,EXT_SOURCE_3*EXT_SOURCE_1,EXT_SOURCE_3*EXIT_SOURCE_MEAN*EXT_SOURCE_2,EXIT_SOURCE_MEAN*EXT_SOURCE_2,EXT_SOURCE_3*EXIT_SOURCE_MEAN*EXIT_SOURCE_SUM,EXIT_SOURCE_SUM*EXT_SOURCE_3*EXT_SOURCE_2,EXT_SOURCE_2*EXT_SOURCE_3*EXIT_SOURCE_SUM
0,100002.0,24700.5,406597.5,202500.0,1.0,0.0247,0.0369,1.0,0.0143,-9461.0,...,0.022549,0.067648,0.078525,0.003043,0.011573,0.005929,0.042542,0.010945,0.017788,0.017788
1,100003.0,35698.5,1293502.5,270000.0,0.0,0.0959,0.0529,2.0,0.0605,-16765.0,...,,,0.435723,,,,0.290437,,,
2,100004.0,6750.0,135000.0,67500.0,0.0,,,1.0,,-19046.0,...,0.468921,0.937842,0.826228,,,0.260679,0.357307,0.602788,0.521358,0.521358
3,100006.0,29686.5,312682.5,135000.0,,,,2.0,,-19005.0,...,,,0.423074,,,,0.423074,,,
4,100007.0,21865.5,513000.0,121500.0,0.0,,,1.0,,-19932.0,...,,,0.10416,,,,0.10416,,,


##### is abs a good choice???

In [68]:
unused_field = ['SK_ID_CURR']
numeric = numeric.drop(columns = unused_field)
numeric = abs(numeric)

In [69]:
log_list = get_log_list(numeric, 'TARGET')
print(log_list[:5])
print(len(log_list))

['AMT_ANNUITY', 'AMT_CREDIT', 'DAYS_ID_PUBLISH', 'DAYS_LAST_PHONE_CHANGE', 'DAYS_REGISTRATION']
138


In [70]:
train = log_transf(train)
test = log_transf(test)

In [71]:
gc.enable()
del numeric
gc.collect()

427

### Missing value imputing

* Impute Missing for binary variables

In [72]:
binary_list = get_binary(train, 'TARGET')
binary_list_test = [x for x in binary_list if x not in 'TARGET']

In [73]:
train[binary_list] = train[binary_list].fillna(0)
test[binary_list_test] = test[binary_list_test].fillna(0)

* Impute Missing for numeric variables

In [74]:
numeric = get_numeric(train, 'TARGET').drop(columns = ['SK_ID_CURR'])
numeric.head()

Unnamed: 0,AMT_INCOME_TOTAL,AMT_REQ_CREDIT_BUREAU_YEAR,APARTMENTS_AVG,BASEMENTAREA_AVG,CNT_FAM_MEMBERS,COMMONAREA_AVG,DAYS_BIRTH,DAYS_EMPLOYED,ENTRANCES_AVG,EXT_SOURCE_1,...,NAME_CONTRACT_TYPE_Cash loans_log,NAME_EDUCATION_TYPE_Secondary / secondary special_log,NAME_FAMILY_STATUS_Single / not married_log,NAME_INCOME_TYPE_Working_log,NAME_TYPE_SUITE_Unaccompanied_log,OCCUPATION_TYPE_Laborers_log,CODE_GENDER_M*NAME_INCOME_TYPE_Working_log,CODE_GENDER_M*NAME_EDUCATION_TYPE_Secondary / secondary special_log,CODE_GENDER_M*NAME_INCOME_TYPE_Working*NAME_EDUCATION_TYPE_Secondary / secondary special_log,NAME_INCOME_TYPE_Working*NAME_EDUCATION_TYPE_Secondary / secondary special_log
0,202500.0,1.0,0.0247,0.0369,1.0,0.0143,-9461.0,-637.0,0.069,0.083037,...,0.693359,0.693359,0.693359,0.693359,0.693359,0.693359,0.693359,0.693359,0.693359,0.693359
1,270000.0,0.0,0.0959,0.0529,2.0,0.0605,-16765.0,-1188.0,0.0345,0.311267,...,0.693359,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,67500.0,0.0,,,1.0,,-19046.0,-225.0,,,...,0.0,0.693359,0.693359,0.693359,0.693359,0.693359,0.693359,0.693359,0.693359,0.693359
3,135000.0,,,,2.0,,-19005.0,-3039.0,,,...,0.693359,0.693359,0.0,0.693359,0.693359,0.693359,0.0,0.0,0.0,0.693359
4,121500.0,0.0,,,1.0,,-19932.0,-3038.0,,,...,0.693359,0.693359,0.693359,0.693359,0.693359,0.0,0.693359,0.693359,0.693359,0.693359


In [75]:
missing_imp_lists = missing_imp_list(numeric, 'TARGET')
print(missing_imp_lists[:5])
print(len(missing_imp_lists))

['APARTMENTS_AVG', 'BASEMENTAREA_AVG', 'CNT_FAM_MEMBERS', 'COMMONAREA_AVG', 'ENTRANCES_AVG']
158


In [76]:
train = missing_imputing(train)
test = missing_imputing(test)

In [77]:
gc.enable()
del numeric
gc.collect()

896

In [78]:
train.shape

(307511, 957)

In [79]:
test.shape

(48744, 957)

### Modeling

* sampling for params tuning
* modeling
* might try rf?

In [80]:
import lightgbm as lgb

from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV

In [81]:
params = {'boosting_type': 'gbdt',
          'max_depth' : -1,
          'objective': 'binary',
          'nthread': 3, # Updated from nthread
          'num_leaves': 64,
          'learning_rate': 0.05,
          'max_bin': 512,
          'subsample_for_bin': 200,
          'subsample': 1,
          'subsample_freq': 1,
          'colsample_bytree': 0.8,
          'reg_alpha': 5,
          'reg_lambda': 10,
          'min_split_gain': 0.5,
          'min_child_weight': 1,
          'min_child_samples': 5,
          'scale_pos_weight': 1,
          'num_class' : 1,
          'metric' : 'binary_error'}

In [82]:
gridParams = {
    'learning_rate': 0.015,
    'n_estimators': 5000,
    'class_weight': 'balanced',
    'objective' : 'binary',
    'subsample' : 0.8,
    'reg_alpha' : 0.01,
    'reg_lambda' : 0.05,
    'random_state': 50
    }

In [83]:
params.update(gridParams)

In [84]:
train_ids = train['SK_ID_CURR']
test_ids = test['SK_ID_CURR']
    
y_train = train['TARGET']
    
x_train = train.drop(columns = ['SK_ID_CURR', 'TARGET'])
x_test = test.drop(columns = ['SK_ID_CURR', 'TARGET'])

In [85]:
feature_names = list(x_train.columns)

x_train = np.array(x_train)
x_test = np.array(x_test)

k_fold = KFold(n_splits = 5, shuffle = False, random_state = 50)

feature_importance_values = np.zeros(len(feature_names))

test_predictions = np.zeros(x_test.shape[0])

out_of_fold = np.zeros(x_train.shape[0])

valid_scores = []
train_scores = []

In [86]:
params

{'boosting_type': 'gbdt',
 'max_depth': -1,
 'objective': 'binary',
 'nthread': 3,
 'num_leaves': 64,
 'learning_rate': 0.015,
 'max_bin': 512,
 'subsample_for_bin': 200,
 'subsample': 0.8,
 'subsample_freq': 1,
 'colsample_bytree': 0.8,
 'reg_alpha': 0.01,
 'reg_lambda': 0.05,
 'min_split_gain': 0.5,
 'min_child_weight': 1,
 'min_child_samples': 5,
 'scale_pos_weight': 1,
 'num_class': 1,
 'metric': 'binary_error',
 'n_estimators': 5000,
 'class_weight': 'balanced',
 'random_state': 50}

In [87]:
for train_indices, valid_indices in k_fold.split(x_train):
        
        # Training data for the fold
    train_features, train_labels = x_train[train_indices], y_train[train_indices]
        # Validation data for the fold
    valid_features, valid_labels = x_train[valid_indices], y_train[valid_indices]
        
        # Create the model
    
    model = lgb.LGBMClassifier(**params)
    model.fit(train_features, train_labels, eval_metric = 'auc',
              eval_set = [(valid_features, valid_labels), (train_features, train_labels)],
              eval_names = ['valid', 'train'],
              early_stopping_rounds = 200, verbose = 1000)
    best_iteration = model.best_iteration_
    
    feature_importance_values += model.feature_importances_ / k_fold.n_splits
    
    test_predictions += model.predict_proba(x_test, num_iteration = best_iteration)[:, 1] / k_fold.n_splits
    
    out_of_fold[valid_indices] = model.predict_proba(valid_features, num_iteration = best_iteration)[:, 1]
    
    valid_score = model.best_score_['valid']['auc']
    train_score = model.best_score_['train']['auc']
    valid_scores.append(valid_score)
    train_scores.append(train_score)
    

Training until validation scores don't improve for 200 rounds.
[1000]	valid's auc: 0.782943	valid's binary_error: 0.233777	train's auc: 0.905848	train's binary_error: 0.202355
Early stopping, best iteration is:
[1115]	valid's auc: 0.783078	valid's binary_error: 0.228054	train's auc: 0.914806	train's binary_error: 0.192986
Training until validation scores don't improve for 200 rounds.
[1000]	valid's auc: 0.788191	valid's binary_error: 0.234122	train's auc: 0.905972	train's binary_error: 0.199773
Early stopping, best iteration is:
[1216]	valid's auc: 0.788266	valid's binary_error: 0.223131	train's auc: 0.922556	train's binary_error: 0.182961
Training until validation scores don't improve for 200 rounds.
[1000]	valid's auc: 0.779791	valid's binary_error: 0.234399	train's auc: 0.905505	train's binary_error: 0.200932
Early stopping, best iteration is:
[1152]	valid's auc: 0.780019	valid's binary_error: 0.226903	train's auc: 0.91735	train's binary_error: 0.189058
Training until validation sco

In [88]:
submission = pd.DataFrame({'SK_ID_CURR': test_ids, 'TARGET': test_predictions})

In [89]:
for i in range(0, submission.shape[0]):
    if submission.iloc[i, 1] > 1:
        submission.iloc[i, 1] = 1

In [90]:
submission['SK_ID_CURR'] = submission['SK_ID_CURR'].astype('int32')

In [91]:
submission.to_csv('sub_lgb_3_10_1.csv', index = False)