In [None]:
import numpy as np
import pandas as pd 
import os
nn_result=pd.read_csv('../input/sub-data/lgb.csv')
log_result=pd.read_csv('../input/sub-data/nn_log.csv')
nn_result.rename(columns={'TARGET':'nn_TARGET'},inplace=True)
log_result.rename(columns={'TARGET':'log_TARGET'},inplace=True)
# print(nn_result)
# print(lgb_result)
sub=pd.merge(nn_result,log_result,on='SK_ID_CURR')
sub['TARGET']=0*sub['nn_TARGET']+1*sub['log_TARGET']
sub=sub.drop(columns=['log_TARGET','nn_TARGET'])
sub.to_csv('lgb_nn_log.csv', index=False, float_format='%.8f')

In [None]:
import time
import os
import gc
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

In [None]:
# Load data
start_time = time.time()
input_dir = os.path.join(os.pardir, 'input/sub-data')
print('Loading data...')
rows_read = None
app_train_df = pd.read_csv(os.path.join(input_dir, 'lgb.csv'), nrows=rows_read)
app_test_df = pd.read_csv(os.path.join(input_dir, 'lgb_nn_log.csv'), nrows=rows_read)
print('Time elapsed %.0f sec'%(time.time()-start_time))
print('Pre-processing data...')

In [None]:
# Merge the datasets into a single one
target = app_train_df.pop('TARGET')
len_train = len(app_train_df)
merged_df = pd.concat([app_train_df, app_test_df])
meta_df = merged_df.pop('SK_ID_CURR')
del app_test_df, app_train_df
gc.collect()

In [None]:
# Encode categoricals: 1-hot
categorical_feats = merged_df.columns[merged_df.dtypes == 'object']
print('Using %d prediction variables'%(merged_df.shape[1]))
print('Encoding %d non-numeric columns...'%(merged_df.columns[merged_df.dtypes == 'object'].shape))
for feat in categorical_feats:
    merged_df[feat].fillna('MISSING', inplace=True) # populate missing labels
    encoder = LabelBinarizer() # works with text
    new_columns = encoder.fit_transform(merged_df[feat])
    i=0
    for u in merged_df[feat].unique():
        if i<new_columns.shape[1]:
            merged_df[feat+'_'+u]=new_columns[:,i]
            i+=1
    merged_df.drop(feat, axis=1, inplace=True)
print('Now using %d prediction variables'%(merged_df.shape[1]))
print('Time elapsed %.0f sec'%(time.time()-start_time))

In [None]:
# handle missing values
null_counts = merged_df.isnull().sum()
null_counts = null_counts[null_counts > 0]
null_ratios = null_counts / len(merged_df)

In [None]:
# Drop columns over x% null
null_thresh = .8
null_cols = null_ratios[null_ratios > null_thresh].index
merged_df.drop(null_cols, axis=1, inplace=True)
if null_cols.shape[0] > 0:
    print('Columns dropped for being over %.2f null:'%(null_thresh))
    for col in null_cols:
        print(col)

In [None]:
# Fill the rest with 0
merged_df.fillna(0, inplace=True)

In [None]:
# scale continuous features
# first, convert large ingegers into floats.
for feat in merged_df.columns:
    if (merged_df[feat].max() > 100) | (merged_df[feat].min() < -100):
        merged_df[feat]=merged_df[feat].astype(np.float64)
scaler = StandardScaler()
continuous_feats = merged_df.columns[merged_df.dtypes == 'float64']
print('Scaling %d features...'%(continuous_feats.shape))
s1 = merged_df.shape[0],1
for feat in continuous_feats:
    merged_df[feat] = scaler.fit_transform(merged_df[feat].values.reshape(s1))

In [None]:
# Re-separate into train and test
train_df = merged_df[:len_train]
test_df = merged_df[len_train:]
del merged_df
gc.collect()

print('Time elapsed %.0f sec'%(time.time()-start_time))
print('Starting training...')

In [None]:
# define train parameters
L2c = 4e-4                    # loss, with L2
lr0 = 0.02                    # starting learning rate
lr_decay = 0.90               # lr decay rate
iterations = 101               # full passes over data
ROWS = train_df.shape[0]      # rows in input data
VARS = train_df.shape[1]      # vars used in the model
NUMB = 10000                  # batch size
NN = int(ROWS/NUMB)           # number of batches

In [None]:
# define the model
import tensorflow as tf
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
y_ = tf.placeholder(tf.float32, [None, 1])
x  = tf.placeholder(tf.float32, [None, VARS])

# model: logistic + 1 hidden layer
W      = tf.Variable(tf.truncated_normal([VARS,1],mean=0.0,stddev=0.001),dtype=np.float32)
NUML1  = 64
W1     = tf.Variable(tf.truncated_normal([VARS,NUML1],mean=0.0,stddev=0.0001),dtype=np.float32)
W1f    = tf.Variable(tf.truncated_normal([NUML1,1],mean=0.0,stddev=0.0001),dtype=np.float32)
logit1 = tf.matmul( x, W ) + tf.matmul(tf.nn.relu(tf.matmul( x, W1 )), W1f)
y      = tf.nn.sigmoid( logit1 )

In [None]:
# loss/optimizer
loss0 = tf.reduce_mean( (y_-y)*(y_-y) )
loss1 = L2c * (tf.nn.l2_loss( W ) + tf.nn.l2_loss( W1 ) + tf.nn.l2_loss( W1f ))
loss  = loss0 + loss1
global_step = tf.Variable(0, trainable=False)
learning_rate = tf.train.exponential_decay(lr0, global_step, NN, lr_decay)
train_step = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss,global_step=global_step)
sess = tf.InteractiveSession()
tf.global_variables_initializer().run()

In [None]:
# main training loop
y0=target.values.astype(np.float32)
x0=train_df.values.astype(np.float32)
del train_df
gc.collect()
y0_1=np.where(y0[0:int(NN*0.8)*NUMB] == 1)[0] # reserve last 20% for testing
y0_0=np.where(y0[0:int(NN*0.8)*NUMB] == 0)[0] # reserve last 20% for testing
for i in range(iterations):
    for j in range(int(NN*0.8)): # reserve last 20% for testing
        pos_ratio = 0.5
        #pos_idx = np.random.choice(y0_1, size=int(np.round(NUMB*pos_ratio)))
        #neg_idx = np.random.choice(y0_0, size=int(np.round(NUMB*(1-pos_ratio))))
        #idx = np.concatenate([pos_idx, neg_idx])
        #fd = {y_: y0[idx].reshape(NUMB,1),x:  x0[idx,:]}
        #_= sess.run( [train_step], feed_dict=fd )
   

In [None]:
#Predict on test set and create submission
x0     = test_df.values.astype(np.float32)
fd     = {y_: np.zeros([x0.shape[0],1]),x: x0}
y_pred = sess.run( y, feed_dict=fd )
out_df = pd.DataFrame({'SK_ID_CURR': meta_df[len_train:], 'TARGET': y_pred[:,0]})
out_df.to_csv('submission.csv', index=False)
print('Time elapsed %.0f sec'%(time.time()-start_time))

In [None]:
nn_result=pd.read_csv('../input/sub-data/lgb.csv')
log_result=pd.read_csv('../input/sub-data/submission.csv')
nn_result.rename(columns={'TARGET':'nn_TARGET'},inplace=True)
log_result.rename(columns={'TARGET':'log_TARGET'},inplace=True)
# print(nn_result)
# print(lgb_result)
sub=pd.merge(nn_result,log_result,on='SK_ID_CURR')
sub['TARGET']=1*sub['nn_TARGET']+1.2*sub['log_TARGET']
sub=sub.drop(columns=['log_TARGET','nn_TARGET'])
sub.to_csv('90sub_lgb_nn_log.csv', index=False, float_format='%.8f')