In [None]:
import gc
import os
import logging
import datetime
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
import xgboost as xgb
from tqdm import tqdm_notebook
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.preprocessing import StandardScaler,MinMaxScaler
warnings.filterwarnings('ignore')

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, LearningRateScheduler
from keras.models import load_model
from keras.initializers import glorot_normal, Zeros, Ones
import keras.backend as K
from keras.optimizers import RMSprop
import tensorflow as tf

In [None]:
train = pd.read_csv('../input/train.csv')
test  = pd.read_csv('../input/test.csv')

In [None]:
train.head()

In [None]:
train.describe()

In [None]:
features = [c for c in train.columns if c not in ['ID_code', 'target']]
test_id_code = test['ID_code']
train_id_code = train['ID_code']
target = train['target']

In [None]:
df_train = train[features].copy()
df_test = test[features].copy()
train_target = target.values
ntrain = df_train.shape[0]
ntest  = df_test.shape[0]

In [None]:
from scipy.special import erfinv
def hot_encoder(df, columns):
    one_hot = {c: list(df[c].unique()) for c in columns}
    for c in one_hot:
        for val in one_hot[c]:
            df[c+'_oh_' + str(val)] = (df[c].values == val).astype(np.int)
    return df

def scale_feat(df):
    scaler = StandardScaler()
    return pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

def rank_gauss(x):
    # x is numpy vector
    N = x.shape[0]
    temp = x.argsort()
    rank_x = temp.argsort() / N
    rank_x -= rank_x.mean()
    rank_x *= 2 # rank_x.max(), rank_x.min() should be in (-1, 1)
    efi_x = erfinv(rank_x) # np.sqrt(2)*erfinv(rank_x)
    efi_x -= efi_x.mean()
    return efi_x

def df_inputSwapNoise(df, p):
    n = df.shape[0]
    idx = list(range(n))
    swap_n = round(n*p)
    for col in df.columns:
        arr = df[col].values
        col_vals = np.random.permutation(arr)
        swap_idx = np.random.choice(idx, size= swap_n)
        arr[swap_idx] = np.random.choice(col_vals, size = swap_n)
        df[col] = arr
    return df

In [None]:
print('Transforming data')
feature_cols = [c for c in df_train.columns if c not in ['ID_code','target']]
keep_cols    = [c for c in feature_cols]
df_all = pd.concat([df_train[keep_cols], df_test[keep_cols]])
df_all_org = df_all.copy()
df_all_noise = df_inputSwapNoise(df_all, 0.15)
data_all_org = df_all_org.values
data_all_noise = df_all_noise.values
cols = data_all_org.shape[1]
print('Final data with {} columns'.format(cols))

In [None]:
for i in range(cols):
    u = np.unique(data_all_org[:,i])
    if u.shape[0] > 3:
        data_all_org[:,i] = rank_gauss(data_all_org[:,i])

for i in range(cols):
    u = np.unique(data_all_noise[:,i])
    if u.shape[0] > 3:
        data_all_noise[:,i] = rank_gauss(data_all_noise[:,i])

train_data_orig = data_all_org[0:ntrain,:]
test_data_orig  = data_all_org[ntrain:,:]
train_data_noise = data_all_noise[0:ntrain,:]
test_data_noise  = data_all_noise[ntrain:,:]
print(train_data_orig.shape)
print(test_data_orig.shape)
print(train_data_noise.shape)
print(test_data_noise.shape)

In [None]:
print('Original data')
all_data = np.vstack((train_data_orig, test_data_orig))
print('Noise data')
all_data_noise = np.vstack((train_data_noise, test_data_noise))

In [None]:
print('Creating neural net')
model = Sequential()
model.add(Dense(units=1500, input_dim = all_data.shape[1], kernel_initializer=glorot_normal()))
model.add(Activation('relu'))
model.add(BatchNormalization())
model.add(Dense(units=1500, kernel_initializer=glorot_normal()))
model.add(Activation('relu'))
model.add(BatchNormalization())
model.add(Dense(units=1500, kernel_initializer=glorot_normal()))
model.add(Activation('relu'))
model.add(BatchNormalization())
model.add(Dense(all_data.shape[1])) 
model.add(Activation('linear'))

opt = keras.optimizers.Adam(lr=0.0009)
model.compile(loss='mse', optimizer=opt)

In [None]:
print('Training neural net')
###Train this for 120 epochs
epochs = 15
chck = ModelCheckpoint('keras_dae.h5', monitor='loss', save_best_only=True)
cb = [ EarlyStopping(monitor='loss', patience=100, verbose=2, min_delta=0), chck ]
model.fit(all_data_noise, all_data, batch_size=128, verbose=1, epochs=epochs, callbacks=cb)

print('Applying neural net')
train_data_transform = model.predict(train_data_orig)
test_data_transform = model.predict(test_data_orig)
print(train_data_transform.shape)
print(test_data_transform.shape)

In [None]:
train_data = pd.DataFrame(data=train_data_transform, columns=features) 
test_data = pd.DataFrame(data=test_data_transform, columns=features)

In [None]:
train_data.to_csv('train_dae.csv',index=False)
test_data.to_csv('test_dae.csv',index=False)

In [None]:
train_data.head()

In [None]:
param = {
    'bagging_freq': 5,
    'bagging_fraction': 0.335,
    'boost_from_average':'true',
    'boost': 'gbdt',
    'feature_fraction': 0.041,
    'learning_rate': 0.0083,
    'max_depth': -1,
    'metric':'auc',
    'min_data_in_leaf': 80,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 13,
    'num_threads': 8,
    'tree_learner': 'serial',
    'objective': 'binary',
    'verbosity': 1
}

In [None]:
folds = StratifiedKFold(n_splits=15, shuffle=False, random_state=2319)
oof = np.zeros(len(train_data))
predictions = np.zeros(len(test_data))
feature_importance_df = pd.DataFrame()
print(train_data.shape)

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_data.values, target.values)):
    print("Fold {}".format(fold_))
    trn_data = lgb.Dataset(train_data.iloc[trn_idx][features], label=target.iloc[trn_idx])
    val_data = lgb.Dataset(train_data.iloc[val_idx][features], label=target.iloc[val_idx])

    num_round = 1000000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 4000)
    oof[val_idx] = clf.predict(train_data.iloc[val_idx][features], num_iteration=clf.best_iteration)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions += clf.predict(test_data[features], num_iteration=clf.best_iteration) / folds.n_splits

print("CV score: {:<8.5f}".format(roc_auc_score(target, oof)))

In [None]:
sub_df = pd.DataFrame({"ID_code":test_id_code})
sub_df["target"] = predictions
sub_df.to_csv("submission1.csv", index=False)


In [None]:
##end