In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data proces

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
# Libraries
pd.set_option('max_columns', None)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import datetime
import lightgbm as lgb
from scipy import stats
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold, cross_val_score, GridSearchCV, RepeatedStratifiedKFold
from sklearn.preprocessing import StandardScaler, RobustScaler
import os
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import xgboost as xgb
import lightgbm as lgb
from sklearn import model_selection
from sklearn.metrics import accuracy_score, roc_auc_score
import json
import ast
import time
from sklearn import linear_model
from sklearn.feature_selection import RFECV
import eli5
from eli5.sklearn import PermutationImportance
import shap

from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_selection import GenericUnivariateSelect, SelectPercentile, SelectKBest, f_classif, mutual_info_classif, RFE
import statsmodels.api as sm
import warnings
warnings.filterwarnings('ignore')
from catboost import CatBoostClassifier

from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline

import chainer
import chainer.links as L
import chainer.functions as F
from chainer import optimizers
from chainer.cuda import to_cpu

import optuna
from optuna.integration import ChainerPruningExtension

In [None]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
train.shape

In [None]:
train.head()

In [None]:
train[train.columns[2:]].std().plot('hist');
plt.title('Distribution of stds of all columns');

In [None]:
train[train.columns[2:]].mean().plot('hist');
plt.title('Distribution of means of all columns');

In [None]:
# we have no missing values
train.isnull().any().any()

In [None]:
print('Distributions of first 28 columns')
plt.figure(figsize=(26, 24))
for i, col in enumerate(list(train.columns)[2:30]):
    plt.subplot(7, 4, i + 1)
    plt.hist(train[col])
    plt.title(col)

In [None]:
train['target'].value_counts()

In [None]:
corrs = train.corr().abs().unstack().sort_values(kind="quicksort").reset_index()
corrs = corrs[corrs['level_0'] != corrs['level_1']]
corrs.tail(10)

In [None]:
X_train = train.drop(['id', 'target'], axis=1)
y_train = train['target']
X_test = test.drop(['id'], axis=1)
n_fold = 20
folds = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=42)
repeated_folds = RepeatedStratifiedKFold(n_splits=20, n_repeats=20, random_state=42)

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
gpu = 0

In [None]:
def create_model(trial, mode = "trial", params = None):
    layers = []
    if mode == "trial":
        n_fc_layers = trial.suggest_int('n_fc_layers', 1, 3)
        for i in range(n_fc_layers):
            n_units = int(trial.suggest_loguniform('n_units_l{}'.format(i), 4, 128))
            layers.append(L.Linear(None, n_units))
            layers.append(F.relu)
        layers.append(L.Linear(None, 1))
    elif mode == "train":
        if params==None:
            assert("Not found param error!")
            return None
        n_fc_layers = params['n_fc_layers']
        for i in range(n_fc_layers):
            n_units = int(params['n_units_l{}'.format(i)])
            layers.append(L.Linear(None, n_units))
            layers.append(F.relu)
        layers.append(L.Linear(None, 1))
    return chainer.Sequential(*layers)

In [None]:
def create_optimizer(trial, model, params=None):
    # 最適化関数の選択
    if trial is not None:
        optimizer_name = trial.suggest_categorical('optimizer', ['Adam', 'MomentumSGD'])
        if optimizer_name == 'Adam':
            adam_alpha = trial.suggest_loguniform('adam_alpha', 1e-5, 1e-1)
            optimizer = chainer.optimizers.Adam(alpha=adam_alpha)
        else:
            momentum_sgd_lr = trial.suggest_loguniform('momentum_sgd_lr', 1e-5, 1e-1)
            optimizer = chainer.optimizers.MomentumSGD(lr=momentum_sgd_lr)

        weight_decay = trial.suggest_loguniform('weight_decay', 1e-10, 1e-3)
    else:
        if params == None:
            assert("Not found params error!")
        optimizer_name = params["optimizer"]
        if optimizer_name == 'Adam':
            adam_alpha = params["adam_alpha"]
            optimizer = chainer.optimizers.Adam(alpha=adam_alpha)
        else:
            momentum_sgd_lr = params["momentum_sgd_lr"]
            optimizer = chainer.optimizers.MomentumSGD(lr=momentum_sgd_lr)
        weight_decay = params['weight_decay']
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.WeightDecay(weight_decay))
    return optimizer

In [None]:
import chainer
from chainer import datasets
class Dataset(chainer.dataset.DatasetMixin):
    def __init__(self, x, y):
        self.x = x
        self.y = y
        
    def __len__(self):
        return len(self.x)
    
    def get_example(self, i):
        return self.x[i], self.y[i]
    
    

In [None]:
from chainer import report 
class MyRegressor(chainer.Chain):

    def __init__(self, predictor):
        super(MyRegressor, self).__init__(predictor=predictor)

    def __call__(self, x, y):
        pred = self.predictor(x)
        loss = F.mean_squared_error(pred, y)

        report({'loss': loss}, self)
        return loss
    
def objective(trial, X_train, X_valid, y_train, y_valid):
    # モデルのインスタンス化
    model = MyRegressor(create_model(trial))
    optimizer = create_optimizer(trial, model) # モデルとoptimizerを紐付ける

    # モデルをGPUに移動
    gpu_id = 0
    model.to_gpu(gpu_id)
    max_epoch = 100
    now_epoch = 0
    X_train_, X_valid_ = model.xp.asarray(X_train), model.xp.asarray(X_valid)
    y_train_, y_valid_ = model.xp.asarray(y_train), model.xp.asarray(y_valid)
    X_test_ = model.xp.asarray(X_test)
    train = Dataset(X_train_, y_train_)
    valid = Dataset(X_valid_, y_valid_)
    train_iter = chainer.iterators.SerialIterator(train, train.__len__())
    valid_iter = chainer.iterators.SerialIterator(valid, test.__len__(), repeat=False, shuffle=False)
    updater = chainer.training.StandardUpdater(train_iter, optimizer, device=gpu_id)
    
    trainer = chainer.training.Trainer(updater, (max_epoch, 'epoch'))
    trainer.extend(chainer.training.extensions.Evaluator(valid_iter, model, device=gpu_id))
    log_report_extension = chainer.training.extensions.LogReport(log_name=None)
#     trainer.extend(chainer.training.extensions.PrintReport(['epoch', 'main/loss', 'validation/main/loss','elapsed_time']))
    trainer.extend(log_report_extension)
    # 学習の実行
    trainer.run()

    # 学習結果の保存
    log_last = log_report_extension.log[-1]
    for key, value in log_last.items():
        trial.set_user_attr(key, value)
        
    with chainer.using_config('train', False), \
                chainer.using_config('enable_backprop', False):
        y_pred_valid = model.predictor(X_valid_)

        score = roc_auc_score(to_cpu(y_valid_), to_cpu(y_pred_valid.data))
#         print('test_score:{:.04f}'.format(score))
    
    # 最終的なバリデーションの値を返す
    return 1.0 - score

In [None]:
from functools import partial
def train_model(X, X_test, y, folds=folds, gpu_id=0):
    prediction = np.zeros(len(X_test))[:, None]
    scores = []
    X_test = X_test.astype(np.float32)
    study = None
    for fold_n, (train_index, valid_index) in enumerate(folds.split(X, y)):
        # print('Fold', fold_n, 'started at', time.ctime())
        X_train, X_valid = X[train_index].astype(np.float32), X[valid_index].astype(np.float32)
        y_train, y_valid = y[train_index].astype(np.float32)[:, None], y[valid_index].astype(np.float32)[:, None]
        if fold_n == 0:
            study = optuna.create_study(pruner=optuna.pruners.MedianPruner(n_warmup_steps=5))
            study.optimize(partial(objective, X_train=X_train, X_valid=X_valid, y_train=y_train, y_valid=y_valid), n_trials=100)
            print('Number of finished trials: ', len(study.trials))
            trial = study.best_trial
            print('Best trial: ', )
            print('Params: ')
            param_n=[]
            param_v=[]
            for key, value in trial.params.items():
                print('{}:{}'.format(key, value))
                param_n.append(key)
                param_v.append(value)

            print('User attrs: ')
            attrs_n=[]
            attrs_v=[]
            for key, value in trial.user_attrs.items():
                print('{}:{}'.format(key, value))
                attrs_n.append(key)
                attrs_v.append(value)
        else:
            trial = study.best_trial
            model = MyRegressor(create_model(trial=None, mode="train", params=trial.params))
            optimizer = create_optimizer(trial=None, model=model, params=trial.params)
            gpu_id = 0
            model.to_gpu(gpu_id)
            max_epoch = 100
            now_epoch = 0
            X_train, X_valid = model.xp.asarray(X_train), model.xp.asarray(X_valid)
            y_train, y_valid = model.xp.asarray(y_train), model.xp.asarray(y_valid)
            X_test = model.xp.asarray(X_test)
            train = Dataset(X_train, y_train)
            valid = Dataset(X_valid, y_valid)
            train_iter = chainer.iterators.SerialIterator(train, train.__len__())
            valid_iter = chainer.iterators.SerialIterator(valid, test.__len__(), repeat=False, shuffle=False)
            updater = chainer.training.StandardUpdater(train_iter, optimizer, device=gpu_id)
            trainer = chainer.training.Trainer(updater, (max_epoch, 'epoch'))
            trainer.extend(chainer.training.extensions.Evaluator(valid_iter, model, device=gpu_id))
            log_report_extension = chainer.training.extensions.LogReport(log_name=None)
#             trainer.extend(chainer.training.extensions.PrintReport(['epoch', 'main/loss', 'validation/main/loss','elapsed_time']))
            trainer.extend(log_report_extension)
            # 学習の実行
            trainer.run()
            with chainer.using_config('train', False), \
                        chainer.using_config('enable_backprop', False):
                y_pred_valid = model.predictor(X_valid)

                score = roc_auc_score(to_cpu(y_valid), to_cpu(y_pred_valid.data))
                scores.append(score)
                print('test_score:{:.04f}'.format(score))
            y_pred = model.predictor(X_test)
            prediction += to_cpu(y_pred.data)
    prediction /= (n_fold - 1)
    
    print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))
    
    
    return prediction, scores

In [None]:
prediction, scores = train_model(X_train, X_test, y_train, folds=repeated_folds)

In [None]:
# # v9 0.849, v10(RobustScaler) 0.846
# submission = pd.read_csv('../input/sample_submission.csv')
# submission['target'] = prediction_lasso_repeated
# submission.to_csv('lasso_repeat_cv.csv', index=False)

# submission.head()

In [None]:
submission = pd.read_csv('../input/sample_submission.csv')
submission['target'] = prediction
submission.to_csv('MLP.csv', index=False)

submission.head()