In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
pd.__version__

In [None]:
#audio_utils
import numpy as np

def select_feature_func(feature_name):
    if feature_name == 'aqibsaeed_1':
        return get_feature_aqibsaeed_1
    elif feature_name == 'mfcc':
        return get_feature_mfcc

def get_feature_mfcc(X, sr, n_mfcc=13):
    import librosa
    mfcc = librosa.feature.mfcc(y=X, sr=sr, n_mfcc=n_mfcc)
    return mfcc

def get_feature_aqibsaeed_1(X, sr, au_path=None):
    """
    http://aqibsaeed.github.io/2016-09-03-urban-sound-classification-part-1/
    """
    import librosa
    if au_path is not None:
        X, sr = librosa.load(au_path)
    stft = np.abs(librosa.stft(X))
    mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sr, n_mfcc=40).T,axis=0)
    chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sr).T,axis=0)
    mel = np.mean(librosa.feature.melspectrogram(X, sr=sr).T,axis=0)
    contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sr).T,axis=0)
    tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sr).T,axis=0)
    feature = np.hstack([mfccs,chroma,mel,contrast,tonnetz])
    return feature

def get_feature_aqibsaeed_conv(X, sr, au_path=None):
    """
    http://aqibsaeed.github.io/2016-09-24-urban-sound-classification-part-2/
    """
    import librosa
    def windows(data, window_size):
        start = 0
        while start < len(data):
            yield start, start + window_size
            start += (window_size / 2)
    bands = 60
    frames = 41
    window_size = 512 * (frames - 1)
    for (start,end) in windows(X, window_size):
        if(len(X[start:end]) == window_size):
            signal = X[start:end]
            melspec = librosa.feature.melspectrogram(signal, n_mels = bands)
            logspec = librosa.logamplitude(melspec)
            logspec = logspec.T.flatten()[:, np.newaxis].T
            log_specgrams.append(logspec)


In [None]:
#cache utils
def name2path(name):
    """
    Replace '/' in name by '_'
    """
    return name.replace("/", "-")


In [None]:
#config utils
# -*- coding:utf-8 -*-

def load_json(path):
    import json
    """
    支持以//开头的注释
    """
    lines = []
    with open(path) as f:
        for row in f.readlines():
            if row.strip().startswith("//"):
                continue
            lines.append(row)
    return json.loads("\n".join(lines))

def get_config_value(config, key, default_value, value_types, required=False, config_name=None):
    """

    Parameters
    ----------
    config: dict
        Config dictionary
    key: str
        Config's key
    default_value: str
        Default value when key is absent in config
    value_types: Type or List of Types
       if not None, should check value belongs one value_types
    required: bool
        if the key is required in config
    config_name: str
        used for debug
    """
    if config_name is not None:
        log_prefix = "[{}] ".format(config_name)
    else:
        log_prefix = ""
    if required and not key in config:
        raise ValueError("{}config={}, key={} is absent but it's required !!!".format(log_prefix, config, key))
    elif not key in config:
        return default_value
    value = config[key]
    # value type check
    if value is not None:
        value_type_match = True
        if value_types is None:
            value_types = []
        elif not isinstance(value_types, list):
            value_types = [value_types]
        for value_type in value_types:
            if not isinstance(value, value_type):
                value_type_match = False
                break
        if not value_type_match:
            raise ValueError("{}config={}, Value type not matched!!! key={}, value={}, value_types={}".format(
                log_prefix, config, key, value, value_types))
    return value


In [None]:
#debug utils
def repr_blobs_shape(blobs):
    res = []
    for b in blobs:
        if b is not None: 
            res.append('x'.join(map(str, b.shape)))
        else:
            res.append('null')
    return ','.join(res)


In [None]:
#log utils
# -*- coding:utf-8 -*-
import os, os.path as osp
import time

def strftime(t = None):
    return time.strftime("%Y%m%d-%H%M%S", time.localtime(t or time.time()))

#################
# Logging
#################
import logging
from logging.handlers import TimedRotatingFileHandler
logging.basicConfig(format="[ %(asctime)s][%(module)s.%(funcName)s] %(message)s")

DEFAULT_LEVEL = logging.INFO
DEFAULT_LOGGING_DIR = osp.join("logs", "gcforest")
fh = None

def init_fh():
    global fh
    if fh is not None:
        return
    if DEFAULT_LOGGING_DIR is None:
        return
    if not osp.exists(DEFAULT_LOGGING_DIR): os.makedirs(DEFAULT_LOGGING_DIR)
    logging_path = osp.join(DEFAULT_LOGGING_DIR, strftime() + ".log")
    fh = logging.FileHandler(logging_path)
    fh.setFormatter(logging.Formatter("[ %(asctime)s][%(module)s.%(funcName)s] %(message)s"))

def update_default_level(defalut_level):
    global DEFAULT_LEVEL
    DEFAULT_LEVEL = defalut_level

def update_default_logging_dir(default_logging_dir):
    global DEFAULT_LOGGING_DIR
    DEFAULT_LOGGING_DIR = default_logging_dir

def get_logger(name="gcforest", level=None):
    level = level or DEFAULT_LEVEL
    logger = logging.getLogger(name)
    logger.setLevel(level)
    init_fh()
    if fh is not None:
        logger.addHandler(fh)
    return logger


In [None]:
#win utils
# -*- coding:utf-8 -*-
import numpy as np
from joblib import Parallel, delayed

#import get_logger

LOGGER = get_logger('win.win_helper')

def get_windows_channel(X, X_win, des_id, nw, nh, win_x, win_y, stride_x, stride_y):
    """
    X: N x C x H x W
    X_win: N x nc x nh x nw
    (k, di, dj) in range(X.channle, win_y, win_x)
    """
    #des_id = (k * win_y + di) * win_x + dj
    dj = des_id % win_x
    di = des_id / win_x % win_y
    k = des_id / win_x / win_y
    src = X[:, k, di:di+nh*stride_y:stride_y, dj:dj+nw*stride_x:stride_x].ravel()
    des = X_win[des_id, :]
    np.copyto(des, src)

def get_windows(X, win_x, win_y, stride_x=1, stride_y=1, pad_x=0, pad_y=0):
    """
    parallizing get_windows
    Arguments:
        X (ndarray): n x c x h x w
    Return:
        X_win (ndarray): n x nh x nw x nc
    """
    assert len(X.shape) == 4
    n, c, h, w = X.shape
    if pad_y > 0:
        X = np.concatenate(( X, np.zeros((n, c, pad_y, w),dtype=X.dtype) ), axis=2)
        X = np.concatenate(( np.zeros((n, c, pad_y, w),dtype=X.dtype), X ), axis=2)
    n, c, h, w = X.shape
    if pad_x > 0:
        X = np.concatenate(( X, np.zeros((n, c, h, pad_x),dtype=X.dtype) ), axis=3)
        X = np.concatenate(( np.zeros((n, c, h, pad_x),dtype=X.dtype), X ), axis=3)
    n, c, h, w = X.shape
    nc = win_y * win_x * c
    nh = (h - win_y) / stride_y + 1
    nw = (w - win_x) / stride_x + 1
    X_win = np.empty(( nc, n * nh * nw), dtype=np.float32)
    LOGGER.info("get_windows_start: X.shape={}, X_win.shape={}, nw={}, nh={}, c={}, win_x={}, win_y={}, stride_x={}, stride_y={}".format(
                X.shape, X_win.shape, nw, nh, c, win_x, win_y, stride_x, stride_y))
    Parallel(n_jobs=-1, backend="threading", verbose=0)(
            delayed(get_windows_channel)(X, X_win, des_id, nw, nh, win_x, win_y, stride_x, stride_y)
            for des_id in range(c * win_x * win_y))
    LOGGER.info("get_windows_end")
    X_win = X_win.transpose((1, 0))
    X_win = X_win.reshape((n, nh, nw, nc))
    return X_win

def calc_accuracy(y_gt, y_pred, tag):
    LOGGER.info("Accuracy({})={:.2f}%".format(tag, np.sum(y_gt==y_pred)*100./len(y_gt)))

def win_vote(y_win_predict, n_classes):
    """ 
     
    y_win_predict (ndarray): n x n_window
        y_win_predict[i, j] prediction for the ith data of jth window 
    """
    y_pred = np.zeros(len(y_win_predict), dtype=np.int16)
    for i, y_bag in enumerate(y_win_predict):
        y_pred[i] = np.argmax(np.bincount(y_bag,minlength=n_classes))
    return y_pred

def win_avg(y_win_proba):
    """ 
     
    Parameters
    ----------
    y_win_proba: n x n_windows x n_classes
    """
    n_classes = y_win_proba.shape[-1]
    y_bag_proba = np.mean(y_win_proba, axis=1)
    y_pred = np.argmax(y_bag_proba, axis=1)
    return y_pred


In [None]:
#metrics
# -*- coding:utf-8 -*-
import numpy as np

#from .win_utils import win_vote, win_avg

def accuracy(y_true, y_pred):
    return 1.0 * np.sum(y_true == y_pred) / len(y_true)

def accuracy_pb(y_true, y_proba):
    y_true = y_true.reshape(-1)
    y_pred = np.argmax(y_proba.reshape((-1, y_proba.shape[-1])), 1)
    return 1.0 * np.sum(y_true == y_pred) / len(y_true)

def accuracy_win_vote(y_true, y_proba):
    """
 
    
    Parameters
    ----------
    y_true: n x n_windows
    y_proba: n x n_windows x n_classes
    """
    n_classes = y_proba.shape[-1]
    y_pred = win_vote(np.argmax(y_proba, axis=2), n_classes)
    return accuracy(y_true[:,0], y_pred)

def accuracy_win_avg(y_true, y_proba):
    """
 
    
    Parameters
    ----------
    y_true: n x n_windows
    y_proba: n x n_windows x n_classes
    """
    y_pred = win_avg(y_proba)
    return accuracy(y_true[:,0], y_pred)


In [None]:
#base_layer
# -*- coding:utf-8 -*-
"""
Description: A python 2.7 implementation of gcForest proposed in [1]. A demo implementation of gcForest library as well as some demo client scripts to demostrate how to use the code. The implementation is flexible enough for modifying the model or
fit your own datasets. 
Reference: [1] Z.-H. Zhou and J. Feng. Deep Forest: Towards an Alternative to Deep Neural Networks. In IJCAI-2017.  (https://arxiv.org/abs/1702.08835v2 )
Requirements: This package is developed with Python 2.7, please make sure all the demendencies are installed, which is specified in requirements.txt
ATTN: This package is free for academic usage. You can run it at your own risk. For other purposes, please contact Prof. Zhi-Hua Zhou(zhouzh@lamda.nju.edu.cn)
ATTN2: This package was developed by Mr.Ji Feng(fengj@lamda.nju.edu.cn). The readme file and demo roughly explains how to use the codes. For any problem concerning the codes, please feel free to contact Mr.Feng. 
"""
import os.path as osp
import numpy as np

#from ..utils.log_utils import get_logger
#from ..utils.config_utils import get_config_value

LOGGER = get_logger('gcforest.layers.base_layer')

class BaseLayer(object):
    def __init__(self, layer_config, data_cache):
        self.layer_config = layer_config
        self.name = layer_config["name"]
        self.bottom_names = layer_config["bottoms"]
        self.top_names = layer_config["tops"]
        self.data_cache = data_cache

    def get_value(self, key, default_value, value_types, required=False, config=None):
        return get_config_value(config or self.layer_config, key, default_value, value_types, 
                required=required, config_name=self.name)
        return value

    def check_top_cache(self, phases, ti):
        """
        Check if top cache exists

        Parameters
        ---------
        phases: List of str
            e.g. ["train", "test"]
        ti: int
            top index

        Return
        ------
        exist_mask: List of bool
            exist_mask[ti] represent tops[ti] is exist in cache (either keeped in memory or saved in disk)
        """
        top_name = self.top_names[ti]
        exist_mask = np.zeros(len(phases))
        for pi, phase in enumerate(phases):
            top = self.data_cache.get(phase, top_name, ignore_no_exist=True)
            exist_mask[pi] = top is not None
            if top is not None:
                LOGGER.info("[data][{},{}] top cache exists. tops[{}].shape={}".format(self.name, phase, ti, top.shape))
        return exist_mask

    def fit_transform(self, train_config):
        raise NotImplementedError()

    def transform(self):
        raise NotImplementedError()

    def score(self):
        pass


In [None]:
#fg_concat_layer
# -*- coding:utf-8 -*-
"""
Description: A python 2.7 implementation of gcForest proposed in [1]. A demo implementation of gcForest library as well as some demo client scripts to demostrate how to use the code. The implementation is flexible enough for modifying the model or
fit your own datasets. 
Reference: [1] Z.-H. Zhou and J. Feng. Deep Forest: Towards an Alternative to Deep Neural Networks. In IJCAI-2017.  (https://arxiv.org/abs/1702.08835v2 )
Requirements: This package is developed with Python 2.7, please make sure all the demendencies are installed, which is specified in requirements.txt
ATTN: This package is free for academic usage. You can run it at your own risk. For other purposes, please contact Prof. Zhi-Hua Zhou(zhouzh@lamda.nju.edu.cn)
ATTN2: This package was developed by Mr.Ji Feng(fengj@lamda.nju.edu.cn). The readme file and demo roughly explains how to use the codes. For any problem concerning the codes, please feel free to contact Mr.Feng. 
"""
import numpy as np

# from .base_layer import BaseLayer
# from ..utils.debug_utils import repr_blobs_shape
# from ..utils.log_utils import get_logger

LOGGER = get_logger('gcforest.layers.fg_concat_layer')

class FGConcatLayer(BaseLayer):
    def __init__(self, layer_config, data_cache):
        """
        Concat Layer
        """
        super(FGConcatLayer, self).__init__(layer_config, data_cache)
        self.axis = self.get_value("axis", -1, int)
        assert(len(self.bottom_names) > 0)
        assert(len(self.top_names) == 1)

    def fit_transform(self, train_config):
        LOGGER.info("[data][{}] bottoms={}, tops={}".format(self.name, self.bottom_names, self.top_names))
        self._transform(train_config.phases)

    def transform(self):
        LOGGER.info("[data][{}] bottoms={}, tops={}".format(self.name, self.bottom_names, self.top_names))
        self._transform(["test"])

    def _transform(self, phases):
        """
        bottoms:
            for example: n x Ci x w x h
        """
        for phase in phases:
            # check top cache
            if self.check_top_cache([phase], 0)[0]:
                continue
            bottoms = self.data_cache.gets(phase, self.bottom_names)
            LOGGER.info('[data][{},{}] bottoms.shape={}'.format(self.name, phase, repr_blobs_shape(bottoms)))
            if self.axis == -1:
                for i, bottom in enumerate(bottoms):
                    bottoms[i] = bottom.reshape((bottom.shape[0], -1))
                concat_data = np.concatenate(bottoms, axis=1)
            else:
                concat_data = np.concatenate(bottoms, axis=self.axis)
            LOGGER.info('[data][{},{}] tops[0].shape={}'.format(self.name, phase, concat_data.shape))
            self.data_cache.update(phase, self.top_names[0], concat_data)


In [None]:
#fg pool layer
# -*- coding:utf-8 -*-
"""
Description: A python 2.7 implementation of gcForest proposed in [1]. A demo implementation of gcForest library as well as some demo client scripts to demostrate how to use the code. The implementation is flexible enough for modifying the model or
fit your own datasets. 
Reference: [1] Z.-H. Zhou and J. Feng. Deep Forest: Towards an Alternative to Deep Neural Networks. In IJCAI-2017.  (https://arxiv.org/abs/1702.08835v2 )
Requirements: This package is developed with Python 2.7, please make sure all the demendencies are installed, which is specified in requirements.txt
ATTN: This package is free for academic usage. You can run it at your own risk. For other purposes, please contact Prof. Zhi-Hua Zhou(zhouzh@lamda.nju.edu.cn)
ATTN2: This package was developed by Mr.Ji Feng(fengj@lamda.nju.edu.cn). The readme file and demo roughly explains how to use the codes. For any problem concerning the codes, please feel free to contact Mr.Feng. 
"""
import numpy as np
#from tqdm import trange

# from .base_layer import BaseLayer
# from ..utils.debug_utils import repr_blobs_shape
# from ..utils.log_utils import get_logger

LOGGER = get_logger('gcforest.layers.fg_pool_layer')

class FGPoolLayer(BaseLayer):
    def __init__(self, layer_config, data_cache):
        """
        Pooling Layer (MaxPooling, AveragePooling)
        """
        super(FGPoolLayer, self).__init__(layer_config, data_cache)
        self.win_x = self.get_value("win_x", None, int, required=True)
        self.win_y = self.get_value("win_y", None, int, required=True)
        self.pool_method = self.get_value("pool_method", "avg", basestring)

    def fit_transform(self, train_config):
        LOGGER.info("[data][{}] bottoms={}, tops={}".format(self.name, self.bottom_names, self.top_names))
        self._transform(train_config.phases, True)

    def transform(self):
        LOGGER.info("[data][{}] bottoms={}, tops={}".format(self.name, self.bottom_names, self.top_names))
        self._transform(["test"], False)

    def _transform(self, phases, check_top_cache):
        for ti, top_name in enumerate(self.top_names):
            LOGGER.info("[progress][{}] ti={}/{}, top_name={}".format(ti, self.name, len(self.top_names), top_name))
            for phase in phases:
                # check top cache
                if check_top_cache and self.check_top_cache([phase], ti)[0]:
                    continue
                X = self.data_cache.get(phase, self.bottom_names[ti])
                LOGGER.info('[data][{},{}] bottoms[{}].shape={}'.format(self.name, phase, ti, X.shape))
                n, c, h, w = X.shape
                win_x, win_y = self.win_x, self.win_y
                #assert h % win_y == 0
                #assert w % win_x == 0
                #nh = int(h / win_y)
                #nw = int(w / win_x)
                nh = (h - 1) / win_y + 1
                nw = (w - 1) / win_x + 1
                X_pool = np.empty(( n, c, nh, nw), dtype=np.float32)
                #for k in trange(c, desc='loop channel'):
                #    for di in trange(nh, desc='loop win_y'):
                #        for dj in trange(nw, desc='loop win_x'):
                for k in range(c):
                    for di in range(nh):
                        for dj in range(nw):
                            si = di * win_y
                            sj = dj * win_x
                            src = X[:, k, si:si+win_y, sj:sj+win_x]
                            src = src.reshape((X.shape[0], -1))
                            if self.pool_method == 'max':
                                X_pool[:, k, di, dj] = np.max(src, axis=1)
                            elif self.pool_method == 'avg':
                                X_pool[:, k, di, dj] = np.mean(src, axis=1)
                            else:
                                raise ValueError('Unkown Pool Method, pool_method={}'.format(self.pool_method))
                #print ('\n')
                LOGGER.info('[data][{},{}] tops[{}].shape={}'.format(self.name, phase, ti, X_pool.shape))
                self.data_cache.update(phase, top_name, X_pool)


In [None]:
#base estimator
# -*- coding:utf-8 -*-
"""
Description: A python 2.7 implementation of gcForest proposed in [1]. A demo implementation of gcForest library as well as some demo client scripts to demostrate how to use the code. The implementation is flexible enough for modifying the model or
fit your own datasets. 
Reference: [1] Z.-H. Zhou and J. Feng. Deep Forest: Towards an Alternative to Deep Neural Networks. In IJCAI-2017.  (https://arxiv.org/abs/1702.08835v2 )
Requirements: This package is developed with Python 2.7, please make sure all the demendencies are installed, which is specified in requirements.txt
ATTN: This package is free for academic usage. You can run it at your own risk. For other purposes, please contact Prof. Zhi-Hua Zhou(zhouzh@lamda.nju.edu.cn)
ATTN2: This package was developed by Mr.Ji Feng(fengj@lamda.nju.edu.cn). The readme file and demo roughly explains how to use the codes. For any problem concerning the codes, please feel free to contact Mr.Feng. 
"""
import os, os.path as osp
import numpy as np

# from ..utils.log_utils import get_logger
# from ..utils.cache_utils import name2path

LOGGER = get_logger("gcforest.estimators.base_estimator")

def check_dir(path):
    d = osp.abspath(osp.join(path, osp.pardir))
    if not osp.exists(d):
        os.makedirs(d)

class BaseClassifierWrapper(object):
    def __init__(self, name, est_class, est_args):
        """
        name: str)
            Used for debug and as the filename this model may be saved in the disk
        """
        self.name = name
        self.est_class = est_class
        self.est_args = est_args
        self.cache_suffix = ".pkl"
        self.est = None

    def _init_estimator(self):
        """
        You can re-implement this function when inherient this class
        """
        est = self.est_class(**self.est_args)
        return est

    def fit(self, X, y, cache_dir=None):
        """
        cache_dir(str): 
            if not None
                then if there is something in cache_dir, dont have fit the thing all over again
                otherwise, fit it and save to model cache 
        """
        LOGGER.debug("X_train.shape={}, y_train.shape={}".format(X.shape, y.shape))
        cache_path = self._cache_path(cache_dir)
        # cache
        if self._is_cache_exists(cache_path):
            LOGGER.info("Find estimator from {} . skip process".format(cache_path))
            return
        est = self._init_estimator()
        self._fit(est, X, y)
        if cache_path is not None:
            # saved in disk
            LOGGER.info("Save estimator to {} ...".format(cache_path))
            check_dir(cache_path); 
            self._save_model_to_disk(est, cache_path)
        else:
            # keep in memory
            self.est = est

    def predict_proba(self, X, cache_dir=None, batch_size=None):
        LOGGER.debug("X.shape={}".format(X.shape))
        cache_path = self._cache_path(cache_dir)
        # cache
        if cache_path is not None:
            LOGGER.info("Load estimator from {} ...".format(cache_path))
            est = self._load_model_from_disk(cache_path)
            LOGGER.info("done ...")
        else:
            est = self.est
        batch_size = batch_size or self._default_predict_batch_size(est, X)
        if batch_size > 0:
            y_proba = self._batch_predict_proba(est, X, batch_size)
        else:
            y_proba = self._predict_proba(est, X)
        LOGGER.debug("y_proba.shape={}".format(y_proba.shape))
        return y_proba

    def _cache_path(self, cache_dir):
        if cache_dir is None:
            return None
        return osp.join(cache_dir, name2path(self.name) + self.cache_suffix)

    def _is_cache_exists(self, cache_path):
        return cache_path is not None and osp.exists(cache_path)

    def _batch_predict_proba(self, est, X, batch_size):
        LOGGER.debug("X.shape={}, batch_size={}".format(X.shape, batch_size))
        if hasattr(est, "verbose"):
            verbose_backup = est.verbose
            est.verbose = 0
        n_datas = X.shape[0]
        y_pred_proba = None
        for j in range(0, n_datas, batch_size):
            LOGGER.info("[progress][batch_size={}] ({}/{})".format(batch_size, j, n_datas))
            y_cur = self._predict_proba(est, X[j:j+batch_size])
            if j == 0:
                n_classes = y_cur.shape[1]
                y_pred_proba = np.empty((n_datas, n_classes), dtype=np.float32)
            y_pred_proba[j:j+batch_size,:] = y_cur
        if hasattr(est, "verbose"):
            est.verbose = verbose_backup
        return y_pred_proba

    def _load_model_from_disk(self, cache_path):
        raise NotImplementedError()

    def _save_model_to_disk(self, est, cache_path):
        raise NotImplementedError()

    def _default_predict_batch_size(self, est, X):
        """
        You can re-implement this function when inherient this class 

        Return
        ------
        predict_batch_size (int): default=0
            if = 0,  predict_proba without batches
            if > 0, then predict_proba without baches
            sklearn predict_proba is not so inefficient, has to do this
        """
        return 0

    def _fit(self, est, X, y):
        est.fit(X, y)

    def _predict_proba(self, est, X):
        return est.predict_proba(X)


In [None]:
#est utils
# -*- coding:utf-8 -*-
"""
Description: A python 2.7 implementation of gcForest proposed in [1]. A demo implementation of gcForest library as well as some demo client scripts to demostrate how to use the code. The implementation is flexible enough for modifying the model or
fit your own datasets. 
Reference: [1] Z.-H. Zhou and J. Feng. Deep Forest: Towards an Alternative to Deep Neural Networks. In IJCAI-2017.  (https://arxiv.org/abs/1702.08835v2 )
Requirements: This package is developed with Python 2.7, please make sure all the demendencies are installed, which is specified in requirements.txt
ATTN: This package is free for academic usage. You can run it at your own risk. For other purposes, please contact Prof. Zhi-Hua Zhou(zhouzh@lamda.nju.edu.cn)
ATTN2: This package was developed by Mr.Ji Feng(fengj@lamda.nju.edu.cn). The readme file and demo roughly explains how to use the codes. For any problem concerning the codes, please feel free to contact Mr.Feng. 
"""
import numpy as np
#from ..utils.log_utils import get_logger

LOGGER = get_logger('gcforest.estimators.est_utils')

def xgb_train(train_config, X_train, y_train, X_test, y_test):
    import xgboost as xgb
    LOGGER.info("X_train.shape={}, y_train.shape={}, X_test.shape={}, y_test.shape={}".format(
        X_train.shape, y_train.shape, X_test.shape, y_test.shape))
    param = train_config["param"]
    xg_train = xgb.DMatrix(X_train, label=y_train)
    xg_test = xgb.DMatrix(X_test, label=y_test)
    num_round = int(train_config["num_round"])
    watchlist = [(xg_train, 'train'), (xg_test, 'test')]
    try:
        bst = xgb.train(param, xg_train, num_round, watchlist)
    except KeyboardInterrupt:
        LOGGER.info("Canceld by user's Ctrl-C action")
        return
    y_pred = np.argmax(bst.predict(xg_test), axis=1)
    acc = 100. * np.sum(y_pred == y_test) / len(y_test)
    LOGGER.info("accuracy={}%".format(acc))


In [None]:
#kfold wrapper
# -*- coding:utf-8 -*-
"""
Description: A python 2.7 implementation of gcForest proposed in [1]. A demo implementation of gcForest library as well as some demo client scripts to demostrate how to use the code. The implementation is flexible enough for modifying the model or
fit your own datasets. 
Reference: [1] Z.-H. Zhou and J. Feng. Deep Forest: Towards an Alternative to Deep Neural Networks. In IJCAI-2017.  (https://arxiv.org/abs/1702.08835v2 )
Requirements: This package is developed with Python 2.7, please make sure all the demendencies are installed, which is specified in requirements.txt
ATTN: This package is free for academic usage. You can run it at your own risk. For other purposes, please contact Prof. Zhi-Hua Zhou(zhouzh@lamda.nju.edu.cn)
ATTN2: This package was developed by Mr.Ji Feng(fengj@lamda.nju.edu.cn). The readme file and demo roughly explains how to use the codes. For any problem concerning the codes, please feel free to contact Mr.Feng. 
"""
import os, os.path as osp
import numpy as np
from sklearn.model_selection import KFold, StratifiedKFold

# from ..utils.log_utils import get_logger
# from ..utils.cache_utils import name2path

LOGGER = get_logger("gcforest.estimators.kfold_wrapper")

class KFoldWrapper(object):
    """
    K-Fold Wrapper
    """
    def __init__(self, name, n_folds, est_class, est_args, random_state=None):
        """
        Parameters
        ----------
        n_folds (int): 
            Number of folds.
            If n_folds=1, means no K-Fold
        est_class (class):
            Class of estimator
        est_args (dict):
            Arguments of estimator
        random_state (int):
            random_state used for KFolds split and Estimator
        """
        self.name = name
        self.n_folds = n_folds
        self.est_class = est_class
        self.est_args = est_args
        self.random_state = random_state
        self.estimator1d = [None for k in range(self.n_folds)]

    def _init_estimator(self, k):
        est_args = self.est_args.copy()
        est_name = "{}/{}".format(self.name, k)
        est_args["random_state"] = self.random_state
        return self.est_class(est_name, est_args)

    def fit_transform(self, X, y, y_stratify, cache_dir=None, test_sets=None, eval_metrics=None, keep_model_in_mem=True):
        """
        X (ndarray):
            n x k or n1 x n2 x k
            to support windows_layer, X could have dim >2 
        y (ndarray):
            n or n1 x n2
        y_stratify (list):
            used for StratifiedKFold or None means no stratify
        test_sets (list): optional
            A list of (prefix, X_test, y_test) pairs.
            predict_proba for X_test will be returned 
            use with keep_model_in_mem=False to save mem useage
            y_test could be None, otherwise use eval_metrics for debugging
        eval_metrics (list): optional
            A list of (str, callable functions)
        keep_model_in_mem (bool):
        """
        if cache_dir is not None:
            cache_dir = osp.join(cache_dir, name2path(self.name))
        assert 2 <= len(X.shape) <= 3, "X.shape should be n x k or n x n2 x k"
        assert len(X.shape) == len(y.shape) + 1
        assert X.shape[0] == len(y_stratify)
        test_sets = test_sets if test_sets is not None else []
        eval_metrics = eval_metrics if eval_metrics is not None else []
        # K-Fold split
        n_stratify = X.shape[0]
        if self.n_folds == 1:
            cv = [(range(len(X)), range(len(X)))]
        else:
            if y_stratify is None:
                skf = KFold(n_splits=self.n_folds, shuffle=True, random_state=self.random_state)
                cv = [(t, v) for (t, v) in skf.split(len(n_stratify))]
            else:
                skf = StratifiedKFold(n_splits=self.n_folds, shuffle=True, random_state=self.random_state)
                cv = [(t, v) for (t, v) in skf.split(range(n_stratify), y_stratify)]
        # Fit
        y_probas = []
        n_dims = X.shape[-1]
        n_datas = X.size / n_dims
        inverse = False
        for k in range(self.n_folds):
            est = self._init_estimator(k)
            if not inverse:
                train_idx, val_idx = cv[k]
            else:
                val_idx, train_idx = cv[k]
            # fit on k-fold train
            est.fit(X[train_idx].reshape((-1, n_dims)), y[train_idx].reshape(-1), cache_dir=cache_dir)

            # predict on k-fold validation
            y_proba = est.predict_proba(X[val_idx].reshape((-1, n_dims)), cache_dir=cache_dir)
            if len(X.shape) == 3:
                y_proba = y_proba.reshape((len(val_idx), -1, y_proba.shape[-1]))
            self.log_eval_metrics(self.name, y[val_idx], y_proba, eval_metrics, "train_{}".format(k))

            # merging result
            if k == 0:
                if len(X.shape) == 2:
                    y_proba_cv = np.zeros((n_stratify, y_proba.shape[1]), dtype=np.float32)
                else:
                    y_proba_cv = np.zeros((n_stratify, y_proba.shape[1], y_proba.shape[2]), dtype=np.float32)
                y_probas.append(y_proba_cv)
            y_probas[0][val_idx, :] += y_proba
            if keep_model_in_mem:
                self.estimator1d[k] = est

            # test
            for vi, (prefix, X_test, y_test) in enumerate(test_sets):
                y_proba = est.predict_proba(X_test.reshape((-1, n_dims)), cache_dir=cache_dir)
                if len(X.shape) == 3:
                    y_proba = y_proba.reshape((X_test.shape[0], X_test.shape[1], y_proba.shape[-1]))
                if k == 0:
                    y_probas.append(y_proba)
                else:
                    y_probas[vi + 1] += y_proba
        if inverse and self.n_folds > 1:
            y_probas[0] /= (self.n_folds - 1)
        for y_proba in y_probas[1:]:
            y_proba /= self.n_folds
        # log
        self.log_eval_metrics(self.name, y, y_probas[0], eval_metrics, "train_cv")
        for vi, (test_name, X_test, y_test) in enumerate(test_sets):
            if y_test is not None:
                self.log_eval_metrics(self.name, y_test, y_probas[vi + 1], eval_metrics, test_name)
        return y_probas

    def log_eval_metrics(self, est_name, y_true, y_proba, eval_metrics, y_name):
        """
        y_true (ndarray): n or n1 x n2
        y_proba (ndarray): n x n_classes or n1 x n2 x n_classes
        """
        if eval_metrics is None:
            return
        for (eval_name, eval_metric) in eval_metrics:
            accuracy = eval_metric(y_true, y_proba)
            LOGGER.info("Accuracy({}.{}.{})={:.2f}%".format(est_name, y_name, eval_name, accuracy * 100.))

    def predict_proba(self, X_test):
        assert 2 <= len(X_test.shape) <= 3, "X_test.shape should be n x k or n x n2 x k"
        # K-Fold split
        n_dims = X_test.shape[-1]
        n_datas = X_test.size / n_dims
        for k in range(self.n_folds):
            est = self.estimator1d[k]
            y_proba = est.predict_proba(X_test.reshape((-1, n_dims)), cache_dir=None)
            if len(X_test.shape) == 3:
                y_proba = y_proba.reshape((X_test.shape[0], X_test.shape[1], y_proba.shape[-1]))
            if k == 0:
                y_proba_kfolds = y_proba
            else:
                y_proba_kfolds += y_proba
        y_proba_kfolds /= self.n_folds
        return y_proba_kfolds


In [None]:
#sklearn estimators
# -*- coding:utf-8 -*-
"""
Description: A python 2.7 implementation of gcForest proposed in [1]. A demo implementation of gcForest library as well as some demo client scripts to demostrate how to use the code. The implementation is flexible enough for modifying the model or
fit your own datasets. 
Reference: [1] Z.-H. Zhou and J. Feng. Deep Forest: Towards an Alternative to Deep Neural Networks. In IJCAI-2017.  (https://arxiv.org/abs/1702.08835v2 )
Requirements: This package is developed with Python 2.7, please make sure all the demendencies are installed, which is specified in requirements.txt
ATTN: This package is free for academic usage. You can run it at your own risk. For other purposes, please contact Prof. Zhi-Hua Zhou(zhouzh@lamda.nju.edu.cn)
ATTN2: This package was developed by Mr.Ji Feng(fengj@lamda.nju.edu.cn). The readme file and demo roughly explains how to use the codes. For any problem concerning the codes, please feel free to contact Mr.Feng. 
"""
from sklearn.externals import joblib

# from .base_estimator import BaseClassifierWrapper
# from ..utils.log_utils import get_logger

LOGGER = get_logger('gcforest.estimators.sklearn_estimators')

def forest_predict_batch_size(clf, X):
    import psutil
    free_memory = psutil.virtual_memory().free
    if free_memory < 2e9:
        free_memory = int(2e9)
    max_mem_size = max(int(free_memory * 0.5), int(8e10))
    mem_size_1 = clf.n_classes_ * clf.n_estimators * 16
    batch_size = (max_mem_size - 1) / mem_size_1 + 1
    if batch_size < 10:
        batch_size = 10
    if batch_size >= X.shape[0]:
        return 0
    return batch_size

class SKlearnBaseClassifier(BaseClassifierWrapper):
    def _load_model_from_disk(self, cache_path):
        return joblib.load(cache_path)

    def _save_model_to_disk(self, clf, cache_path):
        joblib.dump(clf, cache_path)

class GCExtraTreesClassifier(SKlearnBaseClassifier):
    def __init__(self, name, kwargs):
        from sklearn.ensemble import ExtraTreesClassifier
        super(GCExtraTreesClassifier, self).__init__(name, ExtraTreesClassifier, kwargs)
    
    def _default_predict_batch_size(self, clf, X):
        return forest_predict_batch_size(clf, X)

class GCRandomForestClassifier(SKlearnBaseClassifier):
    def __init__(self, name, kwargs):
        from sklearn.ensemble import RandomForestClassifier
        super(GCRandomForestClassifier, self).__init__(name, RandomForestClassifier, kwargs)
    
    def _default_predict_batch_size(self, clf, X):
        return forest_predict_batch_size(clf, X)


class GCLR(SKlearnBaseClassifier):
    def __init__(self,name,kwargs):
        from sklearn.linear_model import LogisticRegression
        super(GCLR,self).__init__(name,LogisticRegression,kwargs)


class GCSGDClassifier(SKlearnBaseClassifier):
    def __init__(self,name,kwargs):
        from sklearn.linear_model import SGDClassifier
        super(GCSGDClassifier,self).__init__(name,SGDClassifier,kwargs)


class GCXGBClassifier(SKlearnBaseClassifier):
    def __init__(self,name,kwargs):
        import xgboost as xgb
        kwargs = kwargs.copy()
        if "random_state" in kwargs:
            kwargs["seed"] = kwargs["random_state"]
            kwargs.pop("random_state")
        super(GCXGBClassifier,self).__init__(name,xgb.XGBClassifier,kwargs)


In [None]:
#estimator init
"""
Description: A python 2.7 implementation of gcForest proposed in [1]. A demo implementation of gcForest library as well as some demo client scripts to demostrate how to use the code. The implementation is flexible enough for modifying the model or
fit your own datasets. 
Reference: [1] Z.-H. Zhou and J. Feng. Deep Forest: Towards an Alternative to Deep Neural Networks. In IJCAI-2017.  (https://arxiv.org/abs/1702.08835v2 )
Requirements: This package is developed with Python 2.7, please make sure all the demendencies are installed, which is specified in requirements.txt
ATTN: This package is free for academic usage. You can run it at your own risk. For other purposes, please contact Prof. Zhi-Hua Zhou(zhouzh@lamda.nju.edu.cn)
ATTN2: This package was developed by Mr.Ji Feng(fengj@lamda.nju.edu.cn). The readme file and demo roughly explains how to use the codes. For any problem concerning the codes, please feel free to contact Mr.Feng. 
"""
# from .base_estimator import BaseClassifierWrapper
# from .sklearn_estimators import GCSGDClassifier,GCLR, GCExtraTreesClassifier, GCRandomForestClassifier, GCXGBClassifier
# #from .xgb_estimator import GCXGBClassifier
# from .kfold_wrapper import KFoldWrapper

def get_estimator_class(est_type):
    if est_type == "ExtraTreesClassifier":
        return GCExtraTreesClassifier
    if est_type == "RandomForestClassifier":
        return GCRandomForestClassifier
    if est_type == "LogisticRegression":
        return GCLR
    if est_type == "SGDClassifier":
        return GCSGDClassifier
    if est_type == "XGBClassifier":
        return GCXGBClassifier
    #if est_type == "XGBClassifier":
    #    return GCXGBClassifier
    raise ValueError('Unkown Estimator Type, est_type={}'.format(est_type))

def get_estimator(name, est_type, est_args):
    est_class = get_estimator_class(est_type)
    return est_class(name, est_args)

def get_estimator_kfold(name, n_splits, est_type, est_args, random_state=None):
    est_class = get_estimator_class(est_type)
    return KFoldWrapper(name, n_splits, est_class, est_args, random_state=random_state)


In [None]:
#fg win layer
# -*- coding:utf-8 -*-
"""
Description: A python 2.7 implementation of gcForest proposed in [1]. A demo implementation of gcForest library as well as some demo client scripts to demostrate how to use the code. The implementation is flexible enough for modifying the model or
fit your own datasets. 
Reference: [1] Z.-H. Zhou and J. Feng. Deep Forest: Towards an Alternative to Deep Neural Networks. In IJCAI-2017.  (https://arxiv.org/abs/1702.08835v2 )
Requirements: This package is developed with Python 2.7, please make sure all the demendencies are installed, which is specified in requirements.txt
ATTN: This package is free for academic usage. You can run it at your own risk. For other purposes, please contact Prof. Zhi-Hua Zhou(zhouzh@lamda.nju.edu.cn)
ATTN2: This package was developed by Mr.Ji Feng(fengj@lamda.nju.edu.cn). The readme file and demo roughly explains how to use the codes. For any problem concerning the codes, please feel free to contact Mr.Feng. 
"""
import numpy as np
from sklearn.model_selection import StratifiedKFold

# from .base_layer import BaseLayer
# from ..estimators import get_estimator_kfold
# from ..utils.metrics import accuracy_pb, accuracy_win_vote, accuracy_win_avg
# from ..utils.win_utils import get_windows
# from ..utils.debug_utils import repr_blobs_shape
# from ..utils.log_utils import get_logger

LOGGER = get_logger("gcforest.layers.fg_win_layer")

#CV_POLICYS = ["data", "win"]
#CV_POLICYS = ["data"]

class FGWinLayer(BaseLayer):
    def __init__(self, layer_config, data_cache):
        """
        est_config (dict): 
            estimator的config
        win_x, win_y, stride_x, stride_y, pad_x, pad_y (int): 
            configs for windows 
        n_folds(int): default=1
             1 means do not use k-fold
        n_classes (int):
             
        """
        super(FGWinLayer, self).__init__(layer_config, data_cache)
        # estimator
        self.est_configs = self.get_value("estimators", None, list, required=True)
        self.win_x = self.get_value("win_x", None, int, required=True)
        self.win_y = self.get_value("win_y", None, int, required=True)
        self.stride_x = self.get_value("stride_x", 1, int)
        self.stride_y = self.get_value("stride_y", 1, int)
        self.pad_x = self.get_value("pad_x", 0, int)
        self.pad_y = self.get_value("pad_y", 0, int)
        self.n_classes = self.get_value("n_classes", None, int, required=True)
        #self.cv_policy = layer_config.get("cv_policy", "data")
        #assert(self.cv_policy in CV_POLICYS)
        assert len(self.bottom_names) >= 2
        assert len(self.est_configs) == len(self.top_names), "Each estimator shoud produce one unique top"
        # self.eval_metrics = [("predict", accuracy_pb), ("vote", accuracy_win_vote), ("avg", accuracy_win_avg)]
        self.eval_metrics = [("predict", accuracy_pb), ("avg", accuracy_win_avg)]
        self.estimator1d = [None for ei in range(len(self.est_configs))]

    def _init_estimators(self, ei, random_state):
        """
        ei (int): estimator index
        """
        top_name = self.top_names[ei]
        est_args = self.est_configs[ei].copy()
        est_name ="{}/{}_folds".format(top_name, est_args["n_folds"])
        # n_folds
        n_folds = int(est_args["n_folds"])
        est_args.pop("n_folds")
        # est_type
        est_type = est_args["type"]
        est_args.pop("type")
        # random_state
        random_state = (random_state + hash("[estimator] {}".format(est_name))) % 1000000007
        return get_estimator_kfold(est_name, n_folds, est_type, est_args, random_state=random_state)

    def fit_transform(self, train_config):
        LOGGER.info("[data][{}], bottoms={}, tops={}".format(self.name, self.bottom_names, self.top_names))
        phases = train_config.phases
        X_train_win, y_train_win = None, None
        test_sets = None

        for ti, top_name in enumerate(self.top_names):
            LOGGER.info("[progress][{}] ti={}/{}, top_name={}".format(self.name, ti, len(self.top_names), top_name))
            # check top cache
            if np.all(self.check_top_cache(phases, ti)):
                LOGGER.info("[data][{}] all top cache exists. skip progress".format(self.name))
                continue

            # init X, y, n_classes
            if X_train_win is None:
                for pi, phase in enumerate(phases):
                    bottoms = self.data_cache.gets(phase, self.bottom_names)
                    LOGGER.info('[data][{},{}] bottoms.shape={}'.format(self.name, phase, repr_blobs_shape(bottoms)))
                    X, y = np.concatenate(bottoms[:-1], axis=1), bottoms[-1]
                    # n x n_windows x channel
                    X_win = get_windows(X, self.win_x, self.win_y, self.stride_x, self.stride_y, self.pad_x, self.pad_y)
                    _, nh, nw, _ = X_win.shape
                    X_win = X_win.reshape((X_win.shape[0], -1, X_win.shape[-1]))
                    y_win = y[:,np.newaxis].repeat(X_win.shape[1], axis=1)
                    if pi == 0:
                        assert self.n_classes == len(np.unique(y)), \
                                "n_classes={}, len(unique(y))={}".format(self.n_classes, len(np.unique(y)))
                        X_train_win, y_train_win = X_win, y_win
                    else:
                        test_sets = [("test", X_win, y_win), ]

            # fit
            est = self._init_estimators(ti, train_config.random_state)
            y_probas = est.fit_transform(X_train_win, y_train_win, y_train_win[:,0], cache_dir=train_config.model_cache_dir, 
                    test_sets = test_sets, eval_metrics=self.eval_metrics,
                    keep_model_in_mem=train_config.keep_model_in_mem)

            for pi, phase in enumerate(phases):
                y_proba = y_probas[pi].reshape((-1, nh, nw, self.n_classes)).transpose((0, 3, 1, 2))
                LOGGER.info('[data][{},{}] tops[{}].shape={}'.format(self.name, phase, ti, y_proba.shape))
                self.data_cache.update(phase, self.top_names[ti], y_proba)
            if train_config.keep_model_in_mem:
                self.estimator1d[ti] = est
    
    def transform(self):
        phase = "test"
        for ti, top_name in enumerate(self.top_names):
            LOGGER.info("[progress][{}] ti={}/{}, top_name={}".format(self.name, ti, len(self.top_names), top_name))

            bottoms = self.data_cache.gets(phase, self.bottom_names[:-1])
            LOGGER.info('[data][{},{}] bottoms.shape={}'.format(self.name, phase, repr_blobs_shape(bottoms)))
            X = np.concatenate(bottoms, axis=1)
            # n x n_windows x channel
            X_win = get_windows(X, self.win_x, self.win_y, self.stride_x, self.stride_y, self.pad_x, self.pad_y)
            _, nh, nw, _ = X_win.shape
            X_win = X_win.reshape((X_win.shape[0], -1, X_win.shape[-1]))

            est = self.estimator1d[ti]
            y_proba = est.predict_proba(X_win)
            y_proba = y_proba.reshape((-1, nh, nw, self.n_classes)).transpose((0, 3, 1, 2))
            LOGGER.info('[data][{},{}] tops[{}].shape={}'.format(self.name, phase, ti, y_proba.shape))
            self.data_cache.update(phase, self.top_names[ti], y_proba)

    def score(self):
        eval_metrics = [("predict", accuracy_pb), ("avg", accuracy_win_avg)]
        for ti, top_name in enumerate(self.top_names):
            for phase in ["train", "test"]:
                y = self.data_cache.get(phase, self.bottom_names[-1])
                y_proba = self.data_cache.get(phase, top_name)
                y_proba = y_proba.transpose((0,2,3,1))
                y_proba = y_proba.reshape((y_proba.shape[0], -1, y_proba.shape[3]))
                y = y[:,np.newaxis].repeat(y_proba.shape[1], axis=1)
                for eval_name, eval_metric in eval_metrics:
                    acc = eval_metric(y, y_proba)
                    LOGGER.info("Accuracy({}.{}.{})={:.2f}%".format(top_name, phase, eval_name, acc*100))


In [None]:
#layers init
# -*- coding:utf-8 -*-
"""
Description: A python 2.7 implementation of gcForest proposed in [1]. A demo implementation of gcForest library as well as some demo client scripts to demostrate how to use the code. The implementation is flexible enough for modifying the model or
fit your own datasets. 
Reference: [1] Z.-H. Zhou and J. Feng. Deep Forest: Towards an Alternative to Deep Neural Networks. In IJCAI-2017.  (https://arxiv.org/abs/1702.08835v2 )
Requirements: This package is developed with Python 2.7, please make sure all the demendencies are installed, which is specified in requirements.txt
ATTN: This package is free for academic usage. You can run it at your own risk. For other purposes, please contact Prof. Zhi-Hua Zhou(zhouzh@lamda.nju.edu.cn)
ATTN2: This package was developed by Mr.Ji Feng(fengj@lamda.nju.edu.cn). The readme file and demo roughly explains how to use the codes. For any problem concerning the codes, please feel free to contact Mr.Feng. 
"""
# from .base_layer import BaseLayer
# from .fg_concat_layer import FGConcatLayer
# from .fg_pool_layer import FGPoolLayer
# from .fg_win_layer import FGWinLayer

def get_layer_class(layer_type):
    if layer_type == "FGWinLayer":
        return FGWinLayer
    if layer_type == "FGConcatLayer":
        return FGConcatLayer
    if layer_type == "FGPoolLayer":
        return FGPoolLayer
    raise ValueError("Unkown Layer Type: ", layer_type)

def get_layer(layer_config, data_cache):
    """
    layer_config (dict): config for layer 
    data_cache (gcforest.DataCache): DataCache 
    """
    layer_config = layer_config.copy()
    layer_class = get_layer_class(layer_config["type"])
    layer_config.pop("type")
    layer = layer_class(layer_config, data_cache)
    return layer


In [None]:
#cascade classifier
# -*- coding:utf-8 -*-
"""
Description: A python 2.7 implementation of gcForest proposed in [1]. A demo implementation of gcForest library as well as some demo client scripts to demostrate how to use the code. The implementation is flexible enough for modifying the model or
fit your own datasets.
Reference: [1] Z.-H. Zhou and J. Feng. Deep Forest: Towards an Alternative to Deep Neural Networks. In IJCAI-2017.  (https://arxiv.org/abs/1702.08835v2 )
Requirements: This package is developed with Python 2.7, please make sure all the demendencies are installed, which is specified in requirements.txt
ATTN: This package is free for academic usage. You can run it at your own risk. For other purposes, please contact Prof. Zhi-Hua Zhou(zhouzh@lamda.nju.edu.cn)
ATTN2: This package was developed by Mr.Ji Feng(fengj@lamda.nju.edu.cn). The readme file and demo roughly explains how to use the codes. For any problem concerning the codes, please feel free to contact Mr.Feng.
"""
import numpy as np
import os
import os.path as osp
import pickle

# from ..estimators import get_estimator_kfold
# from ..utils.config_utils import get_config_value
# from ..utils.log_utils import get_logger
# from ..utils.metrics import accuracy_pb

LOGGER = get_logger('gcforest.cascade.cascade_classifier')


def check_dir(path):
    d = osp.abspath(osp.join(path, osp.pardir))
    if not osp.exists(d):
        os.makedirs(d)


def calc_accuracy(y_true, y_pred, name, prefix=""):
    acc = 100. * np.sum(np.asarray(y_true) == y_pred) / len(y_true)
    LOGGER.info('{}Accuracy({})={:.2f}%'.format(prefix, name, acc))
    return acc


def get_opt_layer_id(acc_list):
    """ Return layer id with max accuracy on training data """
    opt_layer_id = np.argsort(-np.asarray(acc_list), kind='mergesort')[0]
    return opt_layer_id


class CascadeClassifier(object):
    def __init__(self, ca_config):
        """
        Parameters (ca_config)
        ----------
        early_stopping_rounds: int
            when not None , means when the accuracy does not increase in early_stopping_rounds, the cascade level will stop automatically growing
        max_layers: int
            maximum number of cascade layers allowed for exepriments, 0 means use Early Stoping to automatically find the layer number
        n_classes: int
            Number of classes
        est_configs:
            List of CVEstimator's config
        look_indexs_cycle (list 2d): default=None
            specification for layer i, look for the array in look_indexs_cycle[i % len(look_indexs_cycle)]
            defalut = None <=> [range(n_groups)]
            .e.g.
                look_indexs_cycle = [[0,1],[2,3],[0,1,2,3]]
                means layer 1 look for the grained 0,1; layer 2 look for grained 2,3; layer 3 look for every grained, and layer 4 cycles back as layer 1
        data_save_rounds: int [default=0]
        data_save_dir: str [default=None]
            each data_save_rounds save the intermidiate results in data_save_dir
            if data_save_rounds = 0, then no savings for intermidiate results
        """
        self.ca_config = ca_config
        self.early_stopping_rounds = self.get_value("early_stopping_rounds", None, int, required=True)
        self.max_layers = self.get_value("max_layers", 0, int)
        self.n_classes = self.get_value("n_classes", None, int, required=True)
        self.est_configs = self.get_value("estimators", None, list, required=True)
        self.look_indexs_cycle = self.get_value("look_indexs_cycle", None, list)
        self.random_state = self.get_value("random_state", None, int)
        # self.data_save_dir = self.get_value("data_save_dir", None, basestring)
        self.data_save_dir = ca_config.get("data_save_dir", None)
        self.data_save_rounds = self.get_value("data_save_rounds", 0, int)
        if self.data_save_rounds > 0:
            assert self.data_save_dir is not None, "data_save_dir should not be null when data_save_rounds>0"
        self.eval_metrics = [("predict", accuracy_pb)]
        self.estimator2d = {}
        self.opt_layer_num = -1
        # LOGGER.info("\n" + json.dumps(ca_config, sort_keys=True, indent=4, separators=(',', ':')))

    @property
    def n_estimators_1(self):
        # estimators of one layer
        return len(self.est_configs)

    def get_value(self, key, default_value, value_types, required=False):
        return get_config_value(self.ca_config, key, default_value, value_types,
                required=required, config_name="cascade")

    def _set_estimator(self, li, ei, est):
        if li not in self.estimator2d:
            self.estimator2d[li] = {}
        self.estimator2d[li][ei] = est

    def _get_estimator(self, li, ei):
        return self.estimator2d.get(li, {}).get(ei, None)

    def _init_estimators(self, li, ei):
        est_args = self.est_configs[ei].copy()
        est_name = "layer_{} - estimator_{} - {}_folds".format(li, ei, est_args["n_folds"])
        # n_folds
        n_folds = int(est_args["n_folds"])
        est_args.pop("n_folds")
        # est_type
        est_type = est_args["type"]
        est_args.pop("type")
        # random_state
        if self.random_state is not None:
            random_state = (self.random_state + hash("[estimator] {}".format(est_name))) % 1000000007
        else:
            random_state = None
        return get_estimator_kfold(est_name, n_folds, est_type, est_args, random_state=random_state)

    def _check_look_indexs_cycle(self, X_groups, is_fit):
        # check look_indexs_cycle
        n_groups = len(X_groups)
        if is_fit and self.look_indexs_cycle is None:
            look_indexs_cycle = [list(range(n_groups))]
        else:
            look_indexs_cycle = self.look_indexs_cycle
            for look_indexs in look_indexs_cycle:
                if np.max(look_indexs) >= n_groups or np.min(look_indexs) < 0 or len(look_indexs) == 0:
                    raise ValueError("look_indexs doesn't match n_groups!!! look_indexs={}, n_groups={}".format(
                        look_indexs, n_groups))
        if is_fit:
            self.look_indexs_cycle = look_indexs_cycle
        return look_indexs_cycle

    def _check_group_dims(self, X_groups, is_fit):
        if is_fit:
            group_starts, group_ends, group_dims = [], [], []
        else:
            group_starts, group_ends, group_dims = self.group_starts, self.group_ends, self.group_dims
        n_datas = X_groups[0].shape[0]
        X = np.zeros((n_datas, 0), dtype=X_groups[0].dtype)
        for i, X_group in enumerate(X_groups):
            assert(X_group.shape[0] == n_datas)
            X_group = X_group.reshape(n_datas, -1)
            if is_fit:
                group_dims.append( X_group.shape[1] )
                group_starts.append(0 if i == 0 else group_ends[i - 1])
                group_ends.append(group_starts[i] + group_dims[i])
            else:
                assert(X_group.shape[1] == group_dims[i])
            X = np.hstack((X, X_group))
        if is_fit:
            self.group_starts, self.group_ends, self.group_dims = group_starts, group_ends, group_dims
        return group_starts, group_ends, group_dims, X

    def fit_transform(self, X_groups_train, y_train, X_groups_test, y_test, stop_by_test=False, train_config=None):
        """
        fit until the accuracy converges in early_stop_rounds
        stop_by_test: (bool)
            When X_test, y_test is validation data that used for determine the opt_layer_id,
            use this option
        """
        if train_config is None:
            from ..config import GCTrainConfig
            train_config = GCTrainConfig({})
        data_save_dir = train_config.data_cache.cache_dir or self.data_save_dir

        is_eval_test = "test" in train_config.phases
        if not type(X_groups_train) == list:
            X_groups_train = [X_groups_train]
        if is_eval_test and not type(X_groups_test) == list:
            X_groups_test = [X_groups_test]
        LOGGER.info("X_groups_train.shape={},y_train.shape={},X_groups_test.shape={},y_test.shape={}".format(
            [xr.shape for xr in X_groups_train], y_train.shape,
            [xt.shape for xt in X_groups_test] if is_eval_test else "no_test", y_test.shape if is_eval_test else "no_test"))

        # check look_indexs_cycle
        look_indexs_cycle = self._check_look_indexs_cycle(X_groups_train, True)
        if is_eval_test:
            self._check_look_indexs_cycle(X_groups_test, False)

        # check groups dimension
        group_starts, group_ends, group_dims, X_train = self._check_group_dims(X_groups_train, True)
        if is_eval_test:
            _, _, _, X_test = self._check_group_dims(X_groups_test, False)
        else:
            X_test = np.zeros((0, X_train.shape[1]))
        LOGGER.info("group_dims={}".format(group_dims))
        LOGGER.info("group_starts={}".format(group_starts))
        LOGGER.info("group_ends={}".format(group_ends))
        LOGGER.info("X_train.shape={},X_test.shape={}".format(X_train.shape, X_test.shape))

        n_trains = X_groups_train[0].shape[0]
        n_tests = X_groups_test[0].shape[0] if is_eval_test else 0

        n_classes = self.n_classes
        assert n_classes == len(np.unique(y_train)), "n_classes({}) != len(unique(y)) {}".format(n_classes, np.unique(y_train))
        train_acc_list = []
        test_acc_list = []
        # X_train, y_train, X_test, y_test
        opt_datas = [None, None, None, None]
        try:
            # probability of each cascades's estimators
            X_proba_train = np.zeros((n_trains, n_classes * self.n_estimators_1), dtype=np.float32)
            X_proba_test = np.zeros((n_tests, n_classes * self.n_estimators_1), dtype=np.float32)
            X_cur_train, X_cur_test = None, None
            layer_id = 0
            while 1:
                if self.max_layers > 0 and layer_id >= self.max_layers:
                    break
                # Copy previous cascades's probability into current X_cur
                if layer_id == 0:
                    # first layer not have probability distribution
                    X_cur_train = np.zeros((n_trains, 0), dtype=np.float32)
                    X_cur_test = np.zeros((n_tests, 0), dtype=np.float32)
                else:
                    X_cur_train = X_proba_train.copy()
                    X_cur_test = X_proba_test.copy()
                # Stack data that current layer needs in to X_cur
                look_indexs = look_indexs_cycle[layer_id % len(look_indexs_cycle)]
                for _i, i in enumerate(look_indexs):
                    X_cur_train = np.hstack((X_cur_train, X_train[:, group_starts[i]:group_ends[i]]))
                    X_cur_test = np.hstack((X_cur_test, X_test[:, group_starts[i]:group_ends[i]]))
                LOGGER.info("[layer={}] look_indexs={}, X_cur_train.shape={}, X_cur_test.shape={}".format(
                    layer_id, look_indexs, X_cur_train.shape, X_cur_test.shape))
                # Fit on X_cur, predict to update X_proba
                y_train_proba_li = np.zeros((n_trains, n_classes))
                y_test_proba_li = np.zeros((n_tests, n_classes))
                for ei, est_config in enumerate(self.est_configs):
                    est = self._init_estimators(layer_id, ei)
                    # fit_trainsform
                    test_sets = [("test", X_cur_test, y_test)] if n_tests > 0 else None
                    y_probas = est.fit_transform(X_cur_train, y_train, y_train,
                            test_sets=test_sets, eval_metrics=self.eval_metrics,
                            keep_model_in_mem=train_config.keep_model_in_mem)
                    # train
                    X_proba_train[:, ei * n_classes: ei * n_classes + n_classes] = y_probas[0]
                    y_train_proba_li += y_probas[0]
                    # test
                    if n_tests > 0:
                        X_proba_test[:, ei * n_classes: ei * n_classes + n_classes] = y_probas[1]
                        y_test_proba_li += y_probas[1]
                    if train_config.keep_model_in_mem:
                        self._set_estimator(layer_id, ei, est)
                y_train_proba_li /= len(self.est_configs)
                train_avg_acc = calc_accuracy(y_train, np.argmax(y_train_proba_li, axis=1), 'layer_{} - train.classifier_average'.format(layer_id))
                train_acc_list.append(train_avg_acc)
                if n_tests > 0:
                    y_test_proba_li /= len(self.est_configs)
                    test_avg_acc = calc_accuracy(y_test, np.argmax(y_test_proba_li, axis=1), 'layer_{} - test.classifier_average'.format(layer_id))
                    test_acc_list.append(test_avg_acc)
                else:
                    test_acc_list.append(0.0)

                opt_layer_id = get_opt_layer_id(test_acc_list if stop_by_test else train_acc_list)
                # set opt_datas
                if opt_layer_id == layer_id:
                    opt_datas = [X_proba_train, y_train, X_proba_test if n_tests > 0 else None, y_test]
                # early stop
                if self.early_stopping_rounds > 0 and layer_id - opt_layer_id >= self.early_stopping_rounds:
                    # log and save final result (opt layer)
                    LOGGER.info("[Result][Optimal Level Detected] opt_layer_num={}, accuracy_train={:.2f}%, accuracy_test={:.2f}%".format(
                        opt_layer_id + 1, train_acc_list[opt_layer_id], test_acc_list[opt_layer_id]))
                    if data_save_dir is not None:
                        self.save_data( data_save_dir, opt_layer_id, *opt_datas)
                    # remove unused model
                    if train_config.keep_model_in_mem:
                        for li in range(opt_layer_id + 1, layer_id + 1):
                            for ei, est_config in enumerate(self.est_configs):
                                self._set_estimator(li, ei, None)
                    self.opt_layer_num = opt_layer_id + 1
                    return opt_layer_id, opt_datas[0], opt_datas[1], opt_datas[2], opt_datas[3]
                # save opt data if needed
                if self.data_save_rounds > 0 and (layer_id + 1) % self.data_save_rounds == 0:
                    self.save_data(data_save_dir, layer_id, *opt_datas)
                # inc layer_id
                layer_id += 1
            LOGGER.info("[Result][Reach Max Layer] opt_layer_num={}, accuracy_train={:.2f}%, accuracy_test={:.2f}%".format(
                opt_layer_id + 1, train_acc_list[opt_layer_id], test_acc_list[opt_layer_id]))
            if data_save_dir is not None:
                self.save_data(data_save_dir, self.max_layers - 1, *opt_datas)
            self.opt_layer_num = self.max_layers
            return self.max_layers, opt_datas[0], opt_datas[1], opt_datas[2], opt_datas[3]
        except KeyboardInterrupt:
            pass

    def transform(self, X_groups_test):
        if not type(X_groups_test) == list:
            X_groups_test = [X_groups_test]
        LOGGER.info("X_groups_test.shape={}".format([xt.shape for xt in X_groups_test]))
        # check look_indexs_cycle
        look_indexs_cycle = self._check_look_indexs_cycle(X_groups_test, False)
        # check group_dims
        group_starts, group_ends, group_dims, X_test = self._check_group_dims(X_groups_test, False)
        LOGGER.info("group_dims={}".format(group_dims))
        LOGGER.info("X_test.shape={}".format(X_test.shape))

        n_tests = X_groups_test[0].shape[0]
        n_classes = self.n_classes

        # probability of each cascades's estimators
        X_proba_test = np.zeros((X_test.shape[0], n_classes * self.n_estimators_1), dtype=np.float32)
        X_cur_test = None
        for layer_id in range(self.opt_layer_num):
            # Copy previous cascades's probability into current X_cur
            if layer_id == 0:
                # first layer not have probability distribution
                X_cur_test = np.zeros((n_tests, 0), dtype=np.float32)
            else:
                X_cur_test = X_proba_test.copy()
            # Stack data that current layer needs in to X_cur
            look_indexs = look_indexs_cycle[layer_id % len(look_indexs_cycle)]
            for _i, i in enumerate(look_indexs):
                X_cur_test = np.hstack((X_cur_test, X_test[:, group_starts[i]:group_ends[i]]))
            LOGGER.info("[layer={}] look_indexs={}, X_cur_test.shape={}".format(
                layer_id, look_indexs, X_cur_test.shape))
            for ei, est_config in enumerate(self.est_configs):
                est = self._get_estimator(layer_id, ei)
                if est is None:
                    raise ValueError("model (li={}, ei={}) not present, maybe you should set keep_model_in_mem to True".format(
                        layer_id, ei))
                y_probas = est.predict_proba(X_cur_test)
                X_proba_test[:, ei * n_classes:ei * n_classes + n_classes] = y_probas
        return X_proba_test

    def predict_proba(self, X):
        # n x (n_est*n_classes)
        y_proba = self.transform(X)
        # n x n_est x n_classes
        y_proba = y_proba.reshape((y_proba.shape[0], self.n_estimators_1, self.n_classes))
        y_proba = y_proba.mean(axis=1)
        return y_proba

    def save_data(self, data_save_dir, layer_id, X_train, y_train, X_test, y_test):
        for pi, phase in enumerate(["train", "test"]):
            data_path = osp.join(data_save_dir, "layer_{}-{}.pkl".format(layer_id, phase))
            check_dir(data_path)
            data = {"X": X_train, "y": y_train} if pi == 0 else {"X": X_test, "y": y_test}
            LOGGER.info("Saving Data in {} ... X.shape={}, y.shape={}".format(data_path, data["X"].shape, data["y"].shape))
            with open(data_path, "wb") as f:
                pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)


In [None]:
#data cache
# -*- coding:utf-8 -*-
"""
Description: A python 2.7 implementation of gcForest proposed in [1]. A demo implementation of gcForest library as well as some demo client scripts to demostrate how to use the code. The implementation is flexible enough for modifying the model or
fit your own datasets. 
Reference: [1] Z.-H. Zhou and J. Feng. Deep Forest: Towards an Alternative to Deep Neural Networks. In IJCAI-2017.  (https://arxiv.org/abs/1702.08835v2 )
Requirements: This package is developed with Python 2.7, please make sure all the demendencies are installed, which is specified in requirements.txt
ATTN: This package is free for academic usage. You can run it at your own risk. For other purposes, please contact Prof. Zhi-Hua Zhou(zhouzh@lamda.nju.edu.cn)
ATTN2: This package was developed by Mr.Ji Feng(fengj@lamda.nju.edu.cn). The readme file and demo roughly explains how to use the codes. For any problem concerning the codes, please feel free to contact Mr.Feng. 
"""
import os, os.path as osp
import numpy as np

# from .utils.log_utils import get_logger
# from .utils.cache_utils import name2path

LOGGER = get_logger("gcforest.data_cache")

def check_dir(path):
    """ make sure the dir specified by path got created """
    d = osp.abspath(osp.join(path, osp.pardir))
    if not osp.exists(d):
        os.makedirs(d)

def data_disk_path(cache_dir, phase, data_name):
    data_path = osp.join(cache_dir, phase, name2path(data_name) + ".npy")
    return data_path

class DataCache(object):
    def __init__(self, config):
        self.config = config
        self.cache_dir = config.get("cache_dir", None)
        if self.config.get("keep_in_mem") is None:
            self.config["keep_in_mem"] = {"default": 1}
        if self.config.get("cache_in_disk") is None:
            self.config["cache_in_disk"] = {"default": 0}
        self.datas = {"train": {}, "test": {}}

    def keep_in_mem(self, phase, data_name):
        """
        determine if the data for (phase, data_name) should be kept in RAM
        if config["keep_in_mem"][data_name] exist, then use it, otherwise use the default value of config["keep_in_mem"] 
        """
        return self.config["keep_in_mem"].get(data_name, self.config["keep_in_mem"]["default"])

    def cache_in_disk(self, phase, data_name):
        """
        check data for (phase, data_name) is cached in disk
        if config["cache_in_disk"][data_name] exist, then use it , otherwise use default value of config["cache_in_disk"]  
        """
        return self.config["cache_in_disk"].get(data_name, self.config["cache_in_disk"]["default"])

    def is_exist(self, phase, data_name):
        """
        check data_name is generated or cashed to disk 
        """
        data_mem = self.datas[phase].get(data_name, None)
        if data_mem is not None:
            return True
        if self.cache_dir is None:
            return False
        data_path = data_disk_path(self.cache_dir, phase, data_name)
        if osp.exists(data_path):
            return data_path
        return None

    def gets(self, phase, data_names, ignore_no_exist=False):
        assert isinstance(data_names, list)
        datas = []
        for data_name in data_names:
            datas.append(self.get(phase, data_name, ignore_no_exist=ignore_no_exist))
        return datas

    def get(self, phase, data_name, ignore_no_exist=False):
        """
        get data according to data_name 

        Arguments
        ---------
        phase (str): train or test
        data_name (str): name for tops/bottoms  
        ignore_no_exist (bool): if True, when no data found, return None, otherwise raise e
        """
        assert isinstance(data_name, basestring), "data_name={}, type(data_name)={}".format(data_name, type(data_name))
        # return data if data in memory
        data_mem = self.datas[phase].get(data_name, None)
        if data_mem is not None:
            return data_mem
        # load data from disk
        if self.cache_dir is None:
            if ignore_no_exist:
                return None
            raise ValueError("Cache base unset, can't load data ({}->{}) from disk".format(phase, data_name))
        data_path = data_disk_path(self.cache_dir, phase, data_name)
        if not osp.exists(data_path):
            if ignore_no_exist:
                return None
            raise ValueError("Data path not exist, can't load data ({}->{}) from disk: {}".format(phase, data_name, data_path))
        return np.load(data_path)

    def updates(self, phase, data_names, datas):
        assert isinstance(data_names, list)
        for i, data_name in enumerate(data_names):
            self.update(phase, data_name, datas[i])

    def update(self, phase, data_name, data):
        """
        update (phase, data_name) data in cache  
        """
        assert isinstance(data, np.ndarray), "data(type={}) is not a np.ndarray!!!".format(type(data))
        if self.keep_in_mem(phase, data_name):
            self.datas[phase][data_name] = data
        if self.cache_in_disk(phase, data_name):
            if self.cache_dir is None:
                raise ValueError("Cache base unset, can't Save data ({}->{}) to disk".format(phase, data_name))
            data_path = data_disk_path(self.cache_dir, phase, data_name)
            LOGGER.info("Updating data ({}->{}, shape={}) in disk: {}".format(phase, data_name, data.shape, data_path))
            check_dir(data_path);
            np.save(data_path, data)

    def reset(self, phase, X, y):
        self.datas[phase].clear()
        if X is not None:
            self.update(phase, "X", X)
        if y is not None:
            self.update(phase, "y", y)


In [None]:
#exp utils
"""
Description: A python 2.7 implementation of gcForest proposed in [1]. A demo implementation of gcForest library as well as some demo client scripts to demostrate how to use the code. The implementation is flexible enough for modifying the model or
fit your own datasets. 
Reference: [1] Z.-H. Zhou and J. Feng. Deep Forest: Towards an Alternative to Deep Neural Networks. In IJCAI-2017.  (https://arxiv.org/abs/1702.08835v2 )
Requirements: This package is developed with Python 2.7, please make sure all the demendencies are installed, which is specified in requirements.txt
ATTN: This package is free for academic usage. You can run it at your own risk. For other purposes, please contact Prof. Zhi-Hua Zhou(zhouzh@lamda.nju.edu.cn)
ATTN2: This package was developed by Mr.Ji Feng(fengj@lamda.nju.edu.cn). The readme file and demo roughly explains how to use the codes. For any problem concerning the codes, please feel free to contact Mr.Feng. 
"""
import numpy as np
from scipy.sparse import issparse

#from .utils.log_utils import get_logger

LOGGER = get_logger('gcforest.exp_utils')

def load_model_config(model_path, log_name=None):
    import json
    from .utils.config_utils import load_json
    config = load_json(model_path)
    if log_name is not None:
        logger = get_logger(log_name)
        logger.info(log_name)
        logger.info("\n" + json.dumps(config, sort_keys=True, indent=4, separators=(',', ':')))
    return config


def concat_datas(datas):
    if type(datas) != list:
        return datas
    for i, data in enumerate(datas):
        datas[i] = data.reshape((data.shape[0], -1))
    return np.concatenate(datas, axis=1)

def data_norm(X_train, X_test):
    X_mean = np.mean(X_train, axis=0)
    X_std = np.std(X_train, axis=0)
    X_train -= X_mean
    X_train /= X_std
    X_test -= X_mean
    X_test /= X_std
    return X_mean, X_std

def append_origin(X, X_origin):
    return np.hstack(( X.reshape((X.shape[0]), -1), X_origin.reshape((X_origin.shape[0], -1)) ))

def prec_ets(n_trees, X_train, y_train, X_test, y_test, random_state=None):
    """
    ExtraTrees
    """
    from sklearn.ensemble import ExtraTreesClassifier
    if not issparse(X_train):
        X_train = X_train.reshape((X_train.shape[0], -1))
    if not issparse(X_test):
        X_test = X_test.reshape((X_test.shape[0], -1))
    LOGGER.info('start predict: n_trees={},X_train.shape={},y_train.shape={},X_test.shape={},y_test.shape={}'.format(
        n_trees, X_train.shape, y_train.shape, X_test.shape, y_test.shape))
    clf = ExtraTreesClassifier(n_estimators=n_trees, max_depth=None, n_jobs=-1, verbose=1, random_state=random_state)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    prec = float(np.sum(y_pred == y_test)) / len(y_test)
    LOGGER.info('prec_ets{}={:.6f}%'.format(n_trees, prec*100.0))
    return clf, y_pred

def prec_rf(n_trees, X_train, y_train, X_test, y_test):
    """
    ExtraTrees
    """
    from sklearn.ensemble import RandomForestClassifier
    if not issparse(X_train):
        X_train = X_train.reshape((X_train.shape[0], -1))
    if not issparse(X_test):
        X_test = X_test.reshape((X_test.shape[0], -1))
    LOGGER.info('start predict: n_trees={},X_train.shape={},y_train.shape={},X_test.shape={},y_test.shape={}'.format(
        n_trees, X_train.shape, y_train.shape, X_test.shape, y_test.shape))
    clf = RandomForestClassifier(n_estimators=n_trees, max_depth=None, n_jobs=-1, verbose=1)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    prec = float(np.sum(y_pred == y_test)) / len(y_test)
    LOGGER.info('prec_rf{}={:.6f}%'.format(n_trees, prec*100.0))
    return clf, y_pred

def xgb_eval_accuracy(y_pred_proba, y_true):
    """
    y_true (DMatrix)
    """
    y_pred = np.argmax(y_pred_proba, axis=1)
    y_true = y_true.get_label()
    acc = float(np.sum(y_pred == y_true)) / len(y_pred)
    return 'accuracy', -acc

def prec_xgb(n_trees, max_depth, X_train, y_train, X_test, y_test, learning_rate=0.1):
    """
    ExtraTrees
    """
    import xgboost as xgb
    X_train = X_train.reshape((X_train.shape[0], -1))
    X_test = X_test.reshape((X_test.shape[0], -1))
    LOGGER.info('start predict: n_trees={},X_train.shape={},y_train.shape={},X_test.shape={},y_test.shape={}'.format(
        n_trees, X_train.shape, y_train.shape, X_test.shape, y_test.shape))
    clf = xgb.XGBClassifier(n_estimators=n_trees, max_depth=max_depth, objective='multi:softprob',
            seed=0, silent=True, nthread=-1, learning_rate=learning_rate)
    eval_set = [(X_test, y_test)]
    clf.fit(X_train, y_train, eval_set=eval_set, eval_metric="merror")
    y_pred = clf.predict(X_test)
    prec = float(np.sum(y_pred == y_test)) / len(y_test)
    LOGGER.info('prec_xgb_{}={:.6f}%'.format(n_trees, prec*100.0))
    return clf, y_pred

def prec_log(X_train, y_train, X_test, y_test):
    from sklearn.linear_model import LogisticRegression
    if not issparse(X_train):
        X_train = X_train.reshape((X_train.shape[0], -1))
    if not issparse(X_test):
        X_test = X_test.reshape((X_test.shape[0], -1))
    LOGGER.info('start predict: X_train.shape={},y_train.shape={},X_test.shape={},y_test.shape={}'.format(
        X_train.shape, y_train.shape, X_test.shape, y_test.shape))
    X_train = X_train.reshape((X_train.shape[0], -1))
    X_test = X_test.reshape((X_test.shape[0], -1))
    clf = LogisticRegression(solver='sag', n_jobs=-1, verbose=1)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    prec = float(np.sum(y_pred == y_test)) / len(y_test)
    LOGGER.info('prec_log={:.6f}%'.format(prec*100.0))
    return clf, y_pred

def plot_forest_all_proba(y_proba_all, y_gt):
    from matplotlib import pylab
    N = len(y_gt)
    num_tree = len(y_proba_all)
    pylab.clf()
    mat = np.zeros((num_tree, N))
    LOGGER.info('mat.shape={}'.format(mat.shape))
    for i in range(num_tree):
        mat[i,:] = y_proba_all[i][(range(N), y_gt)]
    pylab.matshow(mat, fignum=False, cmap='Blues', vmin=0, vmax=1.0)
    pylab.grid(False)
    pylab.show()

def plot_confusion_matrix(cm, label_list, title='Confusion matrix', cmap=None):
    from matplotlib import pylab
    cm = np.asarray(cm, dtype=np.float32)
    for i, row in enumerate(cm):
        cm[i] = cm[i] / np.sum(cm[i])
    #import matplotlib.pyplot as plt
    #plt.ion()
    pylab.clf()
    pylab.matshow(cm, fignum=False, cmap='Blues', vmin=0, vmax=1.0)
    ax = pylab.axes()
    ax.set_xticks(range(len(label_list)))
    ax.set_xticklabels(label_list, rotation='vertical')
    ax.xaxis.set_ticks_position('bottom')
    ax.set_yticks(range(len(label_list)))
    ax.set_yticklabels(label_list)
    pylab.title(title)
    pylab.colorbar()
    pylab.grid(False)
    pylab.xlabel('Predicted class')
    pylab.ylabel('True class')
    pylab.grid(False)
    pylab.savefig('test.jpg')
    pylab.show()


In [None]:
#fgnet
# -*- coding:utf-8 -*-
"""
Description: A python 2.7 implementation of gcForest proposed in [1]. A demo implementation of gcForest library as well as some demo client scripts to demostrate how to use the code. The implementation is flexible enough for modifying the model or
fit your own datasets. 
Reference: [1] Z.-H. Zhou and J. Feng. Deep Forest: Towards an Alternative to Deep Neural Networks. In IJCAI-2017.  (https://arxiv.org/abs/1702.08835v2 )
Requirements: This package is developed with Python 2.7, please make sure all the demendencies are installed, which is specified in requirements.txt
ATTN: This package is free for academic usage. You can run it at your own risk. For other purposes, please contact Prof. Zhi-Hua Zhou(zhouzh@lamda.nju.edu.cn)
ATTN2: This package was developed by Mr.Ji Feng(fengj@lamda.nju.edu.cn). The readme file and demo roughly explains how to use the codes. For any problem concerning the codes, please feel free to contact Mr.Feng. 
"""
import numpy as np
import os, os.path as osp
import json

# from .layers import get_layer
# from .utils.log_utils import get_logger

LOGGER = get_logger("gcforest.gcnet")

class FGNet(object):
    """
    GCForest : FineGrained Components
    """
    def __init__(self, net_config, data_cache):
        #net_config_str = json.dumps(net_config, sort_keys=True, indent=4, separators=(',', ':'))
        #LOGGER.info("\n" + net_config_str)
        self.data_cache = data_cache
        self.inputs = net_config.get("inputs", [])
        self.check_net_config(net_config)
        self.outputs = net_config.get("outputs", [])

        # layers
        self.layers = []
        self.name2layer = {}
        model_disk_base = net_config.get("model_cache", {}).get("disk_base", None)
        for layer_config in net_config["layers"]:
            layer = get_layer(layer_config, self.data_cache)
            layer.model_disk_base = model_disk_base
            self.layers.append(layer)
            self.name2layer[layer.name] = layer


    def fit_transform(self, X_train, y_train, X_test, y_test, train_config):
        """
        delete_layer (bool): defalut=False
            When X_test is not None and there is no need to run test, delete layer in realtime to save mem
             
        """
        LOGGER.info("X_train.shape={}, y_train.shape={}, X_test.shape={}, y_test.shape={}".format(
            X_train.shape, y_train.shape, None if X_test is None else X_test.shape, None if y_test is None else y_test.shape))
        self.data_cache.reset("train", X_train, y_train)
        if "test" in train_config.phases:
            self.data_cache.reset("test", X_test, y_test)
        for li, layer in enumerate(self.layers):
            layer.fit_transform(train_config)

    @staticmethod
    def concat_datas(datas):
        if type(datas) != list:
            return datas
        for i, data in enumerate(datas):
            datas[i] = data.reshape((data.shape[0], -1))
        return np.concatenate(datas, axis=1)

    def transform(self, X_test):
        LOGGER.info("X_test.shape={}".format(X_test.shape))
        self.data_cache.reset("test", X_test, None)
        for li, layer in enumerate(self.layers):
            layer.transform()
        return self.get_outputs("test")

    def score(self):
        for li, layer in enumerate(self.layers):
            layer.score()

    def get_outputs(self, phase):
        outputs = self.data_cache.gets(phase, self.outputs)
        return outputs

    def save_outputs(self, phase, save_y=True, save_path=None):
        if save_path is None:
            if self.data_cache.cache_dir is None:
                LOGGER.error("save path is None and data_cache.cache_dir is None!!! don't know where to save")
                return
            save_path = osp.join(self.data_cache.cache_dir, phase, "outputs.pkl")
        import pickle
        info  = ""
        data_names = [name for name in self.outputs]
        if save_y:
            data_names.append("y")
        datas = {}
        for di, data_name in enumerate(data_names):
            datas[data_name] = self.data_cache.get(phase, data_name)
            info = "{},{}->{}".format(info, data_name, datas[data_name].shape)
        LOGGER.info("outputs.shape={}".format(info))
        LOGGER.info("Saving Outputs in {} ".format(save_path))
        with open(save_path, "wb") as f:
            pickle.dump(datas, f, pickle.HIGHEST_PROTOCOL)

    def check_net_config(self, net_config):
        """
        check net_config 
        """
         
        top2layer = {}
        name2layer = {}
        for li, layer_config in enumerate(net_config["layers"]):
            layer_name = layer_config["name"]
            if layer_name in name2layer:
                raise ValueError("layer name duplicate. layer_name={}, config1={}, config2={}".format(
                    layer_name, name2layer[layer_name], layer_config))
            name2layer[layer_name] = layer_config

            for bottom in layer_config["bottoms"]:
                if bottom != "X" and bottom != "y" and not bottom in self.inputs and not bottom in top2layer:
                    raise ValueError("li={}, layer_config={}, bottom({}) doesn't be produced by other layers".format(
                        li, layer_config, bottom))
            for top in layer_config["tops"]:
                if top in top2layer:
                    raise ValueError("top duplicate. layer({}) and layer({}) have same top blob: {}".format(
                        top2layer[top], layer_config["name"], top))
                top2layer[top] = layer_config["name"]
         
        outputs = net_config.get("outputs", [])
        if len(outputs) == 0:
            LOGGER.warn("outputs list is empty!!!")
        for output in outputs:
            if output == "X" or output == "y" or output in self.inputs or output in top2layer:
                continue
            raise ValueError("output data name not exist: output={}".format(output))
         
        for layer_config in net_config["layers"]:
            if len(layer_config["tops"]) > 1:
                for top_name in layer_config["tops"]:
                    if not top_name.startswith(layer_config["name"]):
                        LOGGER.warn("top_name is suggested to startswith layer_name: layer_config={}".format(layer_config))
            else:
                top = layer_config["tops"][0]
                if top != layer_config["name"]:
                    LOGGER.warn("layer_name != top_name, You should check to make sure this is what you want!!! layer_config={}".format(layer_config))


In [None]:
#config 
#from .data_cache import DataCache


class GCTrainConfig(object):
    def __init__(self, train_config):
        self.keep_model_in_mem = train_config.get("keep_model_in_mem", True)
        self.random_state = train_config.get("random_state", 0)
        self.model_cache_dir = strip(train_config.get("model_cache_dir", None))
        self.data_cache = DataCache(train_config.get("data_cache", {}))
        self.phases = train_config.get("phases", ["train", "test"])

        for data_name in ("X", "y"):
            if data_name not in self.data_cache.config["keep_in_mem"]:
                self.data_cache.config["keep_in_mem"][data_name] = True
            if data_name not in self.data_cache.config["cache_in_disk"]:
                self.data_cache.config["cache_in_disk"][data_name] = False


def strip(s):
    if s is None:
        return None
    s = s.strip()
    if len(s) == 0:
        return None
    return s


In [None]:
#gcforest
import numpy as np

# from .cascade.cascade_classifier import CascadeClassifier
# from .config import GCTrainConfig
# from .fgnet import FGNet
# from .utils.log_utils import get_logger

LOGGER = get_logger("gcforest.gcforest")


class GCForest(object):
    def __init__(self, config):
        self.config = config
        self.train_config = GCTrainConfig(config.get("train", {}))
        if "net" in self.config:
            self.fg = FGNet(self.config["net"], self.train_config.data_cache)
        else:
            self.fg = None
        if "cascade" in self.config:
            self.ca = CascadeClassifier(self.config["cascade"])
        else:
            self.ca = None

    def fit_transform(self, X_train, y_train, X_test=None, y_test=None, train_config=None):
        train_config = train_config or self.train_config
        if X_test is None or y_test is None:
            if "test" in train_config.phases:
                train_config.phases.remove("test")
            X_test, y_test = None, None
        if self.fg is not None:
            self.fg.fit_transform(X_train, y_train, X_test, y_test, train_config)
            X_train = self.fg.get_outputs("train")
            if "test" in train_config.phases:
                X_test = self.fg.get_outputs("test")
        if self.ca is not None:
            _, X_train, _, X_test, _, = self.ca.fit_transform(X_train, y_train, X_test, y_test, train_config=train_config)

        if X_test is None:
            return X_train
        else:
            return X_train, X_test

    def transform(self, X):
        """
        return:
            if only finegrained proviede: return the result of Finegrained
            if cascade is provided: return N x (n_trees in each layer * n_classes)
        """
        if self.fg is not None:
            X = self.fg.transform(X)
        y_proba = self.ca.transform(X)
        return y_proba

    def predict_proba(self, X):
        if self.fg is not None:
            X = self.fg.transform(X)
        y_proba = self.ca.predict_proba(X)
        return y_proba

    def predict(self, X):
        y_proba = self.predict_proba(X)
        y_pred = np.argmax(y_proba, axis=1)
        return y_pred

    def set_data_cache_dir(self, path):
        self.train_config.data_cache.cache_dir = path

    def set_keep_data_in_mem(self, flag):
        """
        flag (bool):
            if flag is 0, data will not be keeped in memory.
            this is for the situation when memory is the bottleneck
        """
        self.train_config.data_cache.config["keep_in_mem"]["default"] = flag

    def set_keep_model_in_mem(self, flag):
        """
        flag (bool):
            if flag is 0, model will not be keeped in memory.
            this is for the situation when memory is the bottleneck
        """
        self.train_config.keep_model_in_mem = flag


In [None]:
import argparse
import numpy as np
import sys
from keras.datasets import mnist
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

def get_toy_config():
    config = {}
    ca_config = {}
    ca_config["random_state"] = 0
    ca_config["max_layers"] = 20
    ca_config["early_stopping_rounds"] = 3
    ca_config["n_classes"] = 2
    ca_config["estimators"] = []
    ca_config["estimators"].append(
            {"n_folds": 5, "type": "XGBClassifier", "n_estimators": 10, "max_depth": 5,
             "objective": "binary:logistic", "silent": True, "nthread": -1, "learning_rate": 0.1} )
    ca_config["estimators"].append({"n_folds": 5, "type": "RandomForestClassifier", "n_estimators": 10, "max_depth": None, "n_jobs": -1})
    ca_config["estimators"].append({"n_folds": 5, "type": "ExtraTreesClassifier", "n_estimators": 10, "max_depth": None, "n_jobs": -1})
    #ca_config["estimators"].append({"n_folds": 5, "type": "LogisticRegression"})
    config["cascade"] = ca_config
    return config
  
  
config = get_toy_config()


In [None]:
gc = GCForest(config=config)

In [None]:
import glob
import gc
import pandas as pd
import numpy as np
import lightgbm as lgb
import datetime
from functools import partial
from itertools import product
from multiprocessing import Pool, cpu_count
from contextlib import closing

In [None]:
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

import lightgbm as lgb
# import xgboost as xgb
# from catboost import CatBoostRegressor

In [None]:
trades_df = pd.read_csv('../input/DSG2018-qualifiers/Trade.csv')
submission_df = pd.read_csv('../input/DSG2018-qualifiers/Challenge_20180423.csv')
isin_df = pd.read_csv('../input/DSG2018-qualifiers/Isin.csv')
customer_df = pd.read_csv('../input/DSG2018-qualifiers/Customer.csv')
market_df = pd.read_csv('../input/DSG2018-qualifiers/Market.csv')
price_df = pd.read_csv("../input/lastdayprice/PSYPredictedV2.csv")

In [None]:
price_df.rename(columns={"Unnamed: 0":"WeekDateKey"}, inplace=True)

In [None]:
price_df['WeekDateKey'] = datetime.datetime.strptime('20180423','%Y%m%d')
price_df.head()

In [None]:
trades_df = trades_df.rename({'TradeDateKey': 'DateKey'}, axis=1)
trades_df.DateKey = pd.to_datetime(trades_df.DateKey, format='%Y%m%d')
trades_df = trades_df.sort_values(['DateKey'])

In [None]:
trades_df['WeekDateKey'] = trades_df.DateKey - pd.to_timedelta(trades_df.DateKey.dt.dayofweek, 'd')
trades_df = trades_df.drop_duplicates(['CustomerIdx', 'IsinIdx', 'BuySell', 'WeekDateKey', 'CustomerInterest'])

In [None]:
trades_df1 = trades_df[trades_df.CustomerInterest == 1]

In [None]:
trades_df.shape

In [None]:
ngs = []
for i in [1, 4, 12, 16, 24]:
    tmp = trades_df1[['CustomerIdx', 'IsinIdx', 'BuySell', 'WeekDateKey']].copy()
    tmp.WeekDateKey += pd.to_timedelta(7 * i, 'd')
    ngs.append(tmp)

In [None]:
negative_sampling = pd.concat(ngs).reset_index(drop=True)
negative_sampling = negative_sampling[negative_sampling.WeekDateKey < '2018-04-23']
negative_sampling = negative_sampling.drop_duplicates(['CustomerIdx', 'IsinIdx', 'BuySell', 'WeekDateKey'])
negative_sampling.shape

In [None]:
negative_sampling = pd.concat([
    negative_sampling,
    trades_df1[['CustomerIdx', 'IsinIdx', 'BuySell', 'WeekDateKey', 'CustomerInterest']]
])

In [None]:
reord1 = negative_sampling[negative_sampling.duplicated(['CustomerIdx', 'IsinIdx', 'BuySell', 'WeekDateKey'], keep=False)
    &
    (negative_sampling.CustomerInterest == 1)
]
reord1['CustomerInterest'] = 1

reord0 = negative_sampling[~negative_sampling.duplicated(['CustomerIdx', 'IsinIdx', 'BuySell', 'WeekDateKey'], keep=False)
    &
    (negative_sampling.CustomerInterest == 1)
]
reord0['CustomerInterest'] = 0

In [None]:
reordered = pd.concat([reord0, reord1])

In [None]:
trades_df = reordered
del reordered
gc.collect()

In [None]:
trades_df.WeekDateKey = pd.to_datetime(trades_df.WeekDateKey)

submission_df = submission_df.rename({'DateKey': 'WeekDateKey'}, axis=1)
submission_df.WeekDateKey = pd.to_datetime(submission_df.WeekDateKey, format='%Y%m%d')

In [None]:
market_df = market_df.rename({'DateKey': 'WeekDateKey'}, axis=1)
market_df.WeekDateKey = pd.to_datetime(market_df.WeekDateKey, format='%Y%m%d')
market_df.WeekDateKey = market_df.WeekDateKey - pd.to_timedelta(market_df.WeekDateKey.dt.dayofweek-1, 'd')
market_df = pd.concat([market_df, price_df]).reset_index()
market_df.Price = np.log(market_df.Price, dtype=np.float32)
market_df = market_df.groupby(['WeekDateKey', 'IsinIdx']).mean().reset_index()

In [None]:
isin_df = pd.read_csv("../input/DSG2018-qualifiers/Isin.csv")
isin_df.fillna("missing", inplace=True)
isin_df = isin_df.rename({'Region': 'BondRegion'}, axis=1)
for col in ['ActualMaturityDateKey', 'IssueDateKey']:
    isin_df[col] = pd.to_datetime(isin_df[col], format='%Y%m%d')
    isin_df[col] = isin_df[col].apply(lambda x: x.toordinal())

categorical_isin = list(isin_df.dtypes[isin_df.dtypes == 'object'].index) + ['TickerIdx']
for col in categorical_isin:
    isin_df[col] =isin_df[col].astype('category')


In [None]:
customer_df = pd.read_csv("../input/DSG2018-qualifiers/Customer.csv")
customer_df.fillna("missing", inplace=True)
customer_df = customer_df.rename({'Region': 'CustomerRegion'}, axis=1)
categorical_customer = list(customer_df.dtypes[customer_df.dtypes == 'object'].index)
for col in categorical_customer:
    customer_df[col] = customer_df[col].astype('category')

categorical_features = categorical_customer + categorical_isin

In [None]:
trades_df = trades_df.sort_values('WeekDateKey')

In [None]:
import pandas as pd
pd.__version__

In [None]:
alldata_df = pd.concat([trades_df, submission_df], sort=False).reset_index(drop=True)

In [None]:
del trades_df
gc.collect()

In [None]:
market_df = pd.merge(market_df, isin_df, on='IsinIdx', how='left')
market_df['Order_dom'] = market_df['WeekDateKey'].apply(lambda d: (d.day-1) // 7 + 1)
market_df['Order_month'] = market_df['WeekDateKey'].dt.month

In [None]:
def rolling_handler(x, feature, rolling=-1):
    rolling = len(x) if rolling == -1 else rolling
    return getattr(x.rolling(rolling, 1), feature)()

In [None]:
features = ['sum', 'min', 'max', 'mean']

for groupby in categorical_isin + ['Order_dom', 'Order_month']:
    groupby = ['IsinIdx'] + [groupby]
    suffix = '_&_'.join(groupby)
    market_df.sort_values(groupby + ['WeekDateKey'], inplace=True)
    start_time = datetime.datetime.now()
    
    def worker_func(grouped, feature):
        return grouped.apply(lambda x: rolling_handler(x, feature))
    
    grouped = market_df[groupby + ['Price']].groupby(groupby, sort=False).Price
    partial_worker_func = partial(worker_func, grouped)
    
    with closing(Pool(24)) as p:
        ret_list = p.map(partial_worker_func, features)
        p.terminate()
    
    for ret, feature in zip(ret_list, features):
        name = '{}_{}_by_{}'.format('Price', feature, suffix)
        market_df[name] = ret
    
    del ret_list, grouped
    gc.collect()
    print(suffix, (datetime.datetime.now() - start_time).total_seconds() / 60)

In [None]:
market_df = market_df.dropna()

In [None]:
market_df.shape

In [None]:
alldata_df = pd.merge(alldata_df, customer_df, on='CustomerIdx', how='left')
alldata_df = pd.merge(alldata_df, market_df, on=['WeekDateKey', 'IsinIdx'], how='left')

In [None]:
del isin_df, customer_df, market_df
gc.collect()

In [None]:
alldata_df.loc[alldata_df.BuySell == 'Sell', 'Price'] *= -1

In [None]:
alldata_df['BuySell'] = alldata_df['BuySell'].map({'Buy': 0, 'Sell': 1})
categorical_features.extend(['BuySell'])

In [None]:
base_features = alldata_df.columns.difference(
    categorical_features + ['CustomerInterest', 'IsinIdx', 'CustomerIdx', 'WeekDateKey', 'PredictionIdx']
).tolist()
base_features

In [None]:
train_df = alldata_df[(alldata_df.WeekDateKey >= '2017-01-01') & (alldata_df.WeekDateKey < '2018-04-09')]
valid_df = alldata_df[(alldata_df.WeekDateKey >= '2018-04-09') & (alldata_df.WeekDateKey < '2018-04-23')]
test_df = alldata_df[alldata_df.WeekDateKey >= '2018-04-23']

In [None]:
if "TickerIdx" in categorical_features:
    categorical_features.remove("TickerIdx")
for c in categorical_features:
    print(c)
    lbl = LabelEncoder()
    lbl.fit(list(train_df[c].values) + list(valid_df[c].values) + list(test_df[c].values))
    train_df[c] = lbl.transform(list(train_df[c].values))
    valid_df[c] = lbl.transform(list(valid_df[c].values))
    test_df[c] = lbl.transform(list(test_df[c].values))

In [None]:
X_train = train_df[base_features + categorical_features]
X_valid = valid_df[base_features + categorical_features]
X_test = test_df[base_features + categorical_features]
y_train = train_df['CustomerInterest']
y_valid = valid_df['CustomerInterest']
y_test = test_df['CustomerInterest']
PredictionIdx = test_df['PredictionIdx']

In [None]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': .01,
    'num_leaves': 32,
    'max_depth': 12,
    'feature_fraction': 0.35,
    'bagging_fraction': 0.9,
    'bagging_freq': 2,
}

In [None]:
lgb_train = lgb.Dataset(X_train, y_train)
lgb_valid = lgb.Dataset(X_valid, y_valid)

In [None]:
gbm = lgb.train(params, lgb_train, categorical_feature=categorical_features, valid_sets=[lgb_train, lgb_valid], valid_names=['train','valid'], num_boost_round=500)

In [None]:
preds = gbm.predict(X_test)

In [None]:
pd.DataFrame({'PredictionIdx': PredictionIdx, 'CustomerInterest': preds}).to_csv('lgb_price_ma_2.csv', index=None)

In [None]:
import os
os.listdir("../working/")