In [1]:
from sklearn.datasets import load_boston, load_iris
from sklearn.model_selection import train_test_split
data = load_boston()

In [8]:
print(data.DESCR)


.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [11]:
train_X,test_X, train_y,  test_y  = train_test_split(data.data,data.target, test_size=0.2, random_state=42, shuffle=True)
train_X, validate_X, train_y,  validate_y = train_test_split(train_X, train_y, test_size=0.25, random_state=42)

In [2]:
import lightgbm as lgb

In [None]:
from sklearn.preprocessing import StandardScaler

In [19]:
training_data = lgb.Dataset(data=train_X, label=train_y.squeeze(), categorical_feature=[3, 8])
testing_data = lgb.Dataset(data=validate_X, label=validate_y.squeeze(), categorical_feature=[3, 8])
store = {}

params = {'objective':'regression', 'num_iterations':300, 'learning_rate':0.3, 'num_leaves':100000, 'metric':'l2'}
rt = lgb.train(params=params, train_set=training_data, valid_sets=[training_data, testing_data], early_stopping_rounds=100)




[1]	training's l2: 52.0229	valid_1's l2: 49.8547
Training until validation scores don't improve for 100 rounds
[2]	training's l2: 32.5724	valid_1's l2: 34.7255
[3]	training's l2: 22.7636	valid_1's l2: 27.7439
[4]	training's l2: 17.135	valid_1's l2: 22.4719
[5]	training's l2: 14.1091	valid_1's l2: 20.065
[6]	training's l2: 12.2844	valid_1's l2: 18.7452
[7]	training's l2: 10.4471	valid_1's l2: 16.6091
[8]	training's l2: 8.99057	valid_1's l2: 15.4185
[9]	training's l2: 8.37676	valid_1's l2: 14.9135
[10]	training's l2: 7.66913	valid_1's l2: 14.8714
[11]	training's l2: 7.23623	valid_1's l2: 14.5292
[12]	training's l2: 6.836	valid_1's l2: 14.2767
[13]	training's l2: 6.51385	valid_1's l2: 14.1769
[14]	training's l2: 6.11447	valid_1's l2: 13.5908
[15]	training's l2: 5.69775	valid_1's l2: 13.2734
[16]	training's l2: 5.50663	valid_1's l2: 13.1714
[17]	training's l2: 5.18788	valid_1's l2: 12.9715
[18]	training's l2: 4.95285	valid_1's l2: 13.1308
[19]	training's l2: 4.61744	valid_1's l2: 13.7021
[

TypeError: predict() missing 1 required positional argument: 'data'

In [35]:
k = 233
contribute = rt.predict(data.data[k:k + 1,:], raw_score=True, pred_contrib=True)
contribute.sum(), contribute.shape, rt.num_trees()



(43.910947255937934, (1, 14), 17)

In [36]:
k = 233
leaf = rt.predict(data.data[k:k + 1,:], raw_score=True, pred_leaf=True)


In [44]:
leaves_output = [rt.get_leaf_output(tree_id=i, leaf_id=leaf_id) for i, leaf_id in enumerate(leaf.reshape(-1))]
print(leaves_output)
print(sum(leaves_output))
print(rt.predict(data.data[k:k + 1,:], raw_score=True))

[29.236942049717694, 4.766395582471575, 3.2633113114730175, 2.0712735360318963, 1.8391251010554177, 1.1139755263924598, 1.1210216611357673, 0.7847151651978492, 0.4324425732959872, -0.3656373962759972, -0.3357709317539747, 0.38303897086530925, -0.3439648206345737, 0.31538343007930303, -0.27129329634564264, 0.21849834139530475, -0.31850954816346205]
43.91094725593792
[43.91094726]


In [2]:
import lightgbm as lgb
import numpy as np
import itertools
class StackingModel:
    pass

class LGBStacking:
    def __init__(self, lgb_params, stacking_params=None):
        self.stacking_params = stacking_params if stacking_params is not None else {}
        self.lgb_params = lgb_params
    
    @staticmethod
    def get_intermediate_representation(rt, X):
        #type:(lgb.Booster, np.ndarray)->np.ndarray
        intermediate_representation = np.zeros((X.shape[0], rt.num_trees()), dtype=np.float32)
        pred_leaves = rt.predict(X, pred_leaf=True)
        for i, j in itertools.product(range(X.shape[0]), range(rt.num_trees())):
            intermediate_representation[i, j] = rt.get_leaf_output(tree_id=j, leaf_id=pred_leaves[i, j])
        return intermediate_representation
    
    def eval(self, rt, X, y, metric='L2'):
        #type: (lgb.Booster, np.ndarray, np.ndarray, str)->np.ndarray
        y_pred = rt.predict(data=X)
        if metric == 'L2':
            return np.mean(np.square(y_pred - y))
    
    def train(self, X, y, validate_size=0.25, random_state=None, shuffle=True,  categorical_feature='auto'):
        train_X, test_X, train_y, test_y  = train_test_split(X,y, test_size=validate_size, random_state=random_state, shuffle=shuffle)
        self.layers = []
        performance = []
        min_loss = 1e9
        count = 0
        best_round = 0
        for layers in range(self.stacking_params.get('maximum_layers', 5)):
            if len(self.layers) > 0:
                rt = self.layers[-1]
                train_X_intermediate = LGBStacking.get_intermediate_representation(rt, train_X)
                test_X_intermediate = LGBStacking.get_intermediate_representation(rt, test_X)
                train_X = np.hstack((train_X, train_X_intermediate))
                test_X = np.hstack((test_X, test_X_intermediate))
            training_data = lgb.Dataset(data=train_X, label=train_y.squeeze(), categorical_feature=categorical_feature)
            testing_data = lgb.Dataset(data=test_X, label=test_y.squeeze(), categorical_feature=categorical_feature)
            rt = lgb.train(params=self.lgb_params, train_set=training_data, valid_sets=[training_data, testing_data])
            self.layers.append(rt)
            performance.append(self.eval(rt, test_X, test_y))
            if min_loss > performance[-1]:
                count = 0
                min_loss = performance[-1]
                best_round = layers
            else:
                count += 1
                if count > self.stacking_params.get('min_early_stopping_layers', 2) \
                        and count == self.stacking_params.get('early_stopping_round', 2):
                    self.layers = self.layers[:best_round + 1]
                    return 
            
    
    def predict(self, X):
        for i, rt in enumerate(self.layers):
            if  i < len(self.layers) - 1:
                X_intermediate = LGBStacking.get_intermediate_representation(rt, X)
                X = np.hstack((X, X_intermediate))
            else:
                y = rt.predict(X)
                return y

In [5]:
params = {'objective':'regression', 'num_iterations':300, 'learning_rate':0.1, 'num_leaves':1000, 'metric':'l2'}
stacking = LGBStacking(lgb_params=params)
train_X,test_X, train_y,  test_y  = train_test_split(data.data,data.target, test_size=0.2, random_state=42, shuffle=True)

In [6]:
stacking.train(X=train_X, y=train_y)
print(np.mean(np.square(stacking.predict(test_X) - test_y)))


[1]	training's l2: 71.8998	valid_1's l2: 83.2917
[2]	training's l2: 61.6804	valid_1's l2: 74.6162
[3]	training's l2: 53.1023	valid_1's l2: 66.6323
[4]	training's l2: 46.1637	valid_1's l2: 60.0965
[5]	training's l2: 40.325	valid_1's l2: 55.12
[6]	training's l2: 35.4066	valid_1's l2: 50.1646
[7]	training's l2: 31.5398	valid_1's l2: 46.8128
[8]	training's l2: 28.1115	valid_1's l2: 43.2399
[9]	training's l2: 25.1459	valid_1's l2: 39.7494
[10]	training's l2: 22.8672	valid_1's l2: 36.4931
[11]	training's l2: 21.0206	valid_1's l2: 33.945
[12]	training's l2: 19.2772	valid_1's l2: 31.9091
[13]	training's l2: 17.8933	valid_1's l2: 29.8747
[14]	training's l2: 16.4811	valid_1's l2: 28.0929
[15]	training's l2: 15.3204	valid_1's l2: 26.6456
[16]	training's l2: 14.4379	valid_1's l2: 25.4303
[17]	training's l2: 13.6055	valid_1's l2: 24.3157
[18]	training's l2: 12.9111	valid_1's l2: 23.4678
[19]	training's l2: 12.2994	valid_1's l2: 22.8089
[20]	training's l2: 11.7124	valid_1's l2: 21.9733
[21]	training