# LightGBM with embeddings

In [2]:
import os 
import numpy as np
import pandas as pd

import lightgbm as lgb

import keras 
from keras.layers import *
from keras.optimizers import *
from keras.models import Model

from sklearn.metrics import f1_score 
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [65]:
DIR  = "data/"
SEED = 1881

if not os.path.isdir("models/"):
    os.makedirs("models")
    
print(os.listdir(DIR))

['dowload_data_here.txt', 'submission_format.csv', 'test_labels.csv', 'test_values.csv', 'train_labels.csv', 'train_values.csv']


In [66]:
train_x = pd.read_csv(DIR+"train_values.csv")
train_y = pd.read_csv(DIR+"train_labels.csv")
test_x  = pd.read_csv(DIR+"test_values.csv")
sub_csv = pd.read_csv(DIR+"submission_format.csv")

In [67]:
geo1 = np.array(pd.get_dummies(pd.concat([train_x["geo_level_1_id"], test_x["geo_level_1_id"]])))
geo2 = np.array(pd.get_dummies(pd.concat([train_x["geo_level_2_id"], test_x["geo_level_2_id"]])))
geo3 = np.array(pd.get_dummies(pd.concat([train_x["geo_level_3_id"], test_x["geo_level_3_id"]])))

In [68]:
def NET():
    inp = Input((geo3.shape[1],))
    i1 = Dense(16, name="intermediate")(inp)
    x2 = Dense(geo2.shape[1], activation='sigmoid')(i1)
    x1 = Dense(geo1.shape[1], activation='sigmoid')(i1)

    model = Model(inp, [x2,x1])
    model.compile(loss="binary_crossentropy", optimizer="adam")
    return model

In [None]:
# Alternate deeper model did not give better results
def NET():
    inp = Input((geo3.shape[1],))
    x = Dense(128, activation='relu')(inp)
    x = BatchNormalization()(x)
    x = Dense(64, activation='relu')(x)
    x = BatchNormalization()(x)
    x = Dense(32, activation='relu')(x)
    x = BatchNormalization()(x)
    x2 = Dense(geo2.shape[1], activation='sigmoid', name="output_2")(x)
    x1 = Dense(geo1.shape[1], activation='sigmoid', name="output_1")(x)

    model = Model(inp, [x2,x1])
    model.compile(loss="binary_crossentropy", optimizer="adam")
    return model

In [69]:
model = NET()
model.fit(geo3, [geo2, geo1], batch_size=32, epochs=10, verbose=2)
model.save("geo_embed.h5")

Epoch 1/10
10859/10859 - 358s - loss: 0.0932 - dense_2_loss: 0.0172 - dense_3_loss: 0.0760 - 358s/epoch - 33ms/step
Epoch 2/10
10859/10859 - 357s - loss: 0.0072 - dense_2_loss: 0.0031 - dense_3_loss: 0.0041 - 357s/epoch - 33ms/step
Epoch 3/10
10859/10859 - 354s - loss: 0.0025 - dense_2_loss: 0.0016 - dense_3_loss: 9.3608e-04 - 354s/epoch - 33ms/step
Epoch 4/10
10859/10859 - 361s - loss: 0.0010 - dense_2_loss: 6.7001e-04 - dense_3_loss: 3.5547e-04 - 361s/epoch - 33ms/step
Epoch 5/10
10859/10859 - 358s - loss: 4.9443e-04 - dense_2_loss: 3.4887e-04 - dense_3_loss: 1.4556e-04 - 358s/epoch - 33ms/step
Epoch 6/10
10859/10859 - 353s - loss: 2.8026e-04 - dense_2_loss: 2.2095e-04 - dense_3_loss: 5.9311e-05 - 353s/epoch - 33ms/step
Epoch 7/10
10859/10859 - 359s - loss: 1.8151e-04 - dense_2_loss: 1.5691e-04 - dense_3_loss: 2.4607e-05 - 359s/epoch - 33ms/step
Epoch 8/10
10859/10859 - 356s - loss: 1.3003e-04 - dense_2_loss: 1.1958e-04 - dense_3_loss: 1.0442e-05 - 356s/epoch - 33ms/step
Epoch 9/10
1

In [70]:
model = NET()

In [71]:
from keras import backend as K

get_int_layer_output = K.function([model.layers[0].input],
                                  [model.layers[1].output])

In [72]:
out = []
for dat in geo3[:260601]:
    layer_output = get_int_layer_output([np.reshape(dat, (1, geo3.shape[1]))])[0]
    out.append(layer_output)

out = np.array(out)
out = np.squeeze(out)

train_data = pd.get_dummies(train_x.copy())
train_data = train_data.drop(['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id'], axis=1)
train_data = train_data.assign(geo_feat1=out[:,0],
                               geo_feat2=out[:,1],
                               geo_feat3=out[:,2],  
                               geo_feat4=out[:,3],
                               geo_feat5=out[:,4],    
                               geo_feat6=out[:,5],
                               geo_feat7=out[:,6],
                               geo_feat8=out[:,7],
                               geo_feat9=out[:,8],
                               geo_feat10=out[:,9],
                               geo_feat11=out[:,10],
                               geo_feat12=out[:,11],
                               geo_feat13=out[:,12],
                               geo_feat14=out[:,13],
                               geo_feat15=out[:,14],           
                               geo_feat16=out[:,15])

In [74]:
out = []
for dat in geo3[260601:]:
    layer_output = get_int_layer_output([np.reshape(dat, (1, geo3.shape[1]))])[0]
    out.append(layer_output)

out = np.array(out)
out = np.squeeze(out)

test_data = pd.get_dummies(test_x.copy())
test_data = test_data.drop(['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id'], axis=1)
test_data = test_data.assign(geo_feat1=out[:,0],
                            geo_feat2=out[:,1],
                            geo_feat3=out[:,2],  
                            geo_feat4=out[:,3],
                            geo_feat5=out[:,4],    
                            geo_feat6=out[:,5],
                            geo_feat7=out[:,6],
                            geo_feat8=out[:,7],
                            geo_feat9=out[:,8],
                            geo_feat10=out[:,9],
                            geo_feat11=out[:,10],
                            geo_feat12=out[:,11],
                            geo_feat13=out[:,12],
                            geo_feat14=out[:,13],
                            geo_feat15=out[:,14],           
                            geo_feat16=out[:,15])

In [75]:
def threshold_arr(array):
    new_arr = []
    for ix, val in enumerate(array):
        loc = np.array(val).argmax(axis=0)
        k = list(np.zeros((len(val))))
        k[loc]=1
        new_arr.append(k)
        
    return np.array(new_arr)

In [76]:
y = np.array(train_y["damage_grade"])-1

df = train_data.drop(["building_id"], axis=1)
x = np.array(df)

In [None]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
for ix, (train_index, test_index) in enumerate(kf.split(x, y)):
    lgb_params = {
        "objective" : "multiclass",
        "num_class":3,
        "metric" : "multi_error",
        "boosting": 'gbdt',
        "max_depth" : -1,
        "num_leaves" : 30,
        "learning_rate" : 0.1,
        "feature_fraction" : 0.5,
        "min_sum_hessian_in_leaf" : 0.1,
        "max_bin":8192,
        "verbosity" : 1,
        "num_threads":6,
        "seed": SEED
    }

    x_train, x_val, y_train, y_val= x[train_index], x[test_index], y[train_index], y[test_index]

    train_data = lgb.Dataset(x_train, label=y_train)
    val_data   = lgb.Dataset(x_val, label=y_val)

    lgb_clf = lgb.train(lgb_params,
                        train_data,
                        20000,
                        valid_sets = [val_data],
                        early_stopping_rounds=3000,
                        verbose_eval = 1000)

    y_pred = lgb_clf.predict(x_val)
    print("F1-MICRO SCORE: ", f1_score(np.array(pd.get_dummies(y_val)), threshold_arr(y_pred), average='micro'))
    lgb_clf.save_model(f'models/model{ix}.txt')



You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 122908
[LightGBM] [Info] Number of data points in the train set: 208480, number of used features: 79
[LightGBM] [Info] Start training from score -2.339173
[LightGBM] [Info] Start training from score -0.564028
[LightGBM] [Info] Start training from score -1.094582
Training until validation scores don't improve for 3000 rounds
[1000]	valid_0's multi_error: 0.25696
[2000]	valid_0's multi_error: 0.253909
[3000]	valid_0's multi_error: 0.254485
[4000]	valid_0's multi_error: 0.255406
[5000]	valid_0's multi_error: 0.257497
Early stopping, best iteration is:
[2091]	valid_0's multi_error: 0.253353
F1-MICRO SCORE:  0.7466472247270773




You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 122803
[LightGBM] [Info] Number of data points in the train set: 208481, number of used features: 80
[LightGBM] [Info] Start training from score -2.339128
[LightGBM] [Info] Start training from score -0.564032
[LightGBM] [Info] Start training from score -1.094586
Training until validation scores don't improve for 3000 rounds
[1000]	valid_0's multi_error: 0.259094
[2000]	valid_0's multi_error: 0.255814
[3000]	valid_0's multi_error: 0.255065
[4000]	valid_0's multi_error: 0.256216
[5000]	valid_0's multi_error: 0.258001
[6000]	valid_0's multi_error: 0.259823
Early stopping, best iteration is:
[3182]	valid_0's multi_error: 0.254375
F1-MICRO SCORE:  0.7456254796623177




You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 122967
[LightGBM] [Info] Number of data points in the train set: 208481, number of used features: 79
[LightGBM] [Info] Start training from score -2.339178
[LightGBM] [Info] Start training from score -0.564024
[LightGBM] [Info] Start training from score -1.094586
Training until validation scores don't improve for 3000 rounds
[1000]	valid_0's multi_error: 0.255468
[2000]	valid_0's multi_error: 0.252379
[3000]	valid_0's multi_error: 0.253837
[4000]	valid_0's multi_error: 0.25447
Early stopping, best iteration is:
[1361]	valid_0's multi_error: 0.251727
F1-MICRO SCORE:  0.7482732156561781




You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 122909
[LightGBM] [Info] Number of data points in the train set: 208481, number of used features: 80
[LightGBM] [Info] Start training from score -2.339178
[LightGBM] [Info] Start training from score -0.564032
[LightGBM] [Info] Start training from score -1.094572
Training until validation scores don't improve for 3000 rounds
[1000]	valid_0's multi_error: 0.256581
[2000]	valid_0's multi_error: 0.253454
[3000]	valid_0's multi_error: 0.253185
[4000]	valid_0's multi_error: 0.254624
[5000]	valid_0's multi_error: 0.255602
Early stopping, best iteration is:
[2735]	valid_0's multi_error: 0.252456
F1-MICRO SCORE:  0.747544128933231




You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 122953
[LightGBM] [Info] Number of data points in the train set: 208481, number of used features: 79
[LightGBM] [Info] Start training from score -2.339178
[LightGBM] [Info] Start training from score -0.564032
[LightGBM] [Info] Start training from score -1.094572
Training until validation scores don't improve for 3000 rounds
[1000]	valid_0's multi_error: 0.257118
[2000]	valid_0's multi_error: 0.253665
[3000]	valid_0's multi_error: 0.254029
[4000]	valid_0's multi_error: 0.255142
[5000]	valid_0's multi_error: 0.257022
Early stopping, best iteration is:
[2815]	valid_0's multi_error: 0.253089
F1-MICRO SCORE:  0.7469109746738296


In [None]:
# Load all LightGB Models and concatenate.
models = []
for i in range(5):
    model = lgb.Booster(model_file=f'models/model{i}.txt')

    y_pred = model.predict(x)
    score  = f1_score(np.array(pd.get_dummies(y)), threshold_arr(y_pred), average='micro')
    print("F1-MICRO SCORE: ", score)
    models.append(model)

F1-MICRO SCORE:  0.8178978591793585
F1-MICRO SCORE:  0.8413091277470156
F1-MICRO SCORE:  0.7971535028645323
F1-MICRO SCORE:  0.8322953480608286
F1-MICRO SCORE:  0.8345862064995914


In [None]:
def ensemble(models, x):
    # Ensemble K-Fold CV models with adding all confidence score by class.
    y_preds = []
    
    for model in models:
        y_pred = model.predict(x)
        y_preds.append(y_pred)
        
    init_y_pred = y_preds[0]
    for ypred in y_preds[1:]:
        init_y_pred += ypred
        
    y_pred = threshold_arr(init_y_pred)
    
    return y_pred

In [None]:
test_x.shape

(86868, 39)

In [None]:
df = test_data.drop(["building_id"], axis=1)
x = np.array(df)

In [None]:
y_pred = ensemble(models, x)
y_pred = y_pred.argmax(axis=1)+1
sub_csv["damage_grade"] = y_pred
sub_csv.to_csv("submission.csv", index=False)