# Part 2 - Model training

### Train and Validation

First we want to encode our labels to be between [0, n-1]

In [43]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

y = train_final_gd['target'].to_array()

classes = sorted(np.unique(y))

# Build classes with labels from [0, n-1]
lbl = LabelEncoder()
y = lbl.fit_transform(y)

Set our class weights and build a multi-weighted cross-entropy loss function to train our XGBoost model

In [44]:
from functools import partial

# Taken from Giba's topic : https://www.kaggle.com/titericz
# https://www.kaggle.com/c/PLAsTiCC-2018/discussion/67194
# with Kyle Boone's post https://www.kaggle.com/kyleboone
class_weights = {c: 1 for c in classes}
class_weights.update({c:2 for c in [64, 15]})

func_loss = partial(xgb_multi_weighted_logloss, 
                        classes=classes, 
                        class_weights=class_weights)

Preprocess our columns to fill `nan` values with zeros

In [45]:
cols = [i for i in test_final_gd.columns if i not in ['object_id','target']]
for col in cols:
    train_final_gd[col] = train_final_gd[col].fillna(0).astype('float32')

for col in cols:
    test_final_gd[col] = test_final_gd[col].fillna(0).astype('float32')

Perform a stratified split of our training dataset into 90% training and 10% validation datasets. 

In [46]:
X = train_final_gd[cols].as_matrix()
Xt = test_final_gd[cols].as_matrix()

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.1,stratify=y)

Set the parameters for XGBoost to build our ensemble of trees   

In [47]:
gpu_params = {
            'objective': 'multi:softprob', 
            'tree_method': 'hist', 
            'nthread': 16, 
            'num_class':14,
            'max_depth': 7, 
            'silent':1,
            'subsample':0.7,
            'colsample_bytree': 0.7,
            "objective": "multi:softprob",
            "tree_method": "gpu_hist"
}

Build DMatrix objects with our train, validation, and test datasets 

In [48]:
dtrain = xgb.DMatrix(data=X_train, label=y_train)
dvalid = xgb.DMatrix(data=X_test, label=y_test)
dtest = xgb.DMatrix(data=Xt)

Train our XGBoost model

In [49]:
watchlist = [(dvalid, 'eval'), (dtrain, 'train')]

clf = xgb.train(gpu_params, 
                dtrain=dtrain,
                num_boost_round=60,
                evals=watchlist,
                feval=func_loss,
                early_stopping_rounds=10,
                verbose_eval=1000)

yp = clf.predict(dvalid)

gpu_loss = multi_weighted_logloss(y_test, yp, classes, class_weights)

ysub = clf.predict(dtest)

line = 'validation loss %.4f'%gpu_loss
print(colored(line,'green'))

[0]	eval-merror:0.341401	train-merror:0.275096	eval-wloss:2.05758	train-wloss:1.88726
Multiple eval metrics have been passed: 'train-wloss' will be used for early stopping.

Will train until train-wloss hasn't improved in 10 rounds.
[59]	eval-merror:0.275159	train-merror:0.000849	eval-wloss:1.36472	train-wloss:0.089109
[32mvalidation loss 1.3647[0m
