# (Top 9th) LightGBM & XGBoost Ensemble Modeling
## This is LightGBM and XGBoost ensemble modeling notebook. This model reaches the top 9th. I appreciate if you upvote!
## I also shared [Basic and Informative EDA Notebook](https://www.kaggle.com/werooring/basic-eda-for-everyone). It is quite easy so that even beginners can understand
- [reference notebook](https://www.kaggle.com/xiaozhouwang/2nd-place-lightgbm-solution)

In [None]:
import numpy as np
import pandas as pd

# file path
data_path = '/kaggle/input/porto-seguro-safe-driver-prediction/'

train = pd.read_csv(data_path + 'train.csv', index_col='id')
test = pd.read_csv(data_path + 'test.csv', index_col='id')
submission = pd.read_csv(data_path + 'sample_submission.csv', index_col='id')

In [None]:
all_data = pd.concat([train, test], ignore_index=True)
all_data = all_data.drop('target', axis=1) # Remove target value

all_features = all_data.columns.tolist() # All features

## Feature Engineering

### Generate missing values as a new feature

In [None]:
# Add 'number of missing values per data' as a new feature
all_data['num_missing'] = (all_data==-1).sum(axis=1)

# Features excluding nominal feature, features with calc on tag
remaining_features = [col for col in all_features \
                      if ('cat' not in col and 'calc' not in col)] 
# Add num_missin to remaining_features
remaining_features.append('num_missing')

### Apply One-Hot Encoding to nominal features

In [None]:
from sklearn.preprocessing import OneHotEncoder

cat_features = [col for col in all_features if 'cat' in col] # Nominal features

# Apply One-Hot encoding
onehot_encoder = OneHotEncoder()
encoded_cat_matrix = onehot_encoder.fit_transform(all_data[cat_features]) 

### Create a new features `mix_ind` that combines unique values of an `ind` features

In [None]:
# Feature with 'ind' on tag
ind_features = [col for col in all_features if 'ind' in col]

first_col=True
for col in ind_features:
    if first_col:
        all_data['mix_ind'] = all_data[col].astype(str)+'_'
        first_col = False
    else:
        all_data['mix_ind'] += all_data[col].astype(str)+'_'

### Create new feature, the number of eigenvalues for nominal features

In [None]:
cat_count_features = []
for col in cat_features+['mix_ind']:
    val_counts_dic = all_data[col].value_counts().to_dict()
    all_data[f'{col}_count'] = all_data[col].apply(lambda x: val_counts_dic[x])
    cat_count_features.append(f'{col}_count')

### Final dataset

In [None]:
from scipy import sparse

drop_features = ['ps_ind_14', 'ps_ind_10_bin','ps_ind_11_bin', 
                 'ps_ind_12_bin','ps_ind_13_bin','ps_car_14']

# Data to remove drop_features from remaining_features, cat_count_features
all_data_remaining = all_data[remaining_features+cat_count_features].drop(drop_features, axis=1)

# Concatenate Data
all_data_sprs = sparse.hstack([sparse.csr_matrix(all_data_remaining),
                               encoded_cat_matrix],
                              format='csr')

In [None]:
num_train = train.shape[0] # Number of train data 

# Divide train data and test data
X = all_data_sprs[:num_train]
X_test = all_data_sprs[num_train:]

y = train['target'].values

## Evaluation Matrics

In [None]:
def eval_gini(y_true, y_pred):
    # Verify that the actual and predicted values are the same size (different values raise errors)
    assert y_true.shape == y_pred.shape

    n_samples = y_true.shape[0] # Number of data
    L_mid = np.linspace(1 / n_samples, 1, n_samples) # Diagonal value

    # 1) Gini coefficient for predicted values
    pred_order = y_true[y_pred.argsort()] # Sort y_true values by y_pred size
    L_pred = np.cumsum(pred_order) / np.sum(pred_order) # Lorentz Curve
    G_pred = np.sum(L_mid - L_pred) # Gini coefficient for predicted values

    # 2) Gini coefficient when prediction is perfect
    true_order = y_true[y_true.argsort()] # Sort y_true values by y_true size
    L_true = np.cumsum(true_order) / np.sum(true_order) # Lorentz Curve
    G_true = np.sum(L_mid - L_true) #  Gini coefficient when prediction is perfect

    # Normalized Gini coefficient
    return G_pred / G_true

In [None]:
def gini_lgb(preds, dtrain):
    labels = dtrain.get_label()
    return 'gini', eval_gini(labels, preds), True

In [None]:
def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    return 'gini', eval_gini(labels, preds)

## Modeling

In [None]:
from sklearn.model_selection import StratifiedKFold

# Create Stratified K Fold Cross-Verifier
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1991)

### LightGBM

In [None]:
max_params_lgb = {'bagging_fraction': 0.8043696643500143,
 'feature_fraction': 0.6829323879981047,
 'lambda_l1': 0.9264555612104627,
 'lambda_l2': 0.9774233689434216,
 'min_child_samples': 10,
 'min_child_weight': 125.68433948868649,
 'num_leaves': 28,
 'objective': 'binary',
 'learning_rate': 0.01,
 'bagging_freq': 1,
 'verbosity': 0,
 'random_state': 1991}

In [None]:
import lightgbm as lgbm

# One-dimensional array of probabilities for predicting validation data target values with an OOF-trained model
oof_val_preds_lgb = np.zeros(X.shape[0]) 
# One-dimensional array of probabilities for predicting test data target values with an OOF-trained model
oof_test_preds_lgb = np.zeros(X_test.shape[0]) 

# Train, validate, and predict models by OOF
for idx, (train_idx, valid_idx) in enumerate(folds.split(X, y)):
    # The phrase that separates each fold.
    print('#'*40, f'Fold {idx+1} out of {folds.n_splits}', '#'*40)
    
    # Set train data, valid data
    X_train, y_train = X[train_idx], y[train_idx] # Train data
    X_valid, y_valid = X[valid_idx], y[valid_idx] # Valid data

    # Create lgbm dataset
    dtrain = lgbm.Dataset(X_train, y_train) # lgbm train dataset
    dvalid = lgbm.Dataset(X_valid, y_valid) # lgbm valid dataset

    # Train LightGBM
    lgb_model = lgbm.train(params=max_params_lgb, # Optimal Hyper-parameters
                           train_set=dtrain, # Train data
                           num_boost_round=1500, # Number of boosting iterations
                           valid_sets=dvalid, # Valid data for model performance evaluation
                           feval=gini_lgb, # Evaluation metrics for validation
                           early_stopping_rounds=150, # Early stopping condition
                           verbose_eval=100)
    
    # The number of boosting iterations when the model performs best 
    best_iter = lgb_model.best_iteration
    # Predict probabilities using test data
    oof_test_preds_lgb += lgb_model.predict(X_test, 
                                    num_iteration=best_iter)/folds.n_splits
    # OOF prediction for model performance evaluation
    oof_val_preds_lgb[valid_idx] += lgb_model.predict(X_valid, num_iteration=best_iter)
    
    # Normalized Gini coefficient for oof prediction probabilities
    gini_score = eval_gini(y_valid, oof_val_preds_lgb[valid_idx])
    print(f'Fold {idx+1} gini score: {gini_score}\n')

### XGBoost

In [None]:
max_params_xgb = {'colsample_bytree': 0.8927325521002059,
 'gamma': 9.766883037651555,
 'max_depth': 7,
 'min_child_weight': 6.0577898395058085,
 'reg_alpha': 8.136089122187865,
 'reg_lambda': 1.385119327658532,
 'scale_pos_weight': 1.5142072116395773,
 'subsample': 0.717425859940308,
 'objective': 'binary:logistic',
 'learning_rate': 0.05,
 'random_state': 1991}

In [None]:
import xgboost as xgb

# One-dimensional array of probabilities for predicting validation data target values with an OOF-trained model
oof_val_preds_xgb = np.zeros(X.shape[0]) 
# One-dimensional array of probabilities for predicting test data target values with an OOF-trained model
oof_test_preds_xgb = np.zeros(X_test.shape[0]) 

# Train, validate, and predict models by OOF
for idx, (train_idx, valid_idx) in enumerate(folds.split(X, y)):
    # The phrase that separates each fold.
    print('#'*40, f'Fold {idx+1} out of {folds.n_splits}', '#'*40)
    
    # Set train data, valid data
    X_train, y_train = X[train_idx], y[train_idx]
    X_valid, y_valid = X[valid_idx], y[valid_idx]

    # Create xgboost dmatrix
    dtrain = xgb.DMatrix(X_train, y_train)
    dvalid = xgb.DMatrix(X_valid, y_valid)
    dtest = xgb.DMatrix(X_test)
    
    watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

    xgb_model = xgb.train(params=max_params_xgb, 
                           dtrain=dtrain,
                           num_boost_round=1000,
                           evals=watchlist,
                          maximize=True,
                           feval=gini_xgb,
                           early_stopping_rounds=150,
                           verbose_eval=100)

    # The number of boosting iterations when the model performs best 
    best_iter = xgb_model.best_iteration
    # Predict probabilities using test data
    oof_test_preds_xgb += xgb_model.predict(dtest, 
                                    ntree_limit=best_iter)/folds.n_splits
    # OOF prediction for model performance evaluation
    oof_val_preds_xgb[valid_idx] += xgb_model.predict(dvalid, ntree_limit=best_iter)
    
    # Normalized Gini coefficient for oof prediction probabilities
    gini_score = eval_gini(y_valid, oof_val_preds_xgb[valid_idx])
    print(f'Fold {idx+1} gini score: {gini_score}\n')

### OOF Valid Gini Score

In [None]:
print('LightGBM OOF Gini Score:', eval_gini(y, oof_val_preds_lgb))

In [None]:
print('XGBoost OOF Gini Score:', eval_gini(y, oof_val_preds_xgb))

## Ensemble and Submission

In [None]:
oof_test_preds = oof_test_preds_lgb * 0.6 + oof_test_preds_xgb * 0.4
submission['target'] = oof_test_preds
submission.to_csv('submission.csv')