GBDT seems to be working well in this competition. Why not ensembling them? For ensembling the rank ensemble is used, which typically boosts the AUC score.

This notebook is heavily based on 

- https://www.kaggle.com/lgreig/simple-lgbm-baseline
- https://www.kaggle.com/jsylas/riiid-lgbm-starter

Please upvote these notebooks too.

# Config

In [None]:
SEED = 42
EARLY_STOP = 40
VERBOSE = 1000
START_IDX = 80000000
TEST_SIZE = 0.2
WEIGHTS = [0.8, 0.1, 0.1] # LGB, XGB, CatB

# Preprocess

In [None]:
# Used most of coding from this kernel 
import optuna
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
from  sklearn.tree import DecisionTreeClassifier
from  sklearn.model_selection import train_test_split
import operator

# visualize
import matplotlib.pyplot as plt
import matplotlib.style as style
import seaborn as sns
from matplotlib import pyplot
from matplotlib.ticker import ScalarFormatter
sns.set_context("talk")
style.use('fivethirtyeight')

import riiideducation
import dask.dataframe as dd
import  pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score

env = riiideducation.make_env()
train= pd.read_csv('/kaggle/input/riiid-test-answer-prediction/train.csv',
                usecols=[1, 2, 3,4,7,8,9], dtype={'timestamp': 'int64', 'user_id': 'int32' ,'content_id': 'int16','content_type_id': 'int8','answered_correctly':'int8','prior_question_elapsed_time': 'float32','prior_question_had_explanation': 'boolean'}
              )
train = train[train.content_type_id == False]
#arrange by timestamp
train = train.sort_values(['timestamp'], ascending=True)

train.drop(['timestamp','content_type_id'], axis=1,   inplace=True)

results_c = train[['content_id','answered_correctly']].groupby(['content_id']).agg(['mean'])
results_c.columns = ["answered_correctly_content"]

results_u = train[['user_id','answered_correctly']].groupby(['user_id']).agg(['mean', 'sum'])
results_u.columns = ["answered_correctly_user", 'sum']

In [None]:
#reading in question df
questions_df = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/questions.csv',
                            usecols=[0,1, 3,4],
                            dtype={'question_id': 'int16',
                              'part': 'int8','bundle_id': 'int8','tags': 'str'}
                          )
tag = questions_df["tags"].str.split(" ", n = 10, expand = True) 
tag.columns = ['tags1','tags2','tags3','tags4','tags5','tags6']

questions_df =  pd.concat([questions_df,tag],axis=1)
questions_df['tags1'] = pd.to_numeric(questions_df['tags1'], errors='coerce')
questions_df['tags2'] = pd.to_numeric(questions_df['tags2'], errors='coerce')
questions_df['tags3'] = pd.to_numeric(questions_df['tags3'], errors='coerce')
questions_df['tags4'] = pd.to_numeric(questions_df['tags4'], errors='coerce')
questions_df['tags5'] = pd.to_numeric(questions_df['tags5'], errors='coerce')
questions_df['tags6'] = pd.to_numeric(questions_df['tags6'], errors='coerce')

In [None]:
X = train.iloc[START_IDX:,:]
X['prior_question_had_explanation'].fillna(False, inplace=True)
X = pd.merge(X, results_u, on=['user_id'], how="left")
X = pd.merge(X, results_c, on=['content_id'], how="left")
X = pd.merge(X, questions_df, left_on = 'content_id', right_on = 'question_id', how = 'left')

X=X[X.answered_correctly!= -1 ]
X=X.sort_values(['user_id'])
Y = X[["answered_correctly"]]
X = X.drop(["answered_correctly"], axis=1)

In [None]:
print('Training data shrinked from {:,} records ({:,} users) to {:,} records ({:,} users).'.format(train.shape[0], train['user_id'].nunique(), X.shape[0], X['user_id'].nunique()))

# Train test split

In [None]:
from sklearn.preprocessing import LabelEncoder

lb_make = LabelEncoder()
X["prior_question_had_explanation_enc"] = lb_make.fit_transform(X["prior_question_had_explanation"])
X.head()

X = X[['answered_correctly_user', 'answered_correctly_content', 'sum',
       'bundle_id','part','prior_question_elapsed_time','prior_question_had_explanation_enc',
       'tags1','tags2','tags3']]
X.fillna(0.5,  inplace=True)

Xt, Xv, Yt, Yv = train_test_split(X, Y, test_size = TEST_SIZE, shuffle=False, random_state=SEED)

# Models

In [None]:
# =============================
# LGB 
# =============================
lgb_params = {
    'n_estimators': 24000,
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'metric': 'auc', 
    'max_depth': 7,
    'learning_rate': 0.08,
    'subsample': 0.72,
    'subsample_freq': 4,
    'feature_fraction': 0.7,
    'lambda_l1': 1,
    'lambda_l2': 1,
    'seed': SEED,
    'early_stopping_rounds': EARLY_STOP
}

def fit_lgb(params, Xt, Yt, Xv, Yv):
    # prepare datasets
    lgb_train = lgb.Dataset(Xt, Yt)
    lgb_eval = lgb.Dataset(Xv, Yv)
    
    # fit
    model = lgb.train(params, lgb_train, valid_sets=[lgb_train, lgb_eval], verbose_eval=VERBOSE)
    
    # predict
    val_pred = model.predict(Xv)
    
    # CV score
    score = roc_auc_score(Yv, val_pred)
    print(f"AUC = {score}")
    
    # feature importance
    fi = pd.DataFrame()
    fi['features'] = Xt.columns.values.tolist()
    fi['importance'] = model.feature_importance(importance_type="gain")
    
    return model, fi

In [None]:
# =============================
# XGB 
# =============================
xgb_params = {
    'colsample_bytree': 0.7,
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'learning_rate': 0.08,
    'max_depth': 7,
    'subsample': 1,
    'min_child_weight': 4,
    'gamma': 0.24,
    'alpha': 0,
    'lambda': 1,
    'seed': SEED,
    'n_estimators': 24000
}
            
def fit_xgb(params, Xt, Yt, Xv, Yv):
    # model
    model = xgb.XGBClassifier(**params)
    model.fit(Xt, Yt, eval_set=[(Xv, Yv)], early_stopping_rounds=EARLY_STOP, verbose=VERBOSE)
    
    # predict
    val_pred = model.predict(Xv)
    
    # CV score
    score = roc_auc_score(Yv, val_pred)
    print(f"AUC = {score}")
    
    # feature importance
    fi = pd.DataFrame()
    fi['features'] = Xt.columns.values.tolist()
    fi['importance'] = 0
    importance = model.get_booster().get_score(importance_type='gain')
    importance = sorted(importance.items(), key=operator.itemgetter(1))
    df = pd.DataFrame(importance, columns=['feature', 'fscore'])
    df['fscore'] = df['fscore'] / df['fscore'].sum()
    for i, f in enumerate(Xt.columns.values.tolist()):
        try:
            fi.loc[fi['features'] == f, 'importance'] = df.loc[df['feature'] == f, "fscore"].values[0]
        except: # ignored by XGB
            continue
    
    return model, fi

In [None]:
# =============================
# catb 
# =============================
catb_params = { 
    'task_type': "CPU",
    'learning_rate': 0.08, 
    'iterations': 24000,
    'colsample_bylevel': 0.7,
    'random_seed': SEED,
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'use_best_model': True,
    'early_stopping_rounds': EARLY_STOP
}
            
def fit_catb(params, Xt, Yt, Xv, Yv):
    # model
    model = CatBoostClassifier(**params)
    model.fit(Xt, Yt, eval_set=(Xv, Yv), verbose=VERBOSE)
    
    # feature importance
    fi = pd.DataFrame()
    fi['features'] = Xt.columns.values.tolist()
    fi['importance'] = model.get_feature_importance()
    
    # predict
    val_pred = model.predict(Xv)
    
    # CV score
    score = roc_auc_score(Yv, val_pred)
    print(f"AUC = {score}")
    
    return model, fi

# Model fitting

## LGB

In [None]:
lgb_model, fi = fit_lgb(lgb_params, Xt, Yt, Xv, Yv)

In [None]:
sns.barplot(x='importance', y='features', data=fi.sort_values(by='importance', ascending=False))

## XGB

In [None]:
xgb_model, fi = fit_xgb(xgb_params, Xt, Yt, Xv, Yv)

In [None]:
sns.barplot(x='importance', y='features', data=fi.sort_values(by='importance', ascending=False))

# CatBoost

In [None]:
catb_model, fi = fit_catb(catb_params, Xt, Yt, Xv, Yv)

In [None]:
sns.barplot(x='importance', y='features', data=fi.sort_values(by='importance', ascending=False))

OK, how come does LGB outperform???

# Prediction

In [None]:
# rank ensemble for better AUC score
preds_df = pd.DataFrame()
preds_df['xgb'] = xgb_model.predict(Xv)
preds_df['lgb'] = lgb_model.predict(Xv)
preds_df['catb'] = catb_model.predict(Xv)

y_true = np.array(Yv)
val_pred = np.zeros(len(y_true))
for i, f in enumerate(preds_df.columns.values.tolist()):
    preds_df[f] = preds_df[f].rank(pct=True)
    val_pred += WEIGHTS[i] * preds_df[f].values

print('Validation score (AUC) = {}'.format(roc_auc_score(y_true, val_pred)))

In [None]:
test =  pd.read_csv('/kaggle/input/riiid-test-answer-prediction/example_test.csv')
test["prior_question_had_explanation_enc"] = lb_make.fit_transform(test["prior_question_had_explanation"])
test = pd.merge(test, results_u, on=['user_id'],  how="left")
test = pd.merge(test, results_c, on=['content_id'],  how="left")
test = pd.merge(test, questions_df, left_on = 'content_id', right_on = 'question_id', how = 'left')
test.fillna(0.5, inplace=True)

# rank ensemble for better AUC score
test_ = test[['answered_correctly_user', 'answered_correctly_content', 'sum',
       'bundle_id','part','prior_question_elapsed_time','prior_question_had_explanation_enc',
       'tags1','tags2','tags3']]
preds_df = pd.DataFrame()
preds_df['xgb'] = xgb_model.predict(test_)
preds_df['lgb'] = lgb_model.predict(test_)
preds_df['catb'] = catb_model.predict(test_)

y_pred = np.zeros(test_.shape[0])
for i, f in enumerate(preds_df.columns.values.tolist()):
    preds_df[f] = preds_df[f].rank(pct=True)
    y_pred += WEIGHTS[i] * preds_df[f].values

test['answered_correctly'] = pd.Series(y_pred).rank(pct=True)

results_c = train[['content_id','answered_correctly']].groupby(['content_id']).agg(['mean'])
results_c.columns = ["answered_correctly_content"]

results_u = train[['user_id','answered_correctly']].groupby(['user_id']).agg(['mean', 'sum'])
results_u.columns = ["answered_correctly_user", 'sum']

In [None]:
iter_test = env.iter_test()
for (test_df, sample_prediction_df) in iter_test:
    test_df = pd.merge(test_df, results_u, on=['user_id'],  how="left")
    test_df = pd.merge(test_df, results_c, on=['content_id'],  how="left")
    test_df = pd.merge(test_df, questions_df, left_on = 'content_id', right_on = 'question_id', how = 'left')
    test_df['answered_correctly_user'].fillna(0.5, inplace=True)
    test_df['answered_correctly_content'].fillna(0.5, inplace=True)
    test_df['sum'].fillna(0, inplace=True)
    test_df['prior_question_had_explanation'].fillna(False, inplace=True)
    test_df["prior_question_had_explanation_enc"] = lb_make.fit_transform(test_df["prior_question_had_explanation"])
    
    test_ = test_df[['answered_correctly_user', 'answered_correctly_content', 'sum',
       'bundle_id','part','prior_question_elapsed_time','prior_question_had_explanation_enc',
       'tags1','tags2','tags3']]
    preds_df = pd.DataFrame()
    preds_df['xgb'] = xgb_model.predict(test_)
    preds_df['lgb'] = lgb_model.predict(test_)
    preds_df['catb'] = catb_model.predict(test_)
    
    y_pred = np.zeros(test_.shape[0])
    for i, f in enumerate(preds_df.columns.values.tolist()):
        preds_df[f] = preds_df[f].rank(pct=True)
        y_pred += WEIGHTS[i] * preds_df[f].values

    test_df['answered_correctly'] = pd.Series(y_pred).rank(pct=True)
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])