This notebook is an ensemble of three simple models.LightGBM/XGboost/Catboost.
Almost all parameters in these models are default values.
To save memory, train.csv is read to only 30M lines. !Update! 30M â†’ 80M

Explanation of unique features
*     'user_lot' : Length Of Time per users. In fact, this feature may not be important.
*     'incorrect_answer_var' : Represents the variance of incorrect choices.Indicates if the question is prone to misleading
*     'user_part_lec' : The number of times the user has been lectured on that part.
*     'all_tag_mean' : Correct answer rate for each tag included in the question


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
import datatable as dt
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier, Pool
import riiideducation
import random
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

In [None]:
def reduce_mem_usage(df, use_float16=False):
    """
    Iterate through all the columns of a dataframe and modify the data type to reduce memory usage.        
    """
    
    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype("category")

    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
def qtag_mean(df,qtag):
    qtag_mean = df[[f'qtag_{qtag}','answered_correctly']].groupby([f'qtag_{qtag}']).agg(['mean']).reset_index()
    qtag_mean.columns = ['qtag',f'qtag_{qtag}_mean']
    qtag_mean = qtag_mean.astype({'qtag': 'int16'})
    return qtag_mean

In [None]:
def preprocess(df,qdf,ldf):

    lec = pd.merge(df[df["answered_correctly"]==-1], ldf, on=['content_id'], how="left")    
    
    user_part_lec = lec[['part','user_id','content_id']].groupby(['part','user_id']).agg(['count']).reset_index()
    user_part_lec.columns = ['part','user_id','user_part_lec']
    
    del lec
    
    df = df[df["answered_correctly"]!=-1]
    
    #incorrect_var process...
    incorrect_answer_count = df[df["answered_correctly"]==0][['content_id','user_answer','answered_correctly']].groupby(['content_id','user_answer']).agg(['count']).reset_index()
    incorrect_answer_count.columns = ['content_id','user_answer','incorrect_answer_count']
    
    incorrect_answer_cmax = incorrect_answer_count[['content_id','incorrect_answer_count']].groupby(['content_id']).agg(['max']).reset_index()
    incorrect_answer_cmax.columns = ['content_id','incorrect_answer_cmax']
    incorrect_answer_count = pd.merge(incorrect_answer_count,incorrect_answer_cmax, on=['content_id'], how="left")
    incorrect_answer_count['scaled_IAC'] = incorrect_answer_count['incorrect_answer_count']/incorrect_answer_count['incorrect_answer_cmax']
    
    incorrect_answer_var = incorrect_answer_count[['content_id','scaled_IAC']].groupby(['content_id']).agg(['var']).reset_index()
    incorrect_answer_var.columns = ['content_id','incorrect_answer_var']
    del incorrect_answer_count,incorrect_answer_cmax
    df = df.drop(['user_answer'], axis=1)

    df['prior_question_had_explanation'].fillna(0, inplace=True)
    df['prior_question_had_explanation'] = df['prior_question_had_explanation'].astype(np.int8)
    explanation_count = df[['prior_question_had_explanation','user_id']].groupby(['user_id']).agg(['sum']).reset_index()
    explanation_count.columns = ['user_id', 'explanation_count']

    user_time = df[['timestamp','user_id']].groupby(['user_id']).agg(['min','max']).reset_index()
    user_time.columns = ['user_id', 'min' ,'max']
    user_time['user_lot'] = user_time['max'] - user_time['min']
    user_time = user_time.drop(['max', 'min'], axis=1)

    df = df.drop(['timestamp','prior_question_had_explanation'], axis=1)

    user_cal = df[['user_id','answered_correctly']].groupby(['user_id']).agg(['mean', 'count', 'std', 'sem']).reset_index()
    user_cal.columns = ['user_id','user_mean', 'user_count', 'user_std', 'user_sem']
    user_cal = reduce_mem_usage(user_cal, use_float16=True)

    content_cal = df[['content_id','answered_correctly']].groupby(['content_id']).agg(['mean', 'count']).reset_index()
    content_cal.columns = ['content_id','content_mean', 'content_count']
    content_cal = reduce_mem_usage(content_cal, use_float16=True)
    
    task_cal = df[['task_container_id','answered_correctly']].groupby(['task_container_id']).agg(['count', 'std']).reset_index()
    task_cal.columns = ['task_container_id', 'task_count', 'task_std']
    task_cal = reduce_mem_usage(task_cal, use_float16=True)

    df = df.drop(['task_container_id'], axis=1)

    df = pd.merge(df,qdf,on="content_id",how="left")

    part_cal = df[['part','answered_correctly']].groupby(['part']).agg(['mean']).reset_index()
    part_cal.columns = ['part','part_mean']
    part_cal = reduce_mem_usage(part_cal, use_float16=True)
    
    qtag1_mean = qtag_mean(df,1)
    qtag2_mean = qtag_mean(df,2)
    qtag3_mean = qtag_mean(df,3)
    qtag4_mean = qtag_mean(df,4)
    qtag5_mean = qtag_mean(df,5)
    qtag6_mean = qtag_mean(df,6)

    qtag_merge = pd.DataFrame(range(188),columns={'qtag'})
    qtag_merge = pd.merge(qtag_merge,qtag1_mean,on="qtag",how="left")
    qtag_merge = pd.merge(qtag_merge,qtag2_mean,on="qtag",how="left")
    qtag_merge = pd.merge(qtag_merge,qtag3_mean,on="qtag",how="left")
    qtag_merge = pd.merge(qtag_merge,qtag4_mean,on="qtag",how="left")
    qtag_merge = pd.merge(qtag_merge,qtag5_mean,on="qtag",how="left")
    qtag_merge = pd.merge(qtag_merge,qtag6_mean,on="qtag",how="left")
    qtag_merge['tag_mean'] = qtag_merge.iloc[:,1:6].mean(axis=1)
    qtag_merge = qtag_merge.drop(['qtag_1_mean', 'qtag_2_mean', 'qtag_3_mean', 'qtag_4_mean', 'qtag_5_mean', 'qtag_6_mean'], axis=1)
    
    user_cal = pd.merge(user_cal, explanation_count, on=['user_id'], how="left")
    user_cal['explanation_rate'] = user_cal['explanation_count']/user_cal['user_count']
    user_cal = user_cal.drop(['explanation_count'], axis=1)
    user_cal = pd.merge(user_cal, user_time, on=['user_id'], how="left")
    content_cal = pd.merge(content_cal, incorrect_answer_var, on=['content_id'], how="left")
    
    del explanation_count,user_time, incorrect_answer_var,qtag1_mean,qtag2_mean,qtag3_mean,qtag4_mean,qtag5_mean,qtag6_mean
    
    return user_cal, content_cal, part_cal, task_cal, user_part_lec, qtag_merge

In [None]:
data_types_dict = {
    #'row_id': 'int64',
    'timestamp': 'int64',
    'user_id': 'int32',
    'content_id': 'int16',
    #'content_type_id': 'int8',
    'task_container_id': 'int16',
    'user_answer': 'int8',
    'answered_correctly': 'int8',
    'prior_question_elapsed_time': 'float16',
    'prior_question_had_explanation': 'boolean'
}

In [None]:
train_df = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/train.csv',
                   usecols=[1, 2, 3, 5, 6, 7, 8, 9],
                   nrows=80_000_000, 
                   dtype={'timestamp': 'int64',
                          'user_id': 'int32',
                          'content_id': 'int16',
                          'task_container_id': 'int16',
                          'user_answer': 'int8',
                          'answered_correctly':'int8',
                          'prior_question_elapsed_time': 'float32',
                          'prior_question_had_explanation': 'boolean'}
                   )

In [None]:
questions = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/questions.csv',usecols=[0,3,4])
questions = questions.rename(columns={'question_id': 'content_id'})
tag = questions["tags"].str.split(" ", n = 10, expand = True)
tag = tag.rename(columns=lambda s : 'qtag_' + f'{s+1}')
questions = pd.merge(questions, tag, left_index=True,right_index=True)
del tag
questions = questions.drop(['tags'], axis=1)

questions = reduce_mem_usage(questions, use_float16=True)

In [None]:
lectures = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/lectures.csv')
lectures = lectures.rename(columns={'lecture_id': 'content_id'})

In [None]:
train_df = train_df.sort_values(['timestamp'], ascending=True).reset_index(drop = True)
train = train_df.groupby('user_id').tail(25)
features = train_df[~train_df.index.isin(train.index)]
del train

In [None]:
features = features.drop(['prior_question_elapsed_time'], axis=1)
user_cal, content_cal, part_cal, task_cal, user_part_lec, qtag_merge = preprocess(features,questions,lectures)
del features

In [None]:
train = train_df.groupby('user_id').tail(25)
train = train[train["answered_correctly"]!=-1]
train = pd.merge(train,questions,on="content_id",how="left")

In [None]:
#'all_tag_mean' process... tags1~6 average
for tag in range(1,7):
    train[f'qtag_{tag}'] = pd.to_numeric(train[f'qtag_{tag}'], errors='coerce')
    train = pd.merge(train,qtag_merge,left_on=f'qtag_{tag}',right_on='qtag',how="left").drop(columns={'qtag',f'qtag_{tag}'})
    train = train.rename(columns={'tag_mean': f'tag_mean{tag}'})
    
train['all_tag_mean'] = train.loc[:,'tag_mean1':'tag_mean6'].mean(axis=1)
train = train.drop(['tag_mean1','tag_mean2','tag_mean3','tag_mean4','tag_mean5','tag_mean6'], axis=1)

In [None]:
train = pd.merge(train, user_cal, on=['user_id'], how="left")
train = pd.merge(train, content_cal, on=['content_id'], how="left")
train = pd.merge(train, part_cal, on=['part'], how="left")
train = pd.merge(train, task_cal, on=['task_container_id'], how="left")
train = pd.merge(train, user_part_lec, on=['user_id','part'], how="left")

In [None]:
train.isnull().any()

In [None]:
train['prior_question_had_explanation'] = train.prior_question_had_explanation.fillna(False).astype('int8')
train['prior_question_elapsed_time'].fillna(train['prior_question_elapsed_time'].median(), inplace=True)
train['user_mean'].fillna(train['user_mean'].median(),  inplace=True)
train['user_count'].fillna(0,  inplace=True)
train['user_std'].fillna(train['user_std'].median(),  inplace=True)
train['user_sem'].fillna(train['user_sem'].median(),  inplace=True)
train['content_mean'].fillna(train['content_mean'].median(),  inplace=True)
train['content_count'].fillna(0,  inplace=True)
train['task_count'].fillna(0,  inplace=True)
train['task_std'].fillna(train['task_std'].median(),  inplace=True)
train['part_mean'].fillna(train['part_mean'].median(),  inplace=True)
train['user_part_lec'].fillna(0,  inplace=True)
train['user_lot'].fillna(0,  inplace=True)
train['explanation_rate'].fillna(train['explanation_rate'].median(),  inplace=True)
train['incorrect_answer_var'].fillna(0,  inplace=True)
train['all_tag_mean'].fillna(train['all_tag_mean'].mean(skipna=True),  inplace=True)

In [None]:
train.isnull().any()

In [None]:
select_features = [
    'user_mean', 
    'user_count',
    'user_std', 
    'user_sem', 
    'content_mean',
    'content_count',
    'task_count',
    'task_std', 
    'part_mean',
    'prior_question_elapsed_time', 
    'prior_question_had_explanation',
    'explanation_rate',
    'user_lot',
    'incorrect_answer_var',
    'user_part_lec',
    'all_tag_mean'
]

target = 'answered_correctly'

In [None]:
y = train.groupby('user_id').tail(6)
x = train[~train.index.isin(y.index)]
del train

In [None]:
cat_params = {
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'task_type': 'GPU',
    'grow_policy': 'Lossguide',
    'iterations': 10000
}

cat_train = Pool(x[select_features], label = x[target])
cat_valid = Pool(y[select_features], label = y[target])

cat_model = CatBoostClassifier(**cat_params)
cat_model.fit(cat_train, eval_set = cat_valid,verbose_eval=50, early_stopping_rounds=10, use_best_model = True)
del cat_train,cat_valid

In [None]:
xgb_params = {
    'eval_metric': 'auc',
    'tree_method': 'gpu_hist',
    'n_estimators': 5000
}

xgb_model = xgb.XGBClassifier(**xgb_params)
xgb_model.fit(x[select_features], x[target], 
        eval_set=[(y[select_features], y[target])],
        verbose=50, early_stopping_rounds=10)

In [None]:
lgb_params = {
    'objective': 'binary',
    'metric': 'auc',
}


lgb_train = lgb.Dataset(x[select_features], x[target])
lgb_eval = lgb.Dataset(y[select_features], y[target], reference=lgb_train)

lgb_model = lgb.train(
    lgb_params, lgb_train,
    valid_sets=[lgb_train, lgb_eval],
    verbose_eval=50,
    num_boost_round=5000,
    early_stopping_rounds=10
)

In [None]:
lgb.plot_importance(lgb_model)

In [None]:
print('cat_auc:', roc_auc_score(y[target], cat_model.predict_proba(y[select_features].values)[:,1]))

print('xgb_auc:', roc_auc_score(y[target], xgb_model.predict_proba(y[select_features])[:,1]))

print('lgb_auc:', roc_auc_score(y[target], lgb_model.predict(y[select_features])))

print('ensemble_auc:', roc_auc_score(y[target], np.average([
    cat_model.predict_proba(y[select_features].values)[:,1],
    xgb_model.predict_proba(y[select_features])[:,1],
    lgb_model.predict(y[select_features])
    ], axis=0)))

In [None]:
del x,y

In [None]:
user_cal, content_cal, part_cal, task_cal, user_part_lec, qtag_merge = preprocess(train_df,questions,lectures)

In [None]:
env = riiideducation.make_env()
iter_test = env.iter_test()

In [None]:
for (test_df, sample_prediction_df) in iter_test:
   
    test_df = pd.merge(test_df,user_cal, on=['user_id'], how="left")
    test_df = pd.merge(test_df,content_cal, on=['content_id'], how="left")
    test_df = pd.merge(test_df,task_cal, on=['task_container_id'], how="left")    
    test_df = pd.merge(test_df,questions, on=['content_id'], how="left")
    test_df = pd.merge(test_df,part_cal, on=['part'], how="left")
    test_df = pd.merge(test_df,user_part_lec, on=['user_id','part'], how="left")

    test_df['prior_question_had_explanation'] = test_df.prior_question_had_explanation.fillna(False).astype('int8')
    test_df['prior_question_elapsed_time'].fillna(test_df['prior_question_elapsed_time'].median(), inplace=True)

    for tag in range(1,7):
        test_df[f'qtag_{tag}'] = pd.to_numeric(test_df[f'qtag_{tag}'], errors='coerce')
        test_df = pd.merge(test_df,qtag_merge,left_on=f'qtag_{tag}',right_on='qtag',how="left").drop(columns={'qtag',f'qtag_{tag}'})
        test_df = test_df.rename(columns={'tag_mean': f'tag_mean{tag}'})
    
    test_df['all_tag_mean'] = test_df.loc[:,'tag_mean1':'tag_mean6'].mean(axis=1)
    test_df = test_df.drop(['tag_mean1','tag_mean2','tag_mean3','tag_mean4','tag_mean5','tag_mean6'], axis=1)
      
    test_df['user_mean'].fillna(test_df['user_mean'].median(),  inplace=True)
    test_df['user_count'].fillna(0,  inplace=True)
    test_df['user_std'].fillna(test_df['user_std'].median(),  inplace=True)
    test_df['user_sem'].fillna(test_df['user_sem'].median(),  inplace=True)

    test_df['content_mean'].fillna(test_df['content_mean'].median(),  inplace=True)
    test_df['content_count'].fillna(0,  inplace=True)

    test_df['task_count'].fillna(0,  inplace=True)
    test_df['task_std'].fillna(test_df['task_std'].median(),  inplace=True)
    
    test_df['part_mean'].fillna(test_df['part_mean'].median(),  inplace=True)

    test_df['user_part_lec'].fillna(0,  inplace=True)
    test_df['user_lot'].fillna(0,  inplace=True)
    test_df['explanation_rate'].fillna(test_df['explanation_rate'].median(),  inplace=True)
    test_df['incorrect_answer_var'].fillna(0,  inplace=True)
    test_df['all_tag_mean'].fillna(test_df['all_tag_mean'].median(),  inplace=True)

    test_df['answered_correctly'] = np.average([        
        cat_model.predict_proba(test_df[select_features].values)[:,1],
        xgb_model.predict_proba(test_df[select_features])[:,1],
        lgb_model.predict(test_df[select_features])
    ], axis=0)
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])