In [None]:
!pip install ../input/python-datatable/datatable-0.11.0-cp37-cp37m-manylinux2010_x86_64.whl > /dev/null 2>&1


In [None]:
import pandas as pd
import numpy as np
import datatable as dt
import lightgbm as lgb
from collections import defaultdict
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder
import riiideducation
import xgboost
from xgboost import XGBClassifier
from xgboost import plot_importance
from sklearn.metrics import roc_auc_score



In [None]:
data_types_dict = {
    'user_id': 'int32', 
    'content_id': 'int16', 
    'answered_correctly': 'int8', 
    'prior_question_elapsed_time': 'float32', 
    'prior_question_had_explanation': 'bool'
}
target = 'answered_correctly'

In [None]:
train_df = dt.fread('../input/riiid-test-answer-prediction/train.csv', columns=set(data_types_dict.keys())).to_pandas()
train_df = train_df[train_df[target] != -1].reset_index(drop=True)
train_df['prior_question_had_explanation'].fillna(False, inplace=True)
train_df = train_df.astype(data_types_dict)

In [None]:
train_df['lag'] = train_df.groupby('user_id')[target].shift()
cum = train_df.groupby('user_id')['lag'].agg(['cumsum', 'cumcount'])
train_df['user_correctness'] = cum['cumsum'] / cum['cumcount']
train_df.drop(columns=['lag'], inplace=True)

In [None]:
user_agg = train_df.groupby('user_id')[target].agg(['sum', 'count'])
content_agg = train_df.groupby('content_id')[target].agg(['sum', 'count'])

In [None]:
train_df = train_df.groupby('user_id').tail(24).reset_index(drop=True)

In [None]:
questions_df = pd.read_csv(
    '../input/riiid-test-answer-prediction/questions.csv', 
    usecols=[0, 3],
    dtype={'question_id': 'int16', 'part': 'int8'}
)
train_df = pd.merge(train_df, questions_df, left_on='content_id', right_on='question_id', how='left')
train_df.drop(columns=['question_id'], inplace=True)

In [None]:
train_df['content_count'] = train_df['content_id'].map(content_agg['count']).astype('int32')
train_df['content_avg'] = train_df['content_id'].map(content_agg['sum'] / content_agg['count'])

In [None]:
lencoder = LabelEncoder()
train_df['prior_question_had_explanation_enc'] = lencoder.fit_transform(train_df['prior_question_had_explanation'])


In [None]:
valid_df = train_df.groupby('user_id').tail(6)
train_df.drop(valid_df.index, inplace=True)

In [None]:
train_df.head()

In [None]:
features = [
    'prior_question_elapsed_time',
    'prior_question_had_explanation_enc',
    'user_correctness',
    'part',
    'content_count',
    'content_avg'
]

#Parameters for XGBoost 
params = {
    'max_depth' : 7,
   # 'max_leaves' : 2**4,
    'alpha':0.1, 
   # 'lambda' : 0.2,
    'min_child_weight ':2,
    'subsample':0.7,
    'tree_method' : 'gpu_hist',
    'learning_rate': 0.1, #default = 0.3,
    'colsample_bytree':0.7,
    'eval_metric':'auc', 
    'objective' : 'binary:logistic',
    'grow_policy' : 'lossguide',
    'n_estimators': 800
}


In [None]:
train_matrix = xgboost.DMatrix(data = train_df[features], label = train_df[target])
test_matrix = xgboost.DMatrix(data = valid_df[features])
xgb = xgboost.train(params, dtrain = train_matrix)

predicts = xgb.predict(test_matrix)
roc = roc_auc_score(valid_df[target].astype('int32'), predicts)
print('ROC for XGBoost model')
print(roc)


In [None]:
xgb.get_score(importance_type='gain')
plot_importance(xgb)

In [None]:
model=xgb

In [None]:
user_sum_dict = user_agg['sum'].astype('int16').to_dict(defaultdict(int))
user_count_dict = user_agg['count'].astype('int16').to_dict(defaultdict(int))
content_sum_dict = content_agg['sum'].astype('int32').to_dict(defaultdict(int))
content_count_dict = content_agg['count'].astype('int32').to_dict(defaultdict(int))

In [None]:
try:
    env = riiideducation.make_env()
except:
    pass
iter_test = env.iter_test()
prior_test_df = None

In [None]:

for (test_df, sample_prediction_df) in iter_test:
    if prior_test_df is not None:
        prior_test_df[target] = eval(test_df['prior_group_answers_correct'].iloc[0])
        prior_test_df = prior_test_df[prior_test_df[target] != -1].reset_index(drop = True)
        
        user_ids = prior_test_df['user_id'].values
        content_ids = prior_test_df['content_id'].values
        targets = prior_test_df[target].values
        
        for user_id, content_id, answered_correctly in zip(user_ids, content_ids, targets):
            user_sum_dict[user_id] += answered_correctly
            user_count_dict[user_id] += 1
            content_sum_dict[content_id] += answered_correctly
            content_count_dict[content_id] += 1

    prior_test_df = test_df.copy()
    
    test_df = test_df[test_df['content_type_id'] == 0].reset_index(drop = True)
    test_df = pd.merge(test_df, questions_df, left_on = 'content_id', right_on = 'question_id', how = 'left')
    test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].fillna(False).astype('bool')    
    test_df["prior_question_had_explanation_enc"] = lencoder.transform(test_df["prior_question_had_explanation"])
    
    user_sum = np.zeros(len(test_df), dtype = np.int16)
    user_count = np.zeros(len(test_df), dtype = np.int16)
    content_sum = np.zeros(len(test_df), dtype = np.int32)
    content_count = np.zeros(len(test_df), dtype = np.int32)
    
    for i, (user_id, content_id) in enumerate(zip(test_df['user_id'].values, test_df['content_id'].values)):
        user_sum[i] = user_sum_dict[user_id]
        user_count[i] = user_count_dict[user_id]
        content_sum[i] = content_sum_dict[content_id]
        content_count[i] = content_count_dict[content_id]

    test_df['user_correctness'] = user_sum / user_count
    test_df['content_count'] = content_count
    test_df['content_avg'] = content_sum / content_count
    test_mat = xgboost.DMatrix(data = test_df[features])
    test_df[target] = model.predict(test_mat)
    env.predict(test_df[['row_id', target]])