In [None]:
!pip install ../input/python-datatable/datatable-0.11.0-cp37-cp37m-manylinux2010_x86_64.whl > /dev/null 2>&1

In [None]:
import numpy as np
import pandas as pd
from collections import defaultdict
import datatable as dt
import lightgbm as lgb
from matplotlib import pyplot as plt
import riiideducation
import torch


# Error handling, ignore all
np.seterr(divide = 'ignore', invalid = 'ignore')

In [None]:
data_types_dict = {
    'user_id': 'int32', 
    'timestamp': 'int64',
    'content_id': 'int16', 
    'answered_correctly': 'int8', 
    'prior_question_elapsed_time': 'float32', 
    'prior_question_had_explanation': 'bool'
}
target = 'answered_correctly'

In [None]:
train_df = dt.fread('../input/riiid-test-answer-prediction/train.csv', columns = set(data_types_dict.keys())).to_pandas()


In [None]:
#train_df = train_df.groupby('user_id').tail(24).reset_index(drop = True)

In [None]:
print('Training dataset detailed information')
print('*' * 50)
print('Columns:', train_df.columns)
print('*' * 50)
print('Shape:', train_df.shape)
print('*' * 50)
print('NA values in each column:', sum(train_df.isna().sum()))
print('*' * 50)

In [None]:
train_df = train_df[train_df[target] != -1].reset_index(drop = True, inplace = False)#获取target非-1的样本

train_df['prior_question_had_explanation'].fillna(False, inplace = True)#用False填充nan

train_df = train_df.astype(data_types_dict)


In [None]:
prior_question_elapsed_time_mean = train_df.prior_question_elapsed_time.dropna().values.mean()
train_df['prior_question_elapsed_time_mean'] = train_df.prior_question_elapsed_time.fillna(prior_question_elapsed_time_mean)

In [None]:
train_df.shape

In [None]:
train_df['lag'] = train_df.groupby('user_id')[target].shift()

cum = train_df.groupby('user_id')['lag'].agg(['cumsum', 'cumcount']) # 列方向上求累积和 和累计个数
#  学习进步的增长率
train_df['user_correctness'] = cum['cumsum'] / cum['cumcount']
# 
train_df.drop(columns = ['lag'], inplace = True)



In [None]:
# Overall correctness of users 用户回答问题正确的比例，数目和次数 sum是回答正确的次数，count是回答的xx题目的总次数
user_agg = train_df.groupby('user_id')[target].agg(['sum', 'count'])
# Overall difficulty of questions每个content出现的次数和被回答正确的比例
content_agg = train_df.groupby('content_id')[target].agg(['sum', 'count'])

In [None]:
#train_df['Accuracy'] = train_df['user_id'].map(user_agg['sum']/user_agg['count'])#每个用户回答问题的准确率
train_df['Accuracy_sum'] = train_df['user_id'].map(user_agg['sum'])#每个用户回答问题对的总数
train_df['Questions_num'] = train_df['user_id'].map(user_agg['count'])#每个用户回答问题的总数

In [None]:
# Take only 24 last observations of each user
train_df = train_df.groupby('user_id').tail(500).reset_index(drop = True)

In [None]:
#train = train_df.groupby('user_id').tail(24).reset_index(drop = True)
#valid_df = valid_df.groupby('user_id').tail(24).reset_index(drop = True)

In [None]:
questions_df = pd.read_csv(
    '../input/riiid-test-answer-prediction/questions.csv', 
    usecols = [0, 3],
    dtype = {'question_id': 'int16', 'part': 'int8'}
)
train_df = pd.merge(train_df, questions_df, left_on = 'content_id', right_on = 'question_id', how = 'left')

train_df.drop(columns = ['question_id'], inplace = True)


In [None]:
train_df['content_count'] = train_df['content_id'].map(content_agg['count']).astype('int32')#某讲座被回答的次数

train_df['content_id'] = train_df['content_id'].map(content_agg['sum'] / content_agg['count'])#某讲座被回答正确的比例

In [None]:
train_df.prior_question_had_explanation=train_df.prior_question_had_explanation.astype('int8')

train_df['lag'] = train_df.groupby('user_id')['prior_question_had_explanation'].shift()#用户是否


In [None]:
#train_df['lag'] = train_df.groupby('user_id')['prior_question_had_explanation'].shift()#用户是否看到上一个问题的答案，第一个题目为null。通常前几个都为false，因为那是测试。
cum = train_df.groupby('user_id')['lag'].agg(['cumsum', 'cumcount'])#看上一题解释的总数和列数
train_df['explanation_mean'] = cum['cumsum'] / cum['cumcount']#解释的平均
train_df['explanation_cumsum'] = cum['cumsum'] 

train_df.drop(columns=['lag'], inplace=True)

train_df['explanation_mean'].fillna(0, inplace=True)
train_df['explanation_cumsum'].fillna(0, inplace=True)
train_df.explanation_mean=train_df.explanation_mean.astype('float16')
train_df.explanation_cumsum=train_df.explanation_cumsum.astype('int16')



In [None]:
explanation_agg = train_df.groupby('user_id')['prior_question_had_explanation'].agg(['sum', 'count'])#与上面cusum和cucount的区别
explanation_agg = explanation_agg.astype('int16')


In [None]:
max_timestamp_u = train_df[['user_id','timestamp']].groupby(['user_id']).agg(['max']).reset_index()#取出timestamp的最大值
max_timestamp_u.columns = ['user_id', 'max_time_stamp']#重新设置columns

train_df['lagtime'] = train_df.groupby('user_id')['timestamp'].shift()
train_df['lagtime']=train_df['timestamp']-train_df['lagtime']#此用户交互与该用户完成第一个事件之间的时间（毫秒）。
train_df['lagtime'].fillna(0, inplace=True)#用0填充空值
train_df.lagtime=train_df.lagtime.astype('int32')#数据格式转换

lagtime_agg = train_df.groupby('user_id')['lagtime'].agg(['mean'])#完成每一题的平均时间
train_df['lagtime_mean'] = train_df['user_id'].map(lagtime_agg['mean'])#map映射
train_df.lagtime_mean=train_df.lagtime_mean.astype('int32')#转换数据格式


train_df['timestamp']=train_df['timestamp']/(1000*3600)#时间转换为小时
train_df.timestamp=train_df.timestamp.astype('int16')


提取验证集

In [None]:
# Ratio is 6/24 = 25%
valid_df = train_df.groupby('user_id').tail(125)
train_df.drop(valid_df.index, inplace = True)

In [None]:
train_df.shape,valid_df.shape

训练

In [None]:

features = [ 'timestamp','lagtime','lagtime_mean','Accuracy_sum','Questions_num','prior_question_elapsed_time', 
            'prior_question_had_explanation', 'user_correctness', 'prior_question_elapsed_time_mean',
            'part', 'content_count','content_id','explanation_mean','explanation_cumsum']
'''
features = [ 'timestamp','lagtime','lagtime_mean',
    'content_id', 'prior_question_elapsed_time', 
            'prior_question_had_explanation', 'user_correctness', 
            'part', 'content_count']
'''


params = {
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'task_type': 'GPU' if torch.cuda.is_available() else 'CPU',
    'grow_policy': 'Lossguide',
    'iterations': 5000,
    'learning_rate': 4e-2,
    'random_seed': 0,
    'l2_leaf_reg': 1e-1,
    'depth': 15,
    'max_leaves': 10,
    'border_count': 128,
    'verbose': 50,
}

In [None]:
from catboost import CatBoostClassifier, Pool

# Training and validating data
train_set = Pool(train_df[features], label = train_df[target])
val_set = Pool(valid_df[features], label = valid_df[target])

In [None]:
# Model definition
model = CatBoostClassifier(**params)

# Fitting
model.fit(train_set, eval_set = val_set, use_best_model = True)

接口，这里是相比于国内的比赛有很大的不同。

In [None]:
user_sum_dict = user_agg['sum'].astype('int16').to_dict(defaultdict(int))
user_count_dict = user_agg['count'].astype('int16').to_dict(defaultdict(int))
content_sum_dict = content_agg['sum'].astype('int32').to_dict(defaultdict(int))
content_count_dict = content_agg['count'].astype('int32').to_dict(defaultdict(int))

explanation_sum_dict = explanation_agg['sum'].astype('int16').to_dict(defaultdict(int))
explanation_count_dict = explanation_agg['count'].astype('int16').to_dict(defaultdict(int))


lagtime_mean_dict = lagtime_agg['mean'].astype('int32').to_dict(defaultdict(int))
max_timestamp_u_dict = max_timestamp_u.set_index('user_id').to_dict()

In [None]:
try:
    env = riiideducation.make_env()
except:
    pass
iter_test = env.iter_test()
prior_test_df = None

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    if prior_test_df is not None:
        prior_test_df[target] = eval(test_df['prior_group_answers_correct'].iloc[0])
        prior_test_df = prior_test_df[prior_test_df[target] != -1].reset_index(drop = True)
        
        user_ids = prior_test_df['user_id'].values
        content_ids = prior_test_df['content_id'].values
        targets = prior_test_df[target].values
        
        for user_id, content_id, answered_correctly in zip(user_ids, content_ids, targets):
            user_sum_dict[user_id] += answered_correctly
            user_count_dict[user_id] += 1
            content_sum_dict[content_id] += answered_correctly
            content_count_dict[content_id] += 1

    prior_test_df = test_df.copy()
    
    test_df = test_df[test_df['content_type_id'] == 0].reset_index(drop = True)#测试数据
    test_df = pd.merge(test_df, questions_df, left_on = 'content_id', right_on = 'question_id', how = 'left')
    test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].fillna(False).astype('bool')  
    #prior_question_elapsed_time_mean = test_df.prior_question_elapsed_time.dropna().values.mean()
    test_df['prior_question_elapsed_time_mean'] = test_df.prior_question_elapsed_time.fillna(prior_question_elapsed_time_mean)
    
    user_sum = np.zeros(len(test_df), dtype = np.int16)
    user_count = np.zeros(len(test_df), dtype = np.int16)
    content_sum = np.zeros(len(test_df), dtype = np.int32)
    content_count = np.zeros(len(test_df), dtype = np.int32)
    explanation_sum = np.zeros(len(test_df), dtype=np.int32)
    explanation_count = np.zeros(len(test_df), dtype=np.int32)
    
    lagtime = np.zeros(len(test_df), dtype=np.int32)
    lagtime_mean = np.zeros(len(test_df), dtype=np.int32)
    
    for i, (user_id, content_id,timestamp) in enumerate(zip(test_df['user_id'].values, test_df['content_id'].values,test_df['timestamp'].values)):
        user_sum[i] = user_sum_dict[user_id]
        user_count[i] = user_count_dict[user_id]
        content_sum[i] = content_sum_dict[content_id]
        content_count[i] = content_count_dict[content_id]
        explanation_sum[i] = explanation_sum_dict[user_id]
        explanation_count[i] = explanation_count_dict[user_id]
        
        if user_id in max_timestamp_u_dict['max_time_stamp'].keys():
            lagtime[i] = timestamp-max_timestamp_u_dict['max_time_stamp'][user_id]
            max_timestamp_u_dict['max_time_stamp'][user_id]=timestamp
            lagtime_mean[i] = (lagtime_mean_dict[user_id]+lagtime[i])/2           
        else:
            lagtime[i]=0
            max_timestamp_u_dict['max_time_stamp'].update({user_id:timestamp})
            lagtime_mean_dict.update({user_id:timestamp})
            lagtime_mean[i]=(lagtime_mean_dict[user_id]+lagtime[i])/2

    #test_df['Accuracy'] = user_sum / user_count#每个用户回答问题的准确率
    test_df['user_correctness'] = user_sum / user_count
    test_df['Accuracy_sum'] = user_sum
    test_df['Questions_num'] = user_count
    test_df['content_count'] = content_count
    test_df['content_id'] = content_sum / content_count
    test_df['explanation_mean'] = explanation_sum / explanation_count
    test_df['explanation_cumsum'] = explanation_sum 
    test_df["lagtime"] = lagtime
    test_df["lagtime_mean"] = lagtime_mean
    test_df['timestamp']=test_df['timestamp']/(1000*3600)#时间转换为小时
    test_df.timestamp=test_df.timestamp.astype('int16')
        

       
    test_df[target] = model.predict_proba(test_df[features])[:,1]
    env.predict(test_df[['row_id', target]])