In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import riiideducation
import seaborn as sns
import matplotlib.pyplot as plt

from scipy.stats import skew
from collections import defaultdict, Counter
# import cudf

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
is_test=False


In [None]:
%%time
smp = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/example_sample_submission.csv')
ex_test = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/example_test.csv')
questions = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/questions.csv')
lectures = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/lectures.csv')
train = pd.read_pickle('/kaggle/input/riiid-data/train.pkl')
# train = train.loc[:3e+07]

In [None]:
if is_test:
    train = train.loc[:10000]

# Preprocessing

In [None]:
%%time

# train 데이터 > type 필터링 & merging
train = train[train['content_type_id']==0]
questions_ = questions.copy()
questions_['content_type_id'] = 0
questions_.rename(columns = {'question_id': 'content_id'}, inplace = True)
questions_.fillna('-1', inplace=True) # na 딱 1개있다 
questions_['tags'] = questions_['tags'].apply(lambda x: int(x.split()[0]))
train = train.merge(questions_, how='left', on=['content_type_id', 'content_id'])


# # Dictionary: user_id별 count, answered_correctly sum, prior_question_elapsed_time sum
values = train.groupby('user_id').agg({'user_id': len,
                                     'answered_correctly':np.sum,
                                     'prior_question_elapsed_time':np.sum})
userid_dic = defaultdict(lambda:[0,0,0], zip(values.index, np.array(values)))


# Dictionary: content_id별 count, answered_correctly sum, prior_question_elapsed_time sum
values = train.groupby('content_id').agg({'content_id': len,
                                         'answered_correctly':[np.sum, np.std, skew],
                                         'prior_question_elapsed_time': [np.sum, np.min, np.max, np.std, skew]}) # min 추가 
contentid_dic = defaultdict(lambda:[0]*9, zip(values.index, np.array(values)))


# Dictionary: tag별 count, answered_correctly sum, prior_question_elapsed_time sum
values = train.groupby('tags').agg({'tags': len,
                                     'answered_correctly':[np.sum, np.std, skew],
                                     'prior_question_elapsed_time':[np.sum, np.min, np.max, np.std, skew]})
tag_dic = defaultdict(lambda:[0]*9, zip(values.index, np.array(values)))


# Dictionary: part별 count, answered_correctly sum, prior_question_elapsed_time sum
values = train.groupby('part').agg({'part': len,
                                     'answered_correctly':[np.sum, np.std, skew],
                                     'prior_question_elapsed_time':[np.sum, np.min, np.max, np.std, skew]})
part_dic = defaultdict(lambda:[0]*9, zip(values.index, np.array(values)))


# NA
train.fillna(0, inplace=True)
    
    
# Data Type
train['prior_question_had_explanation'] = train['prior_question_had_explanation'].astype(np.int8)

# user_id & content_id 별 answer mean / time mean
train['userid_answer_mean'] = train['user_id'].apply(lambda x: userid_dic[x][1]) / train['user_id'].apply(lambda x: userid_dic[x][0])
train['userid_time_mean'] = train['user_id'].apply(lambda x: userid_dic[x][2]) / train['user_id'].apply(lambda x: userid_dic[x][0])
train['userid_cnt'] = train['user_id'].apply(lambda x:userid_dic[x][0])

train['contentid_answer_mean'] = train['content_id'].apply(lambda x: contentid_dic[x][1]) / train['content_id'].apply(lambda x: contentid_dic[x][0])
train['contentid_answer_std'] = train['content_id'].apply(lambda x: contentid_dic[x][2])
train['contentid_answer_skew'] = train['content_id'].apply(lambda x: contentid_dic[x][3])
train['contentid_time_mean'] = train['content_id'].apply(lambda x: contentid_dic[x][4]) / train['content_id'].apply(lambda x: contentid_dic[x][0])
train['contentid_time_min'] = train['content_id'].apply(lambda x: contentid_dic[x][5])
train['contentid_time_max'] = train['content_id'].apply(lambda x: contentid_dic[x][6])
train['contentid_time_std'] = train['content_id'].apply(lambda x: contentid_dic[x][7])
train['contentid_time_skew'] = train['content_id'].apply(lambda x: contentid_dic[x][8])
train['contentid_cnt'] = train['content_id'].apply(lambda x:contentid_dic[x][0])

train['tag_answer_mean'] = train['tags'].apply(lambda x: tag_dic[x][1]) / train['tags'].apply(lambda x: tag_dic[x][0])
train['tag_answer_std'] = train['tags'].apply(lambda x: tag_dic[x][2])
train['tag_answer_skew'] = train['tags'].apply(lambda x: tag_dic[x][3])
train['tag_time_mean'] = train['tags'].apply(lambda x: tag_dic[x][4]) / train['tags'].apply(lambda x: tag_dic[x][0])
train['tag_time_min'] = train['tags'].apply(lambda x: tag_dic[x][5])
train['tag_time_max'] = train['tags'].apply(lambda x: tag_dic[x][6])
train['tag_time_std'] = train['tags'].apply(lambda x: tag_dic[x][7])
train['tag_time_skew'] = train['tags'].apply(lambda x: tag_dic[x][8])
train['tag_cnt'] = train['tags'].apply(lambda x:tag_dic[x][0])

train['part_answer_mean'] = train['part'].apply(lambda x: part_dic[x][1]) / train['part'].apply(lambda x: part_dic[x][0])
# train['part_answer_std'] = train['part'].apply(lambda x: part_dic[x][2])
# train['part_answer_skew'] = train['part'].apply(lambda x: part_dic[x][3])
# train['part_time_mean'] = train['part'].apply(lambda x: part_dic[x][4]) / train['part'].apply(lambda x: part_dic[x][0])
# train['part_time_min'] = train['part'].apply(lambda x: part_dic[x][5])
# train['part_time_max'] = train['part'].apply(lambda x: part_dic[x][6])
# train['part_time_std'] = train['part'].apply(lambda x: part_dic[x][7])
# train['part_time_skew'] = train['part'].apply(lambda x: part_dic[x][8])
train['part_cnt'] = train['part'].apply(lambda x:part_dic[x][0])

# tmp = pd.Series(zip(train['user_id'], train['part']))
# train['userpart_answer_mean'] = tmp.apply(lambda x: userpart_dic[x][1] / userpart_dic[x][0])
# train['userpart_time_mean'] = tmp.apply(lambda x: userpart_dic[x][2] / userpart_dic[x][0])
# train['userpart_cnt'] = tmp.apply(lambda x: userpart_dic[x][0])

print(train.isnull().sum()/len(train))


# 정답통계량, merge 시 발생한 NA 처리
train.fillna(0, inplace=True)

In [None]:
if not is_test:
    train_smp = train.sample(n=13000000, random_state = 123)
    train = train_smp[:10000000]
    val = train_smp[10000000:]
    del train_smp
else:
    val = train[7000:]
    train = train[:7000]

# Modeling

In [None]:
import gc
import lightgbm as lgb

from sklearn.model_selection import train_test_split

In [None]:
col_del = ['row_id', 'user_answer', 'answered_correctly', 'user_id', 'content_type_id']

X = train.columns[~pd.Series(train.columns).isin(col_del)]
y = 'answered_correctly'
X

In [None]:
params = {'objective': 'binary',
          'metric': 'auc',
          'learning_rate': 0.1, #default
          "boosting_type": "gbdt", #default
#           'device': 'gpu',
#           'gpu_platform_id': 0,
#           'gpu_device_id': 0,
          'seed': 42
         }

lgb_train = lgb.Dataset(train[X], train[y], categorical_feature = [])
lgb_eval = lgb.Dataset(val[X], val[y], categorical_feature = [])

model = lgb.train(
    params, lgb_train,
    valid_sets=[lgb_train, lgb_eval],
    verbose_eval=50,
    num_boost_round=10000,
    early_stopping_rounds=8
)

In [None]:
lgb.plot_importance(model)
plt.show()

In [None]:
env = riiideducation.make_env()
iter_test = env.iter_test()
i=0
prior_test = None


if not is_test:
    for (test, sample_prediction_df) in iter_test:
        test['answered_correctly']=0
        if prior_test is None:
            # filtering & merging
            test = test[test['content_type_id']==0]
            test = test.merge(questions_, how='left', on=['content_type_id', 'content_id'])

            # NA
            test.fillna(0, inplace=True)

            # Data Type
            test['prior_question_had_explanation'] = test['prior_question_had_explanation'].astype(np.int8)
        
        if prior_test is not None:            
            prior_answers = np.array(eval(test.iloc[0]['prior_group_answers_correct']))
            prior_answers = prior_answers[prior_answers!=-1]
            prior_test['answered_correctly'] = prior_answers
            
            # filtering & merging
            test = test[test['content_type_id']==0]
            test = test.merge(questions_, how='left', on=['content_type_id', 'content_id'])

            # NA
            test.fillna(0, inplace=True)

            # Data Type
            test['prior_question_had_explanation'] = test['prior_question_had_explanation'].astype(np.int8)            
            
            # Dictionary Upgrade: answered_correctly sum
            values = prior_test.groupby('user_id').agg({'user_id': lambda x: 0,
                                                         'answered_correctly':np.sum,
                                                         'prior_question_elapsed_time':lambda x: 0})
            for k, v in zip(values.index, np.array(values)):
                userid_dic[k] += v

            values = prior_test.groupby('content_id').agg({'content_id': lambda x: 0,
                                                             'answered_correctly':[np.sum, lambda x:0, lambda x:0],
                                                             'prior_question_elapsed_time':[lambda x: 0]*5})
            for k, v in zip(values.index, np.array(values)):
                contentid_dic[k] += v
                
            values = prior_test.groupby('tags').agg({'tags': lambda x: 0,
                                                     'answered_correctly':[np.sum, lambda x:0, lambda x:0],
                                                     'prior_question_elapsed_time':[lambda x: 0]*5})
            for k, v in zip(values.index, np.array(values)):
                tag_dic[k] += v       
                
            values = prior_test.groupby('part').agg({'part': lambda x: 0,
                                                     'answered_correctly':[np.sum, lambda x:0, lambda x:0],
                                                     'prior_question_elapsed_time':[lambda x: 0]*5})
            for k, v in zip(values.index, np.array(values)):
                part_dic[k] += v

            
        # user_id & content_id 별 answer mean / time mean       &&       Dictionary Upgrade: Variables excl. answered_correctly sum
        
        # user_id
        test['userid_answer_mean'] = test['user_id'].apply(lambda x: userid_dic[x][1]) / test['user_id'].apply(lambda x: userid_dic[x][0]) 
        values = test.groupby('user_id').agg({'user_id':len,                                
                                              'answered_correctly':lambda x:0,
                                              'prior_question_elapsed_time':np.sum})
        for k, v in zip(values.index, np.array(values)): 
            userid_dic[k] += v 
        test['userid_time_mean'] = test['user_id'].apply(lambda x: userid_dic[x][2]) / test['user_id'].apply(lambda x: userid_dic[x][0]) 
        test['userid_cnt'] = test['user_id'].apply(lambda x:userid_dic[x][0])
        
        # content_id
        test['contentid_answer_mean'] = test['content_id'].apply(lambda x: contentid_dic[x][1]) / test['content_id'].apply(lambda x: contentid_dic[x][0])
        values = test.groupby('content_id').agg({'content_id':len,
                                                 'answered_correctly':[lambda x:0]*3,
                                                 'prior_question_elapsed_time':[np.sum, np.min, np.max]})
        for k, v in zip(values.index, np.array(values)): 
            contentid_dic[k][:5] += v[:5]
            contentid_dic[k][5] = min(contentid_dic[k][5], v[5])
            contentid_dic[k][6] = max(contentid_dic[k][6], v[6])
        test['contentid_answer_std'] = test['content_id'].apply(lambda x: contentid_dic[x][2])
        test['contentid_answer_skew'] = test['content_id'].apply(lambda x: contentid_dic[x][3])
        test['contentid_time_mean'] = test['content_id'].apply(lambda x: contentid_dic[x][4]) / test['content_id'].apply(lambda x: contentid_dic[x][0])
        test['contentid_time_min'] = test['content_id'].apply(lambda x: contentid_dic[x][5])
        test['contentid_time_max'] = test['content_id'].apply(lambda x: contentid_dic[x][6])
        test['contentid_time_std'] = test['content_id'].apply(lambda x: contentid_dic[x][7])
        test['contentid_time_skew'] = test['content_id'].apply(lambda x: contentid_dic[x][8])
        test['contentid_cnt'] = test['content_id'].apply(lambda x:contentid_dic[x][0])

        
        # tag
        test['tag_answer_mean'] = test['tags'].apply(lambda x: tag_dic[x][1]) / test['tags'].apply(lambda x: tag_dic[x][0])
        values = test.groupby('tags').agg({'tags':len,
                                         'answered_correctly':[lambda x:0]*3,
                                         'prior_question_elapsed_time':[np.sum, np.min, np.max]})
        for k, v in zip(values.index, np.array(values)): 
            tag_dic[k][:5] += v[:5]
            tag_dic[k][5] = min(tag_dic[k][5], v[5])
            tag_dic[k][6] = max(tag_dic[k][6], v[6])
        test['tag_answer_std'] = test['tags'].apply(lambda x: tag_dic[x][2])
        test['tag_answer_skew'] = test['tags'].apply(lambda x: tag_dic[x][3])
        test['tag_time_mean'] = test['tags'].apply(lambda x: tag_dic[x][4]) / test['tags'].apply(lambda x: tag_dic[x][0])
        test['tag_time_min'] = test['tags'].apply(lambda x: tag_dic[x][5])
        test['tag_time_max'] = test['tags'].apply(lambda x: tag_dic[x][6])
        test['tag_time_std'] = test['tags'].apply(lambda x: tag_dic[x][7])
        test['tag_time_skew'] = test['tags'].apply(lambda x: tag_dic[x][8])
        test['tag_cnt'] = test['tags'].apply(lambda x:tag_dic[x][0])

        
        # part
        test['part_answer_mean'] = test['part'].apply(lambda x: part_dic[x][1]) / test['part'].apply(lambda x: part_dic[x][0])
        values = test.groupby('part').agg({'part':len,
                                         'answered_correctly':[lambda x:0]*3,
                                         'prior_question_elapsed_time':[np.sum, np.min, np.max]})
        for k, v in zip(values.index, np.array(values)): 
            part_dic[k][:5] += v[:5]
            part_dic[k][5] = min(part_dic[k][5], v[5])
            part_dic[k][6] = max(part_dic[k][6], v[6])
#         test['part_answer_std'] = test['part'].apply(lambda x: part_dic[x][2])
#         test['part_answer_skew'] = test['part'].apply(lambda x: part_dic[x][3])
#         test['part_time_mean'] = test['part'].apply(lambda x: part_dic[x][4]) / test['part'].apply(lambda x: part_dic[x][0])
#         test['part_time_min'] = test['part'].apply(lambda x: part_dic[x][5])
#         test['part_time_max'] = test['part'].apply(lambda x: part_dic[x][6])
#         test['part_time_std'] = test['part'].apply(lambda x: part_dic[x][7])
#         test['part_time_skew'] = test['part'].apply(lambda x: part_dic[x][8])
        test['part_cnt'] = test['part'].apply(lambda x:part_dic[x][0])
        
        print(test.isnull().sum()/len(test))

        # 정답통계량, merge 시 발생한 NA 처리
        test.fillna(0, inplace=True)
        test['answered_correctly'] = model.predict(test[X])
        env.predict(test.loc[test['content_type_id'] == 0, ['row_id', 'answered_correctly']])
        prior_test = test.copy()
        
        
        
        
                                                          
else:
    for (test, sample_prediction_df) in iter_test:
        test['answered_correctly']=0
        if prior_test is None:
            # filtering & merging
            test = test[test['content_type_id']==0]
            test = test.merge(questions_, how='left', on=['content_type_id', 'content_id'])

            # NA
            test.fillna(0, inplace=True)

            # Data Type
            test['prior_question_had_explanation'] = test['prior_question_had_explanation'].astype(np.int8)
        
        if prior_test is not None:            
            prior_answers = np.array(eval(test.iloc[0]['prior_group_answers_correct']))
            prior_answers = prior_answers[prior_answers!=-1]
            prior_test['answered_correctly'] = prior_answers
            
            # filtering & merging
            test = test[test['content_type_id']==0]
            test = test.merge(questions_, how='left', on=['content_type_id', 'content_id'])

            # NA
            test.fillna(0, inplace=True)

            # Data Type
            test['prior_question_had_explanation'] = test['prior_question_had_explanation'].astype(np.int8)            
            
            # Dictionary Upgrade: answered_correctly sum
            values = prior_test.groupby('user_id').agg({'user_id': lambda x: 0,
                                                         'answered_correctly':np.sum,
                                                         'prior_question_elapsed_time':lambda x: 0})
            for k, v in zip(values.index, np.array(values)):
                userid_dic[k] += v

            values = prior_test.groupby('content_id').agg({'content_id': lambda x: 0,
                                                             'answered_correctly':[np.sum, lambda x:0, lambda x:0],
                                                             'prior_question_elapsed_time':[lambda x: 0]*5})
            for k, v in zip(values.index, np.array(values)):
                contentid_dic[k] += v
                
            values = prior_test.groupby('tags').agg({'tags': lambda x: 0,
                                                     'answered_correctly':[np.sum, lambda x:0, lambda x:0],
                                                     'prior_question_elapsed_time':[lambda x: 0]*5})
            for k, v in zip(values.index, np.array(values)):
                tag_dic[k] += v       
                
            values = prior_test.groupby('part').agg({'part': lambda x: 0,
                                                     'answered_correctly':[np.sum, lambda x:0, lambda x:0],
                                                     'prior_question_elapsed_time':[lambda x: 0]*5})
            for k, v in zip(values.index, np.array(values)):
                part_dic[k] += v
                     
            
        # user_id & content_id 별 answer mean / time mean       &&       Dictionary Upgrade: Variables excl. answered_correctly sum
        
        # user_id
        test['userid_answer_mean'] = test['user_id'].apply(lambda x: userid_dic[x][1]) / test['user_id'].apply(lambda x: userid_dic[x][0]) 
        values = test.groupby('user_id').agg({'user_id':len,                                
                                              'answered_correctly':lambda x:0,
                                              'prior_question_elapsed_time':np.sum})
        for k, v in zip(values.index, np.array(values)): 
            userid_dic[k] += v 
        test['userid_time_mean'] = test['user_id'].apply(lambda x: userid_dic[x][2]) / test['user_id'].apply(lambda x: userid_dic[x][0]) 
        test['userid_cnt'] = test['user_id'].apply(lambda x:userid_dic[x][0])
        
        # content_id
        test['contentid_answer_mean'] = test['content_id'].apply(lambda x: contentid_dic[x][1]) / test['content_id'].apply(lambda x: contentid_dic[x][0])
        values = test.groupby('content_id').agg({'content_id':len,
                                                 'answered_correctly':[lambda x:0]*3,
                                                 'prior_question_elapsed_time':[np.sum, np.min, np.max]})
        for k, v in zip(values.index, np.array(values)): 
            contentid_dic[k][:5] += v[:5]
            contentid_dic[k][5] = min(contentid_dic[k][5], v[5])
            contentid_dic[k][6] = max(contentid_dic[k][6], v[6])
        test['contentid_answer_std'] = test['content_id'].apply(lambda x: contentid_dic[x][2])
        test['contentid_answer_skew'] = test['content_id'].apply(lambda x: contentid_dic[x][3])
        test['contentid_time_mean'] = test['content_id'].apply(lambda x: contentid_dic[x][4]) / test['content_id'].apply(lambda x: contentid_dic[x][0])
        test['contentid_time_min'] = test['content_id'].apply(lambda x: contentid_dic[x][5])
        test['contentid_time_max'] = test['content_id'].apply(lambda x: contentid_dic[x][6])
        test['contentid_time_std'] = test['content_id'].apply(lambda x: contentid_dic[x][7])
        test['contentid_time_skew'] = test['content_id'].apply(lambda x: contentid_dic[x][8])
        test['contentid_cnt'] = test['content_id'].apply(lambda x:contentid_dic[x][0])

        
        # tag
        test['tag_answer_mean'] = test['tags'].apply(lambda x: tag_dic[x][1]) / test['tags'].apply(lambda x: tag_dic[x][0])
        values = test.groupby('tags').agg({'tags':len,
                                         'answered_correctly':[lambda x:0]*3,
                                         'prior_question_elapsed_time':[np.sum, np.min, np.max]})
        for k, v in zip(values.index, np.array(values)): 
            tag_dic[k][:5] += v[:5]
            tag_dic[k][5] = min(tag_dic[k][5], v[5])
            tag_dic[k][6] = max(tag_dic[k][6], v[6])
        test['tag_answer_std'] = test['tags'].apply(lambda x: tag_dic[x][2])
        test['tag_answer_skew'] = test['tags'].apply(lambda x: tag_dic[x][3])
        test['tag_time_mean'] = test['tags'].apply(lambda x: tag_dic[x][4]) / test['tags'].apply(lambda x: tag_dic[x][0])
        test['tag_time_min'] = test['tags'].apply(lambda x: tag_dic[x][5])
        test['tag_time_max'] = test['tags'].apply(lambda x: tag_dic[x][6])
        test['tag_time_std'] = test['tags'].apply(lambda x: tag_dic[x][7])
        test['tag_time_skew'] = test['tags'].apply(lambda x: tag_dic[x][8])
        test['tag_cnt'] = test['tags'].apply(lambda x:tag_dic[x][0])

        
        # part
        test['part_answer_mean'] = test['part'].apply(lambda x: part_dic[x][1]) / test['part'].apply(lambda x: part_dic[x][0])
        values = test.groupby('part').agg({'part':len,
                                         'answered_correctly':[lambda x:0]*3,
                                         'prior_question_elapsed_time':[np.sum, np.min, np.max]})
        for k, v in zip(values.index, np.array(values)): 
            part_dic[k][:5] += v[:5]
            part_dic[k][5] = min(part_dic[k][5], v[5])
            part_dic[k][6] = max(part_dic[k][6], v[6])
        test['part_answer_std'] = test['part'].apply(lambda x: part_dic[x][2])
        test['part_answer_skew'] = test['part'].apply(lambda x: part_dic[x][3])
        test['part_time_mean'] = test['part'].apply(lambda x: part_dic[x][4]) / test['part'].apply(lambda x: part_dic[x][0])
        test['part_time_min'] = test['part'].apply(lambda x: part_dic[x][5])
        test['part_time_max'] = test['part'].apply(lambda x: part_dic[x][6])
        test['part_time_std'] = test['part'].apply(lambda x: part_dic[x][7])
        test['part_time_skew'] = test['part'].apply(lambda x: part_dic[x][8])
        test['part_cnt'] = test['part'].apply(lambda x:part_dic[x][0])
        
        print(test.isnull().sum()/len(test))
        
        # 정답통계량, merge 시 발생한 NA 처리
        test.fillna(0, inplace=True)
        test.to_csv("test{}.csv".format(i))
        
        test['answered_correctly'] = model.predict(test[X])
        env.predict(test.loc[test['content_type_id'] == 0, ['row_id', 'answered_correctly']])
        i+=1
        prior_test = test.copy()
        print(userid_dic)