I want to convey two things in this notebook.
## 1. Don't have to be hesitant about using Loop.
They say "avoid loops!'.
But I think It's not bad idea to use loops for this competition.
Because:
* We have to use small batch inference using Time-series API.
* Loops have very small overhead for each batch.
* Loops are more flexible.
* Even loops are not so slow. 3 features are extracted within 10 minits for 100M train data, as you can see blow.

## 2. Future information should not be used.
Time-series API doesn't allow us to use information from the future.
So we should not use it, especially user statistics from future make things very bad.

In [1]:
import pandas as pd
import numpy as np
import gc
from sklearn.metrics import roc_auc_score
from collections import defaultdict
from collections import deque
from tqdm.notebook import tqdm
import lightgbm as lgb
import pickle

## setting
CV files are generated by [this notebook](https://www.kaggle.com/its7171/cv-strategy)

In [2]:
train_pickle = '../input/riiid-cross-validation-files/cv1_train.pickle'
valid_pickle = '../input/riiid-cross-validation-files/cv1_valid.pickle'
debug = False
validaten_flg = False

## feature engineering

In [3]:
def add_user_feats1(df, user_tag_lag1, user_answer_lag1):
    user_prev_tag_lag1 = np.zeros(len(df), dtype = np.int32)

    
    
    
    for cnt,row in enumerate(tqdm(df[['user_id','answered_correctly','tag_sum']].values)):
        
        
        if row[0] in user_tag_lag1:
            user_prev_tag_lag1[cnt] = user_tag_lag1[row[0]]+user_answer_lag1[row[0]]*(283+1)
        else:
            user_prev_tag_lag1[cnt] = 568

        
        user_tag_lag1[row[0]] = row[2]
        user_answer_lag1[row[0]] = row[1]
        
        
    user_feats_df = pd.DataFrame({'user_prev_tag_lag1' : user_prev_tag_lag1})
    return user_feats_df


In [4]:
def add_user_feats2(df, last_time_u_dict, last_time_u_lag_dict, last_time_u_lag2_dict, last_time_u_correct_dict, last_time_u_incorrect_dict):

    td = np.zeros(len(df), dtype=np.int64)
    td_correct = np.zeros(len(df), dtype = np.int64)
    td_incorrect = np.zeros(len(df), dtype = np.int64)

    
    for cnt,row in enumerate(tqdm(df[['user_id','timestamp','answered_correctly']].values)):
        
        td[cnt] = row[1] - last_time_u_dict[row[0]]
        td_correct[cnt] = row[1] - last_time_u_correct_dict[row[0]]
        td_incorrect[cnt] = row[1] - last_time_u_incorrect_dict[row[0]]
        
        if row[1] == last_time_u_dict[row[0]]:
            td[cnt] = td[cnt-1]
            td_correct[cnt] = td_correct[cnt-1]
            td_incorrect[cnt] = td_incorrect[cnt-1]
        
        if row[0] in last_time_u_lag_dict:
            if row[1] != last_time_u_dict[row[0]]:
                last_time_u_lag2_dict[row[0]] = last_time_u_lag_dict[row[0]]
        
        if row[0] in last_time_u_dict:
            if row[1] != last_time_u_dict[row[0]]:
                last_time_u_lag_dict[row[0]] = last_time_u_dict[row[0]]
        
        last_time_u_dict[row[0]] = row[1]
        if row[2] == 1:
            last_time_u_correct_dict[row[0]] = row[1]
        else:
            last_time_u_incorrect_dict[row[0]] = row[1]
            
    user_feats_df = pd.DataFrame({'time_diff':td, 'time_diff_correct': td_correct, 'time_diff_incorrect': td_incorrect})
    user_feats_df = pd.concat([df, user_feats_df], axis = 1)
    return user_feats_df


In [5]:
def add_user_feats3(df, part1_answered_correctly_sum_u_dict, part1_count_u_dict, part2_answered_correctly_sum_u_dict, part2_count_u_dict, part3_answered_correctly_sum_u_dict, part3_count_u_dict, part4_answered_correctly_sum_u_dict, part4_count_u_dict, part5_answered_correctly_sum_u_dict, part5_count_u_dict, part6_answered_correctly_sum_u_dict, part6_count_u_dict, part7_answered_correctly_sum_u_dict, part7_count_u_dict):
    acsu_part = np.zeros(len(df), dtype=np.int32)
    cu_part = np.zeros(len(df), dtype=np.int32)
    acsu = np.zeros(len(df), dtype=np.int32)
    cu = np.zeros(len(df), dtype=np.int32)
    
    for cnt,row in enumerate(tqdm(df[['user_id','answered_correctly','part']].values)):
        acsu[cnt] = part1_answered_correctly_sum_u_dict[row[0]]+part2_answered_correctly_sum_u_dict[row[0]]+part3_answered_correctly_sum_u_dict[row[0]]+part4_answered_correctly_sum_u_dict[row[0]]+part5_answered_correctly_sum_u_dict[row[0]]+part6_answered_correctly_sum_u_dict[row[0]]+part7_answered_correctly_sum_u_dict[row[0]]
        cu[cnt] = part1_count_u_dict[row[0]]+part2_count_u_dict[row[0]]+part3_count_u_dict[row[0]]+part4_count_u_dict[row[0]]+part5_count_u_dict[row[0]]+part6_count_u_dict[row[0]]+part7_count_u_dict[row[0]]
        
        if row[2] == 1:
            acsu_part[cnt] = part1_answered_correctly_sum_u_dict[row[0]]
            cu_part[cnt] = part1_count_u_dict[row[0]]
            part1_answered_correctly_sum_u_dict[row[0]] += row[1]
            part1_count_u_dict[row[0]] += 1
            
        elif row[2] == 2:
            acsu_part[cnt] = part2_answered_correctly_sum_u_dict[row[0]]
            cu_part[cnt] = part2_count_u_dict[row[0]]
            part2_answered_correctly_sum_u_dict[row[0]] += row[1]
            part2_count_u_dict[row[0]] += 1   
            
        elif row[2] == 3:
            acsu_part[cnt] = part3_answered_correctly_sum_u_dict[row[0]]
            cu_part[cnt] = part3_count_u_dict[row[0]]
            part3_answered_correctly_sum_u_dict[row[0]] += row[1]
            part3_count_u_dict[row[0]] += 1   
            
        elif row[2] == 4:
            acsu_part[cnt] = part4_answered_correctly_sum_u_dict[row[0]]
            cu_part[cnt] = part4_count_u_dict[row[0]]
            part4_answered_correctly_sum_u_dict[row[0]] += row[1]
            part4_count_u_dict[row[0]] += 1     
            
        elif row[2] == 5:
            acsu_part[cnt] = part5_answered_correctly_sum_u_dict[row[0]]
            cu_part[cnt] = part5_count_u_dict[row[0]]
            part5_answered_correctly_sum_u_dict[row[0]] += row[1]
            part5_count_u_dict[row[0]] += 1   
            
        elif row[2] == 6:
            acsu_part[cnt] = part6_answered_correctly_sum_u_dict[row[0]]
            cu_part[cnt] = part6_count_u_dict[row[0]]
            part6_answered_correctly_sum_u_dict[row[0]] += row[1]
            part6_count_u_dict[row[0]] += 1 
            
        elif row[2] == 7:
            acsu_part[cnt] = part7_answered_correctly_sum_u_dict[row[0]]
            cu_part[cnt] = part7_count_u_dict[row[0]]
            part7_answered_correctly_sum_u_dict[row[0]] += row[1]
            part7_count_u_dict[row[0]] += 1
            


    user_feats_df = pd.DataFrame({'answered_correctly_sum_u':acsu, 'count_u':cu, 'part_answered_correctly_sum_u':acsu_part, 'part_count_u':cu_part})
    return user_feats_df


In [6]:
def add_user_feats4(df, all_last60_avg, part1_last30_avg, part2_last30_avg, part3_last30_avg, part4_last30_avg, part5_last30_avg, part6_last30_avg, part7_last30_avg):
    acsu_part = np.zeros(len(df), dtype=np.float32)
    #acsu_10 = np.zeros(len(df), dtype=np.float32)
    #acsu_30 = np.zeros(len(df), dtype=np.float32)
    acsu_60 = np.zeros(len(df), dtype=np.float32)
    
    for cnt,row in enumerate(tqdm(df[['user_id','answered_correctly','part']].values)):
#        if row[0] not in all_last10_avg:
#            acsu_10[cnt] = 0
#        else:
#            acsu_10[cnt] = sum(all_last10_avg[row[0]])/len(all_last10_avg[row[0]])
        
#        if row[0] not in all_last10_avg:
#            new_list = deque(maxlen = 10)
#            new_list.append(row[1])
#            all_last10_avg[row[0]] = new_list
#        else:
#            all_last10_avg[row[0]].append(row[1])
        
#        if row[0] not in all_last30_avg:
#            acsu_30[cnt] = 0
#        else:
#            acsu_30[cnt] = sum(all_last30_avg[row[0]])/len(all_last30_avg[row[0]])
            
            
#        if row[0] not in all_last30_avg:
#            new_list = deque(maxlen = 30)
#            new_list.append(row[1])
#            all_last30_avg[row[0]] = new_list
#        else:
#            all_last30_avg[row[0]].append(row[1])
        
        if row[0] not in all_last60_avg:
            acsu_60[cnt] = 0
        else:
            acsu_60[cnt] = sum(all_last60_avg[row[0]])/len(all_last60_avg[row[0]])
            
        
        if row[0] not in all_last60_avg:
            new_list = deque(maxlen = 60)
            new_list.append(row[1])
            all_last60_avg[row[0]] = new_list
        else:
            all_last60_avg[row[0]].append(row[1])
        
        
        
        
        
        if row[2] == 1:
            if row[0] not in part1_last30_avg:
                acsu_part[cnt] = 0
            else:
                acsu_part[cnt] = sum(part1_last30_avg[row[0]])/len(part1_last30_avg[row[0]])    
                
            if row[0] not in part1_last30_avg:
                new_list = deque(maxlen = 30)
                new_list.append(row[1])
                part1_last30_avg[row[0]] = new_list
            else:
                part1_last30_avg[row[0]].append(row[1])
            
        elif row[2] == 2:
            if row[0] not in part2_last30_avg:
                acsu_part[cnt] = 0
            else:
                acsu_part[cnt] = sum(part2_last30_avg[row[0]])/len(part2_last30_avg[row[0]]) 
                
            if row[0] not in part2_last30_avg:
                new_list = deque(maxlen = 30)
                new_list.append(row[1])
                part2_last30_avg[row[0]] = new_list
            else:
                part2_last30_avg[row[0]].append(row[1])
            
            
        elif row[2] == 3:
            if row[0] not in part3_last30_avg:
                acsu_part[cnt] = 0
            else:
                acsu_part[cnt] = sum(part3_last30_avg[row[0]])/len(part3_last30_avg[row[0]]) 
                
            if row[0] not in part3_last30_avg:
                new_list = deque(maxlen = 30)
                new_list.append(row[1])
                part3_last30_avg[row[0]] = new_list
            else:
                part3_last30_avg[row[0]].append(row[1])
            
            
        elif row[2] == 4:
            if row[0] not in part4_last30_avg:
                acsu_part[cnt] = 0
            else:
                acsu_part[cnt] = sum(part4_last30_avg[row[0]])/len(part4_last30_avg[row[0]]) 
                
            if row[0] not in part4_last30_avg:
                new_list = deque(maxlen = 30)
                new_list.append(row[1])
                part4_last30_avg[row[0]] = new_list
            else:
                part4_last30_avg[row[0]].append(row[1])
            
            
        elif row[2] == 5:
            if row[0] not in part5_last30_avg:
                acsu_part[cnt] = 0
            else:
                acsu_part[cnt] = sum(part5_last30_avg[row[0]])/len(part5_last30_avg[row[0]]) 
                
            if row[0] not in part5_last30_avg:
                new_list = deque(maxlen = 30)
                new_list.append(row[1])
                part5_last30_avg[row[0]] = new_list
            else:
                part5_last30_avg[row[0]].append(row[1])
            
            
        elif row[2] == 6:
            if row[0] not in part6_last30_avg:
                acsu_part[cnt] = 0
            else:
                acsu_part[cnt] = sum(part6_last30_avg[row[0]])/len(part6_last30_avg[row[0]]) 
                
            if row[0] not in part6_last30_avg:
                new_list = deque(maxlen = 30)
                new_list.append(row[1])
                part6_last30_avg[row[0]] = new_list
            else:
                part6_last30_avg[row[0]].append(row[1])
            
            
        elif row[2] == 7:
            if row[0] not in part7_last30_avg:
                acsu_part[cnt] = 0
            else:
                acsu_part[cnt] = sum(part7_last30_avg[row[0]])/len(part7_last30_avg[row[0]]) 
                
            if row[0] not in part7_last30_avg:
                new_list = deque(maxlen = 30)
                new_list.append(row[1])
                part7_last30_avg[row[0]] = new_list
            else:
                part7_last30_avg[row[0]].append(row[1])
            
            


    user_feats_df = pd.DataFrame({'last_60':acsu_60, 'part_last_30': acsu_part})
    return user_feats_df

In [7]:
feld_needed = ['timestamp','user_id','content_id','answered_correctly']
train = pd.read_pickle(train_pickle)[feld_needed]
valid = pd.read_pickle(valid_pickle)[feld_needed]

train = train.loc[train.answered_correctly != -1].reset_index(drop = True)
valid = valid.loc[valid.answered_correctly != -1].reset_index(drop = True)

_=gc.collect()

In [8]:

#initialize dictionary
#user_prev_t_dict = defaultdict(object)
#initialize dictionary
#user_prev_t_dict = defaultdict(object)
user_tag_lag1 = defaultdict(int)
user_answer_lag1 = defaultdict(int)
#user_tag_lag2 = defaultdict(int)
#user_answer_lag2 = defaultdict(int)
#user_tag_lag3 = defaultdict(int)
#user_answer_lag3 = defaultdict(int)
#user_tag_lag4 = defaultdict(int)
#user_answer_lag4 = defaultdict(int)
#user_tag_lag5 = defaultdict(int)
#user_answer_lag5 = defaultdict(int)
part1_answered_correctly_sum_u_dict = defaultdict(int)
part1_count_u_dict = defaultdict(int)
part2_answered_correctly_sum_u_dict = defaultdict(int)
part2_count_u_dict = defaultdict(int)
part3_answered_correctly_sum_u_dict = defaultdict(int)
part3_count_u_dict = defaultdict(int)
part4_answered_correctly_sum_u_dict = defaultdict(int)
part4_count_u_dict = defaultdict(int)
part5_answered_correctly_sum_u_dict = defaultdict(int)
part5_count_u_dict = defaultdict(int)
part6_answered_correctly_sum_u_dict = defaultdict(int)
part6_count_u_dict = defaultdict(int)
part7_answered_correctly_sum_u_dict = defaultdict(int)
part7_count_u_dict = defaultdict(int)
last_time_u_dict = defaultdict(int)
last_time_u_lag_dict = defaultdict(int)
last_time_u_lag2_dict = defaultdict(int)
last_time_u_correct_dict = defaultdict(int)
last_time_u_incorrect_dict = defaultdict(int)
#all_last10_avg = defaultdict(list)
#all_last20_avg = defaultdict(list)
#all_last30_avg = defaultdict(list)
all_last60_avg = defaultdict(list)
part1_last30_avg = defaultdict(list)
part2_last30_avg = defaultdict(list)
part3_last30_avg = defaultdict(list)
part4_last30_avg = defaultdict(list)
part5_last30_avg = defaultdict(list)
part6_last30_avg = defaultdict(list)
part7_last30_avg = defaultdict(list)

In [9]:

train = add_user_feats2(train, last_time_u_dict, last_time_u_lag_dict, last_time_u_lag2_dict, last_time_u_correct_dict, last_time_u_incorrect_dict)
valid = add_user_feats2(valid, last_time_u_dict, last_time_u_lag_dict, last_time_u_lag2_dict, last_time_u_correct_dict, last_time_u_incorrect_dict)

_=gc.collect()



train[['time_diff', 'time_diff_correct', 'time_diff_incorrect']].to_csv('train_time_diff.csv',index = False)
valid[['time_diff', 'time_diff_correct', 'time_diff_incorrect']].to_csv('valid_time_diff.csv',index = False)

train.drop(columns = ['timestamp','time_diff', 'time_diff_correct', 'time_diff_incorrect'], inplace = True)
valid.drop(columns = ['timestamp','time_diff', 'time_diff_correct', 'time_diff_incorrect'], inplace = True)

_=gc.collect()

HBox(children=(FloatProgress(value=0.0, max=96817414.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2453886.0), HTML(value='')))




In [10]:
f = open("last_time_u_dict.pkl","wb")
pickle.dump(last_time_u_dict,f)
f.close()

del(last_time_u_dict)

f = open("last_time_u_lag_dict.pkl","wb")
pickle.dump(last_time_u_lag_dict,f)
f.close()

del(last_time_u_lag_dict)

f = open("last_time_u_lag2_dict.pkl","wb")
pickle.dump(last_time_u_lag2_dict,f)
f.close()

del(last_time_u_lag2_dict)

f = open("last_time_u_correct_dict.pkl","wb")
pickle.dump(last_time_u_correct_dict,f)
f.close()

del(last_time_u_correct_dict)

f = open("last_time_u_incorrect_dict.pkl","wb")
pickle.dump(last_time_u_incorrect_dict,f)
f.close()

del(last_time_u_incorrect_dict)


_=gc.collect()

In [11]:
question = pd.read_csv('../input/features/question3.csv')
question['tag_sum'] = pd.factorize(question.tag_sum)[0]

train = train.merge(question[['content_id','part','tag_sum']], on = 'content_id', how = 'left')
valid = valid.merge(question[['content_id','part','tag_sum']], on = 'content_id', how = 'left')

In [12]:
train_near_past = add_user_feats4(train, all_last60_avg, part1_last30_avg, part2_last30_avg, part3_last30_avg, part4_last30_avg, part5_last30_avg, part6_last30_avg, part7_last30_avg)
valid_near_past = add_user_feats4(valid, all_last60_avg, part1_last30_avg, part2_last30_avg, part3_last30_avg, part4_last30_avg, part5_last30_avg, part6_last30_avg, part7_last30_avg)

_=gc.collect()
train_near_past.to_csv('train_near_past.csv', index = False)
valid_near_past.to_csv('valid_near_past.csv', index = False)

del(train_near_past)
del(valid_near_past)
_=gc.collect()    

HBox(children=(FloatProgress(value=0.0, max=96817414.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2453886.0), HTML(value='')))




In [13]:
train_ascu = add_user_feats3(train, part1_answered_correctly_sum_u_dict, part1_count_u_dict, part2_answered_correctly_sum_u_dict, part2_count_u_dict, part3_answered_correctly_sum_u_dict, part3_count_u_dict, part4_answered_correctly_sum_u_dict, part4_count_u_dict, part5_answered_correctly_sum_u_dict, part5_count_u_dict, part6_answered_correctly_sum_u_dict, part6_count_u_dict, part7_answered_correctly_sum_u_dict, part7_count_u_dict)
valid_ascu = add_user_feats3(valid, part1_answered_correctly_sum_u_dict, part1_count_u_dict, part2_answered_correctly_sum_u_dict, part2_count_u_dict, part3_answered_correctly_sum_u_dict, part3_count_u_dict, part4_answered_correctly_sum_u_dict, part4_count_u_dict, part5_answered_correctly_sum_u_dict, part5_count_u_dict, part6_answered_correctly_sum_u_dict, part6_count_u_dict, part7_answered_correctly_sum_u_dict, part7_count_u_dict)

_=gc.collect()
train_ascu.to_csv('train_ascu.csv',index = False)
valid_ascu.to_csv('valid_ascu.csv',index = False)

train.drop(columns = ['part','content_id'], inplace = True)
valid.drop(columns = ['part','content_id'], inplace = True)


del(train_ascu)
del(valid_ascu)
_=gc.collect()

HBox(children=(FloatProgress(value=0.0, max=96817414.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2453886.0), HTML(value='')))




In [14]:


f = open("part1_answered_correctly_sum_u_dict.pkl","wb")
pickle.dump(part1_answered_correctly_sum_u_dict,f)
f.close()

f = open("part1_count_u_dict.pkl","wb")
pickle.dump(part1_count_u_dict,f)
f.close()

f = open("part2_answered_correctly_sum_u_dict.pkl","wb")
pickle.dump(part2_answered_correctly_sum_u_dict,f)
f.close()

f = open("part2_count_u_dict.pkl","wb")
pickle.dump(part2_count_u_dict,f)
f.close()

f = open("part3_answered_correctly_sum_u_dict.pkl","wb")
pickle.dump(part3_answered_correctly_sum_u_dict,f)
f.close()

f = open("part3_count_u_dict.pkl","wb")
pickle.dump(part3_count_u_dict,f)
f.close()


f = open("part4_answered_correctly_sum_u_dict.pkl","wb")
pickle.dump(part4_answered_correctly_sum_u_dict,f)
f.close()

f = open("part4_count_u_dict.pkl","wb")
pickle.dump(part4_count_u_dict,f)
f.close()


f = open("part5_answered_correctly_sum_u_dict.pkl","wb")
pickle.dump(part5_answered_correctly_sum_u_dict,f)
f.close()

f = open("part5_count_u_dict.pkl","wb")
pickle.dump(part5_count_u_dict,f)
f.close()

f = open("part6_answered_correctly_sum_u_dict.pkl","wb")
pickle.dump(part6_answered_correctly_sum_u_dict,f)
f.close()

f = open("part6_count_u_dict.pkl","wb")
pickle.dump(part6_count_u_dict,f)
f.close()

f = open("part7_answered_correctly_sum_u_dict.pkl","wb")
pickle.dump(part7_answered_correctly_sum_u_dict,f)
f.close()

f = open("part7_count_u_dict.pkl","wb")
pickle.dump(part7_count_u_dict,f)
f.close()



f = open("part1_last30_avg.pkl","wb")
pickle.dump(part1_last30_avg,f)
f.close()

f = open("part2_last30_avg.pkl","wb")
pickle.dump(part2_last30_avg,f)
f.close()

f = open("part3_last30_avg.pkl","wb")
pickle.dump(part3_last30_avg,f)
f.close()

f = open("part4_last30_avg.pkl","wb")
pickle.dump(part4_last30_avg,f)
f.close()

f = open("part5_last30_avg.pkl","wb")
pickle.dump(part5_last30_avg,f)
f.close()

f = open("part6_last30_avg.pkl","wb")
pickle.dump(part6_last30_avg,f)
f.close()

f = open("part7_last30_avg.pkl","wb")
pickle.dump(part7_last30_avg,f)
f.close()

f = open("all_last60_avg.pkl","wb")
pickle.dump(all_last60_avg,f)
f.close()

#f = open("all_last30_avg.pkl","wb")
#pickle.dump(all_last10_avg,f)
#f.close()

#f = open("all_last50_avg.pkl","wb")
#pickle.dump(all_last10_avg,f)
#f.close()



In [15]:
train_user_prev_q_a = add_user_feats1(train, user_tag_lag1, user_answer_lag1)
valid_user_prev_q_a = add_user_feats1(valid, user_tag_lag1, user_answer_lag1)

_=gc.collect()
train_user_prev_q_a.to_csv('train_user_prev_q_a.csv',index = False)
valid_user_prev_q_a.to_csv('valid_user_prev_q_a.csv',index = False)

del(train_user_prev_q_a)
del(valid_user_prev_q_a)
del(train)
del(valid)
_=gc.collect()

HBox(children=(FloatProgress(value=0.0, max=96817414.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2453886.0), HTML(value='')))




In [16]:
import pickle

f = open("user_tag_lag1.pkl","wb")
pickle.dump(user_tag_lag1,f)
f.close()

f = open("user_answer_lag1.pkl","wb")
pickle.dump(user_answer_lag1,f)
f.close()


_=gc.collect()

Have a fun with loops! :)