In [None]:
%load_ext line_profiler

In [None]:
import pandas
import numpy
import os
import random

random.seed(4649893)
numpy.random.seed(1192296)

pandas.set_option('display.width', 250)

input_path = '/kaggle/input/riiid-test-answer-prediction/'
train_path = os.path.join(input_path, 'train.csv')
questions_path = os.path.join(input_path, 'questions.csv')
lectures_path = os.path.join(input_path, 'lectures.csv')

working_path = '/kaggle/working/'

train_dtypes = {
    # 'row_id': 'int64',
    'timestamp': 'int64',
    'user_id': 'int32',
    'content_id': 'int16',
    'content_type_id': 'int8',
    # 'task_container_id': 'int16',
    # 'user_answer': 'int8',
    'answered_correctly': 'int8',
    'prior_question_elapsed_time': 'float32',
    # 'prior_question_had_explanation': 'boolean',
}

questions_dtypes = {
    'question_id': 'int16',
    'bundle_id': 'int16',
    'correct_answer': 'int8',
    'part': 'int8',
    'tags': 'str',
}

lectures_dtypes = {
    'lecture_id': 'int16',
    'part': 'int8',
    'tag': 'int16',
    'type_of': 'str',
}

def read_train_csv(nrows=None, chunksize=None):
    path = os.path.join('/kaggle/input/riiid-submission-8', 'train_sorted.csv')
    
    if not os.path.exists(path):
        pandas.read_csv(
            train_path,
            dtype=train_dtypes,
            usecols=train_dtypes.keys(),
        ).sort_values(['timestamp', 'user_id']).to_csv(path, index=False)
    

    return pandas.read_csv(
        path,
        dtype=train_dtypes,
        usecols=train_dtypes.keys(),
        nrows=nrows,
        chunksize=chunksize,
    )

def read_questions_csv():
    df = pandas.read_csv(
        questions_path,
        dtype=questions_dtypes,
        usecols=questions_dtypes.keys(),
        index_col='question_id',
    )

    df_split_tags = df['tags'].str.split(expand=True).fillna(-1).add_prefix('q_tag_').astype(numpy.int16)
    df = pandas.concat([df.drop('tags', axis=1), df_split_tags], axis=1)

    return df

def read_lectures_csv():
    df = pandas.read_csv(
        lectures_path,
        dtype=lectures_dtypes,
        usecols=lectures_dtypes.keys(),
        index_col='lecture_id',
    )

    df['type_of'] = df['type_of'].map({'concept': 0, 'solving question': 1, 'intention': 2, 'starter': 3}).astype(numpy.uint8)
    return df

def join_df(dfs, how, on):
    if isinstance(dfs[0], pandas.DataFrame):
        result = dfs[0]
    else:
        result = pandas.DataFrame(dfs[0])
    
    for df in dfs[1:]:
        result = result.join(df, how=how, on=on)
    
    return result

def left_join(a, b, on):
    if not isinstance(b, pandas.DataFrame):
        b = pandas.DataFrame(b)
    
    if isinstance(on, list):
        for i, o in enumerate(on):
            keys = a[o].to_numpy()
            o2 = b.index.names[i]
            b = b.query(f'{o2} in @keys')
    else:
        keys = a[on].to_numpy()
        b = b.query(f'index in @keys')
    
    return a.join(b, on=on)

def calc_basic_info(df, on, prefix):
    return join_df([
        df.groupby(on)
                 .agg({
                     'answered_correctly': ['count'],
                     'prior_question_elapsed_time': ['sum']
                 })
                 .set_axis(['count', 'time_sum'], axis=1),

        df.query('answered_correctly == 1').groupby(on)
                 .agg({'prior_question_elapsed_time': ['count', 'sum']})
                 .set_axis(['n', 'time_sum'], axis=1)
                 .add_suffix('_true'),

        df.query('answered_correctly == 0').groupby(on)
                 .agg({'prior_question_elapsed_time': ['count', 'sum']})
                 .set_axis(['n', 'time_sum'], axis=1)
                 .add_suffix('_false'),
    ], how='outer', on=on).add_prefix(f'{prefix}_')

In [None]:
import numba

@numba.jitclass([
    ('user_n_answers', numba.uint16[:]),
    ('user_n_corrects', numba.uint16[:]),
    ('user_n_q_parts', numba.uint16[:, :]),
    ('user_n_q_tags', numba.uint16[:, :]),
])
class UserQCount:
    def __init__(self, max_n_users):
        self.user_n_answers = numpy.zeros(max_n_users, dtype=numpy.uint16)
        self.user_n_corrects = numpy.zeros(max_n_users, dtype=numpy.uint16)
        self.user_n_q_parts = numpy.zeros((max_n_users, 7), dtype=numpy.uint16)
        self.user_n_q_tags = numpy.zeros((max_n_users, 188), dtype=numpy.uint16)
        
    def calc(self, result_n_rows, user_indices, user_ids, is_questions, answered_correctly, q_part, q_tag_0, q_tag_1, q_tag_2, q_tag_3, q_tag_4, q_tag_5, train):
        result_user_n_answers = numpy.empty(result_n_rows, dtype=numpy.uint16)
        result_user_n_corrects = numpy.empty(result_n_rows, dtype=numpy.uint16)
        result_user_n_q_part = numpy.empty(result_n_rows, dtype=numpy.uint16)
        result_user_n_q_tags = numpy.zeros((result_n_rows, 6), dtype=numpy.uint16)
        
        result_index = 0
        
        for i in range(user_ids.shape[0]):
            if is_questions[i]:
                user_index = user_indices[user_ids[i]]
            
                result_user_n_answers[result_index] = self.user_n_answers[user_index]
                result_user_n_corrects[result_index] = self.user_n_corrects[user_index]
                result_user_n_q_part[result_index] = self.user_n_q_parts[user_index, int(q_part[i] - 1)]
                
                if 0 <= q_tag_0[i] < 188:
                    result_user_n_q_tags[result_index, 0] = self.user_n_q_tags[user_index, int(q_tag_0[i])]
                if 0 <= q_tag_1[i] < 188:
                    result_user_n_q_tags[result_index, 1] = self.user_n_q_tags[user_index, int(q_tag_1[i])]
                if 0 <= q_tag_2[i] < 188:
                    result_user_n_q_tags[result_index, 2] = self.user_n_q_tags[user_index, int(q_tag_2[i])]
                if 0 <= q_tag_3[i] < 188:
                    result_user_n_q_tags[result_index, 3] = self.user_n_q_tags[user_index, int(q_tag_3[i])]
                if 0 <= q_tag_4[i] < 188:
                    result_user_n_q_tags[result_index, 4] = self.user_n_q_tags[user_index, int(q_tag_4[i])]
                if 0 <= q_tag_5[i] < 188:
                    result_user_n_q_tags[result_index, 5] = self.user_n_q_tags[user_index, int(q_tag_5[i])]
                
                result_index += 1
            
                if train:
                    self.user_n_answers[user_index] += 1
                    self.user_n_corrects[user_index] += answered_correctly[i]
                    self.user_n_q_parts[user_index, int(q_part[i] - 1)] += 1
                    
                    if 0 <= q_tag_0[i] < 188:
                        self.user_n_q_tags[user_index, int(q_tag_0[i])] += 1
                    if 0 <= q_tag_1[i] < 188:
                        self.user_n_q_tags[user_index, int(q_tag_1[i])] += 1
                    if 0 <= q_tag_2[i] < 188:
                        self.user_n_q_tags[user_index, int(q_tag_2[i])] += 1
                    if 0 <= q_tag_3[i] < 188:
                        self.user_n_q_tags[user_index, int(q_tag_3[i])] += 1
                    if 0 <= q_tag_4[i] < 188:
                        self.user_n_q_tags[user_index, int(q_tag_4[i])] += 1
                    if 0 <= q_tag_5[i] < 188:
                        self.user_n_q_tags[user_index, int(q_tag_5[i])] += 1
                    
        return {
            'user_n_answers': result_user_n_answers,
            'user_n_corrects': result_user_n_corrects,
            'user_n_q_part': result_user_n_q_part,
            'user_n_q_tag_0': result_user_n_q_tags[:, 0].astype(numpy.uint16),
            'user_n_q_tag_1': result_user_n_q_tags[:, 1].astype(numpy.uint16),
            'user_n_q_tag_2': result_user_n_q_tags[:, 2].astype(numpy.uint16),
            'user_n_q_tag_3': result_user_n_q_tags[:, 3].astype(numpy.uint16),
            'user_n_q_tag_4': result_user_n_q_tags[:, 4].astype(numpy.uint16),
            'user_n_q_tag_5': result_user_n_q_tags[:, 5].astype(numpy.uint16),
        }
    
@numba.jitclass([
    ('user_n_lectures', numba.uint16[:]),
    ('user_n_l_parts', numba.uint16[:, :]),
    ('user_n_l_tags', numba.uint16[:, :]),
    ('user_n_l_types', numba.uint16[:, :]),
])
class UserLCount:
    def __init__(self, max_n_users):
        self.user_n_lectures = numpy.zeros(max_n_users, dtype=numpy.uint16)
        self.user_n_l_parts = numpy.zeros((max_n_users, 7), dtype=numpy.uint16)
        self.user_n_l_tags = numpy.zeros((max_n_users, 188), dtype=numpy.uint16)
        self.user_n_l_types = numpy.zeros((max_n_users, 4), dtype=numpy.uint16)
        
    def calc(self, result_n_rows, user_indices, user_ids, is_questions, q_part, l_part, type_of, l_tag, q_tag_0, q_tag_1, q_tag_2, q_tag_3, q_tag_4, q_tag_5, train):
        result_user_n_lectures = numpy.empty(result_n_rows, dtype=numpy.uint16)
        result_user_n_l_part = numpy.empty(result_n_rows, dtype=numpy.uint16)
        result_user_n_l_tags = numpy.zeros((result_n_rows, 6), dtype=numpy.uint16)
        result_user_n_l_types = numpy.empty((result_n_rows, 4), dtype=numpy.uint16)
        
        result_index = 0
        
        for i in range(user_ids.shape[0]):
            if is_questions[i]:
                user_index = user_indices[user_ids[i]]
            
                result_user_n_lectures[result_index] = self.user_n_lectures[user_index]
                result_user_n_l_part[result_index] = self.user_n_l_parts[user_index, int(q_part[i] - 1)]
                result_user_n_l_types[result_index, :] = self.user_n_l_types[user_index, :]
                
                if 0 <= q_tag_0[i] < 188:
                    result_user_n_l_tags[result_index, 0] = self.user_n_l_tags[user_index, int(q_tag_0[i])]
                if 0 <= q_tag_1[i] < 188:
                    result_user_n_l_tags[result_index, 1] = self.user_n_l_tags[user_index, int(q_tag_1[i])]
                if 0 <= q_tag_2[i] < 188:
                    result_user_n_l_tags[result_index, 2] = self.user_n_l_tags[user_index, int(q_tag_2[i])]
                if 0 <= q_tag_3[i] < 188:
                    result_user_n_l_tags[result_index, 3] = self.user_n_l_tags[user_index, int(q_tag_3[i])]
                if 0 <= q_tag_4[i] < 188:
                    result_user_n_l_tags[result_index, 4] = self.user_n_l_tags[user_index, int(q_tag_4[i])]
                if 0 <= q_tag_5[i] < 188:
                    result_user_n_l_tags[result_index, 5] = self.user_n_l_tags[user_index, int(q_tag_5[i])]
                
                result_index += 1
            else:
                if train:
                    self.user_n_lectures[user_index] += 1
                    self.user_n_l_parts[user_index, int(l_part[i] - 1)] += 1
                    self.user_n_l_types[user_index, int(type_of[i])] += 1
                    self.user_n_l_tags[user_index, int(l_tag[i])] += 1
                    
        return {
            'user_n_lectures': result_user_n_lectures,
            'user_n_l_part': result_user_n_l_part,
            'user_n_l_tag_0': result_user_n_l_tags[:, 0].astype(numpy.uint16),
            'user_n_l_tag_1': result_user_n_l_tags[:, 1].astype(numpy.uint16),
            'user_n_l_tag_2': result_user_n_l_tags[:, 2].astype(numpy.uint16),
            'user_n_l_tag_3': result_user_n_l_tags[:, 3].astype(numpy.uint16),
            'user_n_l_tag_4': result_user_n_l_tags[:, 4].astype(numpy.uint16),
            'user_n_l_tag_5': result_user_n_l_tags[:, 5].astype(numpy.uint16),
            'user_n_l_type_0': result_user_n_l_types[:, 0].astype(numpy.uint16),
            'user_n_l_type_1': result_user_n_l_types[:, 1].astype(numpy.uint16),
            'user_n_l_type_2': result_user_n_l_types[:, 2].astype(numpy.uint16),
            'user_n_l_type_3': result_user_n_l_types[:, 3].astype(numpy.uint16),
        }
    
@numba.jitclass([
    ('user_prev_question_timestamp', numba.int64[:]),
    ('user_prev_lecture_timestamp', numba.int64[:]),
    ('user_prev_question_timestamp_nonzero', numba.int64[:]),
])
class UserInterval:
    def __init__(self, max_n_users):
        self.user_prev_question_timestamp = numpy.zeros(max_n_users, dtype=numpy.int64)
        self.user_prev_lecture_timestamp = numpy.zeros(max_n_users, dtype=numpy.int64)
        self.user_prev_question_timestamp_nonzero = numpy.zeros(max_n_users, dtype=numpy.int64)
        
    def calc(self, result_n_rows, user_indices, user_ids, is_questions, timestamps, train):
        result_interval_last_question = numpy.empty(result_n_rows, dtype=numpy.int64)
        result_interval_last_lecture = numpy.empty(result_n_rows, dtype=numpy.int64)
        result_interval_last_question_nonzero = numpy.empty(result_n_rows, dtype=numpy.int64)
        
        result_index = 0
        
        for i in range(user_ids.shape[0]):
            if is_questions[i]:
                user_index = user_indices[user_ids[i]]
                result_interval_last_question[result_index] = timestamps[i] - self.user_prev_question_timestamp[user_index]
                result_interval_last_lecture[result_index] = timestamps[i] - self.user_prev_lecture_timestamp[user_index]
                result_interval_last_question_nonzero[result_index] = timestamps[i] - self.user_prev_question_timestamp_nonzero[user_index]
                result_index += 1

            if train:
                if is_questions[i]:
                    self.user_prev_question_timestamp[user_index] = timestamps[i]
                    if self.user_prev_question_timestamp_nonzero[user_index] > timestamps[i]:
                        self.user_prev_question_timestamp_nonzero[user_index] = timestamps[i]
                else:
                    self.user_prev_lecture_timestamp[user_index] = timestamps[i]
            
                    
        return {
            'user_interval_last_question': result_interval_last_question,
            'user_interval_last_question_nonzero': result_interval_last_question_nonzero,
            'user_interval_last_lecture': result_interval_last_lecture,
        }
    
@numba.jitclass([
    ('rate', numba.float32[:]),
    ('user_moving_accuracy', numba.float32[:, :]),
])
class UserMovingAverage:
    def __init__(self, max_n_users):
        self.rate = numpy.array([1, 10, 100], dtype=numpy.float32)
        self.user_moving_accuracy = numpy.full((max_n_users, len(self.rate)), 0.5, dtype=numpy.float32)
        
    def calc(self, result_n_rows, user_indices, user_ids, is_questions, answered_correctly, train):
        result_user_moving_accuracy = numpy.empty((result_n_rows, len(self.rate)), dtype=numpy.float32)
        result_index = 0
        
        for i in range(user_ids.shape[0]):
            if is_questions[i]:
                user_index = user_indices[user_ids[i]]
                
                for j, rate in enumerate(self.rate):
                    result_user_moving_accuracy[result_index, j] = self.user_moving_accuracy[user_index, j]
                    
                result_index += 1
            
                if train:
                    for j, rate in enumerate(self.rate):
                        self.user_moving_accuracy[user_index, j] *= (1 - (1 / rate))
                        self.user_moving_accuracy[user_index, j] += answered_correctly[i] * (1 / rate)
                    
        return {
            'user_accuracy_exponential_mean_1': result_user_moving_accuracy[:, 0].astype(numpy.float32),
            'user_accuracy_exponential_mean_10': result_user_moving_accuracy[:, 1].astype(numpy.float32),
            'user_accuracy_exponential_mean_100': result_user_moving_accuracy[:, 2].astype(numpy.float32),
        }

@numba.jitclass([
    ('user_correct_memory', numba.float32[:, :]),
    ('user_timestamp_memory', numba.float32[:, :]),
])
class UserPredictTrend:
    def __init__(self, max_n_users):
        self.user_correct_memory = numpy.full((max_n_users, 100), numpy.nan, dtype=numpy.float32)
        self.user_timestamp_memory = numpy.full((max_n_users, 100), numpy.nan, dtype=numpy.float32)
        
    def calc(self, result_n_rows, user_indices, user_ids, is_questions, answered_correctly, timestamps, train):
        result_user_accuracy_predict = numpy.empty(result_n_rows, dtype=numpy.float32)
        result_user_accuracy_trend = numpy.empty(result_n_rows, dtype=numpy.float32)
        result_index = 0
        
        for i in range(user_ids.shape[0]):
            if is_questions[i]:
                user_index = user_indices[user_ids[i]]
                
                samples = numpy.count_nonzero(numpy.isfinite(self.user_correct_memory[user_index]))
                
                if samples >= 2:
                    x = self.user_timestamp_memory[user_index]
                    y = self.user_correct_memory[user_index]
                    
                    coeff1 = numpy.nansum(x * y)
                    coeff2 = numpy.nansum(x)
                    coeff3 = numpy.nansum(y)
                    coeff4 = numpy.nansum(x ** 2)
                    
                    a = (coeff1 - (coeff2 * coeff3) / samples) / (coeff4 - (coeff2**2) / samples + 1e-10)
                    b = numpy.nansum(y - a * x) / samples
                    
                    result_user_accuracy_predict[result_index] = timestamps[i] * a + b
                    result_user_accuracy_trend[result_index] = a
                else:
                    result_user_accuracy_predict[result_index] = 0.65
                    result_user_accuracy_trend[result_index] = 0.0
                    
                result_index += 1
            
                if train:
                    self.user_correct_memory[user_index] = numpy.roll(self.user_correct_memory[user_index], -1)
                    self.user_timestamp_memory[user_index] = numpy.roll(self.user_timestamp_memory[user_index], -1)
                    self.user_correct_memory[user_index, -1] = answered_correctly[i]
                    self.user_timestamp_memory[user_index, -1] = timestamps[i]
                     
        return {
            'user_accuracy_predict': result_user_accuracy_predict,
            'user_accuracy_trend': result_user_accuracy_trend,
        }
    
@numba.jitclass([
    ('user_question_history', numba.int16[:, :]),
])
class UserAttempts:
    def __init__(self, max_n_users):
        self.user_question_history = numpy.full((max_n_users, 100), -1, dtype=numpy.int16)
        
    def calc(self, result_n_rows, user_indices, user_ids, is_questions, content_ids, train):
        result_user_question_n_attempts = numpy.empty(result_n_rows, dtype=numpy.uint16)
        result_index = 0
        
        for i in range(user_ids.shape[0]):
            if is_questions[i]:
                user_index = user_indices[user_ids[i]]
                result_user_question_n_attempts[result_index] = numpy.count_nonzero(self.user_question_history[user_index] == content_ids[i])
                result_index += 1
            
                if train:
                    self.user_question_history[user_index] = numpy.roll(self.user_question_history[user_index], 1)
                    self.user_question_history[user_index, 0] = content_ids[i]
                     
        return {
            'user_question_n_attempts': result_user_question_n_attempts,
        }
    
@numba.jitclass([
    ('user_n_answers', numba.uint16[:]),
    ('user_n_corrects', numba.uint16[:]),
    ('user_sum_true_time', numba.float32[:]),
    ('user_sum_false_time', numba.float32[:]),
])
class UserMeanPrevTime:
    def __init__(self, max_n_users):
        self.user_n_answers = numpy.zeros(max_n_users, dtype=numpy.uint16)
        self.user_n_corrects = numpy.zeros(max_n_users, dtype=numpy.uint16)
        self.user_sum_true_time = numpy.zeros(max_n_users, dtype=numpy.float32)
        self.user_sum_false_time = numpy.zeros(max_n_users, dtype=numpy.float32)
        
    def calc(self, result_n_rows, user_indices, user_ids, is_questions, answered_correctly, prior_question_elapsed_time, train):
        result_user_mean_time = numpy.empty(result_n_rows, dtype=numpy.float32)
        result_user_mean_time_true = numpy.empty(result_n_rows, dtype=numpy.float32)
        result_user_mean_time_false = numpy.empty(result_n_rows, dtype=numpy.float32)
        result_index = 0
        
        for i in range(user_ids.shape[0]):
            if is_questions[i]:
                user_index = user_indices[user_ids[i]]
            
                result_user_mean_time[result_index] = (self.user_sum_true_time[user_index] + self.user_sum_false_time[user_index]) / (self.user_n_answers[user_index] + 1e-10)
                result_user_mean_time_true[result_index] = self.user_sum_true_time[user_index]  / (self.user_n_answers[user_index] + 1e-10)
                result_user_mean_time_false[result_index] = self.user_sum_false_time[user_index] / (self.user_n_answers[user_index] + 1e-10)
                
                result_index += 1
            
                if train:
                    self.user_n_answers[user_index] += 1
                    self.user_n_corrects[user_index] += answered_correctly[i]
                    if answered_correctly[i]:
                        self.user_sum_true_time[user_index] += prior_question_elapsed_time[i]
                    else:
                        self.user_sum_false_time[user_index] += prior_question_elapsed_time[i]
                    
                    
        return {
            'user_mean_time': result_user_mean_time,
            'user_mean_time_true': result_user_mean_time_true,
            'user_mean_time_false': result_user_mean_time_false,
        }
    
class Statistics:
    def __init__(self):
        self.questions_df = read_questions_csv()
        self.lectures_df = read_lectures_csv()
        self.drop_features = []
        
        self.reset_user()
    
    def append_content(self, nrows=None, chunksize=None):
        df_qs = []
        df_ls = []
        
        for df in read_train_csv(nrows=nrows, chunksize=chunksize):
            df = df.loc[:, ['user_id', 'content_id', 'content_type_id', 'answered_correctly', 'prior_question_elapsed_time']]
            
            df_q = df.iloc[df['content_type_id'].to_numpy() == 0]
            df_q = df_q.drop(columns='content_type_id')
            df_q = df_q.join(self.questions_df, on='content_id')
            
            df_qs.append(df_q)
            
            df_l = df.iloc[df['content_type_id'].to_numpy() == 1]
            df_l = df_l.drop(columns=['content_type_id', 'answered_correctly', 'prior_question_elapsed_time'])
            df_l = df_l.join(self.lectures_df, on='content_id')
            
            df_ls.append(df_l)
        
        df_q = pandas.concat(df_qs)
        df_l = pandas.concat(df_ls)
        
        self.bundle_count = self.questions_df['bundle_id'].value_counts().rename('bundle_count')
        
        self.question_info = join_df([
            df_q.groupby('content_id')
                .agg({
                    'answered_correctly': ['mean', 'count'],
                })
                .set_axis(['q_accuracy', 'q_attempt'], axis=1)
                .astype({'q_accuracy': 'float32', 'q_attempt': 'uint32'}),
        ], how='outer', on='content_id')

    def reset_user(self):
        max_n_users = 500000
        self.user_q_count = UserQCount(max_n_users)
        self.user_l_count = UserLCount(max_n_users)
        self.user_interval = UserInterval(max_n_users)
        self.user_moving_average = UserMovingAverage(max_n_users)
        self.user_predict_trend = UserPredictTrend(max_n_users)
        self.user_attempts = UserAttempts(max_n_users)
        self.user_mean_time = UserMeanPrevTime(max_n_users)
        self.user_index = numba.typed.Dict.empty(key_type=numba.uint32, value_type=numba.uint32)
        
    @staticmethod
    @numba.njit
    def add_users(user_indices, user_ids):
        for i in range(user_ids.shape[0]):
            if user_ids[i] not in user_indices:
                user_indices[user_ids[i]] = len(user_indices)
                
    def process(self, df, train):
        df = left_join(df, self.questions_df, on='content_id').rename(columns={'part': 'q_part'})
        df = left_join(df, self.lectures_df, on='content_id').rename(columns={'part': 'l_part'})
        df = left_join(df, self.bundle_count, on='bundle_id')
        
        dfs = {column: df[column].to_numpy() for column in df.columns}
        
        statistics.add_users(self.user_index, dfs['user_id'])
         
        is_questions = dfs['content_type_id'] == 0
        
        if 'answered_correctly' not in dfs:
            dfs['answered_correctly'] = numpy.zeros_like(dfs['user_id'])
        
        result_n_rows = numpy.count_nonzero(is_questions)
        result = {}
        result.update(self.user_q_count.calc(result_n_rows, self.user_index, dfs['user_id'], is_questions, dfs['answered_correctly'], dfs['q_part'], dfs['q_tag_0'], dfs['q_tag_1'], dfs['q_tag_2'], dfs['q_tag_3'], dfs['q_tag_4'], dfs['q_tag_5'], train))
        result.update(self.user_l_count.calc(result_n_rows, self.user_index, dfs['user_id'], is_questions, dfs['q_part'], dfs['l_part'], dfs['type_of'], dfs['tag'], dfs['q_tag_0'], dfs['q_tag_1'], dfs['q_tag_2'], dfs['q_tag_3'], dfs['q_tag_4'], dfs['q_tag_5'], train))
        result.update(self.user_interval.calc(result_n_rows, self.user_index, dfs['user_id'], is_questions, dfs['timestamp'], train))
        result.update(self.user_moving_average.calc(result_n_rows, self.user_index, dfs['user_id'], is_questions, dfs['answered_correctly'], train))
        result.update(self.user_predict_trend.calc(result_n_rows, self.user_index, dfs['user_id'], is_questions, dfs['answered_correctly'], dfs['timestamp'], train))
        result.update(self.user_attempts.calc(result_n_rows, self.user_index, dfs['user_id'], is_questions, dfs['content_id'], train))
        result.update(self.user_mean_time.calc(result_n_rows, self.user_index, dfs['user_id'], is_questions, dfs['answered_correctly'], dfs['prior_question_elapsed_time'], train))
        
        result['user_accuracy'] = result['user_n_corrects'] / (result['user_n_answers'] + 1e-10)
        
        df_q = df.iloc[is_questions, :]
        df_q = left_join(df_q, self.question_info, on='content_id')
        result['timestamp_in_hour'] = (df_q['timestamp'].to_numpy() / (60 * 60 * 1000))
        result['question_accuracy'] = df_q['q_accuracy'].to_numpy()
        result['question_n_attempt'] = df_q['q_attempt'].to_numpy()
        result['question_bundle_count'] = df_q['bundle_count'].to_numpy()
        result['part'] = df_q['q_part'].to_numpy() - 1
        result['is_reading_question'] = df_q['q_part'].to_numpy() >= 5
        
        result['q_tag_0'] = df_q['q_tag_0'].to_numpy()
        result['q_tag_1'] = df_q['q_tag_1'].to_numpy()
        result['q_tag_2'] = df_q['q_tag_2'].to_numpy()
        result['q_tag_3'] = df_q['q_tag_3'].to_numpy()
        result['q_tag_4'] = df_q['q_tag_4'].to_numpy()
        result['q_tag_5'] = df_q['q_tag_5'].to_numpy()
        
        result['prior_question_elapsed_time'] = df_q['prior_question_elapsed_time'].to_numpy()
        
        result['accuracy_diff_u_q'] = result['user_accuracy'] - result['question_accuracy']
        result['user_first_attempt'] = result['user_n_answers'] == 0
        
        self.feature_names = [name for name in result.keys() if name not in self.drop_features]
        self.categorical_features = list({'part', 'user_first_attempt', 'q_tag_0', 'q_tag_1', 'q_tag_2', 'q_tag_3', 'q_tag_4', 'q_tag_5'} - set(self.drop_features))
        
        return numpy.stack([result[f] for f in self.feature_names], axis=1).astype(numpy.float32)
    
    def apply_train(self, df):
        return self.process(df, train=True), df.loc[df['content_type_id'].to_numpy() == 0, 'answered_correctly'].to_numpy()
    
    def apply_test(self, df):
        return self.process(df, train=False)
    
def sample_dataset(x, t, sample):
    assert(x.shape[0] == t.shape[0])
    total_rows = int(x.shape[0])
    if sample < total_rows:
        index = random.sample(range(total_rows), int(sample))
        return x[index], t[index]
    else:
        return x, t
    
def statistics_make_train(statistics, nrows=None, chunksize=None, sample=None):
    xs = []
    ts = []
    
    if sample is not None:
        total_rows = 1e8 if nrows is None else nrows
        sample_rate = sample / total_rows
    
    statistics.reset_user()
    for i, df in enumerate(read_train_csv(nrows=nrows, chunksize=chunksize)):
        print(f'chunk {i}.')
        x, t = statistics.apply_train(df)
        
        if sample is not None:
            x, t = sample_dataset(x, t, int(x.shape[0] * sample_rate * 1.5))
        
        xs.append(x)
        ts.append(t)

    x = numpy.concatenate(xs, axis=0)
    t = numpy.concatenate(ts, axis=0)
    
    if sample is not None:
        x, t = sample_dataset(x, t, int(sample))
    
    return x, t

statistics = Statistics()
statistics.append_content(nrows=1e5, chunksize=1e7)

%lprun -f Statistics.process -T lprun_statistics_process.txt statistics_make_train(statistics, nrows=1e5, chunksize=1e6)

In [None]:
statistics.feature_names

In [None]:
import lightgbm
import sklearn.model_selection
import sklearn.inspection
import sklearn.ensemble
import random

def make_validation(statistics, nrows=None, chunksize=None, train_rows=1e6, test_rows=1e6):
    total_rows = train_rows + test_rows
    x, t = statistics_make_train(statistics, nrows=nrows, chunksize=chunksize, sample=total_rows)
    assert(x.shape[0] == t.shape[0])
    total_rows = int(x.shape[0])
    
    index = random.sample(range(total_rows), int(train_rows + test_rows))
    train_index = index[:int(train_rows)]
    test_index = index[int(train_rows):]
    
    return x[train_index], x[test_index], t[train_index], t[test_index]

def benchmark_classifier(classifier, statistics, nrows=None, chunksize=None, train_rows=1e6, test_rows=1e6):
    train_x, test_x, train_t, test_t = make_validation(statistics, nrows, chunksize, train_rows, test_rows)
    
    print(f'train_x.shape = {train_x.shape} ({train_x.nbytes / (1024**2) : .2f} MB)')
    print(f'train_t.shape = {train_t.shape} ({train_t.nbytes / (1024**2) : .2f} MB)')
    print(f'test_x.shape  = {test_x.shape} ({test_x.nbytes / (1024**2) : .2f} MB)')
    print(f'test_t.shape  = {test_t.shape} ({test_t.nbytes / (1024**2) : .2f} MB)')
    
    classifier.fit(train_x, train_t, feature_name=statistics.feature_names, categorical_feature=statistics.categorical_features)
    train_score = sklearn.metrics.roc_auc_score(train_t, classifier.predict_proba(train_x)[:, 1])
    test_score = sklearn.metrics.roc_auc_score(test_t, classifier.predict_proba(test_x)[:, 1])
    
    print(f'Train score : {train_score}')
    print(f'Test score : {test_score}')
    
    importances = calc_feature_importances(classifier, statistics)
    
    result = sklearn.inspection.permutation_importance(classifier, train_x, train_t, scoring='roc_auc')
    permutation_importances = result.importances_mean
    
    # importances['permutation_importance_raw'] = permutation_importances
    importances['permutation_importance'] = 100 * permutation_importances / sum(permutation_importances)
    importances = importances.sort_values('permutation_importance', ascending=False)
    
    print(importances)
    return importances

def train_classifier(classifier, statistics, nrows=None, chunksize=None):
    x, t = statistics_make_train(statistics, nrows=nrows, chunksize=chunksize)
    
    print(f'train_x.shape = {x.shape} ({x.nbytes / (1024**2) : .2f} MB)')
    print(f'train_t.shape = {t.shape} ({t.nbytes / (1024**2) : .2f} MB)')

    classifier.fit(x, t)#, feature_name=statistics.feature_names, categorical_feature=statistics.categorical_features)

def calc_feature_importances(classifier, statistics):
    raw_fi = classifier.feature_importances_
    rate_fi = raw_fi / sum(raw_fi)
    # return pandas.DataFrame(data={'feature_name': statistics.feature_names, 'feature_importance': rate_fi*100, 'feature_importance_raw': raw_fi})
    return pandas.DataFrame(data={'feature_name': statistics.feature_names, 'feature_importance': rate_fi*100})

use_rows = None
statistics.append_content(nrows=use_rows, chunksize=1e7)

classifier = lightgbm.LGBMClassifier(silent=False, class_weight='balanced', objective='binary')

importances = benchmark_classifier(classifier, statistics, nrows=use_rows, chunksize=5e6, train_rows=1e6, test_rows=1e6)
statistics.drop_features = list(importances.iloc[8:, 0])

benchmark_classifier(classifier, statistics, nrows=use_rows, chunksize=5e6, train_rows=1e6, test_rows=1e6)

classifier = sklearn.ensemble.BaggingClassifier(
    base_estimator=lightgbm.LGBMClassifier(silent=False, class_weight='balanced', objective='binary'),
    n_estimators=5,
    max_samples=0.5,
)
train_classifier(classifier, statistics, nrows=use_rows, chunksize=1e7)    

# feature_importances = calc_feature_importances(classifier, statistics).sort_values('feature_importance', ascending=False)
# print(feature_importances)

In [None]:
# import lightgbm
# import optuna.integration.lightgbm
# import sklearn.inspection
# import sklearn.metrics

# params = {
#     'objective': 'binary',
#     'metric': 'auc',
#     'seed': 375645910,
#     'is_unbalance': True,
# }

# def make_validation(statistics, nrows=None, chunksize=None, train_rows=1e6, test_rows=1e6):
#     total_rows = train_rows + test_rows
#     x, t = statistics_make_train(statistics, nrows=nrows, chunksize=chunksize, sample=total_rows)
#     assert(x.shape[0] == t.shape[0])
#     total_rows = int(x.shape[0])
    
#     index = random.sample(range(total_rows), int(train_rows + test_rows))
#     train_index = index[:int(train_rows)]
#     test_index = index[int(train_rows):]
    
#     return x[train_index], x[test_index], t[train_index], t[test_index]

# def feature_selection(statistics, nrows=None, chunksize=None, train_rows=1e6, test_rows=1e6):
#     train_x, test_x, train_t, test_t = make_validation(statistics, nrows, chunksize, train_rows, test_rows)
    
#     print(f'train_x.shape = {train_x.shape} ({train_x.nbytes / (1024**2) : .2f} MB)')
#     print(f'train_t.shape = {train_t.shape} ({train_t.nbytes / (1024**2) : .2f} MB)')
#     print(f'test_x.shape  = {test_x.shape} ({test_x.nbytes / (1024**2) : .2f} MB)')
#     print(f'test_t.shape  = {test_t.shape} ({test_t.nbytes / (1024**2) : .2f} MB)')
    
#     classifier = lightgbm.LGBMClassifier(silent=False, class_weight='balanced', objective='binary')
#     classifier.fit(train_x, train_t, feature_name=statistics.feature_names, categorical_feature=statistics.categorical_features)
    
#     train_score = sklearn.metrics.roc_auc_score(train_t, classifier.predict_proba(train_x)[:, 1])
#     test_score = sklearn.metrics.roc_auc_score(test_t, classifier.predict_proba(test_x)[:, 1])
#     print(f'Train score : {train_score}')
#     print(f'Test score : {test_score}')
    
#     feature_importances = classifier.feature_importances_
#     permutation_importances = sklearn.inspection.permutation_importance(classifier, train_x, train_t, scoring='roc_auc').importances_mean
    
#     feature_importances_rate = 100 * feature_importances / sum(feature_importances)
#     permutation_importances_rate = 100 * permutation_importances / sum(permutation_importances)
    
#     importances = pandas.DataFrame(index=statistics.feature_names, data={
#         'feature_importance': feature_importances_rate,
#         'permutation_importance': permutation_importances_rate
#     }).sort_values('permutation_importance', ascending=False)
    
#     print(importances)
    
#     statistics.drop_features = importances.index[8:]
    
# def model_optimize(statistics, nrows=None, chunksize=None, train_rows=1e6, test_rows=1e6):
#     train_x, test_x, train_t, test_t = make_validation(statistics, nrows, chunksize, train_rows, test_rows)
    
#     print(f'train_x.shape = {train_x.shape} ({train_x.nbytes / (1024**2) : .2f} MB)')
#     print(f'train_t.shape = {train_t.shape} ({train_t.nbytes / (1024**2) : .2f} MB)')
#     print(f'test_x.shape  = {test_x.shape} ({test_x.nbytes / (1024**2) : .2f} MB)')
#     print(f'test_t.shape  = {test_t.shape} ({test_t.nbytes / (1024**2) : .2f} MB)')
    
#     train = lightgbm.Dataset(train_x, train_t, feature_name=statistics.feature_names, categorical_feature=statistics.categorical_features, free_raw_data=False)
#     test = lightgbm.Dataset(test_x, test_t, feature_name=statistics.feature_names, categorical_feature=statistics.categorical_features, reference=train, free_raw_data=False)
    
#     classifier = optuna.integration.lightgbm.train(params, train_set=train, valid_sets=test, early_stopping_rounds=10, verbose_eval=1000)
    
#     train_score = sklearn.metrics.roc_auc_score(train_t, classifier.predict(train_x))
#     test_score = sklearn.metrics.roc_auc_score(test_t, classifier.predict(test_x))
    
#     print(f'Train score : {train_score}')
#     print(f'Test score : {test_score}')
    
#     print(f'best_params = {classifier.params}')
    
#     return classifier.params

# def create_classifier(statistics, params, nrows=None, chunksize=None):
#     x, t = statistics_make_train(statistics, nrows=nrows, chunksize=chunksize)
    
#     print(f'train_x.shape = {x.shape} ({x.nbytes / (1024**2) : .2f} MB)')
#     print(f'train_t.shape = {t.shape} ({t.nbytes / (1024**2) : .2f} MB)')
    
#     train = lightgbm.Dataset(x, t, feature_name=statistics.feature_names, categorical_feature=statistics.categorical_features)
    
#     classifier = lightgbm.train(params, train_set=train, verbose_eval=100)
#     return classifier

# use_rows = None
# statistics.append_content(nrows=use_rows, chunksize=1e7)
# statistics.drop_features = []

# feature_selection(statistics, nrows=use_rows, chunksize=5e6, train_rows=1e6, test_rows=1e6)
# best_params = model_optimize(statistics, nrows=use_rows, chunksize=5e6, train_rows=1e6, test_rows=1e6)
# classifier = create_classifier(statistics, best_params, nrows=use_rows, chunksize=5e6)

In [None]:
# lightgbm.create_tree_digraph(classifier, show_info=['split_gain', 'internal_count'])

In [None]:
import riiideducation

def submission(statistics, classifier):
    env = riiideducation.make_env()

    prev_test_df = None
    buffer_test_df = None
    counter = 0

    iter_test = env.iter_test()
    for (test_df, sample_prediction_df) in iter_test:
        # print(test_df)
        if prev_test_df is not None:
            prior_answered_correctly = eval(test_df['prior_group_answers_correct'].iat[0])
            if prior_answered_correctly:
                prev_test_df['answered_correctly'] = prior_answered_correctly
                prev_test_df = prev_test_df.drop(columns=['prior_group_responses', 'prior_group_answers_correct'])

                if buffer_test_df is None:
                    buffer_test_df = prev_test_df
                else:
                    buffer_test_df = pandas.concat([buffer_test_df, prev_test_df])

            if counter % 10 == 0:
                statistics.process(buffer_test_df, train=True)
                buffer_test_df = None
            counter += 1

        prev_test_df = test_df.copy()

        test_df = test_df.iloc[test_df['content_type_id'].to_numpy() == 0, :]

        test_x = statistics.apply_test(test_df)
        prediction = classifier.predict_proba(test_x)
        test_df['answered_correctly'] = prediction[:, 1]

        submission = test_df.loc[:, ['row_id', 'answered_correctly']]
        env.predict(submission)

# %lprun -f submission -T lprun_submission.txt submission(statistics, classifier)
submission(statistics, classifier)