In [None]:
import pandas as pd
import numpy as np
import gc

DEBUG = True
TARGET = "answered_correctly"


In [None]:
def train_load(debug=True):
    rows = 10 ** 4 if debug else 10 ** 7
    train = pd.read_csv("../input/riiid-test-answer-prediction/train.csv",
                        low_memory=False,
                        nrows=rows,
                        dtype={'row_id': 'int64',
                               'timestamp': 'int64',
                               'user_id': 'int32',
                               'content_id': 'int16',
                               'content_type_id': 'int8',
                               'task_container_id': 'int16',
                               'user_answer': 'int8',
                               'answered_correctly': 'int8',
                               'prior_question_elapsed_time': 'float32',
                               'prior_question_had_explanation': 'boolean',
                               }
                        ).drop(columns="row_id")

    train["prior_question_elapsed_time"] = train["prior_question_elapsed_time"].replace(np.nan, 0).astype("int32")

    print(train.info())
    return train

def question():
    questions = pd.read_csv("../input/riiid-test-answer-prediction/questions.csv")
    print(questions.info())
    return questions


def lecture():
    lectures = pd.read_csv("../input/riiid-test-answer-prediction/lectures.csv")
    print(lectures.info())
    return lectures

In [None]:
def features_engineering(data):
    data["count"] = data.groupby("user_id")[TARGET].transform("count")
    data["answered_num"] = data.groupby("user_id")[TARGET].transform("sum")
    data["correct_ratio"] = data["answered_num"] / data["count"]

    return data


def questions_preprocess(data):
    data = data.reset_index(drop=True)
    data = data.drop(columns=["bundle_id", "correct_answer"])

    data = data.drop(columns="tags")
    data = data.rename(columns={"question_id": "content_id"})

    data["content_id"] = data["content_id"].astype("int16")
    return data


def lectures_preprocess(data):
    data = data.reset_index(drop=True)
    data["type_of"] = data["type_of"].astype("category").cat.codes
    data = data.rename(columns={"lecture_id": "content_id"})

    data["content_id"] = data["content_id"].astype("int16")
    return data


def concat(data):
    questions = questions_preprocess(question())
    lectures = lectures_preprocess(lecture())

    q_data = data.loc[data["content_type_id"] == 0, :].copy()
    q_data = q_data.merge(questions, on="content_id", how="left")

    l_data = data.loc[data["content_type_id"] == 1, :]
    l_data = l_data.merge(lectures, on="content_id", how="left")
    del data
    gc.collect()

    data = pd.concat([q_data, l_data])

    del q_data
    del l_data
    gc.collect()

    return data

In [None]:
train = train_load(debug=DEBUG)
train

In [None]:
train = features_engineering(train)
train

In [None]:
train = concat(train)
train

In [None]:
if not DEBUG:
    train.to_pickle("train_v1.pkl")