# About RiiiD!
![](https://www.riiid.co/assets/about_image_3@2x.png)
RiiiD! is a Korea based AI research company. Their goal is, in their own words, "Inviting AI Researchers to Solve the World's Biggest Challenges in AI Education".Their website can be found at https://www.riiid.co/en/about.

# All avaiable files

In [None]:
import plotly.express as px
import gc
import pickle

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
root = '/kaggle/input/riiid-test-answer-prediction/'

# train.csv
train.csv is a very large file. 
It is a very "long" file with only 10 columns, but 101,230,332 rows.

In [None]:
%%time

df_train = pd.read_csv(root + 'train.csv', 
    nrows = 10**7,
    dtype={
        'row_id': 'int64', 
        'timestamp': 'int64', 
        'user_id': 'int32', 
        'content_id': 'int16', 
        'content_type_id': 'int8',
        'task_container_id': 'int16', 
        'user_answer': 'int8', 
        'answered_correctly': 'int8', 
        'prior_question_elapsed_time': 'float32', 
        'prior_question_had_explanation': 'boolean'
    },
)
df_train.head(10)

In [None]:
"""
Describe does not yield any especially useful info about train.csv.
"""
df_train.describe()

In [None]:
"""
content_type_id denotes if the contents are questions or lectures.
The pie shows that 98.1% of the data in train.csv are questions (0), only 1.94% are lectures.
"""
df = df_train['content_type_id'].value_counts().reset_index()

fig = px.pie(df, values='content_type_id', names='index')
fig.show()

In [None]:
"""
user_answer denotes if a user answered the question or not.
0,1,2,3: I assume this means which option a user choose.
-1: if content_type is lecture.
"""
df = df_train['user_answer'].value_counts().reset_index()

fig = px.pie(df, values='user_answer', names='index')
fig.show()

In [None]:
"""
answered_correctly denotes if a user answered the question correctly.
-1: it's a lecture not a question.
0: wrong answer.
1: correct answer.
"""
df = df_train['answered_correctly'].value_counts().reset_index()

fig = px.pie(df, values='answered_correctly', names='index')
fig.show()

In [None]:
"""
Drop useless columns in train.csv for sake of saving memory.
"""
df_train = df_train.drop([
    'row_id', 
    'timestamp', 
    'content_type_id',
    'task_container_id',
], axis=1)

# lectures.csv
shape=(418, 4)

In [None]:
df_lectures = pd.read_csv(root+'lectures.csv')
df_lectures.head(10)

In [None]:
"""
type_of indicates what a lecture is about.
"""
df = df_lectures['type_of'].value_counts().reset_index()

fig = px.pie(df, values='type_of', names='index')
fig.show()

# questions.csv
shape=(13523, 5)

In [None]:
df_questions = pd.read_csv(root+'questions.csv')
df_questions.head(10)

In [None]:
"""
lecture ids and question ids have overlap. This is a little strange.
I'm expecting no overlap and each id correcpond to the 'content_id' column in train.csv...
"""
lecture_ids = df_lectures['lecture_id'].unique()
question_ids = df_questions['question_id'].unique()
set(lecture_ids).intersection(set(question_ids))

# XGBoost model [WIP]

In [None]:
%%time

"""
Read necessary cols from train.csv.
"""
necessary_cols = {
    'user_id': 'int32',
    'content_id': 'int16',
    'answered_correctly': 'int8',
    'prior_question_elapsed_time': 'float16',
    'prior_question_had_explanation': 'boolean'
}

df_train = pd.read_csv(
    root+'train.csv',
    usecols=necessary_cols.keys(),
    dtype=necessary_cols, 
    index_col=0
)

In [None]:
"""
Feature engineering: calc a user's historical performance.
"""
df_train_groupByUser = df_train.groupby('user_id')
df_train_groupByUserStats = df_train_groupByUser.agg({
    'answered_correctly':['mean', 'count', 'std', 'skew'],
    'prior_question_elapsed_time': ['mean', 'std']
})

# Flatten index.
df_train_groupByUserStats.columns = [
    'user_answered_correctly_mean',
    'user_answered_correctly_count',
    'user_answered_correctly_std',
    'user_answered_correctly_skew',
    'user_prior_question_elapsed_time_mean',
    'user_prior_question_elapsed_time_std',
]
df_user_stats = df_train_groupByUserStats.reset_index()
df_user_stats

In [None]:
"""
Feature engineering: calc a specific content's states.
"""
df_train_groupByContent = df_train.groupby('content_id')
df_train_groupByContentStats = df_train_groupByContent.agg({
    'answered_correctly': ['mean', 'count', 'std', 'skew']})
df_train_groupByContentStats

In [None]:
"""
Feature engineering: combine lectures.csv and questions.csv
"""
df_lectures['is_lecture'] = 1
df_lectures_droped = df_lectures.drop(['tag', 'type_of', 'part'], axis=1)
df_lectures_droped.columns = ['content_id', 'is_lecture']
df_questions['is_question'] = 1
df_questions_droped = df_questions.drop(['bundle_id', 'correct_answer', 'tags', 'part'], axis=1)
df_questions_droped.columns = ['content_id', 'is_question']

df_contents = pd.merge(df_lectures_droped, df_questions_droped, on='content_id', how='outer')
df_contents[['is_lecture', 'is_question']] = df_contents[['is_lecture', 'is_question']].fillna(0)
df_contents_stats = pd.merge(df_train_groupByContentStats, df_contents, on='content_id', how='left')
df_contents_stats.columns = [
    'content_id',
    'content_answered_correctly_mean',
    'content_answered_correctly_count',
    'content_answered_correctly_std',
    'content_answered_correctly_skew',
    'is_lecture',
    'is_question'
]
df_contents_stats

In [None]:
del df_questions
del df_lectures
del df_lectures_droped
del df_questions_droped
del df_contents
del df_train_groupByUserStats
del df_train_groupByContent
del df_train_groupByContentStats
del df_train_groupByUser
gc.collect()

In [None]:
"""
Util fn: reduce memory usage of a dataframe.
"""
def reduce_mem_usage(df, verbose=True):
    """Make everything faster by reducing the memory used by dataframes.
    Iterate all columns and modify data type to reduce memory.

    Args:
        df: pandas dataframe
    Returns:
        df: pandas dataframe, with reduced memory
    """
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: 
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

df_train = reduce_mem_usage(df_train)
df_user_stats = reduce_mem_usage(df_user_stats)
df_contents_stats = reduce_mem_usage(df_contents_stats)

In [None]:
"""
Save df_train, df_user_stats, df_contents_stats before merge. 
Memory error is likely to occur. Painful...
"""
# Save dataframe
# df_train.to_pickle('df_train.pkl')
# df_user_stats.to_pickle('df_user_stats.pkl')
# df_contents_stats.to_pickle('df_contents_stats.pkl')

In [None]:
# Load saved dataframe
# df_train = pd.read_pickle('df_train.pkl')
# df_user_stats = pd.read_pickle('df_user_stats.pkl')
# df_contents_stats = pd.read_pickle('df_contents_stats.pkl')

In [None]:
"""
Constructing training dataframe.
"""
# Only choose rows questions, not lectures.
df_train = df_train[df_train['answered_correctly'] != -1]

df_train = df_train.merge(df_user_stats, on='user_id', how='left')

In [None]:
df_train = df_train.merge(df_contents_stats, on='content_id', how='left')

df_train['prior_question_had_explanation'] = df_train['prior_question_had_explanation'].fillna(value=False).astype(bool)
df_train = df_train.fillna(0.5)
df_train

In [None]:
df_train.columns

In [None]:
"""
Final step before split.
Only choose necessary features.
"""
features = [
       'prior_question_elapsed_time', 'prior_question_had_explanation',
       'user_answered_correctly_mean', 'user_answered_correctly_count',
       'user_answered_correctly_std', 'user_answered_correctly_skew',
       'user_prior_question_elapsed_time_mean',
       'user_prior_question_elapsed_time_std',
       'content_answered_correctly_mean', 'content_answered_correctly_count',
       'content_answered_correctly_std', 'content_answered_correctly_skew',
       'is_lecture', 'is_question'
]
target = 'answered_correctly'

df_train = df_train[features + [target]]

In [None]:
df_train.fillna(0.5)
df_train

In [None]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df_train, random_state=1, test_size=0.2)

In [None]:
"""
Build XGBoost model.
"""
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

xgb_matrix = xgb.DMatrix(
    df_train[features],
    df_train[target]
)

our_params = {
  'eta'             : 0.05,    
  'seed'            : 0, 
  'subsample'       : 0.8, 
  'colsample_bytree': 0.8, 
  'objective'       : 'binary:logistic', # output probability [0, 1]
  'max_depth'       : 10,  
  'min_child_weight': 1 # default=1, prevent overfitting, high value may cause under fitting
}

In [None]:
%%time
final_gb = xgb.train(
  params = our_params, 
  dtrain = xgb_matrix, 
  num_boost_round = 10,
#   early_stopping_rounds = 150,
  verbose_eval = 5
)

In [None]:
"""
Predict using df_test.
"""
from sklearn.metrics import roc_auc_score

xgb_matrix_test = xgb.DMatrix(
    df_test[features],
#     df_test[target]
)

test_predict = final_gb.predict(xgb_matrix_test)

roc_auc_score(df_test[target].values, test_predict)
test_predict

In [None]:
"""
RiiiD API. Only run this cell once!
"""
import riiideducation

env = riiideducation.make_env()
iter_test = env.iter_test()

In [None]:
"""
Create submission.
"""

for (df_test, sample_prediction_df) in iter_test:
    df_test = df_test.merge(df_user_stats, how = 'left', on = 'user_id')
    df_test = df_test.merge(df_contents_stats, how = 'left', on = 'content_id')
    df_test['prior_question_had_explanation'] = df_test['prior_question_had_explanation'].fillna(value=False).astype(bool)
    df_test.fillna(value = 0.5, inplace = True)

    dMatrix = xgb.DMatrix(df_test[features])
    
    df_test['answered_correctly'] = final_gb.predict(dMatrix)
    env.predict(df_test.loc[df_test['content_type_id'] == 0, ['row_id', 'answered_correctly']])

# Some notes
Memory management is crucial for this competition.
ToDo: hyperparameter optimization.