In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # plots
import seaborn as sns # plots
import gc
import riiideducation
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import sys
!cp ../input/rapids/rapids.0.17.0 /opt/conda/envs/rapids.tar.gz
!cd /opt/conda/envs/ && tar -xzvf rapids.tar.gz > /dev/null
sys.path = ["/opt/conda/envs/rapids/lib/python3.7/site-packages"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib/python3.7"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib"] + sys.path 
!cp /opt/conda/envs/rapids/lib/libxgboost.so /opt/conda/lib/

# Riiid answer prediction - XGBoost

## Steps
1. Load
2. Process
3. Model
4. Evaluate

## 1. Load

### Issue    : Data volume  
### Solution : RAPIDS library & Kaggle GPU (39H/week)

In [None]:
# Rapids Imports
import cudf
import cupy # CuPy is an open-source array library accelerated with NVIDIA CUDA.

### Data : *train.csv*

In [None]:
%%time

# Read in data
dtypes = {
    "row_id": "int64",
    "timestamp": "int64",
    "user_id": "int32",
    "content_id": "int16",
    "content_type_id": "int8",
    "task_container_id": "int16",
    "user_answer": "int8",
    "answered_correctly": "int8",
    "prior_question_elapsed_time": "float32", 
    "prior_question_had_explanation": "int8"
}

train = cudf.read_csv('../input/riiid-test-answer-prediction/train.csv', dtype=dtypes)

In [None]:
# Fill in missing values with "-1"
train["prior_question_elapsed_time"] = train["prior_question_elapsed_time"].fillna(-1)
train["prior_question_had_explanation"] = train["prior_question_had_explanation"].fillna(-1)

___

# 2. Process

In [None]:
def aggregations(frame, target):
    """
    Thus function create aggregations data.
    """
    cols = ['user_id']  # Columns to aggregate
    
    aggs = ['mean']  # List of aggregation functions
    
    aggs_dfs = []  # List of aggregated DataFrame
    
    for col in cols:  # Loop over the columns to aggregate
        
        df = frame.groupby(col).agg({target : aggs})
        
        df.columns = [ col[0] + new_column for new_column in df.columns.droplevel()]
        
        df[col] = df.index  # Add the index as column for the merge
        
        frame = frame.merge(df, on=col)  # Merge based on the same column
        
        aggs_dfs.append(df)
        
    return frame, aggs_dfs

def preprocess_frame(frame, features, target):
    """
    This function do the preprocessing on the dataframe and the feature
    engineering.
    """
    
    frame = frame[features]  # Working only on features
        
    frame, aggs_dfs = aggregations(frame, target)  # Do the aggregations

    
    return frame, aggs_dfs

# 3. Model

In [None]:
cudf.set_allocator("managed")

In [None]:
%%time

# Let's exclude all observations where (content_type_id = 1) & (answered_correctly = -1)
train = train[train['content_type_id'] != 1]
train = train[train['answered_correctly'] != -1].reset_index(drop=True)

In [None]:
%%time

# RAPIDS roc_auc_score is 16x faster than sklearn. - cdeotte
from cuml.metrics import roc_auc_score
from cuml.preprocessing.model_selection import train_test_split
import xgboost
import pickle

In [None]:
def train_xgb_model(X_train, X_test, y_train, y_test, params, prints=True):
    '''Trains an XGB and returns the trained model + ROC value.'''
    # Create DMatrix - is optimized for both memory efficiency and training speed.
    train_matrix = xgboost.DMatrix(data = X_train, label = y_train)
    
    # Create & Train the model
    model = xgboost.train(params, dtrain = train_matrix)

    # Make prediction
    predicts = model.predict(xgboost.DMatrix(X_test))
    roc = roc_auc_score(y_test.astype('int32'), predicts)

    if prints:
        print("ROC: {:.5}".format(roc))
    
    return model, roc


def param_tuning_graph(param_values, roc_values):
    '''Represents visually the ROC results for the speciffic parameter tune.'''
    
    plt.figure(figsize=(18, 3))
    ax = sns.barplot(x=param_values, y=roc_values, palette=custom_colors)

    for p in ax.patches:
        width = p.get_width()
        height = p.get_height()
        x, y = p.get_xy() 
        ax.annotate(f'{height:.5%}', (x + width/2, y + height*1.02), ha='center')

In [None]:
%%time

target = 'answered_correctly'

# Preprocessing
train_proc, u_aggs = preprocess_frame(train, train.columns.tolist(), target)
#train_proc = train

features = train_proc.columns.tolist()

In [None]:
features.remove('answered_correctly')
features.remove('user_answer')
features.remove('row_id')
features.remove('user_id')

In [None]:
%%time

# Features, target and train/test split
X = train_proc[features]
y = train_proc[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True, stratify=y)

In [None]:
params1 = {
    'max_depth' : 4,
    'max_leaves' : 2**4,
    'tree_method' : 'gpu_hist',
    'objective' : 'reg:logistic',
    'grow_policy' : 'lossguide',
}

In [None]:
model, roc = train_xgb_model(X_train, X_test, y_train, y_test, params1, prints=True)

version = "xgb_v7"
#model.save_model(version)

In [None]:
# model = xboost.load_model(version)

# 4. Hyperparameter tuning

___

In [None]:
def predict_from(model, Xs, threshold=0.6):
    """
    This function get the predictions from a given pandas dataframe format
    in need to be converted to the model specifics.
    """
    
    dmatrix = xgboost.DMatrix(Xs)  # Convert DataFrame column to DMatrix
    
    predictions_probas = model.predict(dmatrix)  # Get the probas of predictions
    
    predictions = predictions_probas > threshold  # Get True or False
    
    return predictions.astype(int)  # Predictions with 1 for True and 0 for False

In [None]:
def link_to_aggs(Xs, aggs, col):
    
    Xs = cudf.from_pandas(Xs)  # Convert pandas to cudf
        
    Xs = Xs.merge(aggs, how='left', on=col)  # Merge cudf DataFrames
    
    return Xs.to_pandas()

# 5. Predict and Submit

In [None]:
features_submission = ['row_id', 'timestamp', 'user_id', 'content_id', 'content_type_id',
       'task_container_id', 'prior_question_elapsed_time',
       'prior_question_had_explanation', 'prior_group_answers_correct',
       'prior_group_responses']

f_sub = set(features_submission)  # Features available for submission

dtypes_sub = {
    "timestamp": "int64",
    "user_id": "int32",
    "content_id": "int16",
    "content_type_id": "int8",
    "task_container_id": "int16",
    "prior_question_elapsed_time": "float32", 
    "prior_question_had_explanation": "int8"
}

In [None]:
f_train = set(features)  # Features used for training

In [None]:
# print("Intersection :", f_sub & f_train)
# print("Difference   :", f_sub - f_train)
# print("Difference   :", f_train - f_sub)

In [None]:
features

___

In [None]:
# Create the env
env = riiideducation.make_env()

In [None]:
# Create the iterator
iter_test = env.iter_test()

In [None]:
# Iter and predict
for (test_df, sample_prediction_df) in iter_test:
    
    X = test_df.copy()
    
    X = X.merge(u_aggs[0].to_pandas().reset_index(drop=True), how='left', on='user_id')  # Add the aggregated data
    # X = link_to_aggs(X, c_aggs, 'content_id')  # Add the aggregated data
    
    # Preprocessing block
    X["prior_question_elapsed_time"]    = X["prior_question_elapsed_time"].fillna(-1)
    X["prior_question_had_explanation"] = X["prior_question_had_explanation"].fillna(False)
    
    X = X.astype(dtypes_sub)  # Only take defined features
        
    predictions = predict_from(model, X[features], 0.65)  # Get predictions
    
    test_df['answered_correctly'] = predictions  # Assign predictions
    
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])

___