In [None]:
import numpy as np
import pandas as pd
import datatable as dt
import gc
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

train_path = "../input/riiid-test-answer-prediction/train.csv"
questions_path = "../input/riiid-test-answer-prediction/questions.csv"
lectures_path = "../input/riiid-test-answer-prediction/lectures.csv"
test = "../input/riiid-test-answer-prediction/example_test.csv"   

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data_types_dict = {
    'row_id': 'int64', 
    'timestamp': 'int64', 
    'user_id': 'int32', 
    'content_id': 'int16', 
    'content_type_id': 'int8',
    'task_container_id': 'int16', 
    'user_answer': 'int8', 
    'answered_correctly': 'int8', 
    'prior_question_elapsed_time': 'float32', 
    'prior_question_had_explanation': 'boolean'
}
target = 'answered_correctly'


In [None]:
# Read data
train_df = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/train.csv',
                       low_memory=False, 
                       nrows=10**6)

In [None]:
features = [
    'prior_question_elapsed_time', 
    'prior_question_had_explanation',
    'mean_user_acc',
    'median_user_acc',
    'std_user_acc',
    'skew_user_acc',
    'number_of_answered_q',
    'mean_task_acc',
    'median_task_acc',
    'std_task_acc',
    'skew_task_acc',
    'number_of_asked_task_containers',
    'mean_acc',
    'median_acc',
    'std_acc',
    'skew_acc',
    'number_of_asked_q',
#     'user_correctness',
    'seen_lecture'
]


# drop the unnecessary columns, and only save the last 20 questions answered by each user
# train_df = train_df.drop(columns=['row_id','timestamp'])
# train_df = train_df.groupby('user_id').tail(20)

# Replace null
train_df["prior_question_elapsed_time"] = train_df["prior_question_elapsed_time"].replace(np.nan, 0).astype("float32")
train_df["prior_question_had_explanation"] = train_df["prior_question_had_explanation"].replace(np.nan, False).astype("boolean")
train_df

In [None]:
# add the feature seen_lecutre 
df2 = train_df[['user_id','content_type_id']].drop_duplicates()
df2['content_type_id'] = df2.content_type_id.apply(lambda x: 1 if x == 0 else 2)
df2 = df2.groupby('user_id').sum().reset_index()
df2 = df2.rename(index = str, columns = {"content_type_id":"seen_lecture"})
df2['seen_lecture'] = df2.seen_lecture.apply(lambda x: x-1)
train_df = train_df.merge(df2, how='left', on='user_id')

In [None]:
# Exclude lectures
train_df = train_df[train_df[target] != -1].reset_index(drop = True, inplace = False)

In [None]:
# construct new features
# answer for the previous questions
train_df['lag'] = train_df.groupby('user_id')[target].shift()
# cumulative number of correct answers
cum = train_df.groupby('user_id')['lag'].agg(['cumsum', 'cumcount'])
# calculate the correctness
train_df['user_correctness'] = cum['cumsum'] / cum['cumcount']
# drop the 'lag' feature
train_df.drop(columns = ['lag'], inplace = True)

# Overall correctness of users
user_agg = train_df.groupby('user_id')[target].agg(['sum', 'count'])
# Overall difficulty of questions
content_agg = train_df.groupby('content_id')[target].agg(['sum', 'count'])

In [None]:
# merge the question dataset
questions_df = pd.read_csv(
    '../input/riiid-test-answer-prediction/questions.csv', 
    usecols = [0, 3],
    dtype = {'question_id': 'int16', 'part': 'int8'}
)
train_df = pd.merge(train_df, questions_df, left_on = 'content_id', right_on = 'question_id', how = 'left')
train_df.drop(columns = ['question_id'], inplace = True)

# How many questions have been answered in each content ID?
# train_df['content_count'] = train_df['content_id'].map(content_agg['count']).astype('int32')


In [None]:
user_characteristics = train_df.groupby('user_id').agg({'answered_correctly':
                                              ['mean', 'median', 'std', 'skew', 'count']})
user_characteristics.columns = [
    'mean_user_acc',
    'median_user_acc',
    'std_user_acc',
    'skew_user_acc',
    'number_of_answered_q'
]

In [None]:
# We saw earlier some dependencies between answered_correctly and the frequency of task_container_id. 
# Therefore I want to add some features for the task_container_id
# task_container_characteristics derived from task_container_id
task_container_characteristics = train_df.groupby('task_container_id').agg({'answered_correctly':
                                                                      ['mean', 'median', 'std', 'skew', 'count']})
task_container_characteristics.columns = [
    'mean_task_acc',
    'median_task_acc',
    'std_task_acc',
    'skew_task_acc',
    'number_of_asked_task_containers'
]

In [None]:
# content_characteristics derived from content_id
content_characteristics = train_df.groupby('content_id').agg({'answered_correctly':
                                                    ['mean', 'median', 'std', 'skew', 'count']})
content_characteristics.columns = [
    'mean_acc',
    'median_acc',
    'std_acc',
    'skew_acc',
    'number_of_asked_q'
]


In [None]:
# merge data
df = train_df.copy()
df = df.merge(user_characteristics, how='left', on='user_id')
df = df.merge(task_container_characteristics, how='left', on='task_container_id')
df = df.merge(content_characteristics, how='left', on='content_id')

col_to_drop = set(df.columns.values.tolist()).difference(features + [target])
for col in col_to_drop:
    del df[col]
df['prior_question_had_explanation'] = df['prior_question_had_explanation'].fillna(value=False).astype(float)
df = df.fillna(value=0.5)

In [None]:
df

In [None]:
# Model training
import riiideducation
from sklearn.metrics import roc_auc_score, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, learning_curve
from sklearn.utils import shuffle
import lightgbm as lgb
from lightgbm import LGBMClassifier
import eli5
env = riiideducation.make_env()
train_df, test_df, y_train, y_test = train_test_split(df[features], df[target], random_state=777, test_size=0.2)

In [None]:
params = {
    'num_leaves': 30, 
    'n_estimators': 300, 
    'min_data_in_leaf': 100, 
    'max_depth': 5, 
    'lambda': 0.0, 
    'feature_fraction': 1.0
}
model = LGBMClassifier(**params)
model.fit(train_df, y_train)


In [None]:
eli5.show_weights(model, top=20)

In [None]:
# roc_auc_score(y_test.values, model.predict_proba(test_df)[:, 1])

In [None]:
iter_test = env.iter_test()

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    # merge
    test_df = test_df.merge(user_characteristics, on = "user_id", how = "left")
    test_df = test_df.merge(task_container_characteristics, on = "task_container_id", how = "left")
    test_df = test_df.merge(content_characteristics, on = "content_id", how = "left")
    test_df = pd.merge(test_df, questions_df, left_on = 'content_id', right_on = 'question_id', how = 'left')
    test_df.drop(columns = ['question_id'], inplace = True)
    test_df = test_df.merge(df2, how='left', on='user_id')
    
 
    
    # type transformation
    test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].fillna(value=False).astype(bool)
    test_df.fillna(value = 0.5, inplace = True)
    test_df = test_df.replace([np.inf, -np.inf], np.nan)
    test_df = test_df.fillna(0.5)
    
    # preds
    test_df['answered_correctly'] = model.predict_proba(test_df[features])[:, 1]
    cols_to_submission = ['row_id', 'answered_correctly', 'group_num']
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])