In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import riiideducation
from lightgbm import LGBMClassifier
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder

LGBM

In [None]:
train = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/train.csv',
                   usecols=[1, 2, 3, 4, 5, 7, 8, 9],
                   dtype={'timestamp': 'int64',
                          'user_id': 'int32',
                          'content_id': 'int16',
                          'content_type_id': 'int8',
                          'task_container_id': 'int16',
                          'answered_correctly':'int8',
                          'prior_question_elapsed_time': 'float32',
                          'prior_question_had_explanation': 'boolean'})

In [None]:
train = train[train.content_type_id == False]
#Lecture rows have been filtered out from content_type_id col

train = train.sort_values(['timestamp'], ascending=True)
train.drop(['timestamp', 'content_type_id'], axis=1, inplace=True)

In [None]:
results_c = train[['content_id','answered_correctly']].groupby(['content_id']).agg(['mean'])
results_c.columns = ["answered_correctly_content"]

In [None]:
results_u = train[['user_id','answered_correctly']].groupby(['user_id']).agg(['mean', 'sum', 'count'])
results_u.columns = ["answered_correctly_user", 'sum_user_ans', 'count_user_ans']

In [None]:
results_t = train[['task_container_id','answered_correctly']].groupby(['task_container_id']).agg(['mean', 'sum', 'count'])
results_t.columns = ["answered_correctly_task", 'sum_task_ans', 'count_task_ans']

In [None]:
train["prior_question_elapsed_time"] = train["prior_question_elapsed_time"].fillna(13238)

In [None]:
train['prior_question_had_explanation'] = train['prior_question_had_explanation'].fillna(True)

In [None]:
train.isnull().sum()

In [None]:
# Checked voth mean and max for the same column. 
#train['prior_question_elapsed_time'].min()

In [None]:
train["duration"] = train["prior_question_elapsed_time"] / 300000

In [None]:
X = train.iloc[70000000:, :]

In [None]:
#X.shape

In [None]:
X = pd.merge(X, results_u, on=['user_id'], how="left")

In [None]:
X = pd.merge(X, results_c, on=['content_id'], how="left")

In [None]:
X = pd.merge(X, results_t, on=['task_container_id'], how="left")

In [None]:
import gc
gc.collect()

In [None]:
X = X[X.answered_correctly!= -1]
X = X.sort_values(['user_id'])

In [None]:
Y = X[["answered_correctly"]]

In [None]:
X.isnull().sum()

In [None]:
X = X.drop(["answered_correctly"], axis=1)

In [None]:
le = LabelEncoder()

X["prior_question_had_explanation_enc"] = le.fit_transform(X["prior_question_had_explanation"])

In [None]:
#X.head()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X.columns

In [None]:
X = X[['prior_question_elapsed_time',
       'duration', 'answered_correctly_user', 'sum_user_ans', 'count_user_ans',
       'answered_correctly_content', 'answered_correctly_task', 'sum_task_ans',
       'count_task_ans', 'prior_question_had_explanation_enc']]

In [None]:
X_t, X_tt, y_t, y_tt = train_test_split(X, Y, test_size=0.01, shuffle=False)

In [None]:
gc.collect()

In [None]:
params = {'objective': 'binary',
    'max_bin': 650,
    'learning_rate': 0.04,
    'num_leaves': 80}

In [None]:
lgb_train = lgb.Dataset(X_t, y_t)
lgb_eval = lgb.Dataset(X_tt, y_tt, reference=lgb_train)

In [None]:
gc.collect()

In [None]:
model = lgb.train(
    params, lgb_train,
    valid_sets=[lgb_train, lgb_eval],
    verbose_eval=10,
    num_boost_round=10000,
    early_stopping_rounds=200
)

In [None]:
y_pred = model.predict(X_tt)

In [None]:
y_true = np.array(y_tt)


In [None]:
y_pred.min()

In [None]:
roc_auc_score(y_true, y_pred)

In [None]:
lgb.plot_importance(model)
plt.show()

In [None]:
env = riiideducation.make_env()

In [None]:
iter_test = env.iter_test()

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    test_df = pd.merge(test_df, results_u, on=['user_id'],  how="left")
    test_df = pd.merge(test_df, results_c, on=['content_id'],  how="left")
    test_df = pd.merge(test_df, results_t, on=['task_container_id'],  how="left")
    test_df['answered_correctly_user'].fillna(0.5, inplace=True)
    test_df['answered_correctly_content'].fillna(0.5, inplace=True)
    test_df['answered_correctly_task'].fillna(0.5, inplace=True)
    test_df['sum_user_ans'].fillna(0, inplace=True)
    test_df['sum_task_ans'].fillna(0, inplace=True)
    test_df['count_user_ans'].fillna(0, inplace=True)
    test_df['count_task_ans'].fillna(0, inplace=True)
    test_df['prior_question_elapsed_time'].fillna(test_df['prior_question_elapsed_time'].mean(), inplace=True)
    sgmin = test_df['prior_question_elapsed_time'].min()
    sgmax = test_df['prior_question_elapsed_time'].max()
    test_df['duration'] = (test_df['prior_question_elapsed_time'] - sgmin) / (sgmax-sgmin)
    test_df['prior_question_had_explanation'].fillna(True, inplace=True)
    test_df["prior_question_had_explanation_enc"] = le.fit_transform(test_df["prior_question_had_explanation"])
    test_df['answered_correctly'] =  model.predict(test_df[['answered_correctly_user', 'answered_correctly_content', 'sum_user_ans', 'count_user_ans',
                                                            'answered_correctly_task', 'sum_task_ans', 'count_task_ans', 'duration', 
                                                            'prior_question_elapsed_time','prior_question_had_explanation_enc']])
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])