In [None]:
import numpy as np
import pandas as pd
import os

import matplotlib.pyplot as plt
%matplotlib inline


Use first 10**5 rows of train dataset for data exploring. Using more efficient datatypes as shown in introduction notebook.

In [None]:
train_df_105 = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/train.csv', low_memory=False, nrows=10**5, 
                       dtype={'row_id': 'int64', 'timestamp': 'int64', 'user_id': 'int32', 'content_id': 'int16', 'content_type_id': 'int8',
                              'task_container_id': 'int16', 'user_answer': 'int8', 'answered_correctly': 'int8', 'prior_question_elapsed_time': 'float32', 
                             'prior_question_had_explanation': 'boolean',
                             }
                      )

In [None]:
#data ordered by uers and timestamp, so we drop last user
train_df_105.drop(train_df_105[train_df_105.user_id == train_df_105.user_id.iloc[-1]].index, inplace=True)

Exploring questions stats

In [None]:
#only questions without lectures
train_df_105_quest = train_df_105[train_df_105.content_type_id ==0]

In [None]:
train_df_105_quest.head()

In [None]:
#correct answers percentage by user
user_quest_stats = train_df_105_quest.groupby('user_id')['answered_correctly'].agg(correct_answers_percentage='mean')

In [None]:
user_quest_stats.plot.hist(bins=100)
plt.title("Correct ansewrs percentage distribution by users")
plt.show()

In [None]:
train_df_105_quest[train_df_105_quest.answered_correctly == 1].prior_question_elapsed_time.plot.hist(bins=100, label='correct')
train_df_105_quest[train_df_105_quest.answered_correctly == 0].prior_question_elapsed_time.plot.hist(bins=100, label='incorrect')
plt.title("Correct and incorrect elapsed time")
plt.legend()
plt.show()

In [None]:
train_df_105_quest.groupby('prior_question_had_explanation')['answered_correctly'].agg(correct_answers_percentage='mean')

In [None]:
train_df_105_quest['answered_correctly'].agg(correct_answers_percentage='mean')

Exploring questions and lectures

In [None]:
questions_df = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/questions.csv')
lectures_df = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/lectures.csv')

In [None]:
questions_df.head()

In [None]:
train_df_105_quest_with_questions = train_df_105_quest.merge(questions_df,left_on='content_id', right_on='question_id')

In [None]:
train_df_105_quest_with_questions['tags_list'] = [x.split() for x in train_df_105_quest_with_questions.tags.values]

In [None]:
train_df_105_quest_with_questions.head()

In [None]:
part_correct_answers=train_df_105_quest_with_questions.groupby('part')['answered_correctly'].agg(correct_answers_percentage='mean')

In [None]:
part_correct_answers.plot.line()
plt.title("Correct ansewrs in parts")
plt.ylim(0,1)
plt.show()

In [None]:
tags_correct_answers = train_df_105_quest_with_questions[['answered_correctly', 'tags_list']]
tags_correct_answers = tags_correct_answers.explode('tags_list')
tags_correct_answers = tags_correct_answers.rename(columns={'tags_list':'tag'})

In [None]:
tags_correct_answers_percentage=tags_correct_answers.groupby('tag')['answered_correctly'].agg(correct_answers_percentage='mean')

In [None]:
tags_correct_answers_percentage.hist(bins=100)
plt.title("Correct ansewrs percentage distribution by tags")
plt.show()

In [None]:
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder

In [None]:
train_df = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/train.csv',
                         usecols=['row_id', 'user_id', 'answered_correctly', 'content_id', 'prior_question_had_explanation', 'prior_question_elapsed_time'],
                         dtype={'row_id': 'int64',  'user_id': 'int32', 'content_id': 'int16', 'answered_correctly': 'int8', 'prior_question_had_explanation': 'boolean', 'prior_question_elapsed_time':'float32'}
                         )

train_df.shape

In [None]:
user_df = train_df[train_df.answered_correctly != -1].groupby('user_id').agg({'answered_correctly': ['count', 'mean']}).reset_index()
user_df.columns = ['user_id', 'user_questions', 'user_mean']
user_df.head()

In [None]:
user_lect = train_df.groupby(["user_id", "answered_correctly"]).size().unstack()
user_lect.columns = ['lecture', 'wrong', 'right']
user_lect['lecture'] = user_lect['lecture'].fillna(0)
user_lect = user_lect.astype('Int64')
user_lect['watches_lecture'] = np.where(user_lect.lecture > 0, 1, 0)
user_lect = user_lect.reset_index()
user_lect = user_lect[['user_id', 'watches_lecture']]
user_lect.head()

In [None]:
user_df = user_df.merge(user_lect, on = "user_id", how = "left")
del user_lect
user_df.head()

In [None]:
content_df = train_df[train_df.answered_correctly != -1].groupby('content_id').agg({'answered_correctly': ['count', 'mean']}).reset_index()
content_df.columns = ['content_id', 'content_questions', 'content_mean']
content_df.head()

In [None]:
train_df.head()

In [None]:
cv2_train = pd.read_pickle("../input/riidvalidationpickle/cv2_train.pickle")['row_id']
cv2_valid = pd.read_pickle("../input/riidvalidationpickle/cv2_valid.pickle")['row_id']

In [None]:
import gc
train_df = train_df[train_df.answered_correctly != -1]
mean_prior = train_df.prior_question_elapsed_time.astype("float64").mean()

validation_df = train_df[train_df.row_id.isin(cv2_valid)]
train_df = train_df[train_df.row_id.isin(cv2_train)]

validation_df = validation_df.drop(columns = "row_id")
train_df = train_df.drop(columns = "row_id")

del cv2_train, cv2_valid
gc.collect()

In [None]:
label_enc = LabelEncoder()

train_df= train_df.merge(user_df, on = "user_id", how = "left")
train_df = train_df.merge(content_df, on = "content_id", how = "left")


In [None]:
train_df.head()

In [None]:
train_df['content_questions'].fillna(0, inplace = True)
train_df['content_mean'].fillna(0.5, inplace = True)
train_df['watches_lecture'].fillna(0, inplace = True)
train_df['user_questions'].fillna(0, inplace = True)
train_df['user_mean'].fillna(0.5, inplace = True)
train_df['prior_question_elapsed_time'].fillna(mean_prior, inplace = True)
train_df['prior_question_had_explanation'].fillna(False, inplace = True)
train_df['prior_question_had_explanation'] = label_enc.fit_transform(train_df['prior_question_had_explanation'])
train_df[['content_questions', 'user_questions']] = train_df[['content_questions', 'user_questions']].astype(int)
train_df.sample(5)

In [None]:
validation_df = validation_df.merge(user_df, on = "user_id", how = "left")
validation_df = validation_df.merge(content_df, on = "content_id", how = "left")


In [None]:
validation_df['content_questions'].fillna(0, inplace = True)
validation_df['content_mean'].fillna(0.5, inplace = True)
validation_df['watches_lecture'].fillna(0, inplace = True)
validation_df['user_questions'].fillna(0, inplace = True)
validation_df['user_mean'].fillna(0.5, inplace = True)
validation_df['prior_question_had_explanation'].fillna(False, inplace = True)
validation_df['prior_question_elapsed_time'].fillna(mean_prior, inplace = True)
validation_df['prior_question_had_explanation'] = label_enc.fit_transform(validation_df['prior_question_had_explanation'])
validation_df[['content_questions', 'user_questions']] = validation_df[['content_questions', 'user_questions']].astype(int)
validation_df.sample(5)

In [None]:
train_df.shape

In [None]:
features = ['user_questions', 'user_mean', 'content_questions', 'content_mean', 
            'prior_question_had_explanation', 'prior_question_elapsed_time', 'watches_lecture']


train = train_df.sample(n=5000000, random_state = 1)

y_train = train['answered_correctly']
train = train[features]

y_val = validation_df['answered_correctly']
validation = validation_df[features]

In [None]:
params = {'objective': 'binary',
          'metric': 'auc',
          'seed': 42,
          'learning_rate': 0.1, 
          "boosting_type": "gbdt" 
         }

In [None]:
lgb_train = lgb.Dataset(train, y_train, categorical_feature = None)
lgb_eval = lgb.Dataset(validation, y_val, categorical_feature = None)
del train, y_train, validation, y_val
gc.collect()

In [None]:
model = lgb.train(
    params, lgb_train,
    valid_sets=[lgb_train, lgb_eval],
    verbose_eval=50,
    num_boost_round=10000,
    early_stopping_rounds=8
)

In [None]:
lgb.plot_importance(model)
plt.show()

In [None]:
import riiideducation
env = riiideducation.make_env()

In [None]:
iter_test = env.iter_test()

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    test_df = test_df.merge(user_df, on = "user_id", how = "left")
    test_df = test_df.merge(content_df, on = "content_id", how = "left")
    test_df['content_questions'].fillna(0, inplace = True)
    test_df['content_mean'].fillna(0.5, inplace = True)
    test_df['watches_lecture'].fillna(0, inplace = True)
    test_df['user_questions'].fillna(0, inplace = True)
    test_df['user_mean'].fillna(0.5, inplace = True)
    test_df['prior_question_elapsed_time'].fillna(mean_prior, inplace = True)
    test_df['prior_question_had_explanation'].fillna(False, inplace = True)
    test_df['prior_question_had_explanation'] = label_enc.fit_transform(test_df['prior_question_had_explanation'])
    test_df[['content_questions', 'user_questions']] = test_df[['content_questions', 'user_questions']].astype(int)
    test_df['answered_correctly'] =  model.predict(test_df[features])
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])