In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# datatable installation
!pip install ../input/python-datatable/datatable-0.11.0-cp37-cp37m-manylinux2010_x86_64.whl

import riiideducation
import gc
import pickle
import numpy as np
import pandas as pd
from collections import defaultdict
import datatable as dt
import lightgbm as lgb
from matplotlib import pyplot as plt
env = riiideducation.make_env()

Load data using datatable

In [None]:
%%time
train_data_types = {
    'row_id': 'int32',
    'timestamp': 'int64',
    'user_id': 'int32', 
    'content_id': 'int16', 
    'answered_correctly': 'int8', 
    'prior_question_had_explanation': 'bool',
}
train_df = dt.fread("../input/riiid-test-answer-prediction/train.csv", columns=set(train_data_types.keys())).to_pandas()
for column, dtype in train_data_types.items():
    train_df[column] = train_df[column].astype(dtype) 
train_df.memory_usage(deep=True)

In [None]:
def correct(field):
    correct = train_df[train_df.answered_correctly != -1].groupby([field, 'answered_correctly'], as_index=False).size()
    correct = correct.pivot(index= field, columns='answered_correctly', values='size')
    correct['Percent_correct'] = round(correct.iloc[:,1]/(correct.iloc[:,0] + correct.iloc[:,1]),2)
    correct = correct.sort_values(by = "Percent_correct", ascending = False)
    correct = correct.iloc[:,2]
    return correct

In [None]:
%%time
group_labels_6 = ['Group_1', 'Group_2', 'Group_3', 'Group_4', 'Group_5', 'Group_6']
train_df['timestamp_group'] = pd.qcut(train_df['timestamp'], q=6, labels=group_labels_6)

In [None]:
new_user_cut_point = 177725917.833

In [None]:
%%time
train_df['new_users'] = np.where(train_df['timestamp_group'] == 'Group_1', True, False)
del train_df['timestamp_group']

In [None]:
lectures = pd.read_csv('../input/riiid-test-answer-prediction/lectures.csv')
questions = pd.read_csv('../input/riiid-test-answer-prediction/questions.csv')

In [None]:
str_tag = questions.tags
tags_list = [x.split() for x in str_tag.astype(str)]
questions['tags'] = tags_list
correct = train_df[train_df.answered_correctly != -1].groupby(["content_id", 'answered_correctly'], as_index=False).size()
correct = correct.pivot(index= "content_id", columns='answered_correctly', values='size')
correct.columns = ['Wrong', 'Right']
correct = correct.fillna(0)
correct[['Wrong', 'Right']] = correct[['Wrong', 'Right']].astype(int)
questions = questions.merge(correct, left_on = "question_id", right_on = "content_id", how = "left")
questions.head()

In [None]:
tags = [" ".join(x).split() for x in questions[questions.tags != "nan"].tags.values]
tags = [item for elem in tags for item in elem]
tags = set(tags)
tags = list(tags)
tags_df = pd.DataFrame()
for x in range(len(tags)):
    df = questions[questions.tags.apply(lambda l: tags[x] in l)]
    df1 = df.agg({'Wrong': ['sum'], 'Right': ['sum']})
    df1['Total_questions'] = df1.Wrong + df1.Right
    df1['Question_ids_with_tag'] = len(df)
    df1['tag'] = tags[x]
    df1 = df1.set_index('tag')
    tags_df = tags_df.append(df1)

tags_df[['Wrong', 'Right', 'Total_questions']] = tags_df[['Wrong', 'Right', 'Total_questions']].astype(int)
tags_df['Percent_correct'] = tags_df.Right/tags_df.Total_questions
tags_df = tags_df.sort_values(by = "Percent_correct")

tags_df.head()

In [None]:
select_rows = list(range(0,10)) + list(range(178, len(tags_df)))
tags_select = tags_df.iloc[select_rows,4]

fig = plt.figure(figsize=(12,6))
x = tags_select.index
y = tags_select.values
clrs = ['red' if y < 0.6 else 'green' for y in tags_select.values]
tags_select.plot.bar(x, y, color=clrs)
plt.title("Ten hardest and ten easiest tags")
plt.xlabel("Tag")
plt.ylabel("Percent answers correct of questions with the tag")
plt.xticks(rotation=90)
plt.show()

In [None]:
%%time
#adding user features
user_df = train_df[train_df.answered_correctly != -1].groupby('user_id').agg({'answered_correctly': ['count', 'mean']}).reset_index()
user_df.columns = ['user_id', 'user_questions', 'user_mean']


user_lect = train_df.groupby(["user_id", "answered_correctly"]).size().unstack()
# Changed [-1, 0, 1] to ['Lecture', 'Wrong', 'Right']
user_lect.columns = ['Lecture', 'Wrong', 'Right']
user_lect['Lecture'] = user_lect['Lecture'].fillna(0)

# Add another column to indicate whether the user watch lectures or not
user_lect = user_lect.astype('Int64')
user_lect['Watched_lecture'] = np.where(user_lect.Lecture > 0, True, False)
user_lect = user_lect.reset_index()
user_lect = user_lect[['user_id', 'Watched_lecture']]
user_df = user_df.merge(user_lect, on = "user_id", how = "left")
del user_lect
user_df.head()

In [None]:
%%time
#adding content features
content_df = train_df[train_df.answered_correctly != -1].groupby('content_id').agg({'answered_correctly': ['count', 'mean']}).reset_index()
content_df.columns = ['content_id', 'content_questions', 'content_mean']
content_df.head()

In [None]:
%%time
cv2_train = pickle.load(open("../input/cv-index-for-riiid/train_index.pkl", 'rb'))
cv2_valid = pickle.load(open("../input/cv-index-for-riiid/valid_index.pkl", 'rb'))

In [None]:
gc.collect()

In [None]:
%%time
#Split the train set as train and validation set.
validation_df = train_df[train_df.row_id.isin(cv2_valid)]
train_df = train_df[train_df.row_id.isin(cv2_train)]

validation_df = validation_df.drop(columns = "row_id")
train_df = train_df.drop(columns = "row_id")

del cv2_train, cv2_valid
gc.collect()

In [None]:
%%time
#train_df = pd.merge(train_df, user_lect[['user_id', 'Watched_lecture']], how='left', on=['user_id', 'user_id'])
train_df = train_df.merge(user_df, on = "user_id", how = "left")
train_df = train_df.merge(content_df, on = "content_id", how = "left")
train_df.sample(5)

In [None]:
%%time
validation_df = validation_df.merge(user_df, on = "user_id", how = "left")
validation_df = validation_df.merge(content_df, on = "content_id", how = "left")
validation_df.sample(5)

In [None]:
gc.collect()

In [None]:
train_df.head()

In [None]:
def merge_fill_na(df):
    df['content_questions'].fillna(0, inplace = True)
    df['content_mean'].fillna(0.5, inplace = True)
    df['user_questions'].fillna(0, inplace = True)
    df['user_mean'].fillna(0.5, inplace = True)
    df[['content_questions', 'user_questions']] = df[['content_questions', 'user_questions']].astype(int)
    df['new_users'].fillna(True, inplace = True)
    df['Watched_lecture'].fillna(False, inplace = True)
    df['prior_question_had_explanation'].fillna(True, inplace = True)
    return(df)

In [None]:
example_test = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/example_test.csv')
example_sample_submission = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/example_sample_submission.csv')

In [None]:
%%time
train_df = merge_fill_na(train_df)
validation_df = merge_fill_na(validation_df)

In [None]:
%%time
#build final train/validation set
features = ['content_id', 'prior_question_had_explanation', 'Watched_lecture', 'new_users', 
            'user_questions', 'user_mean', 'content_questions', 'content_mean']


train_df = train_df.sample(n=10000000, random_state = 1)
y_train = train_df['answered_correctly']
train = train_df[features]

y_val = validation_df['answered_correctly']
validation = validation_df[features]

In [None]:

params = {'objective': 'binary',
          'metric': 'auc',
          'seed': 2020,
          'learning_rate': 0.1, #default
          "boosting_type": "gbdt" #default
         }

In [None]:
lgb_train = lgb.Dataset(train, y_train, categorical_feature = ['prior_question_had_explanation'])
lgb_eval = lgb.Dataset(validation, y_val, categorical_feature = ['prior_question_had_explanation'])
del train, y_train, validation, y_val
gc.collect()

In [None]:
%%time
#train
model = lgb.train(
    params, lgb_train,
    valid_sets=[lgb_train, lgb_eval],
    verbose_eval=50,
    num_boost_round=10000,
    early_stopping_rounds=8
)

In [None]:
iter_test = env.iter_test()

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    test_df['new_users'] = np.where(test_df['timestamp'] <= new_user_cut_point, True, False)
    test_df = test_df.merge(user_df, on = "user_id", how = "left")
    test_df = test_df.merge(content_df, on = "content_id", how = "left")
    test_df = merge_fill_na(test_df)
    test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].astype('bool')
    test_df['answered_correctly'] =  model.predict(test_df[features])
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])