## Reading Data and Importing Libraries ##

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import riiideducation
# import dask.dataframe as dd
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score

In [None]:
train = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/train.csv',
                   usecols=[1, 2, 3, 4, 5, 7, 8, 9],
                   dtype={'timestamp': 'int64',
                          'user_id': 'int32',
                          'content_id': 'int16',
                          'content_type_id': 'int8',
                          'task_container_id': 'int16',
                          'answered_correctly':'int8',
                          'prior_question_elapsed_time': 'float32',
                          'prior_question_had_explanation': 'boolean'}
                   )

In [None]:
#removing True or 1 for content_type_id

train = train[train.content_type_id == False]

#arrange by timestamp

train = train.sort_values(['timestamp'], ascending=True).reset_index(drop = True)

In [None]:
#getting final results ready for later, so we can clear memory
results_c_final = train[['content_id','answered_correctly']].groupby(['content_id']).agg(['mean'])
results_c_final.columns = ["answered_correctly_content"]

results_u_final = train[['user_id','answered_correctly']].groupby(['user_id']).agg(['mean', 'sum', 'count'])
results_u_final.columns = ['answered_correctly_user', 'sum', 'count']

In [None]:
results_t_final = train[['task_container_id','answered_correctly']].groupby(['task_container_id']).agg(['mean', 'sum'])
results_t_final.columns = ['answered_correctly_task', 'sum_task']

In [None]:
#saving value to fillna
time_mean = train.prior_question_elapsed_time.mean()
#print(time_mean)

## Data Exploration ##

In [None]:
train.loc[(train.timestamp == 0)].answered_correctly.mean()

In [None]:
train.loc[(train.timestamp != 0)].answered_correctly.mean()

In [None]:
train.loc[(train.timestamp < 1000000) & (train.timestamp > 0)].answered_correctly.mean()

Are early questions fundamentally different? The best answer I could get was: not really

In [None]:
train.prior_question_had_explanation.value_counts()

In [None]:
train.answered_correctly.mean()

In [None]:
train.drop(['timestamp', 'content_type_id'], axis=1, inplace=True)

## Creating Validation Set (Most Recent Answers by User) ##

In [None]:
validation = pd.DataFrame()

In [None]:
for i in range(4):
    last_records = train.drop_duplicates('user_id', keep = 'last')
    train = train[~train.index.isin(last_records.index)]
    validation = validation.append(last_records)

In [None]:
len(train)

In [None]:
len(validation)

In [None]:
validation.answered_correctly.mean()

In [None]:
train.answered_correctly.mean()

Does it make sense to use last questions as validation? Why is the rate of correct answers so low?

## Extracting Training Data ##

In [None]:
X = pd.DataFrame()

In [None]:
for i in range(15):
    last_records = train.drop_duplicates('user_id', keep = 'last')
    train = train[~train.index.isin(last_records.index)]
    X = X.append(last_records)

In [None]:
len(X)

In [None]:
len(train)

In [None]:
X.answered_correctly.mean()

In [None]:
train.answered_correctly.mean()

## Aggregating and Shaping Data ##

In [None]:
results_c = train[['content_id','answered_correctly']].groupby(['content_id']).agg(['mean'])
results_c.columns = ["answered_correctly_content"]

results_u = train[['user_id','answered_correctly']].groupby(['user_id']).agg(['mean', 'sum', 'count'])
results_u.columns = ["answered_correctly_user", 'sum', 'count']

In [None]:
results_t = train[['task_container_id','answered_correctly']].groupby(['task_container_id']).agg(['mean', 'sum'])
results_t.columns = ['answered_correctly_task', 'sum_task']

In [None]:
#clearing memory
#del(train)


In [None]:
X = pd.merge(X, results_u, on=['user_id'], how="left")
X = pd.merge(X, results_c, on=['content_id'], how="left")

In [None]:
X = pd.merge(X, results_t, on=['task_container_id'], how='left')

In [None]:
validation = pd.merge(validation, results_u, on=['user_id'], how="left")
validation = pd.merge(validation, results_c, on=['content_id'], how="left")

In [None]:
validation = pd.merge(validation, results_t, on=['task_container_id'], how='left')

In [None]:
from sklearn.preprocessing import LabelEncoder

lb_make = LabelEncoder()

X.prior_question_had_explanation.fillna(False, inplace = True)
validation.prior_question_had_explanation.fillna(False, inplace = True)

validation["prior_question_had_explanation_enc"] = lb_make.fit_transform(validation["prior_question_had_explanation"])
X["prior_question_had_explanation_enc"] = lb_make.fit_transform(X["prior_question_had_explanation"])

In [None]:
y = X['answered_correctly']
X = X.drop(['answered_correctly'], axis=1)

y_val = validation['answered_correctly']
X_val = validation.drop(['answered_correctly'], axis=1)

In [None]:
X = X[['answered_correctly_user', 'answered_correctly_content', 'sum', 'count', 
       'answered_correctly_task', 'sum_task', 'prior_question_elapsed_time', 
       'prior_question_had_explanation_enc']]
X_val = X_val[['answered_correctly_user', 'answered_correctly_content', 'sum', 'count',
               'answered_correctly_task', 'sum_task', 'prior_question_elapsed_time', 
               'prior_question_had_explanation_enc']]

In [None]:
#X['answered_correctly_user'].mean()

In [None]:
#X['answered_correctly_content'].mean()

In [None]:
#X['answered_correctly_content'].mode()

In [None]:
#X['answered_correctly_task'].mean()

In [None]:
#X.isnull().sum()

In [None]:
#X['sum'].mode()

In [None]:
#X['count'].mode()

In [None]:
X['answered_correctly_user'].fillna(0.5,  inplace=True)
X['answered_correctly_content'].fillna(0.6,  inplace=True)
X['prior_question_elapsed_time'].fillna(time_mean, inplace = True)
X['prior_question_had_explanation_enc'].fillna(0, inplace = True)

In [None]:
X_val['answered_correctly_user'].fillna(0.5,  inplace=True)
X_val['answered_correctly_content'].fillna(0.6,  inplace=True)
X_val['prior_question_elapsed_time'].fillna(time_mean, inplace = True)
X_val['prior_question_had_explanation_enc'].fillna(0, inplace = True)

In [None]:
X['sum'].fillna(4.0, inplace = True)
X['count'].fillna(0, inplace = True)

In [None]:
X_val['count'].fillna(0, inplace = True)
X_val['sum'].fillna(4.0, inplace = True)

## Modeling ##

In [None]:
import lightgbm as lgb

params = {
    'objective': 'binary',
    'max_bin': 1000,
    'learning_rate': 0.01,
    'num_leaves': 80
}

lgb_train = lgb.Dataset(X, y)
lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)

In [None]:
model = lgb.train(
    params, lgb_train,
    valid_sets=[lgb_train, lgb_eval],
    verbose_eval=25,
    num_boost_round=20000,
    early_stopping_rounds=50
)

In [None]:
#y_pred = model.predict(X_val)
#y_true = np.array(y_val)
#roc_auc_score(y_true, y_pred)

In [None]:
#y_predt = model.predict(X)
#y_truet = np.array(y)
#roc_auc_score(y_truet, y_predt)

We will have to look out for signs of over-fitting.

## Examining Feature Importance ##

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#displaying the most important features
lgb.plot_importance(model)
plt.show()

## Making Predictions for New Data ##

In [None]:
env = riiideducation.make_env()

In [None]:
iter_test = env.iter_test()

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    test_df = pd.merge(test_df, results_u_final, on=['user_id'],  how="left")
    test_df = pd.merge(test_df, results_c_final, on=['content_id'],  how="left")
    test_df = pd.merge(test_df, results_t_final, on=['task_container_id'],  how="left")
    test_df['answered_correctly_user'].fillna(0.5,  inplace=True)
    test_df['answered_correctly_content'].fillna(0.6,  inplace=True)
    test_df['sum'].fillna(4.0, inplace=True)
    test_df['count'].fillna(0, inplace=True)
    test_df['prior_question_elapsed_time'].fillna(time_mean, inplace = True)
    test_df['prior_question_had_explanation'].fillna(False, inplace=True)
    test_df["prior_question_had_explanation_enc"] = lb_make.fit_transform(test_df["prior_question_had_explanation"])
    test_df['answered_correctly'] =  model.predict(test_df[['answered_correctly_user', 'answered_correctly_content', 'sum', 'count', 
                                                            'answered_correctly_task', 'sum_task', 'prior_question_elapsed_time', 
                                                            'prior_question_had_explanation_enc']])
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])