In [1]:
# Used most of coding from this kernel https://www.kaggle.com/lgreig/simple-lgbm-baseline

import riiideducation
import dask.dataframe as dd
import  pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score

env = riiideducation.make_env()


# train.csv

row_id: (int64) ID code for the row.

timestamp: (int64) the time between this user interaction and the first event completion from that user.

user_id: (int32) ID code for the user.

content_id: (int16) ID code for the user interaction

content_type_id: (int8) 0 if the event was a question being posed to the user, 1 if the event was the user watching a lecture.

task_container_id: (int16) Id code for the batch of questions or lectures. For example, a user might see three questions in a row before seeing the explanations for any of them. Those three would all share a task_container_id.

user_answer: (int8) the user's answer to the question, if any. Read -1 as null, for lectures.

answered_correctly: (int8) if the user responded correctly. Read -1 as null, for lectures.

prior_question_elapsed_time: (float32) The average time it took a user to answer each question in the previous question bundle, ignoring any lectures in between. Is null for a user's first question bundle or lecture. Note that the time is the average time a user took to solve each question in the previous bundle.

prior_question_had_explanation: (bool) Whether or not the user saw an explanation and the correct response(s) after answering the previous question bundle, ignoring any lectures in between. The value is shared across a single question bundle, and is null for a user's first question bundle or lecture. Typically the first several questions a user sees were part of an onboarding diagnostic test where they did not get any feedback.

In [2]:
#no need for row_id, task_container_id, user_answer
train= pd.read_csv('/kaggle/input/riiid-test-answer-prediction/train.csv',
                usecols=[1, 2, 3,4,7,8,9], dtype={'timestamp': 'int64', 'user_id': 'int32' ,'content_id': 'int16','content_type_id': 'int8','answered_correctly':'int8','prior_question_elapsed_time': 'float32','prior_question_had_explanation': 'boolean'}
              )

# take only questions
train = train[train.content_type_id == False]

# arrange by timestamp
train = train.sort_values(['timestamp'], ascending=True)

#used up both of them above
train.drop(['timestamp','content_type_id'], axis=1,   inplace=True)

#groupby only keeps uique values of the column, here we are doing mean of the multiple values
results_c = train[['content_id','answered_correctly']].groupby(['content_id']).agg(['mean'])
results_c.columns = ["answered_correctly_content"]

#mean of correct answers, sum of total answers correct
results_u = train[['user_id','answered_correctly']].groupby(['user_id']).agg(['mean', 'sum'])
results_u.columns = ["answered_correctly_user", 'sum']

In [3]:
print("results_c shape = ",results_c.shape)
print(results_c.head(),"\n\n\n")
print("results_u shape = ",results_u.shape)
print(results_u.head())

results_c shape =  (13523, 1)
            answered_correctly_content
content_id                            
0                             0.907721
1                             0.890646
2                             0.554281
3                             0.779437
4                             0.613215 



results_u shape =  (393656, 2)
         answered_correctly_user   sum
user_id                               
115                     0.695652  32.0
124                     0.233333   7.0
2746                    0.578947  11.0
5382                    0.672000  84.0
8623                    0.642202  70.0


In [4]:
print(train.shape)

(99271300, 5)


In [5]:
X = train.iloc[90000000:,:]  #using partial dataset
X = pd.merge(X, results_u, on=['user_id'], how="left")#left outer join 
X = pd.merge(X, results_c, on=['content_id'], how="left")#left outer join
X = X[X.answered_correctly!= -1 ]  #-1 is null for lectures
X = X.sort_values(['user_id'], ascending=True)
Y = X[["answered_correctly"]] 
X = X.drop(["answered_correctly"], axis=1)

In [6]:
X.head()

Unnamed: 0,user_id,content_id,prior_question_elapsed_time,prior_question_had_explanation,answered_correctly_user,sum,answered_correctly_content
1119023,138650,4931,16000.0,True,0.713628,775.0,0.42679
1119056,138650,9113,22000.0,True,0.713628,775.0,0.430757
1119070,138650,4079,27000.0,True,0.713628,775.0,0.421442
1119081,138650,5000,14000.0,True,0.713628,775.0,0.469838
1119089,138650,3609,8000.0,True,0.713628,775.0,0.752486


In [7]:
from sklearn.preprocessing import LabelEncoder

lb_make = LabelEncoder()

X["prior_question_had_explanation_enc"] = lb_make.fit_transform(X["prior_question_had_explanation"])

# X = X.drop(["prior_question_had_explanation"], axis=1)   #converted to binary so no use
# X = X.drop(["content_id"], axis=1) #could have retained but lets not
# X = X.drop(["user_id"], axis=1)   #same as sum pratically, think!
#cant use drop because oeder needs to be uniform

X = X[['answered_correctly_user', 'answered_correctly_content', 'sum','prior_question_elapsed_time','prior_question_had_explanation_enc']] 

X.head()


Unnamed: 0,answered_correctly_user,answered_correctly_content,sum,prior_question_elapsed_time,prior_question_had_explanation_enc
1119023,0.713628,0.42679,775.0,16000.0,1
1119056,0.713628,0.430757,775.0,22000.0,1
1119070,0.713628,0.421442,775.0,27000.0,1
1119081,0.713628,0.469838,775.0,14000.0,1
1119089,0.713628,0.752486,775.0,8000.0,1


In [8]:
print("Before : ", X.isnull().values.any())
X.fillna(0.5,  inplace=True)
print("After  : ", X.isnull().values.any())

Before :  False
After  :  False


In [9]:

from  sklearn.tree import DecisionTreeClassifier
from  sklearn.model_selection import train_test_split
import lightgbm as lgb

X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size =0.05, shuffle=False)

params = {
    'objective': 'binary',
    'max_bin': 600,
    'learning_rate': 0.1,
    'num_leaves': 80
}


lgb_train = lgb.Dataset(X_train, Y_train)
lgb_eval = lgb.Dataset(X_val, Y_val, reference=lgb_train)

model = lgb.train(
    params, lgb_train,
    valid_sets=[lgb_train, lgb_eval],
    verbose_eval=10,
    num_boost_round=100000,
    early_stopping_rounds=10
)

Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.547892	valid_1's binary_logloss: 0.54318
[20]	training's binary_logloss: 0.534266	valid_1's binary_logloss: 0.529325
[30]	training's binary_logloss: 0.531477	valid_1's binary_logloss: 0.526592
[40]	training's binary_logloss: 0.530687	valid_1's binary_logloss: 0.525992
[50]	training's binary_logloss: 0.53029	valid_1's binary_logloss: 0.52585
[60]	training's binary_logloss: 0.529982	valid_1's binary_logloss: 0.525783
Early stopping, best iteration is:
[58]	training's binary_logloss: 0.530051	valid_1's binary_logloss: 0.52578


In [10]:
y_pred = model.predict(X_val)
y_true = np.array(Y_val)
roc_auc_score(y_true, y_pred)

0.7596138747797274

In [11]:
test =  pd.read_csv('/kaggle/input/riiid-test-answer-prediction/example_test.csv')
test.head()

Unnamed: 0,row_id,group_num,timestamp,user_id,content_id,content_type_id,task_container_id,prior_question_elapsed_time,prior_question_had_explanation,prior_group_answers_correct,prior_group_responses
0,0,0,0,275030867,5729,0,0,,,[],[]
1,1,0,13309898705,554169193,12010,0,4427,19000.0,True,,
2,2,0,4213672059,1720860329,457,0,240,17000.0,True,,
3,3,0,62798072960,288641214,13262,0,266,23000.0,True,,
4,4,0,10585422061,1728340777,6119,0,162,72400.0,True,,


In [12]:
test["prior_question_had_explanation_enc"] = lb_make.fit_transform(test["prior_question_had_explanation"])
test = pd.merge(test, results_u, on=['user_id'],  how="left")  #niceeeeeeeeeee one
test = pd.merge(test, results_c, on=['content_id'],  how="left") #yes you too
test[['answered_correctly_user', 'answered_correctly_content', 'sum','prior_question_elapsed_time','prior_question_had_explanation_enc']]
test.fillna(0.5, inplace=True)

y_pred = model.predict(test[['answered_correctly_user', 'answered_correctly_content', 'sum','prior_question_elapsed_time','prior_question_had_explanation_enc']])

test['answered_correctly'] = y_pred

results_c = train[['content_id','answered_correctly']].groupby(['content_id']).agg(['mean'])
results_c.columns = ["answered_correctly_content"]

results_u = train[['user_id','answered_correctly']].groupby(['user_id']).agg(['mean', 'sum'])
results_u.columns = ["answered_correctly_user", 'sum']

In [13]:
iter_test = env.iter_test()
for (test_df, sample_prediction_df) in iter_test:
    test_df = pd.merge(test_df, results_u, on=['user_id'],  how="left")
    test_df = pd.merge(test_df, results_c, on=['content_id'],  how="left")
    test_df['answered_correctly_user'].fillna(0.5, inplace=True)
    test_df['answered_correctly_content'].fillna(0.5, inplace=True)
    test_df['sum'].fillna(0, inplace=True)
    test_df['prior_question_had_explanation'].fillna(False, inplace=True)
    test_df["prior_question_had_explanation_enc"] = lb_make.fit_transform(test_df["prior_question_had_explanation"])
    test_df['answered_correctly'] =  model.predict(test_df[['answered_correctly_user', 'answered_correctly_content', 'sum','prior_question_elapsed_time','prior_question_had_explanation_enc']])
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])
    # df.loc is used to access colums or rows