# pyDataTable Starter

Following on from Vopani's kernel ( https://www.kaggle.com/rohanrao/riiid-with-blazing-fast-rid) outlining the benefits of pydatatable over pandas in terms of memory usage, I have decided to right up a LGBM model using a datatbale processed using pydatatable instead of pandas. 

# So, is it better and does it solve memory problems? 

I would say that that this is a good alternative for preprocessing the data but out-of-memory still becomes an issue. I'm sure I could have written this notebook to be somehwat more memory efficient though ....

Also, one draw back is that I found I had issues with using categorical features with the LGBM after using pydatatable, which I didn't encounter with pandas so I ended up just mean encoding it. Also the env_iter method seems to only be compatible with pandas and not pydatatable objects so we need to convert the aggregated pydatatables to pandas dataframes first before using them in joins when dealing with the test data, but this is a small fix. 

I began writing this before finding out that BigQuery is okay for use in this competition and I think this will be my next approach but I have shared this as a point of interest.

In [None]:
# installation with internet
# !pip install datatable==0.11.0

In [None]:
# installation without internet
!pip install ../input/python-datatable/datatable-0.11.0-cp37-cp37m-manylinux2010_x86_64.whl

In [None]:
import datatable as dt
import numpy as np
from sklearn.preprocessing import LabelEncoder
lb_make = LabelEncoder()


In [None]:
# saving the dataset in .jay (binary format)
#timestamp	
#dt.fread("../input/riiid-test-answer-prediction/train.csv", skip_to_line=90000000).to_jay("train.jay")
#train = dt.fread("train.jay")

In [None]:
# saving the dataset in .jay (binary format)
#timestamp	
dt.fread("../input/riiid-test-answer-prediction/train.csv", columns={"timestamp", "user_id", "content_id", "content_type_id","answered_correctly", "prior_question_elapsed_time", "prior_question_had_explanation" }).to_jay("train.jay")
train = dt.fread("train.jay")

In [None]:
from datatable import (dt, f, by, ifelse, update, sort,
                      count, min, max, mean, sum, rowsum)

In [None]:
train = train[f.content_type_id==0,:]

In [None]:
train.head()

In [None]:
#sort timestamp values ascending
train = train[:, :, sort(f.timestamp)]

In [None]:
num=round(0.8*98000000)

user_m_all = train[0:num, :]
user_m_all = user_m_all[:, mean(f.answered_correctly), by('user_id')]
user_m_all.names= ['user_id', 'user_mean']

user_m_all.key= ("user_id")
train = train [:, :, dt.join(user_m_all)]

user_c_all = train[0:num, :]
user_c_all = user_c_all[:, count(f.answered_correctly), by('user_id')]
user_c_all.names= ['user_id', 'user_count']

user_c_all.key= ("user_id")
train = train [:, :, dt.join(user_c_all)]

content_m_all = train[0:num, :]
content_m_all = content_m_all[:, mean(f.answered_correctly), by('content_id')]
content_m_all.names= ['content_id', 'content_mean']

content_m_all.key= ("content_id")
train = train[:, :, dt.join(content_m_all)]
train.tail()


In [None]:
# get question info
dt.fread("../input/riiid-test-answer-prediction/questions.csv", columns = {"question_id","part", "tags"}).to_jay("questions.jay")
questions = dt.fread("questions.jay")
questions.names=['content_id', 'part', 'tags']

questions.key= ("content_id")
train = train[:, :, dt.join(questions)]

part_m_all = train[0:num,:]
part_m_all = part_m_all[:, mean(f.answered_correctly), by('part')]
part_m_all.names=["part", "part_mean"]

part_m_all.key= ("part")
train = train[:, :, dt.join(part_m_all)]
train.tail()

In [None]:
#Can we get information from the question tag that is valuable?

tag_m_all = train[0:num,:]
tag_m_all = tag_m_all[:, mean(f.answered_correctly), by('tags')]
tag_m_all.names=["tags", "tags_mean"]
tag_m_all.key= ("tags")
train = train[:, :, dt.join(tag_m_all)]


In [None]:
train.head()

In [None]:
#Deal with missing values

a= dt.math.isna(train[:,"prior_question_elapsed_time"])
m = train[0:num,'prior_question_elapsed_time'].mean()
train[a ,"prior_question_elapsed_time"] = m

a= dt.math.isna(train[:,"user_count"])
train[a ,"user_count"] = 0

a= dt.math.isna(train[:,"user_mean"])
m = train[0:num,"user_mean"].mean()
train[a ,"user_mean"] = m

a= dt.math.isna(train[:,"part_mean"])
m = train[0:num,"part_mean"].mean()
train[a ,"part_mean"] = m

a= dt.math.isna(train[:,"content_mean"])
m = train[0:num,"content_mean"].mean()
train[a ,"content_mean"] = m

a= dt.math.isna(train[:,"tags_mean"])
m = train[0:num,"tags_mean"].mean()
train[a ,"tags_mean"] = m

a= dt.math.isna(train[:,"prior_question_had_explanation"])
train[a ,"prior_question_had_explanation"] = False

#Get means for later
m1 = train[:,'prior_question_elapsed_time'].mean()
m2 = train[:,"user_mean"].mean()
m3 = train[:,"part_mean"].mean()
m4 = train[:,"content_mean"].mean()
m5 = train[:,"tags_mean"].mean()

In [None]:
train.tail()

In [None]:
train["prior_question_had_explanation_enc"] = lb_make.fit_transform(train["prior_question_had_explanation"])

In [None]:
pq_m_all = train[0:num,:]
pq_m_all = pq_m_all[:, mean(f.answered_correctly), by('prior_question_had_explanation_enc')]
pq_m_all.names=["prior_question_had_explanation_enc", "pq_mean"]
pq_m_all.key= ("prior_question_had_explanation_enc")
train = train[:, :, dt.join(pq_m_all)]


In [None]:
train.head()

In [None]:
#Create a hold out set for out-of-sample testing (approx. last 2 million rows)

holdout = train[98000000:train.shape[0], :]
X_hold = holdout[:,["user_mean", "user_count", "content_mean", "part_mean" , "prior_question_elapsed_time", "tags_mean"] ].to_numpy()
Y_hold = holdout[:,"answered_correctly"].to_numpy().reshape(-1)

In [None]:
X_train = train[30000000:num, ["user_mean", "user_count", "content_mean", "part_mean" , "prior_question_elapsed_time", "tags_mean"]]
X_val = train[num:98000000,["user_mean", "user_count", "content_mean", "part_mean" , "prior_question_elapsed_time", "tags_mean"]]

In [None]:
Y_train = train[30000000:num, "answered_correctly"].to_numpy().reshape(-1)
Y_val = train[num:98000000, "answered_correctly"].to_numpy().reshape(-1)

In [None]:
X_train.head()

In [None]:
#LGBM model

import lightgbm as lgb

params = {
    'objective': 'binary',
    'max_bin': 600,
    'learning_rate': 0.02,
    'num_leaves': 80
}


lgb_train = lgb.Dataset(X_train, Y_train)
lgb_eval = lgb.Dataset(X_val, Y_val, reference=lgb_train)

model = lgb.train(
    params, lgb_train,
    valid_sets=[lgb_train, lgb_eval],
    verbose_eval=10,
    num_boost_round=1000,
    early_stopping_rounds=10
)


In [None]:
#Evaluate

y_pred = model.predict(X_hold)
y_true = Y_hold

from sklearn.metrics import roc_auc_score
print(roc_auc_score(y_true, y_pred))

In [None]:
train.head()

In [None]:
#Lets prepare the test file and check the methodology works.

#Get train again
import pandas as pd
train=train[:, [ "user_id","content_id", "content_type_id", "answered_correctly" , "prior_question_elapsed_time", "prior_question_had_explanation_enc", "part", "tags"]]
test = pd.read_csv("../input/riiid-test-answer-prediction/example_test.csv")

In [None]:
test.head()

In [None]:
num=train.shape[0]

user_m_all = train[0:num, :]
user_m_all = user_m_all[:, mean(f.answered_correctly), by('user_id')]
user_m_all.names= ['user_id', 'user_mean']
user_m_all = user_m_all.to_pandas()
test = pd.merge(test,user_m_all , on=["user_id"], how="left")

user_c_all = train[0:num, :]
user_c_all = user_c_all[:, count(f.answered_correctly), by('user_id')]
user_c_all.names= ['user_id', 'user_count']
user_c_all = user_c_all.to_pandas()
test = pd.merge(test,user_c_all , on=["user_id"], how="left")

content_m_all = train[0:num, :]
content_m_all = content_m_all[:, mean(f.answered_correctly), by('content_id')]
content_m_all.names= ['content_id', 'content_mean']
content_m_all = content_m_all.to_pandas()
test = pd.merge(test, content_m_all, on=["content_id"], how="left")

questions = questions.to_pandas()
test = pd.merge(test, questions, on=["content_id"], how="left")

part_m_all = train[0:num,:]
part_m_all = part_m_all[:, mean(f.answered_correctly), by('part')]
part_m_all.names=["part", "part_mean"]
part_m_all = part_m_all.to_pandas()
test = pd.merge(test, part_m_all, on=["part"], how="left")


tag_m_all = train[0:num,:]
tag_m_all = tag_m_all[:, mean(f.answered_correctly), by('tags')]
tag_m_all.names=["tags", "tags_mean"]
tag_m_all = tag_m_all.to_pandas()
test = pd.merge(test, tag_m_all, on=["tags"], how="left")



test.head()

In [None]:
#Deal with missing values

test["prior_question_elapsed_time"].fillna(float(m1.to_numpy().reshape(-1)), inplace=True)
test["user_count"].fillna(0)
test["user_mean"].fillna(float(m2.to_numpy().reshape(-1)), inplace=True)
test["part_mean"].fillna(float(m3.to_numpy().reshape(-1)), inplace=True)
test["content_mean"].fillna(float(m4.to_numpy().reshape(-1)), inplace=True)

test["prior_question_had_explanation"].fillna(False)
test["prior_question_had_explanation_enc"] = lb_make.fit_transform(test["prior_question_had_explanation"])

pq_m_all = train[0:num,:]
pq_m_all = pq_m_all[:, mean(f.answered_correctly), by('prior_question_had_explanation_enc')]
pq_m_all.names=["prior_question_had_explanation_enc", "pq_mean"]
pq_m_all = pq_m_all.to_pandas()
test = pd.merge(test, pq_m_all, on=["prior_question_had_explanation_enc"], how="left")

test.head()

In [None]:
test['answered_correctly'] = model.predict( test[[ "user_mean", "user_count", "content_mean", "part_mean" , "prior_question_elapsed_time", "tags_mean"]])

In [None]:
test.head()

In [None]:
#Run on the test set
import riiideducation
env = riiideducation.make_env()
iter_test = env.iter_test()

In [None]:
for (test, sample_prediction_df) in iter_test:
    
    #Join
    
    
    test = pd.merge(test,user_m_all , on=["user_id"], how="left")
    test = pd.merge(test,user_c_all , on=["user_id"], how="left")
    test = pd.merge(test,content_m_all , on=["content_id"], how="left")
    test = pd.merge(test, questions, on=["content_id"], how="left")
    test = pd.merge(test, part_m_all, on=["part"], how="left")
    test = pd.merge(test, tag_m_all, on=["tags"], how="left")
    
    #Na fill
    
    test["prior_question_elapsed_time"].fillna(float(m1.to_numpy().reshape(-1)), inplace=True)
    test["user_count"].fillna(0)
    test["user_mean"].fillna(float(m2.to_numpy().reshape(-1)), inplace=True)
    test["part_mean"].fillna(float(m3.to_numpy().reshape(-1)), inplace=True)
    test["content_mean"].fillna(float(m4.to_numpy().reshape(-1)), inplace=True)

    #test["prior_question_had_explanation"].fillna(False)
    #test["prior_question_had_explanation_enc"] = lb_make.fit_transform(test["prior_question_had_explanation"])

    #test = pd.merge(test, pq_m_all, on=["prior_question_had_explanation_enc"], how="left")
    #Predict
    
    test['answered_correctly'] = model.predict( test[[ "user_mean", "user_count", "content_mean", "part_mean" , "prior_question_elapsed_time", "tags_mean"]])
    env.predict(test.loc[test['content_type_id'] == 0, ['row_id', 'answered_correctly']])