In [None]:
!pip install ../input/python-datatable/datatable-0.11.0-cp37-cp37m-manylinux2010_x86_64.whl > /dev/null

Load Data

In [None]:
import datatable as dt
import pandas as pd
import numpy as np

In [None]:
train = dt.fread('../input/riiid-test-answer-prediction/train.csv').to_pandas()

In [None]:
train.head()

In [None]:
print(train['user_id'].max())
print(train['user_id'].min())
print(train['task_container_id'].max())
print(train['task_container_id'].min())
print(len(train))

Take A Sample on User Id

In [None]:
train = train[(train['content_type_id'] == 0) & (train['user_id'] <= 21474828)]
print(len(train))

In [None]:
train.sort_values(by=['user_id', 'timestamp'])

train['u_avg'] = train.groupby('user_id')['answered_correctly'].transform(lambda x: x.mean())
train['q_avg'] = train.groupby('task_container_id')['answered_correctly'].transform(lambda x: x.mean())

col = ['user_id','task_container_id','answered_correctly', 'u_avg', 'q_avg']
features = ['l_pred', 'u_avg', 'q_avg']

for i in range(100):
    c = 'l' + str(i+1)
    train[c] = train.groupby('user_id')['answered_correctly'].apply(lambda x: x.shift(i+1))
    col.append(c)
    features.append(c)

train = train[col]

for i in range(100):
    c = 'l' + str(i+1)
    train[c] = train[c].fillna(0)

from sklearn import model_selection as cv

train_data, test_data = cv.train_test_split(train, test_size=0.2)
del train

Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
l = LinearRegression().fit(train_data[['l1', 'u_avg', 'q_avg']], train_data.answered_correctly)
train_data['l_pred'] = l.predict(train_data[['l1', 'u_avg', 'q_avg']])
test_data['l_pred'] = l.predict(test_data[['l1', 'u_avg', 'q_avg']])

In [None]:
from sklearn import metrics

print(metrics.roc_auc_score(test_data.answered_correctly, test_data.l_pred))

In [None]:
f = lambda x: x if x >= 0 and x <= 1 else 0 if x < 0 else 1
vf = np.vectorize(f)
test_data['l_pred_trunc'] = vf(test_data.l_pred)
print(metrics.roc_auc_score(test_data.answered_correctly, test_data.l_pred_trunc))

In [None]:
u_tbl = train_data.groupby(['user_id']).size().reset_index(name='counts')
u_tbl = u_tbl[u_tbl['counts'] >= 500]
q_tbl = train_data.groupby(['task_container_id']).size().reset_index(name='counts')
q_tbl = q_tbl[q_tbl['counts'] >= 2000]
u_list = list(u_tbl.user_id.unique())
q_list = list(q_tbl.task_container_id.unique())

l_list = []

from sklearn.linear_model import Lasso

for u in u_list:
    l_list.append(Lasso(alpha=0.01, max_iter=10e5).fit(train_data[train_data['user_id']==u][features], train_data[train_data['user_id']==u].answered_correctly - train_data[train_data['user_id']==u].l_pred))
    
train_data['l_err_pred'] = train_data.apply(lambda x: l_list[u_list.index(x.user_id)].predict(x[features].values.reshape(1, -1))[0] if x.user_id in u_list and np.sum(l_list[u_list.index(x.user_id)].coef_!=0) >= 2 and np.sum(l_list[u_list.index(x.user_id)].coef_!=0) <= 20 else 0, axis=1)
test_data['l_err_pred'] = test_data.apply(lambda x: l_list[u_list.index(x.user_id)].predict(x[features].values.reshape(1, -1))[0] if x.user_id in u_list and np.sum(l_list[u_list.index(x.user_id)].coef_!=0) >= 2 and np.sum(l_list[u_list.index(x.user_id)].coef_!=0) <= 20 else 0, axis=1)
train_data['l_pred_multi'] = train_data.apply(lambda x: x.l_pred + x.l_err_pred if abs(x.l_err_pred) <= 0.5 else x.l_pred + 0.5 * np.sign(x.l_err_pred), axis=1)
test_data['l_pred_multi'] = test_data.apply(lambda x: x.l_pred + x.l_err_pred if abs(x.l_err_pred) <= 0.5 else x.l_pred + 0.5 * np.sign(x.l_err_pred), axis=1)

In [None]:
print(metrics.roc_auc_score(test_data.answered_correctly, test_data.l_pred_multi))
print(metrics.roc_auc_score(test_data.answered_correctly, vf(test_data.l_pred_multi)))

Similarity of Bias

In [None]:
train_data['bias'] = train_data.answered_correctly - train_data.l_pred_multi
train_bias = pd.DataFrame(train_data.groupby(['user_id', 'task_container_id'])['bias'].mean()).reset_index()

train_bias['user_id'] = train_bias['user_id'].apply(lambda x: u_list.index(x) if x in u_list else -1)
train_bias['task_container_id'] = train_bias['task_container_id'].apply(lambda x: q_list.index(x) if x in q_list else -1)
bias_matrix = np.zeros((len(u_list), len(q_list)))
for line in train_bias.itertuples():
    if line[1] != -1 and line[2] != -1:
        if abs(line[3]) < 0.25:
            bias_matrix[line[1], line[2]] = line[3]
        else:
            bias_matrix[line[1], line[2]] = 0.25 * np.sign(line[3])

from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(bias_matrix.T)
#s = lambda x: x if abs(x) >= 0.01 else 0
#vs = np.vectorize(s)
#similarity = vs(similarity)
bias_pred = bias_matrix.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])

test_data['b_pred'] = test_data.apply(lambda x: bias_pred[u_list.index(x.user_id)][q_list.index(x.task_container_id)] if x.user_id in u_list and x.task_container_id in q_list else 0, axis=1)

In [None]:
test_data['l_pred_sim'] = test_data.apply(lambda x: x.l_pred_multi + x.b_pred if abs(x.b_pred) <= 0.1 else x.l_pred_multi + 0.1 * np.sign(x.b_pred), axis=1)

In [None]:
print(metrics.roc_auc_score(test_data.answered_correctly, test_data.l_pred_sim))
print(metrics.roc_auc_score(test_data.answered_correctly, vf(test_data.l_pred_sim)))

In [None]:
print(metrics.roc_auc_score(test_data[test_data['b_pred'] != 0].answered_correctly, vf(test_data.l_pred_multi[test_data['b_pred'] != 0])))
print(metrics.roc_auc_score(test_data[test_data['b_pred'] != 0].answered_correctly, vf(test_data.l_pred_sim[test_data['b_pred'] != 0])))