### Load necessary modules and data:

In [None]:
# Loading modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import riiideducation
import gc
import tqdm
import time
import os
import lightgbm as lgb

!pip --quiet install ../input/python-datatable/datatable-0.11.0-cp37-cp37m-manylinux2010_x86_64.whl
!pip install --quiet -r ../input/treelite-treelite-runtime-version-093/treelite/requirements.txt --no-index --find-links ../input/treelite-treelite-runtime-version-093/treelite

from bitarray import bitarray
import datatable as dt

tqdm.tqdm.pandas()

%matplotlib inline
# plt.style.use("dark_background")

# Loading the API
env = riiideducation.make_env()

In [None]:
# check the files we have been given
!ls ../input/riiid-test-answer-prediction/

Right now we are going to subsample the dataset instead of reading it all to the memory, let's try to understand what data we have been provided with before training on all the data given.

We need to change the dtypes of certain columns and read a subset of the entire data to be able to fit it all to the memory:

In [None]:
dtypes = {
    "row_id": "int64",
    "timestamp": "int64",
    "user_id": "int32",
    "content_id": "int16",
    "content_type_id": "boolean",
    "task_container_id": "int16",
    "user_answer": "int8",
    "answered_correctly": "int8",
    "prior_question_elapsed_time": "float32", 
    "prior_question_had_explanation": "boolean"
}

data = pd.read_csv("../input/riiid-test-answer-prediction/train.csv", dtype=dtypes, nrows=int(1e6))

ques = pd.read_csv('../input/riiid-test-answer-prediction/questions.csv')
lectures = pd.read_csv("../input/riiid-test-answer-prediction/lectures.csv")
ex_sub = pd.read_csv('../input/riiid-test-answer-prediction/example_sample_submission.csv')
ex_test = pd.read_csv('../input/riiid-test-answer-prediction/example_test.csv')

data.shape, ques.shape, lectures.shape, ex_sub.shape, ex_test.shape

In [None]:
# returns total bytes consumed
temp = data.memory_usage(deep=True).sum()

# bytes to MB
print (f"{temp / (2**20):.2f}", 'MB')

### Cutting out EDA to include only neccessary parts:

In [None]:
# combining questions and lectures together
ql = pd.concat([ques, lectures.rename({"lecture_id": "question_id"}, axis=1)], axis=0).reset_index(drop=True)

# overlap the tags from both columns
ql.tags = ql.tags.fillna(ql.tag)

# custom type of for questions
ql.type_of = ql.type_of.fillna("question")

# for distinguishing between lectures and questions
ql["content_type_id"] = ql["type_of"] != 'question'

# bundle id and correct ans & tags is filled with -1
# tag is missing for 1 row -> 10033
ql = ql.fillna(-1)

# drop the unneeded tag feature
ql = ql.drop("tag", 1)

# rename the column for easy merge
ql = ql.rename({"question_id": "content_id"}, axis=1)

# convert all the tags to list from string
ql.tags = ql.tags.apply(lambda x: [int(x)] if type(x) != str else list(map(int, x.split())))

# some might find reading section easier and listening tougher or vice versa
ql['listening_lvl'] = ql.part.map({1: 2, 2: 2, 3: 1, 4: 1, 5: 0, 6: 0, 7: 0})

# mulitple subquestions per question?
ql['multiple_q'] = ql.part.isin([3, 4, 6, 7])

# Number of tag counts
ql['tag_c'] = ql.tags.apply(len)

In [None]:
# we create a feature that counts the number of lectures available per tag
lec_count = ql.loc[ql.content_type_id, 'tags'].transform(lambda x: x[0]).value_counts()

ql['lec_available'] = (
    ql.loc[~ql.content_type_id, 'tags'].transform(
        lambda x: sum([lec_count.at[i] if i in lec_count.index else 0 for i in x]))
)

In [None]:
# merge just the essentials, a whole merge is on the way
data = data.merge(ql[['content_id', 'part', 'bundle_id', 'content_type_id']], on=['content_id', 'content_type_id'])

f, ax = plt.subplots(ncols=3, figsize=(18, 5))
ql.groupby("part")['tags'].apply(lambda x: x.explode().nunique()).plot(
    kind='bar', rot=0, ax=ax[0], title='Unique Tags Per Part')

ql[~ql.content_type_id].groupby("part")['content_id'].nunique().plot(
    kind='bar', rot=0, ax=ax[1], title='Unique Questions Per Part')

ql[ql.content_type_id].groupby("part")['content_id'].nunique().plot(
    kind='bar', rot=0, ax=ax[2], title='Unique Lectures Per Part')

f.suptitle("Part Wise Count");

In [None]:
ql['bundle_q_count'] = ql.groupby("bundle_id")['content_id'].transform('count')
ql.loc[ql.content_type_id, 'bundle_q_count'] = -1

In [None]:
from mlxtend.preprocessing import TransactionEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

te = TransactionEncoder()

temp = ql[~ql.content_type_id]

temp = temp.merge(
    data[~data.content_type_id].groupby("content_id")['answered_correctly'].agg(['count', 'mean']),
    on='content_id', how='left')

temp['mean'] = temp['mean'].fillna(0.5)
temp['count'] = temp['count'].fillna(0)

temp = np.hstack([
    te.fit_transform(temp.tags.to_list()).astype(int), 
    pd.get_dummies(temp[['tag_c', 'part', 'count', 'mean']], columns=['part', 'tag_c'])
])

temp = PCA(n_components=150).fit_transform(StandardScaler().fit_transform(temp))

# assign clusters to the questions
ql.loc[~ql.content_type_id, 'cluster'] = KMeans(n_clusters=30).fit_predict(temp)
ql.loc[ql.content_type_id, 'cluster'] = -1

# convert to int
ql.cluster = ql.cluster.astype(int)

ql.sample(5)

Another important feature we could create is the tags themselves. We see in this [notebook](https://www.kaggle.com/jsylas/riiid-lgbm-starter) that tag seems to have a higher feature importance. Let's create this feature:

In [None]:
temp = ql.tags.progress_apply(pd.Series)

# we retain just the top two tags, drop the rest
ql['tagF'] = temp[0]
ql['tagS'] = temp[1]
ql['tagT'] = temp[2]

# last tag is imp since we observed it determining the bundle_id
ql['tagL'] = ql.tags.apply(lambda x: x[-1])

# although there will be overlap between tagL and tagF, tagS we
# donot remove it. As this may help the model understand
# if the number of tags there is 1, 2 or etc
# we also impute the missing values as 0 after incrementing 
# all the tags by 1
ql[['tagF', 'tagS', 'tagL', 'tagT']] = (ql[['tagF', 'tagS', 'tagL', 'tagT']] + 1).fillna(0)

ql.sample(5)

We will use this cluster information for making group wise predictions for each user. Before we proceed further, let's merge `ql` with our dataframe:

In [None]:
data = data.drop(['part', 'bundle_id'], 1).merge(ql, on=['content_id', 'content_type_id'], how='left')
data.shape

In [None]:
# timetaken by user since her last interaction
data = data.sort_values(by=['user_id', 'timestamp'])

data['response_time'] = (
    data.groupby("user_id")['timestamp']
    .transform(lambda x: x.diff().replace(0, np.nan)
               .fillna(method='ffill').fillna(0))
)

There could be a still easier approach to obtaining the response time:

$timestamp - (timestamp * \frac{task\_container\_id - 1}{task\_container\_id})$

The advantage with this approach is that it doesn't require shifting or merging operations. Let's verify their efficiency:

In [None]:
data['res_time_avg'] = (
    data.timestamp - 
    (data.timestamp * (data.task_container_id - 1) / data.task_container_id)
)

data['res_time_avg'] = data['res_time_avg'].replace(np.inf, np.nan)

print ("Corelation to response_time:\n", 
       data.corr()['response_time'].loc[['res_time_avg']], sep='')

print ("\nCorrelation to ans_correctness:\n", 
       data.corr()['answered_correctly'].loc[['response_time', 'res_time_avg']], sep='')

(data[['timestamp', 'response_time', 'res_time_avg']]
 .sample(10).fillna(0).astype(int))

`Prior_question_had_explanation` needs to be mapped to previous bundle and it might serve as a powerful feature. We create features:
1. `pqet_shifted`
2. `pqhe_shifted`

In [None]:
# sorting to ensure groupby works as intended
data = data.sort_values(['user_id', 'timestamp'])

data = data.merge(
    (data[~data.content_type_id].groupby(['user_id', 'task_container_id'])
     [['prior_question_elapsed_time', 'prior_question_had_explanation']]
     .mean().groupby("user_id").shift(-1).reset_index()
     .rename({"prior_question_elapsed_time": 'pqet_shifted', 
              'prior_question_had_explanation': "pqhe_shifted"}, axis=1)),
    on=['user_id', 'task_container_id'], how='left')

In [None]:
# ensure timestamp and userid are sorted
data = data.sort_values(by=['user_id', 'timestamp'])

cut_off = (1000 * 60 * 60) # one hr
cut_off = cut_off * 1 # seperated by an hr worth of gap, tweak it!

data['sessions'] = (
    data.groupby("user_id")['timestamp'].diff() > cut_off
).groupby(data['user_id']).cumsum()

In [None]:
data['sess_event_count'] = data.groupby(["user_id", 'sessions']).cumcount()
data[['sessions', 'sess_event_count', 'answered_correctly']].corr().style.background_gradient()

In [None]:
temp = data.loc[data.user_id == np.random.choice(data[data.content_type_id]['user_id'].unique()), 
         ['timestamp', 'task_container_id', 'content_id', 
          'tags', 'type_of', 'answered_correctly']]

mask = list()
step = 10
for i in temp[temp.answered_correctly == -1].index:
    
    tag = temp.loc[i, 'tags'][0]
    for j in np.arange(i - step, i + step):
        if j in temp.index and tag in temp.loc[j, 'tags']:
            mask.append(j)
    
mask = np.unique(mask)

def highlight_row(x, mask):
    df = x.copy()
    df.loc[(df.index.isin(mask)) & (df.answered_correctly != -1), :] = 'background-color: red'
    df.loc[df.answered_correctly == -1, :] = 'background-color: green'
    df.loc[~df.index.isin(mask), :] = 'background-color: ""'
    return df

print (temp.loc[temp['answered_correctly'] == -1, 'task_container_id'].values)

filter_mask = np.unique([[j for j in range(i-step, i+step)] for i in temp[temp.answered_correctly == -1].index])
filter_mask = np.intersect1d(filter_mask, temp.index)

temp.loc[filter_mask].style.apply(lambda x: highlight_row(x, mask), axis=None)

What we observe:
- For most cases, the tags of lectures repeat afterwards or before like we assumed it would. 
- The tags to the *left* however are the tags that match with the lectures, meaning that they might be more important.
- A student after seeing a lecture answers corresponding questions better.
- For those cases the lecture tags match the preceding questions, these were the same questions the user answered incorrectly.
- There were no *tests* as in formal examinations.
- *We need some way to encode if a person has seen lecture of that particular tag.*

*More EDA at a later time*

#### We now try to understand `ex_sub` & `ex_test` csv files:

These files are provided as sample for how the files produced by `env.itertest` would be. At one call, it would only give us a small batch. We need to make predictions with our models on this and submit with a `env.predict` before we can call the next batch. *This is done so as to mimic real life scenarios where the future data is not available for model training.*

From data description:

- `prior_group_responses` (string) provides all of the user_answer entries for previous group in a string representation of a list in the **first row** of the group. **All other rows in each group are null**. If you are using Python, you will likely want to call eval on the non-null rows. Some rows may be null, or empty lists.

- `prior_group_answers_correct` (string) provides all the answered_correctly field for previous group, with the *same format and caveats as prior_group_responses*. Some rows may be null, or empty lists.

A more thorough understanding can be obtained from this post [here](https://www.kaggle.com/c/riiid-test-answer-prediction/discussion/190430) by Alex:

     Once you submitted your predictions for a group, you get the next test_df off the iterator and that immediately tells you whether you were right or not. You can use this information to improve your model before continuing with going through the test set, or you can just ignore it.
    As you can't submit predictions for the same group twice, you can't cheat with it. It's just meant to be used for improving your prediction algorithm as you get more information, as is typical for realtime applications.

In [None]:
# example input from the API looks like:
ex_test.head()

- Only the first row of each group would contain the answers and scores. The rest of the rows are all null.
- We are *NOT* provided with the `user_answer` during the predictions we are to make. We are only provided that information at the next batch along with whether the user's predictions were indeed correct. If this had not been the case, we could simply compare with the `questions.csv` and be able to perfectly predict if the users were correct ;)
- During the prediction time we only have features such as the timestamp, question meta data and info regarding prior groups response.

In [None]:
# example prediction to the API must look like:
ex_sub.head()

- For submission, we only pass in row_id, predictions. Group_num although present here is not required for submission. (check 'Making Our Predictions' part)

Some more insights about the time series API testframes:
- All shapes for each batch aren't identical, each batch may have differing no of samples
- From this discussion [here](https://www.kaggle.com/c/riiid-test-answer-prediction/discussion/190698): Every test_iter will only have one group number -> The first row is a list in str format (could be an empty list too).

Also from [this](https://www.kaggle.com/dwit392/riiid-challenge-time-since-last-action-for-test) notebook it is said that time elapsed between the last interaction and current one is a good predictor of the answer correctness which would make sense since that time could be used by a student to prepare before taking up the next test. However with data scattered around and us loading only a tiny fraction of the actual dataset, creating this feature would prove really difficult (for a later time).

Let's now write a function that when given testframe_1 and testframe_2 returns testframe_1 with `user_answer` and `answered_correctly` columns merged to it:

In [None]:
def post_process(fn0, fn1):
    '''
    fn0 is test dataframe at time t
    fn1 is test dataframe at time t + 1
    
    If however no fn1 is provided, we need to simply assign all the 
    user_answer and answered_correctly as nans. For this purpose, 
    any dataframe with first row as nan will suffice.
    '''
    
    fn_processed = fn0.drop(['prior_group_answers_correct', 'prior_group_responses'], 1)
    
    fn_processed['answered_correctly'] = eval(fn1['prior_group_answers_correct'].iloc[0])
    fn_processed['user_answer'] = eval(fn1['prior_group_responses'].iloc[0])
    
    return fn_processed

#### Let's now load the data from the API for the purpose of understanding it, we will disable it when we wish to make a submission:

We have saved these batches as pickle to bench mark how fast our end pipeline is. We can also use this to check if our `post_process` function works good as intended.

Code to save sample_batches:

```
iter_test = env.iter_test()

batches = []
while True:
    try:
        batches.append(next(iter_test))
        env.predict(ex_sub)
    except StopIteration:
        break
```

In [None]:
import pickle

with open("../input/riiid-final-model-inputs/sample-batches.pkl", 'rb') as f:
    batches = pickle.load(f)

print ("Batch sizes for each test sample:", list(map(lambda x: x[0].shape[0], batches)))

In [None]:
# working good? Check: prior_group_ans correct correctly mapped
post_process(batches[0][0], batches[1][0]).sample(5)

Post_processed frame could then be used to help our model to learn about a students behaviour better in real time during submission. And since we are able to learn from the test data as well, it's best to create an *online / incremental model* for this competition.


#### Evaluation metric:

Let's understand the evaluation metric - `roc_auc_score`. It ensures that random predictions always yeild a score of 0.5. If however our score is less that 0.5 it means that we have made some mistake in our predictions (wrongly labeled the data, model does beter than random guess). A score of 1 (or 0) means that model is absolutely perfect and makes correct predictions 100% of the time. 

Let's verify this with some random guesses:

In [None]:
# Predicting constant values for the entire train dataset.
from sklearn.metrics import roc_auc_score
temp = data.loc[~data.content_type_id, "answered_correctly"]
for value in [0, 1, 0.5, temp.mean()]:
    print ("At {:.2f} the score is: {:.3f}".format(value, roc_auc_score(temp, np.full_like(temp, 0))))

A highly robust metric indeed. So we have to be a bit more smart in making predictions to beat this. What about random predictions *per question*?

In [None]:
roc_auc_score(temp, np.random.rand(len(temp)))

Does marginally (very marginally) better than previous predictions. Our next naive idea is to use per user mean accuracy as predictions but befor we do that we need to *something else*.

Let's now write a function to split the data to train/val as reliably as possible to mimic the test case scenario. Further it should also function as a CV generator, given some train ids:

In [None]:
def generate_train_val(train, train_users=None, tp=.70, vp=None, put=None, dyn_p=False, return_mask=False): 
    '''
    What we already know:
    - Test dataset has new users but no new questions. 
    - Test follows chornologically after train
    
    * Parameters *  
    train_ids  -> The ID's completely used for training (If None, generate it using tp)
    tp         -> train users percentage (completely used for training)
    vp         -> Validation usage percentage completely used for val 
                  (used to make val users | partial users)
                  If None, randomly chosen
                 
    put        -> timestamp threshold for partial users above which 
                  timetamp the row becomes validation dataset
                  If None value chosen is same as `vp` or randomly 
                  chosen for each user depending on dyn_p
                  
    dyn_p       -> Each user may have a different threshold at which they were split.
                   If dyn_p is set to true, it is dynamically generated for each user.
                 
    return_mask -> If set to true, returns a mask instead of returning a modified dataframe
    
    * Output *
    Returns a new dataframe with `train` column added for val/train split
    
    * Note *
    1. Partial users are those users whose data is used for training and validation
    2. Perentage of train data is always greater than tp
    3. Exact percentage -> tp + 
    '''   

    def threshold_user(arr, put=None):
        if put is None: # each user may have a distinct threshold
            put = np.random.choice(np.linspace(.20, .80, 13))
        return arr < np.quantile(arr, put)
    
    
    total_users = train.user_id.unique()
    
    if train_users is None:
        train_users = np.random.choice(total_users, int(len(total_users) * tp), replace=False)
        
    if vp is None: # validation percent 
        vp = np.random.choice(np.linspace(.20, .80, 13))
    
    if put is None: # threshold for timestamp cutoff
        # if dyn_p, put is left to be None
        if not dyn_p:
            put = vp
            
    # partial users percentage
    pp = 1 - vp
    
    remaining_users = np.setdiff1d(total_users, train_users)
    val_users = np.random.choice(remaining_users, int(len(remaining_users) * vp), replace=False)
    partial_users = np.setdiff1d(remaining_users, val_users)
    
    # generating the train mask
    mask = (
        train[train.user_id.isin(partial_users)]
        .groupby("user_id")['timestamp']
        .transform(lambda x: threshold_user(x.values, put))
        .reindex(train.index, fill_value=False) | 
        
        train.user_id.isin(train_users))
    
    # we will tinkering with it, best to copy it 
    # beware of passing in large sized DataFrames
    if return_mask:
        return mask
    else:
        Train = train.copy()
        Train['train'] = mask
        return Train

How does the above function work? Lets see the split via a pie chart:

In [None]:
temp = generate_train_val(data, return_mask=True)
(temp.astype(int).value_counts()
 .plot(kind='pie', autopct=lambda x: f"{int(x)}%", 
       title='Train/Val Split',
       colors=['r', 'g'], explode=[.1, .15],
       labels=["Train", "Validation"]));

We are going to be using a feature to mark whether a user has already seen the same question he is currently solving. The most memory efficient way to do that would be to use a *bitarray*. Let's aquaint ourselves a bit with this module:

In [None]:
from bitarray import bitarray
ba = bitarray(15000, endian='little')

# initialize this way
ba.setall(False)
repeat_c = 0

temp = data[~data.content_type_id].loc[data.user_id == np.random.choice(data.user_id.unique()), ['content_id']]

for _, c in temp['content_id'].iteritems():
    if ba[c]:
        print (f"{c:<5} was already viewed by the user!")
        repeat_c += 1
    else:
        ba[c] = 1
    
if repeat_c == 0:
    print ("No repeated questions for this user")
else:
    print (f"\n{repeat_c} question(s) were repeated")
    
# simple sanity check. This way we can perform multiple indexing
np.array(ba.tolist())[temp.content_id.values].all() # must equal true

### Data Modelling (Prototype Version):

##### Let's create a few numba helper functions! Its from here that we start defining useful numba functions. These functions would be useful when we generate the train data on the entire 100M training data.

As we define the numba functions, we compare them with their pandas equivalent to ensure that they are bug free. Numba functions have two limitations over pandas functions:
1. Since they parallelize data code, we may run into OOM sometimes due to sudden increase in computations performed.
2. Unlike pandas, a single nan value encountered by numba would be propagated until the end of the array. 

We need to manually ensure that these two don't happen.

In [None]:
import numba as nb

@nb.njit(parallel=True)
def expanding_mean(arr):
    return np.cumsum(arr) / (np.arange(arr.size) + 1)

# for compilation purposes
expanding_mean(np.array([1, 1, 0]))

temp = np.random.randint(0, 2, int(1e5))

%timeit -n100 expanding_mean(temp)
%timeit -n100 np.cumsum(temp) / (np.arange(int(1e5)) + 1)
%timeit -n100 pd.Series(temp).expanding(1).mean()

((expanding_mean(temp) == pd.Series(temp).expanding(1).mean()).all() and
 (np.cumsum(temp) / (np.arange(int(1e5)) + 1) == pd.Series(temp).expanding(1).mean()).all())

In [None]:
@nb.njit
def shifted_expanding_mean(arr, keep_last=False):
    
    'keep_last is set to true when we apply this func to pqet_mean. SPECIAL USE CASE!'
    
    temp = expanding_mean(arr)
    if not keep_last:
        return np.concatenate((np.array([np.nan]), temp[:-1]))
    else:
        return np.concatenate((np.array([np.nan]), temp))

# for compilation purposes
shifted_expanding_mean(temp)

%timeit -n 20 shifted_expanding_mean(temp)
%timeit -n 20 pd.Series(temp).expanding(1).mean().shift()

# throws error if they aren't equal
np.testing.assert_allclose(pd.Series(temp).expanding(1).mean().shift(), shifted_expanding_mean(temp))

Code to calculate response time:

Diff operation followed by a replace of 0 with nan and forward filling the latest value in the nan positions. This is required as a mere diff operation would be erraneous for questions sharing the same bundle_id.

In [None]:
@nb.njit
def rt_func(arr, pred=False):
        
    temp = np.concatenate((np.array([np.nan]), arr[1:] - arr[:-1]))
    temp = np.where(temp == 0, np.nan, temp)
    
    mask = np.isnan(temp)
    idx = np.arange(len(mask))
    idx = np.where(mask, 0, idx)
    
    rmax, i = idx[0], 0
    for i, val in enumerate(idx):
        if val > rmax: 
            rmax = val
        idx[i] = rmax
    
    temp[mask] = temp[idx[mask]] 
    
    if not pred:
        temp = np.where(np.isnan(temp), 0, temp)
    else:
        temp = np.where(np.isnan(temp), arr, temp)
    
    return temp

In [None]:
%time data.groupby("user_id")['timestamp'].transform(lambda x: rt_func(x.values)) 
%time data.groupby("user_id")['timestamp'].transform(lambda x: x.diff().replace(0, np.nan).fillna(method='ffill').fillna(0))

(
    data.groupby("user_id")['timestamp'].transform(lambda x: rt_func(x.values)) 
    == data['response_time']
).all()

Function to alter answered_correctly according to the question's difficulty: 

We weigh the user's responsed based on the actual question difficulty. A user is given negative marks if the question when wrong and positive marks when he gets it right. Additionally a tough question answered correctly will yield better score than a easy question answered correctly. However a easy question answered wrongly must carry a higher penality than a difficult question answered wrongly.

In [None]:
@nb.njit
def modify_ac(arr, c_mean):
    temp = np.where(arr == 0, -1, 1) * c_mean
    return np.where(temp < 0, temp, 1 - temp)

In [None]:
%%time 

## Give more weightage to user ans acc to question difficulty?
# penalize easy wrong questions more than tough wrong questions
# reward tough right questions more than easy right questions
data['ac_modified'] = modify_ac(
    data['answered_correctly'].values, 
    data.groupby(["content_id", 'content_type_id'])['answered_correctly'].transform('mean').values)

# set ac_modified as 0 for questions
data.loc[data.content_type_id, 'ac_modified'] = 0

Let's compute the part wise expanding mean for users. First we create a numba function that can be reused later on.

In [None]:
@nb.njit
def fillnshift(arr):
    
    '''Function to fwdfill and shift by 1 position. To replace pandas functionality.'''
    
    # fill logic
    
    mask = np.isnan(arr)
    idx = np.arange(len(mask))
    idx = np.where(mask, 0, idx)
    
    rmax, i = idx[0], 0
    
    for i, val in enumerate(idx):
        if val > rmax: 
            rmax = val
        idx[i] = rmax
    
    arr[mask] = arr[idx[mask]] 
    
    # shift logic
    
    return np.concatenate((np.array([np.nan]), arr[:-1]))

Check if the above function works as intended:

In [None]:
temp = data.loc[data.user_id == np.random.choice(data.user_id.unique()), 'answered_correctly']

# for compilaton purposes
fillnshift(temp.values)

# pd function timing
%timeit -n 20 temp.fillna(method='ffill').shift()

# numba timing
%timeit -n 20 fillnshift(temp.values)

# if all goes fine no error is raised
np.testing.assert_allclose(fillnshift(temp.values), temp.fillna(method='ffill').shift())

We find ourselves using calculating MA pretty frequently also we shift the values again after a groupy. This could be replaced using a numba function as follows:

In [None]:
@nb.njit
def moving_average(arr, n=10, shift=True):
    
    mask = np.isnan(arr)
    
    ret = np.cumsum(np.where(mask, 0, arr))
    
    ret[n:] = ret[n:] - ret[:-n]
    
    counts = np.cumsum(~mask)
    counts[n:] = counts[n:] - counts[:-n]
    
    ret[~mask] /= counts[~mask]
    ret[mask] = np.nan
    
    if shift:
        # perform shifting
        ret = np.concatenate((np.array([np.nan]), ret[:-1]))

    return ret

In [None]:
temp = data.loc[data.user_id == np.random.choice(data.user_id.unique()), 'answered_correctly']

print ("Simple Moving Average:")

# numba timing
%timeit -n 20 moving_average(temp.values.astype('float'), shift=False)

# pd Series timing
%timeit -n 20 pd.Series(temp).rolling(10, min_periods=1).mean()

# throws error if something is off
np.testing.assert_allclose(
    moving_average(temp.values.astype('float'), shift=False), 
    pd.Series(temp).rolling(10, min_periods=1).mean())

print ("\nNow Moving Average followed by one position shift:")

# numba timing
%timeit -n 20 moving_average(temp.values.astype('float'))

# pd Series timing
%timeit -n 20 pd.Series(temp).rolling(10, min_periods=1).mean().shift()

# throws error if something is off
np.testing.assert_allclose(
    moving_average(temp.values.astype('float')), 
    pd.Series(temp).rolling(10, min_periods=1).mean().shift())

Numba function to calculate shifted expanding cumsum:

In [None]:
@nb.njit()
def shifted_expanding_sum(arr):
    temp = np.cumsum(arr)
    return np.concatenate((np.array([np.nan]), temp[:-1]))

temp = pd.Series(np.random.choice(2, size=100))

np.testing.assert_allclose(shifted_expanding_sum(temp.values), temp.expanding().sum().shift())

%timeit -n 10 shifted_expanding_sum(temp.values)
%timeit -n 10 temp.cumsum().shift()

Let's create these features now:

In [None]:
data['up_mean'] = data[~data.content_type_id].groupby(['user_id', 'part'])['ac_modified'].transform(
    lambda x: shifted_expanding_mean(x.values))

# per part count relative to all count
data['up_count'] = (data[~data.content_type_id].groupby(['user_id', 'part']).cumcount() 
                    / data[~data.content_type_id].groupby("user_id").cumcount())

# up_mean can't be zero since we ac_modified
# up_count can be filled with 0 though
data['up_count'] = data['up_count'].fillna(0)

Using modified shifted_expanding_mean to compute expanding_mean for pqet:

In [None]:
temp = np.random.randint(0, 2, size=int(1e5)).astype('float')

# injecting nan like it would usually be in pqet
# however there are few users with first question as a bundle
# these first entries didn't have np.nan for pqet
temp[0] = np.nan 
temp = pd.Series(temp)

np.testing.assert_allclose(
    temp.expanding().mean(), 
    shifted_expanding_mean(temp.values[1:], keep_last=True)
)

In [None]:
data['content_c'] = data.groupby(["content_id", 'content_type_id'])['row_id'].transform("count")

data['seen_ratio'] = (data.loc[~data.content_type_id, ['user_id', 'prior_question_had_explanation']]
                      .fillna(False).astype(float).groupby("user_id") # convert to float for using nb func
                      .transform(lambda x: expanding_mean(x.values)))

data['pqet_mean'] = (
    data[(~data.content_type_id)].groupby("user_id")['prior_question_elapsed_time']
     .transform(lambda x: shifted_expanding_mean(x.values[1:], keep_last=True))
)

data.loc[~data.content_type_id, 'repeat_c'] = (
    data[~data.content_type_id].groupby(["user_id", 'content_id'])
    .cumcount().astype(bool))
data['repeat_c'] = data['repeat_c'].fillna(False)

# density of user interactions
data['uq_per_hr'] = data.groupby("user_id").cumcount() / (data['timestamp'] / 60000)

# when the task_container_id occurs might be good feature
data['tmed'] = data[~data.content_type_id].groupby("content_id")[['task_container_id']].transform('median')

# create features counting the number of incorrect responses
data['uwrong_sum'] = (
    data[~data.content_type_id]['answered_correctly']
    .map({0: 1, 1: 0}).groupby(data['user_id'])
    .transform(lambda x: shifted_expanding_sum(x.values))
)

In [None]:
data['lec_recent'] = (
    data.loc[data.content_type_id, 'content_type_id']
    .reindex(data.index).groupby(data['user_id'])
    .fillna(method='ffill', limit=10)
    .fillna(False).astype(bool)
)

data['uf_bundle'] = data.groupby("user_id")['bundle_id'].transform('first')

data['pqetmr_10'] = (data[~data.content_type_id].groupby("user_id")['prior_question_elapsed_time'].transform(
        lambda x: moving_average(x.values.astype('float32'), shift=False, n=10)))

data['che_sum'] = data[~data.content_type_id].groupby("content_id")['pqhe_shifted'].transform('sum')
data['che_sum'] = data['che_sum'].fillna(0).astype('int')

In [None]:
data['seen_exp_when_wrong'] = (data['pqhe_shifted'].fillna(False) & (data['answered_correctly'] == 0)).astype(int)
data['seen_exp_when_wrong'] = (data[~data.content_type_id].groupby('user_id')['seen_exp_when_wrong']
 .transform(lambda x: shifted_expanding_sum(x.values)))

data['seen_exp_when_right'] = (data['pqhe_shifted'].fillna(False) & (data['answered_correctly'] == 1)).astype(int)
data['seen_exp_when_right'] = (data[~data.content_type_id].groupby('user_id')['seen_exp_when_right']
 .transform(lambda x: shifted_expanding_sum(x.values)))

data['sessions'] = (data['response_time'] > (15*60*1000)).groupby(data['user_id']).cumsum()
data['sess_event_count'] = data.groupby(['user_id', 'sessions']).cumcount()

data['up_recency'] = data.groupby(['user_id', 'part'])['timestamp'].transform(lambda x: rt_func(x.values))
data['ts_recency_10'] = data['timestamp'] - data.groupby("user_id")['timestamp'].shift(10)
data['ts_recency_5'] = data['timestamp'] - data.groupby("user_id")['timestamp'].shift(5)

#### Training on a small subset of data (1M) to test how our features work so far:

In [None]:
import optuna.integration.lightgbm as lgbo
import lightgbm as lgb
import time
import warnings

warnings.filterwarnings("ignore", category=UserWarning) 

train_cols = [
    
    # defintely improves score besides u_mean and c_mean
    'repeat_c',
    'tagF', 'tagS', 'tagL', 'tagT',
    'response_time', 
    'prior_question_elapsed_time',
    
    'up_mean', 'up_count', 'uq_per_hr',
    'uwrong_sum', 'lec_recent',
    
    'pqet_mean', 'seen_ratio', 'tmed', 
    'up_recency', 'ts_recency_10', 
    'ts_recency_5', 
    
#     'pqet_shifted',
    
    # for stability  
    'timestamp', 'task_container_id', 'content_c',
    
    # nice to add features
#     'res_time_avg', 'lec_available',
    
    'che_sum', 'seen_exp_when_wrong', 
    'seen_exp_when_right',
    'sessions','sess_event_count', 
    'pqetmr_10', 'uf_bundle', 
    
    # useless features
#     "cluster", 'tag_c', 'listening_lvl', 'part',
]

cat_cols = ['tagF', 'tagS', 'tagT', 'tagL']
other_cols = ['train', 'user_id', 'content_id', 'ac_modified', 
              'pqet_shifted', 'bundle_q_count', 'part']

folds = 7

best_params = {
 'learning_rate':0.075,
 'num_leaves': 200,
 'objective': 'binary',
 'metric': 'auc'}

total_users = data.user_id.unique()
np.random.shuffle(total_users)
val_size = len(total_users) // folds
scores = {'TRAIN': [], 'VAL': []}
models = []

for i in range(folds):
    
    start_time = time.time()
    
    train_users = np.setdiff1d(total_users, total_users[(i)*val_size:(i+1)*val_size])
    temp = generate_train_val(data, train_users, dyn_p=True)
    temp = temp[~temp.content_type_id]
    temp = temp.loc[~temp.content_type_id, train_cols + other_cols + ['answered_correctly']]

    temp[['u_mean']] = (
        temp.groupby(["user_id"])['ac_modified']
        .transform(lambda x: shifted_expanding_mean(x.values))
    )
    
    temp['ummr_10'] = (
        temp.groupby('user_id')['ac_modified']
        .transform(lambda x: moving_average(x.values, n=10))
    )
    
    temp['rt_per_task'] = temp['response_time'] / temp['bundle_q_count']
    
    # data preprocessing
    temp[cat_cols] = temp[cat_cols].fillna(0)
    temp[cat_cols] = temp[cat_cols].astype(int)
    
    # save the content wise mean score and create feature in train and val
    temp = temp.merge(
        temp[temp.train].groupby("content_id")['answered_correctly'].agg(c_mean='mean', c_std='std'),
        left_on=['content_id'], right_index=True, how='left')
    
    # impute missing c_mean
    temp['c_mean'] = temp['c_mean'].fillna(0.5)
    
    # harmonic mean idea from here: https://www.kaggle.com/markwijkhuizen/riiid-training-and-prediction-using-a-state
    temp['h_mean'] = temp.groupby("user_id")['answered_correctly'].transform(
        lambda x: shifted_expanding_mean(x.values)
    )
    
    temp['h_mean'] = (2 * temp['h_mean'] * temp['c_mean']) / (temp['h_mean'] + temp['c_mean'])
    
    # doing the same for part wise mean
    temp['uph_mean'] = temp.groupby(['user_id', 'part'])['answered_correctly'].transform(
        lambda x: shifted_expanding_mean(x.values))
    
    temp['uph_mean'] = (2 * temp['uph_mean'] * temp['c_mean']) / (temp['uph_mean'] + temp['c_mean'])
    
    # add wrong and right response mean for each content
    temp = temp.merge(
        temp[temp.train].groupby(["content_id", 'answered_correctly'])['pqet_shifted'].agg('median').unstack(),
        left_on='content_id', right_index=True, how='left')
    temp.columns = temp.columns[:-2].tolist() + ['wrong_et_med', 'right_et_med']
    
    # split into train/val
    train, val = temp[temp['train']], temp[~temp['train']]

    # drop the useless columns
    train, val = train.drop(other_cols, 1), val.drop(other_cols, 1)
    
    if i == 0:
        if not best_params: # we do a hyper parameter tuning the first time
            
            # creating the lgb dataset for training
            train_lgb = lgb.Dataset(
                train.drop('answered_correctly', 1), train['answered_correctly'], 
                free_raw_data=False, categorical_feature=cat_cols)

            val_lgb = lgb.Dataset(
                val.drop('answered_correctly', 1), val['answered_correctly'], 
                free_raw_data=False, categorical_feature=cat_cols)

            print ("Searching for Optimal Hyperparameters! This may take some time to complete.")
            model = lgbo.train(
                params={'objective':'binary', 'metric':'auc'}, 
                train_set=train_lgb, valid_sets=[train_lgb, val_lgb],
                verbose_eval=0, early_stopping_rounds=10, 
                show_progress_bar=False,
                time_budget=60*60, # ten minutes to run, increase to into deeper search space
            )
            params = model.params

        else:
            params = best_params
            
        print (f"Starting folds: {'='*80}>")
        
    # time for data preprocessing
    d_time = time.time() - start_time 
    
    model = lgb.LGBMClassifier(**params)
    
    model.fit(train.drop('answered_correctly', 1), train['answered_correctly'], 
              eval_set=[(val.drop('answered_correctly', 1), val['answered_correctly'])],
              early_stopping_rounds=10, verbose=0)
    
    model = model.booster_
    models.append(model)

    scores['TRAIN'] = scores.get('TRAIN') + [roc_auc_score(train['answered_correctly'], model.predict(
        train.drop("answered_correctly", 1),  categorical_feature=cat_cols))]

    scores['VAL'] = scores.get('VAL') + [roc_auc_score(val['answered_correctly'], model.predict(
        val.drop("answered_correctly", 1),  categorical_feature=cat_cols))]

    print ("AUC for Fold# {}/{}:> | T: {:.3f} | V: {:.3f} | Preprocess Time: {:5.2f}s | Total Fold Time: {:.2f}s"
           .format(i+1, folds, scores['TRAIN'][-1], scores['VAL'][-1], d_time, time.time() - start_time))

print ("Summary stats : {}\n\nMean AUC for {} folds: T: {:.4f} @ {:.4f} std | V: {:.4f} @ {:.4f} std"
       .format('='*76+">", folds, np.mean(scores['TRAIN']), np.std(scores['TRAIN']), 
               np.mean(scores['VAL']), np.std(scores['VAL'])))

print (f"\nAdditional Info: {'='*80+'>'}\nTrain Features Used By Model: {val.shape[1] - 1} Features")
print (f"Hypothetical Space Required For Entire Data: {(val.shape[1] * 101230332 * 32) / 8e+9:.2f} GB (float 32)")
print (f"Hypothetical Space Required For Entire Data: {(val.shape[1] * 101230332 * 64) / 8e+9:.2f} GB (float 64)")

We plot the feature importance on all the models based on their performance. Let's define a helper functon to help us with that:

In [None]:
def plot_ensemble_importance(models, scores=None, col_sampling=False):
    
    result = pd.DataFrame()

    for i, m, in enumerate(models):
        temp = pd.DataFrame(index=models[i].feature_name())
        temp[f"{i}_split"] = m.feature_importance(importance_type='split')
        temp[f"{i}_gain"] = m.feature_importance(importance_type='gain')
        result = pd.concat([result, temp], axis=1)
        
    gain_cols = list(filter(lambda x: 'gain' in x, result.columns))
    split_cols = list(filter(lambda x: 'split' in x, result.columns))

    if col_sampling:
        result[gain_cols] = result[gain_cols].fillna(axis=1, method='ffill').fillna(axis=1, method='bfill')
        result[split_cols] = result[split_cols].fillna(axis=1, method='ffill').fillna(axis=1, method='bfill')
        
    result['gain'] = np.average(result[gain_cols], axis=1, weights=scores)
    result['split'] = np.average(result[split_cols], axis=1, weights=scores)
    
    result = result[['gain', 'split']]
    result = result.sort_values("split")

    f, ax = plt.subplots(figsize=(20, int(0.5*len(result))), nrows=2, sharey=False)
    result.plot(kind='barh', subplots=True, ax=ax, legend=False)

In [None]:
# plot it
plot_ensemble_importance(models, scores["VAL"])

### Data Generation & Training Logic (Entire dataset):
Now our prototype works well. Let's create these features on the entire dataset to be able to train our model on them:

In [None]:
# delete data no longer needed
del data, temp, train, val
data = temp = train = val = None
gc.collect()

What follows is optimized pipeline code for train data generation. I have used as `gc.collect()` liberally. We create features piece by peice and delete them once we are done with them:

In [None]:
# monitor the time passing
start_time = time.time()

LEC_RECENT_ROLL = 10
ROLL_WINDOW = 10
ROLL_WINDOW_PQET = 10
SESSION_DURATION = 15 * 60 * 1000

# save op as n number of chunked files
CHUNKS = 3

SAVE_LOC = f"../input/riiid-final-model-inputs/model_train_c1.feather"
MODEL_LOC = f"../input/riiid-final-model-inputs/trained_model.txt"

if not os.path.exists(SAVE_LOC):
    
    ########################## BASIC INFO GATHERING ##########################
    
    data = pd.read_feather("../input/riiid-train-data-multiple-formats/riiid_train.feather", 
                       columns=['user_id', 'timestamp', 'answered_correctly'])

    q_mask = data.answered_correctly != -1
    BATCH_SIZE = q_mask.sum() // CHUNKS
    
    del data
    gc.collect()

    print (f"Time Elapsed: {time.time() - start_time:10.2f} s", end=' ')
    print("| Percentage of Data used: {:.2f}% | Single chunk Size: {:9} rows".format(
        q_mask.sum()*100/len(q_mask), BATCH_SIZE
    ))
    print (f"Time Elapsed: {time.time() - start_time:10.2f} s | <= Entering Data Prep Bottleneck 1!")
    
    ################ UP_RECENCY, TS_RECENCY_5, TS_RECENCY_10 ################
    
    data = pd.read_feather("../input/riiid-train-data-multiple-formats/riiid_train.feather", 
                           columns=['user_id', 'timestamp', 'content_id', 'content_type_id'])
    
    data = data.merge(
        ql[['content_id', 'content_type_id', 'part']], 
        on=['content_id', 'content_type_id'], how='left')
    
    data.drop(['content_id', 'content_type_id'], axis=1, inplace=True)
    data['part'] = data['part'].astype('uint8')
    gc.collect()
    
    up_recency = pd.Series(data=np.zeros(len(data), dtype='float64'), index=data.index, name='up_recency')
    
    for i in range(1, 8):
        temp = (data[data.part == i].groupby("user_id")['timestamp']
                .transform(lambda x: rt_func(x.values)))
        
        temp = temp.reindex(data.index).fillna(0).values.astype('float64')
        up_recency = up_recency + temp
        
        del temp
        gc.collect()
        
        print (f"Time Elapsed: {time.time() - start_time:10.2f} s | * Done with up_recency_{i}!")
    
    data['up_recency'] = up_recency
    data.drop('part', axis=1, inplace=True)
    del up_recency
    gc.collect()
    
    data['ts_recency_10'] = data['timestamp'] - data.groupby('user_id')['timestamp'].shift(10)
    data['ts_recency_5'] = data['timestamp'] - data.groupby('user_id')['timestamp'].shift(5)
    
    data = data.loc[q_mask]
    data.drop(['user_id', 'timestamp'], axis=1, inplace=True)
    data.reset_index().to_feather("ts_up_recency.feather")
    
    del data
    gc.collect()
    
    print (f"Time Elapsed: {time.time() - start_time:10.2f} s | <= Done creating up/ts recency features!")
    
    ################ SEEN_EXP_WHEN_WRONG, SEEN_EXP_WHEN_RIGHT ################
    
    data = pd.read_feather("../input/riiid-train-data-multiple-formats/riiid_train.feather", 
                           columns=['user_id', 'answered_correctly']).loc[q_mask]
    
    pq_shifted = pd.read_feather("../input/riiid-final-model-inputs/pq_shifted.feather", 
                                 columns=['index', 'pqhe_shifted']).set_index("index")

    data['seen_exp_when_wrong'] = (pq_shifted['pqhe_shifted'] & (data['answered_correctly'] == 0)).astype(int)
    data['seen_exp_when_right'] = (pq_shifted['pqhe_shifted'] & (data['answered_correctly'] == 1)).astype(int)
    
    del pq_shifted
    gc.collect()
    
    data['seen_exp_when_wrong'] = (
        data.groupby('user_id')['seen_exp_when_wrong']
        .transform(lambda x: shifted_expanding_sum(x.values)))
    
    data['seen_exp_when_right'] = (
        data.groupby('user_id')['seen_exp_when_right']
        .transform(lambda x: shifted_expanding_sum(x.values)))
    
    data = data[['seen_exp_when_wrong', 'seen_exp_when_right']]   
    data = data.fillna(0).astype("uint16")
    data.astype('uint16').reset_index().to_feather("seen_exp.feather")
    
    del data
    gc.collect()
    
    print (f"Time Elapsed: {time.time() - start_time:10.2f} s | Done creating seen_exp features!")
    
    ############################## UF_BUNDLE ##############################

    data = pd.read_feather("../input/riiid-train-data-multiple-formats/riiid_train.feather", 
                           columns=['user_id', 'content_id']).loc[q_mask]
    
    content_df = pd.read_csv("../input/riiid-final-model-inputs/content-df.csv", index_col=0, 
                             usecols=['content_id', 'bundle_id'])
    
    content_df['bundle_id'] = content_df['bundle_id'].astype("uint16")
    
    data = data.merge(content_df['bundle_id'], left_on='content_id', right_index=True, how='left')
    data.drop(['content_id'], axis=1, inplace=True)
    
    data = data.groupby("user_id")['bundle_id'].transform("first").rename("uf_bundle")
    data.astype("uint16").reset_index().to_feather("uf_bundle.feather")
    
    del data, content_df
    gc.collect()
    
    print (f"Time Elapsed: {time.time() - start_time:10.2f} s | Done creating uf_bundle!")
    
    ################################ PQETMR ################################
    
    data = pd.read_feather("../input/riiid-train-data-multiple-formats/riiid_train.feather", 
                           columns=['user_id', 'prior_question_elapsed_time']).loc[q_mask]
    
    data['pqetmr_10'] = data.groupby("user_id")['prior_question_elapsed_time'].transform(
        lambda x: moving_average(x.values, n=10, shift=False)
    )
    
    data.drop(['prior_question_elapsed_time', 'user_id'], axis=1, inplace=True)
    data.astype('float32').reset_index().to_feather("pqetr.feather")
    
    del data
    gc.collect()
    
    print (f"Time Elapsed: {time.time() - start_time:10.2f} s | Done creating pqet rolling features!")
    
    ################ HMEAN, UPHMEAN, UPMEAN, UPCOUNT UMEAN, UMMR ###########
    
    data = pd.read_feather("../input/riiid-train-data-multiple-formats/riiid_train.feather", 
                           columns=['user_id', 'content_id', 'answered_correctly']).loc[q_mask]

    content_df = pd.read_csv("../input/riiid-final-model-inputs/content-df.csv", index_col=0, 
                             usecols=['part', 'c_mean', 'content_id'])
    
    # to save space
    content_df['part'] = content_df['part'].astype('uint8')
    content_df['c_mean'] = content_df['c_mean'].astype('float32')

    data = data.merge(content_df, left_on='content_id', right_index=True, how='left')
    data.drop(['content_id'], axis=1, inplace=True)
    del content_df
    gc.collect()
    
    temp = (
        data.groupby("user_id")['answered_correctly']
        .transform(lambda x: shifted_expanding_mean(x.values))
    )
    
    temp = (2 * temp * data['c_mean'].values) / (temp + data['c_mean'].values)
    
    (temp.astype('float32').rename("h_mean")
     .reset_index().to_feather("h_mean.feather"))
    
    del temp
    gc.collect()
    
    print (f"Time Elapsed: {time.time() - start_time:10.2f} s | Done creating h_mean!")
    
    data['answered_wrongly'] = data['answered_correctly'] == 0
    
    (data.groupby("user_id")['answered_wrongly']
     .transform(lambda x: shifted_expanding_sum(x.values))
     .fillna(0).astype("uint16").rename("uwrong_sum")
     .reset_index().to_feather("uwrong_sum.feather"))
    
    data.drop(['answered_wrongly'], axis=1, inplace=True)
    gc.collect()
    
    print (f"Time Elapsed: {time.time() - start_time:10.2f} s | Done creating uwrong_sum!")
    print (f"Time Elapsed: {time.time() - start_time:10.2f} s | <= Entering Data Prep Bottleneck 2!")
    
    uph_mean = pd.Series(data=np.zeros(len(data), dtype='float32'), index=data.index, name='uph_mean')
    
    for i in range(1, 8):
        temp = (data[data.part == i].groupby("user_id")['answered_correctly']
                .transform(lambda x: shifted_expanding_mean(x.values)))
        
        temp = temp.reindex(data.index).fillna(0).values.astype('float32')
        uph_mean = uph_mean + temp
        
        del temp
        gc.collect()
        
        print (f"Time Elapsed: {time.time() - start_time:10.2f} s | * Done with uph_mean_{i}!")
        
    uph_mean = (2 * uph_mean * data['c_mean'].values) / (uph_mean + data['c_mean'].values)
    uph_mean.reset_index().to_feather("uph_mean.feather")
    
    del uph_mean
    gc.collect()
    print (f"Time Elapsed: {time.time() - start_time:10.2f} s | <= Done creating uph_mean!")
    
    # perform ops for converting ans_crctly to weighted values based on c_mean
    data['answered_correctly'] = modify_ac(data['answered_correctly'].values, data['c_mean'].values)
    data.drop(['c_mean'], 1, inplace=True)
    gc.collect()
    
    print (f"Time Elapsed: {time.time() - start_time:10.2f} s | Modified labels according to Importance!")
    print (f"Time Elapsed: {time.time() - start_time:10.2f} s | <= Entering Data Prep Bottleneck 3!")
    
    up_mean = pd.Series(data=np.zeros(len(data), dtype='float32'), index=data.index, name='up_mean')
    
    for i in range(1, 8):
        temp = (data[data.part == i].groupby("user_id")['answered_correctly']
                .transform(lambda x: shifted_expanding_mean(x.values)))
        temp = temp.reindex(data.index).fillna(0).values.astype('float32')
        up_mean = up_mean + temp
        
        del temp
        gc.collect()
        
        print (f"Time Elapsed: {time.time() - start_time:10.2f} s | * Done with up_mean_{i}!")

    up_mean.reset_index().to_feather("up_mean.feather")
    
    del up_mean
    gc.collect()
    
    print (f"Time Elapsed: {time.time() - start_time:10.2f} s | <= Done creating up_mean!")
    
    data['up_count'] = data.groupby(['user_id', 'part']).cumcount().fillna(0)
    data['up_count'] = data['up_count'] / data.groupby('user_id').cumcount()
    data['up_count'].astype('float32').reset_index().to_feather("up_count.feather")
    
    data.drop(['part'], axis=1, inplace=True)
    gc.collect()

    print (f"Time Elapsed: {time.time() - start_time:10.2f} s | Done creating up_count!")
    
    (data.groupby(["user_id"])['answered_correctly']
     .transform(lambda x: moving_average(x.values, n=ROLL_WINDOW))
     .rename(f'ummr_{ROLL_WINDOW}').astype('float32').reset_index()
     .to_feather(f'ummr_{ROLL_WINDOW}.feather'))
    
    gc.collect()
    
    print (f"Time Elapsed: {time.time() - start_time:10.2f} s | Done creating ummr_{ROLL_WINDOW}!") 

    (data.groupby("user_id")['answered_correctly']
     .transform(lambda x: shifted_expanding_mean(x.values))
     .rename("u_mean").astype('float32').reset_index()
     .to_feather("u_mean.feather"))
    
    del data
    gc.collect()

    print (f"Time Elapsed: {time.time() - start_time:10.2f} s | Done creating u_mean!")
    
    ############################## LEC_RECENT ##############################
    
    data = pd.read_feather("../input/riiid-train-data-multiple-formats/riiid_train.feather",
                          columns=['user_id', 'content_type_id'])
    
    data = (data['content_type_id'].replace(False, np.nan).groupby(data['user_id'])
            .fillna(method='ffill', limit=LEC_RECENT_ROLL).fillna(False).astype(bool))
    
    data.loc[q_mask].rename("lec_recent").reset_index().to_feather("lec_recent.feather")
    
    del data
    gc.collect()
    
    print (f"Time Elapsed: {time.time() - start_time:10.2f} s | Done creating lec_recent!")
    
    ################################ REPEAT_C ################################

    data = pd.read_feather("../input/riiid-train-data-multiple-formats/riiid_train.feather", 
                       columns=['user_id', 'content_id'])

    data = data.loc[q_mask].groupby(["user_id", 'content_id']).cumcount()
    data.astype(bool).rename("repeat_c").reset_index().to_feather("repeat_c.feather")
    
    del data
    gc.collect()

    print (f"Time Elapsed: {time.time() - start_time:10.2f} s | Done creating repeat_c!")
    
    ############################# SEEN_RATIO, PQET_MEAN #############################
    
    data = pd.read_feather("../input/riiid-train-data-multiple-formats/riiid_train.feather", 
                       columns=['user_id', 'prior_question_elapsed_time', 'prior_question_had_explanation'])
    
    # filter out the questions
    data = data.loc[q_mask]

    data['prior_question_elapsed_time'] = (
        data.groupby("user_id")['prior_question_elapsed_time'].transform(
            lambda x: shifted_expanding_mean(x.values[1:], keep_last=True)))
    
    data['prior_question_had_explanation'] = data['prior_question_had_explanation'].fillna(False).astype('float32')
    data['prior_question_had_explanation'] = data.groupby("user_id")['prior_question_had_explanation'].transform(
        lambda x: expanding_mean(x.values))

    data = data.rename({"prior_question_had_explanation": "seen_ratio", 
                        'prior_question_elapsed_time': "pqet_mean"}, axis=1)
    
    # drop the column, no longer needed
    data.drop(["user_id"], axis=1, inplace=True)
    data.astype('float32').reset_index().to_feather("pq_se.feather")
    
    del data
    gc.collect() 
    
    print (f"Time Elapsed: {time.time() - start_time:10.2f} s | Done creating pqet_mean and seen_ratio!")
    
    ################################# UQ_PER_HR #################################
    
    data = pd.read_feather("../input/riiid-train-data-multiple-formats/riiid_train.feather",
                           columns=['user_id', 'timestamp', 'task_container_id'])
    
    data = data.groupby("user_id").cumcount() / (data['timestamp'] / 60000)
    data.fillna(0, inplace=True)
        
    data.loc[q_mask].astype("float32").rename('uq_per_hr').reset_index().to_feather("uq_per_hr.feather")
    
    del data
    gc.collect()
    
    print (f"Time Elapsed: {time.time() - start_time:10.2f} s | Done creating uq_per_hr!")
    
    ################ INTERMEDIATE FILE CONCAT FOR SAVING SPACE ################
    
    print (f"Time Elapsed: {time.time() - start_time:10.2f} s | <= Beginning intermediate file concat!")
    
    data = pd.DataFrame()
    
    for name in ['up_mean.feather', "uph_mean.feather", f"ummr_{ROLL_WINDOW}.feather"]:
        
        temp = pd.read_feather(name).set_index("index")
        ! rm ./{name}

        # concat the dataframes together
        data = pd.concat([data, temp], axis=1)

        del temp
        gc.collect()
        time.sleep(2)
        
    data.reset_index().to_feather("./concat_intermediate_1.feather")
    del data
    gc.collect()
    
    print (f"Time Elapsed: {time.time() - start_time:10.2f} s |  * Done with Intermediate Concat 1!")
    
    data = pd.DataFrame()
    
    for name in ["repeat_c.feather", "uf_bundle.feather", "uwrong_sum.feather", 
                 "seen_exp.feather", "u_mean.feather"]:
        
        temp = pd.read_feather(name).set_index("index")
        ! rm ./{name}

        # concat the dataframes together
        data = pd.concat([data, temp], axis=1)

        del temp
        gc.collect()
        time.sleep(2)
        
    data.reset_index().to_feather("./concat_intermediate_2.feather")
    del data
    gc.collect()
    
    print (f"Time Elapsed: {time.time() - start_time:10.2f} s |  * Done with Intermediate Concat 2!")
    
    data = pd.DataFrame()
    
    for name in ["pq_se.feather", "pqetr.feather"]:
        
        temp = pd.read_feather(name).set_index("index")
        ! rm ./{name}

        # concat the dataframes together
        data = pd.concat([data, temp], axis=1)

        del temp
        gc.collect()
        time.sleep(2)
        
    data.reset_index().to_feather("./concat_intermediate_3.feather")
    del data
    gc.collect()
    
    print (f"Time Elapsed: {time.time() - start_time:10.2f} s |  * Done with Intermediate Concat 3!")
    
    data = pd.DataFrame()
    
    for name in ["uq_per_hr.feather", "up_count.feather", "lec_recent.feather", "h_mean.feather"]:
        
        temp = pd.read_feather(name).set_index("index")
        ! rm ./{name}

        # concat the dataframes together
        data = pd.concat([data, temp], axis=1)

        del temp
        gc.collect()
        time.sleep(2)
        
    data.reset_index().to_feather("./concat_intermediate_4.feather")
    del data
    gc.collect()
    
    print (f"Time Elapsed: {time.time() - start_time:10.2f} s |  * Done with Intermediate Concat 4!")
    print (f"Time Elapsed: {time.time() - start_time:10.2f} s | <= Intermediate file concat Complete!")
    
    ############################ CONTENT_DF FEATURES ############################
    
    data = (pd.read_feather("../input/riiid-train-data-multiple-formats/riiid_train.feather", 
                       columns=['content_id']).loc[q_mask])
    
    content_df = pd.read_csv("../input/riiid-final-model-inputs/content-df.csv", index_col=0)    
    content_df[cat_cols + ['bundle_q_count', 'part']] = content_df[cat_cols + ['bundle_q_count', 'part']].astype('uint8')
    content_df[['wrong_et_med', 'right_et_med']] = content_df[['wrong_et_med', 'right_et_med']].astype('float32')
    content_df[['c_mean', 'c_std']] = content_df[['c_mean', 'c_std']].astype('float32')
    content_df[['content_c', 'tmed', 'che_sum']] = (
        content_df[['content_c', 'tmed', 'che_sum']]).astype('uint16')
    content_df.drop(['part', 'bundle_id'], axis=1, inplace=True)
    
    data = data.merge(content_df, how='left', left_on='content_id', right_index=True)
    data.drop(['content_id'], axis=1, inplace=True)
    
    data.reset_index().to_feather("c_df.feather")
    
    del data, content_df
    gc.collect()
    
    print (f"Time Elapsed: {time.time() - start_time:10.2f} s | Done creating content_df for merge!")
    
    ########### RESPONSE_TIME, SESSIONS, SESS_EVENT_COUNT, RT_PER_TASK ###########

    data = pd.read_feather("../input/riiid-train-data-multiple-formats/riiid_train.feather", 
                       columns=['timestamp', 'user_id'])
    
    data['response_time'] = ( # response time requires the lectures as well
        data.groupby("user_id")['timestamp']
        .transform(lambda x: rt_func(x.values)))
    
    data.drop(['timestamp'], axis=1, inplace=True)
    gc.collect()
    
    data['sessions'] = (data['response_time'] > SESSION_DURATION).astype(bool)
    data['sessions'] = data.groupby('user_id')['sessions'].cumsum()
    data['sess_event_count'] = data.groupby(['user_id', 'sessions']).cumcount()
    data[['sessions', 'sess_event_count']] = data[['sessions', 'sess_event_count']].astype('uint16')
    
    # filter out lectures no longer needed
    data = data.loc[q_mask]
    
    # creating rt_per_task
    temp = pd.read_feather("./c_df.feather", columns=['index', 'bundle_q_count'])
    temp = temp.set_index("index")
    data['rt_per_task'] = data['response_time'] / temp['bundle_q_count'].values
    
    # save as file
    data.drop(['user_id'], axis=1, inplace=True)
    data.reset_index().to_feather("response_time.feather")
    
    del data, temp
    gc.collect()

    print (f"Time Elapsed: {time.time() - start_time:10.2f} s | Done creating reponse_time!")
    print ("\nDone creating all required intermediate files.\n")
    
    ############################ CHUNKED CONCAT LOGIC ############################
    
    print (f"Time Elapsed: {time.time() - start_time:10.2f} s | <== Starting merge operation!")
    
    for i in range(CHUNKS):
        
        print (f"\nTime Elapsed: {time.time() - start_time:10.2f} s |  <= Beginning chunk {i + 1} merge!")
        front, rear = i * BATCH_SIZE, (i + 1) * BATCH_SIZE
        
        data = (pd.read_feather("../input/riiid-train-data-multiple-formats/riiid_train.feather", 
                                columns=['timestamp', 'user_id', 'task_container_id',
                                         'prior_question_elapsed_time', 'answered_correctly'])
                .loc[q_mask] # filter the lectures
                .iloc[front: rear])

        for name in ['c_df.feather', 'ts_up_recency.feather', "response_time.feather",
                     "concat_intermediate_2.feather", "concat_intermediate_4.feather",
                     "concat_intermediate_1.feather", "concat_intermediate_3.feather"]:

            temp = pd.read_feather(name)
            temp.iloc[BATCH_SIZE:].reset_index(drop=True).to_feather(name)
            temp = temp.iloc[:BATCH_SIZE].set_index("index")

            if name == 'c_df.feather':
                temp.drop("bundle_q_count", axis=1, inplace=True)
                
            gc.collect()
            time.sleep(3)

            # concat the dataframes together
            assert len(data) == len(temp) == BATCH_SIZE
            data = pd.concat([data, temp], axis=1)

            del temp
            gc.collect()
            time.sleep(2)

            print (f"Time Elapsed: {time.time() - start_time:10.2f} s |  * Done merging {name}!")
        
        print (f"Time Elapsed: {time.time() - start_time:10.2f} s |  * Saving chunk {i + 1} onto Disk!")
        data.reset_index(drop=True, inplace=True) 
        data.to_feather(f"model_train_c{i + 1}.feather")
        print (f"Time Elapsed: {time.time() - start_time:10.2f} s |  * Done saving chunk {i + 1} to Disk!")
        print (f"Time Elapsed: {time.time() - start_time:10.2f} s |  <= Done Merging chunk {i + 1}!")
        
        del data
        gc.collect()
        time.sleep(3)
        
    print (f"\nTime Elapsed: {time.time() - start_time:10.2f} s | <== Merge Complete!")        
    print ("Deleting unneeded intermediate files.\n")
    ! rm c_df.feather concat_intermediate_*.feather ts_up_recency.feather
    ! rm response_time.feather
    
else:
    
    print (f"Time Elapsed: {time.time() - start_time:10.2f} s | Found PreSaved File(s). Load it manually if need be!")
    print (f"Time Elapsed: {time.time() - start_time:10.2f} s | Copying Train File(s) back to Disk!")
    
    for i in range(CHUNKS):
        ! cp ../input/riiid-final-model-inputs/model_train_c{i + 1}.feather ./model_train_c{i + 1}.feather
        
    print (f"Time Elapsed: {time.time() - start_time:10.2f} s | Done copying Train File(s) back to Disk!")

#### LGB Training:
Done with data generation. Now we can't really use this entire data to fit a lgbm model since it would inevitably exhaust all available ram. So we need to load the data in chunks and train them piece by piece. This could be better accomplished using a framework such as Dask. Unfortunately I wasn't aware of the workings of Dask so I stuck with simple python logic. 

We define two functions. One is to return random front and rear indices given a required batch size. Second is to map the front and rear to respective data chunks (we have three of them). Note that we each chunk have their indices reset. So we need to map them manually:

In [None]:
def return_random_slice(batch_size, nrows):
    '''Randomly returns a slice of `batch_size` from nrows.'''
    
    front = np.random.choice(nrows - batch_size)
    rear = front + batch_size
        
    return front, rear

In [None]:
def return_chunk_indices(start, end, nrows, chunks=3):
    'Maps the indices to chunk indices for data loading'
    
    chunk_size = (nrows//chunks)
    indices = []
    
    start_chunk = start // chunk_size
    end_chunk = (end-1) // chunk_size
    
    start_chunk_start = start - chunk_size * start_chunk
    end_chunk_end = end - chunk_size * end_chunk
    start_chunk_end = chunk_size if end_chunk != start_chunk else end_chunk_end
    end_chunk_start = 0 if end_chunk != start_chunk else start_chunk_start 
    
    indices.append((start_chunk+1, start_chunk_start, start_chunk_end))
    
    if start_chunk != end_chunk:

        for i in range(end_chunk - start_chunk - 1):
            indices.append((start_chunk + i + 2, 0, chunk_size))

        indices.append((end_chunk+1, end_chunk_start, end_chunk_end))
    
    return indices

In [None]:
%%time

ALL_FEATURES = [
    'repeat_c', 'tagF', 'tagS', 'tagL', 'tagT', 'response_time', 'prior_question_elapsed_time',
    'up_mean', 'up_count', 'uq_per_hr', 'uwrong_sum', 'lec_recent', 'pqet_mean', 'seen_ratio',
     'tmed', 'up_recency', 'ts_recency_10', 'ts_recency_5', 'timestamp', 'task_container_id',
     'content_c', 'che_sum', 'seen_exp_when_wrong', 'seen_exp_when_right', 'sessions', 'sess_event_count',
     'pqetmr_10', 'uf_bundle', 'u_mean', 'ummr_10', 'rt_per_task', 'c_mean', 'c_std', 'h_mean', 'uph_mean',
     'wrong_et_med', 'right_et_med'
]
    
# initially had the idea to randomly choose features, later I decided against it and used all the columns
CORE = ['repeat_c', 'tagF', 'tagS', 'tagL', 'response_time', 'up_mean', 'up_count', 'pqet_mean', 
        'seen_ratio', 'tmed', 'up_recency', 'ts_recency_5', 'content_c', 'pqetmr_10', 'u_mean', 
        'rt_per_task', 'c_mean', 'c_std', 'h_mean', 'ts_recency_10', 'uph_mean']

N_FEATURES = 30            # not used if col_sampling is set to false

BATCH_SIZE = int(2.5e7)    # Rows taken at a time for model train/val
STRIDE = int(1e7)          # Stride these many elements
HOLD_OUT = int(2.5e6)      # number of rows, no model would ever see

models = []                # we save our models here to create a bagging ensemble

COL_SAMPLING = False       # whether to sample rows from total features, if false, n_Features is altered
ALREADY_TRAINED = 8        # reload any trained model?
TRAIN_FOR = 0              # how many rounds to actually train for, if None, auto specified
ITERATIVE_TRAIN = False    # Do we wish to re-init models?

OOF_VALIDATE = False       # Do oof validation after training/reloading models?

N_ROUNDS = 3000            # number of rounds each model would be trained for

# Training data location (split as three files)
LOC1 = "../input/riiid-final-model-inputs/model_train_c1.feather"
LOC2 = "../input/riiid-final-model-inputs/model_train_c2.feather"
LOC3 = "../input/riiid-final-model-inputs/model_train_c3.feather"

SIZE1 = len(pd.read_feather(LOC1, columns=['answered_correctly']))
SIZE2 = len(pd.read_feather(LOC2, columns=['answered_correctly']))
SIZE3 = len(pd.read_feather(LOC3, columns=['answered_correctly']))

# total number of rows in training data
N_ROWS = SIZE1 + SIZE2 + SIZE3

# https://stackoverflow.com/questions/53580088/calculate-the-output-size-in-convolution-layer
# tot. num of chunks after striding with a specified batch_size
CHUNKS = ((N_ROWS - BATCH_SIZE - HOLD_OUT) // STRIDE) + 1

# if not specified train for remaining number of chunks
if TRAIN_FOR is None:
    TRAIN_FOR = CHUNKS - ALREADY_TRAINED

# a simple sanity check
assert TRAIN_FOR + ALREADY_TRAINED <= CHUNKS

# reload models if already trained
if ALREADY_TRAINED:
    
    # read the OOF indices from memory
    OOF = np.load("../input/riiid-final-model-inputs/OOF.npy")
    
    for j in range(ALREADY_TRAINED):    
        model = lgb.Booster(model_file=f"../input/riiid-final-model-inputs/trained_model_c{j+1}.txt")
        model.save_model(f"trained_model_c{j+1}.txt")
        models.append(model)
        
    print (f"Successfully loaded {len(models)} model(s)!")

if TRAIN_FOR:
        
    if not ALREADY_TRAINED:
        print ("Beginning Model Training from Scratch!")
        
        # re create OOF indices if not present
        OOF = np.arange(N_ROWS) 
        
    # for init_model parameter we use pre trained model
    if ITERATIVE_TRAIN and ALREADY_TRAINED:
        pre_model = model
        
    # if we wish not to iterative train or no model has been trained so far
    else:    
        pre_model = None

    print (f"{BATCH_SIZE} rows would be used for {CHUNKS} model(s)!", end=' ')
    print (f"{TRAIN_FOR} model(s) would be trained for {N_ROUNDS} rounds!")
    print (f"The models would {'BE' if ITERATIVE_TRAIN else 'NOT BE'} iteratively built!")
    print (f"Col sampling has been {'ENABLED' if COL_SAMPLING else 'DISABLED'}!")
    print (f"{HOLD_OUT + N_ROWS - ((CHUNKS - 1) * STRIDE + BATCH_SIZE)} rows goes straight to OOF!")
    print ("Space Req. For One Chunk: {:.2f} GB ({})".format(
        (N_FEATURES + 1 if COL_SAMPLING else len(ALL_FEATURES)) * BATCH_SIZE * 64 / 8e+9, 'float64'))
    
    # train for TRAIN_FOR number of times
    for j in range(ALREADY_TRAINED, ALREADY_TRAINED + TRAIN_FOR):
        
        front, rear = j * STRIDE, j * STRIDE + BATCH_SIZE
        print (f"\nBeginning chunk {j + 1}/{ALREADY_TRAINED + TRAIN_FOR}! Loading the Feather File to Memory => ")
        
        if COL_SAMPLING:
        
            FEATURES = (CORE + np.random.choice(list(set(ALL_FEATURES) - set(CORE)), 
                                                size=(N_FEATURES - len(CORE)), 
                                                replace=False).tolist())

            print (f"Selected Features ({N_FEATURES}/{len(ALL_FEATURES)}): {list(set(FEATURES) - set(CORE))}")
            cat_cols = np.intersect1d(['tagF', 'tagS', 'tagT', 'tagL'], FEATURES).tolist()        
            COLUMNS = ['user_id', *FEATURES, 'answered_correctly']

            if 'timestamp' not in COLUMNS:
                COLUMNS += ['timestamp']
                
        else:
            
            FEATURES = ALL_FEATURES
            COLUMNS = ['user_id', *FEATURES, 'answered_correctly']
            cat_cols = ['tagF', 'tagS', 'tagT', 'tagL']
        
        print (f"Slicing from {front} to {rear}!")
        
        data = pd.DataFrame()
        for file, start, end in return_chunk_indices(front, rear, nrows=N_ROWS):
            
            data = pd.concat([
                data,              
                (pd.read_feather(f"../input/riiid-final-model-inputs/model_train_c{file}.feather", columns=COLUMNS)
                 .iloc[start: end])
            ])
            
        # reset the index to simulate the actual ordering
        data.index = range(front, rear)
        
        # a simple buffer to prevent OOMs
        gc.collect()
        time.sleep(3)
        
        print ("Generating train/val Masks for the data...")
        mask = generate_train_val(data, return_mask=True, tp=0.95, dyn_p=True).values.astype(bool)
        print("Done generating masks! Saving to intermediate train/val files..")
        
        # remove the indices from oof that are used for model train
        OOF = np.setdiff1d(OOF, data.index[mask])

        # retain only those columns needed
        data.drop(['user_id'], axis=1, inplace=True)
        gc.collect()
        time.sleep(3)
        
        # save it and del from memory to save space
        data.loc[mask].reset_index(drop=True).to_feather("train_intermediate.feather") 
        data.loc[~mask].reset_index(drop=True).to_feather("val_intermediate.feather")

        del data, mask
        gc.collect()
        time.sleep(3)
        
        ####### LOGIC FOR CREATING NUMPY ARRAY WITHOUT MEMORY SPIKE #######
        
        print ("Done saving! Beginning logic for LGB dataset creation..")
        
        chunk_per = 0.025
        
        # load and reorder the columns
        tr_data = pd.read_feather("train_intermediate.feather")
        tr_data = tr_data[FEATURES + ['answered_correctly']]

        ## initialize an emtpy array to which we would be assigning
        tr_result = np.empty(shape=(len(tr_data), len(FEATURES) + 1), dtype='float64')
        step = int(len(tr_data) * chunk_per)
        end = (len(tr_data) // step) + 1

        for i in tqdm.tqdm(range(1, end + 1)):

            # assing to numpy slice and remove dataframe
            tr_result[(i - 1) * step: i * step] = tr_data.iloc[:step]
            tr_data = tr_data.iloc[step:]

            # clear memory and let it go for a while unused
            gc.collect()
            time.sleep(3)

        del tr_data
        gc.collect()
        ! rm ./train_intermediate.feather
        time.sleep(3)
        
        # create LGB dataset
        tr_data = lgb.Dataset(
            tr_result[:, :-1], label=tr_result[:, -1], feature_name=FEATURES, 
            categorical_feature=cat_cols, free_raw_data=not ITERATIVE_TRAIN)
        
        # construct the dataset (Memory Bottleneck)
        tr_data.construct()
        
        del tr_result
        gc.collect()
        time.sleep(2)
        
        # load and reorder the columns
        vl_data = pd.read_feather("./val_intermediate.feather")
        vl_data = vl_data[FEATURES + ['answered_correctly']]

        #  Val Data Next
        ## initialize an emtpy array to which we would be assigning
        vl_result = np.empty(shape=(len(vl_data), len(FEATURES) + 1), dtype='float64')
        step = int(len(vl_data) * chunk_per)
        end = (len(vl_data) // step) + 1

        for i in tqdm.tqdm(range(1, end + 1)):

            # assign to numpy slice and remove dataframe
            vl_result[(i - 1) * step: i * step] = vl_data.iloc[:step]
            vl_data = vl_data.iloc[step:]

            # clear memory and let it go for a while unused
            gc.collect()
            time.sleep(3)

        del vl_data
        gc.collect() 
        ! rm ./val_intermediate.feather
        time.sleep(3)
        
        vl_data = lgb.Dataset(
            vl_result[:, :-1], label=vl_result[:, -1], feature_name=FEATURES, 
            categorical_feature=cat_cols, reference=tr_data, 
            free_raw_data=not ITERATIVE_TRAIN)
        
        # construct the dataset (Memory Bottleneck)
        vl_data.construct()        
        
        # no longer needed
        del vl_result
        gc.collect()
        time.sleep(2)

        ###################################################################

        print ("Done generating LGB datasets! Starting training..")

        model = lgb.train(
            
            params={'metric': 'auc', 'objective': 'binary', 
                    'num_leaves': 200, 'learning_rate': 0.075},
            
            train_set=tr_data,
            feature_name=FEATURES,
            categorical_feature=cat_cols,
            num_boost_round=N_ROUNDS,
            valid_sets=[tr_data, vl_data], 
            init_model=pre_model,
            early_stopping_rounds=50,
            verbose_eval=50
        )
        
        del tr_data, vl_data
        gc.collect()
        
        print (f"Done training chunk {j + 1}!")
        
        # save the model & append to list
        model.save_model(f"trained_model_c{j + 1}.txt")
        models.append(model)
        
        # reassign pre_model for iterative training
        if ITERATIVE_TRAIN:
            pre_model = model
        
if OOF_VALIDATE:
        
    ##################### MODEL(S) EVALUATION LOGIC #####################
    
    print (f"\nWe would be evaluating the model on {HOLD_OUT} rows out of {len(OOF)} OOF rows.")
    data = pd.read_feather(LOC3).iloc[-HOLD_OUT:]

    actual = data['answered_correctly']
    data.drop(['user_id', 'answered_correctly'], axis=1, inplace=True)

    # empty numpy array for saving preds
    preds = np.zeros(shape=(len(actual), len(models)))

    print ("\nIndividual Model's OOF Scores:")
    for i in range(len(models)):
        FEATURES = models[i].feature_name()
        cat_cols = np.intersect1d(['tagF', 'tagS', 'tagT', 'tagL'], FEATURES)
        preds[:, i] = models[i].predict(data[FEATURES], categorical_feature=cat_cols)
        print(f"\tModel #{i+1}'s Score: {roc_auc_score(actual, preds[:, i]):.4f}")

    print (f"\nThe Ensemble's Score: {roc_auc_score(actual, preds.mean(1)):.4f}")
    
    # clear memory after running, not needed
    del data, actual, preds
    gc.collect()
    
# only retain last model if iterative training            
if ITERATIVE_TRAIN:
    models = models[-1:]
    
# save OOF for future training & evaluation
np.save("OOF.npy", OOF)
print (f"\nNumber of Models: {len(models)}\n")

A custom class to make ensemble predictions:

In [None]:
class ensemble(object):
    def __init__(self, models, weights=None, treelite=True):
        self.n_models = len(models)
        self.models = models
        self.weights = [1 / self.n_models] * self.n_models if not weights else weights
        self.cat_cols = ['tagF', 'tagS', 'tagL', 'tagT']
        self.treelite = treelite
        
        # specify feature order
        self.feats = ['repeat_c', 'tagF', 'tagS', 'tagL', 'tagT', 'response_time', 'prior_question_elapsed_time',
                       'up_mean', 'up_count', 'uq_per_hr', 'uwrong_sum', 'lec_recent', 'pqet_mean', 'seen_ratio',
                       'tmed', 'up_recency', 'ts_recency_10', 'ts_recency_5', 'timestamp', 'task_container_id',
                       'content_c', 'che_sum', 'seen_exp_when_wrong', 'seen_exp_when_right', 'sessions', 
                      'sess_event_count', 'pqetmr_10', 'uf_bundle', 'u_mean', 'ummr_10', 'rt_per_task', 'c_mean',
                       'c_std', 'h_mean', 'uph_mean', 'wrong_et_med', 'right_et_med']
        
        if not self.treelite:
            assert len(np.setdiff1d(self.feats, self.models[0].feature_name())) == 0
    
    def __getattr__(self, attr):
        '''Solely to prevent any errors by mistaken accessing in future.
        Features such as feature_importances are not reliable!'''
        
        return getattr(self.models[0], attr)
    
    def feature_name(self):
        return self.feats
    
    def predict(self, x, **kwargs):
        
        if not self.treelite:
            return self.predict_lgb(x[self.feats])
        
        else:
            return self.predict_treelite(x[self.feats].values)
        
    def predict_treelite(self, x):
        
        # cast to treelite compatible batch
        X = treelite_runtime.Batch.from_npy2d(x)
        
        pred = np.zeros(len(x))
        for m, w in zip(self.models, self.weights):
            pred += m.predict(X) * w
            
        return pred
        
    def predict_lgb(self, x):
        
        pred = np.zeros(len(x))
        for m, w in zip(self.models, self.weights):
            pred += m.predict(x, categorical_feature=self.cat_cols) * w
            
        return pred

#### FTRL Model Training:
This part has been added from my other notebook. This part trains the FTRL model over the LGB ensemble. We feed in the LGB predictions. This is a bottleneck so we load the predictions we had precomputed:

In [None]:
%%time

import glob

# track time passing
start_time = time.time()

if not os.path.exists("../input/col-sampled-train-dataset/lgb_pred_1.npy"):

    # we take only a subset of the 8 models for faster approximation
    models = [models[3], models[6]]

    # Rows taken at a time for model predictions
    BATCH_SIZE = int(3.0e7)  

    CHUNKS = list(range(0, N_ROWS, BATCH_SIZE))

    print (f"Time Elapsed: {time.time() - start_time:10.2f} s | Predictions will be chunked for", len(CHUNKS), "Chunks!")
    print (CHUNKS)

    for j, front in enumerate(CHUNKS):
        rear = front + BATCH_SIZE

        print (f"\nBeginning chunk {j+1}! Loading the Feather File to Memory => ")    
        print (f"Slicing from {front} to {rear}!")

        data = pd.DataFrame()
        for file, start, end in return_chunk_indices(front, rear, nrows=N_ROWS):

            file = f"../input/riiid-final-model-inputs/model_train_c{file}.feather"

            if not os.path.exists(file):
                continue

            data = pd.concat([data, pd.read_feather(file).iloc[start: end]])

        gc.collect()
        time.sleep(2)

        # rearange columns (only keep those needed)
        data = data[LGB_COLS]

        # compute len before we create treelite batch
        data_shape = data.shape

        print (f"Time Elapsed: {time.time() - start_time:10.2f} s |", end=' ')
        print (f"Done loading! Data Dimensions: {data_shape}")
        print ("Beginning logic for treelite batch dataset creation..")

        gc.collect()
        time.sleep(2)

        chunk_per = 0.025

        ## initialize an emtpy array to which we would be assigning
        darray = np.empty(shape=(data_shape[0], data_shape[1]), dtype='float64')
        step = int(data_shape[0] * chunk_per)
        end = (data_shape[0] // step) + 1

        for i in tqdm.tqdm(range(1, end + 1)):

            # assing to numpy slice and remove dataframe
            darray[(i - 1) * step: i * step] = data.iloc[:step]
            data = data.iloc[step:]

            # clear memory and let it go for a while unused
            gc.collect()
            time.sleep(3)

        del data
        gc.collect()
        time.sleep(3)

        # convert to tlite batch for prediction
        darray = treelite_runtime.Batch.from_npy2d(darray)

        # a simple buffer to prevent OOMs
        gc.collect()
        time.sleep(3)

        lgb_pred = np.zeros(shape=data_shape[0], dtype='float32')

        print (f"Time Elapsed: {time.time() - start_time:10.2f} s | Making Predictions using Treelite models =>")

        for k, model in enumerate(models):

            lgb_pred = lgb_pred + model.predict(darray)

            print (f"Time Elapsed: {time.time() - start_time:10.2f} s | Done with model# {k + 1}")

        # compute mean 
        lgb_pred = lgb_pred / len(models)

        print (f"Time Elapsed: {time.time() - start_time:10.2f} s | Done with Chunk {j+1}! Saving predictions to Disk!")
        np.save(f"lgb_pred_{j+1}.npy", lgb_pred)

        del darray, lgb_pred
        gc.collect() 
        
else:
    print(f"Time Elapsed: {time.time() - start_time:10.2f} s | Copying LGB pred numpy arrays back to Disk!")
    for i in glob.glob("../input/col-sampled-train-dataset/lgb_pred_*"):
        ! cp {i} {i.split("/")[-1]}
    print(f"Time Elapsed: {time.time() - start_time:10.2f} s | Done copying to Disk!")

In [None]:
%%time

# track time passing
start_time = time.time()

if not os.path.exists("../input/col-sampled-train-dataset/lgb2_pred_1.npy"):

    # we take only a subset of the 8 models for faster approximation
    models = [models[1], models[4]]

    # Rows taken at a time for model predictions
    BATCH_SIZE = int(3.0e7)  

    CHUNKS = list(range(0, N_ROWS, BATCH_SIZE))

    print (f"Time Elapsed: {time.time() - start_time:10.2f} s | Predictions will be chunked for", len(CHUNKS), "Chunks!")
    print (CHUNKS)

    for j, front in enumerate(CHUNKS):
        rear = front + BATCH_SIZE

        print (f"\nBeginning chunk {j+1}! Loading the Feather File to Memory => ")    
        print (f"Slicing from {front} to {rear}!")

        data = pd.DataFrame()
        for file, start, end in return_chunk_indices(front, rear, nrows=N_ROWS):

            file = f"../input/riiid-final-model-inputs/model_train_c{file}.feather"

            if not os.path.exists(file):
                continue

            data = pd.concat([data, pd.read_feather(file).iloc[start: end]])

        gc.collect()
        time.sleep(2)

        # rearange columns (only keep those needed)
        data = data[LGB_COLS]

        # compute len before we create treelite batch
        data_shape = data.shape

        print (f"Time Elapsed: {time.time() - start_time:10.2f} s |", end=' ')
        print (f"Done loading! Data Dimensions: {data_shape}")
        print ("Beginning logic for treelite batch dataset creation..")

        gc.collect()
        time.sleep(2)

        chunk_per = 0.025

        ## initialize an emtpy array to which we would be assigning
        darray = np.empty(shape=(data_shape[0], data_shape[1]), dtype='float64')
        step = int(data_shape[0] * chunk_per)
        end = (data_shape[0] // step) + 1

        for i in tqdm.tqdm(range(1, end + 1)):

            # assing to numpy slice and remove dataframe
            darray[(i - 1) * step: i * step] = data.iloc[:step]
            data = data.iloc[step:]

            # clear memory and let it go for a while unused
            gc.collect()
            time.sleep(3)

        del data
        gc.collect()
        time.sleep(3)

        # convert to tlite batch for prediction
        darray = treelite_runtime.Batch.from_npy2d(darray)

        # a simple buffer to prevent OOMs
        gc.collect()
        time.sleep(3)

        lgb_pred = np.zeros(shape=data_shape[0], dtype='float32')

        print (f"Time Elapsed: {time.time() - start_time:10.2f} s | Making Predictions using Treelite models =>")

        for k, model in enumerate(models):

            lgb_pred = lgb_pred + model.predict(darray)

            print (f"Time Elapsed: {time.time() - start_time:10.2f} s | Done with model# {k + 1}")

        # compute mean 
        lgb_pred = lgb_pred / len(models)

        print (f"Time Elapsed: {time.time() - start_time:10.2f} s | Done with Chunk {j+1}! Saving predictions to Disk!")
        np.save(f"lgb2_pred_{j+1}.npy", lgb_pred)

        del darray, lgb_pred
        gc.collect() 
        
else:
    print(f"Time Elapsed: {time.time() - start_time:10.2f} s | Copying LGB pred numpy arrays back to Disk!")
    for i in glob.glob("../input/col-sampled-train-dataset/lgb_pred_*"):
        ! cp {i} {i.split("/")[-1]}
    print(f"Time Elapsed: {time.time() - start_time:10.2f} s | Done copying to Disk!")

Creating dataset for FTRL training:

In [None]:
%%time

data = pd.read_feather(
    "../input/riiid-train-data-multiple-formats/riiid_train.feather", 
    columns=['content_id', 'content_type_id'])

data = data.loc[~data['content_type_id']].iloc[:N_ROWS]

data = data.merge(
    ques[['question_id', 'part']].set_index("question_id"), 
    left_on=['content_id'], right_index=True, how='left')

# drop unncessary columns
data.drop(['content_type_id'], axis=1, inplace=True)

# reset the indices
data.reset_index(drop=True, inplace=True)

# temp save to save memory
data.to_feather("./intermediate_train.feather")
del data
gc.collect()

data = pd.DataFrame()
for LOC in [LOC1, LOC2, LOC3]:
    
    data = pd.concat([
        data, 
        
        (pd.read_feather(LOC, columns=['user_id', 'task_container_id', 'content_c', 'c_mean', 'h_mean', 
                                    'answered_correctly', 'ummr_10']))
    ], axis=0)

# reset the indices to match
data.reset_index(drop=True, inplace=True)

# we create some meta features to help FTRL
data['ummr_10_50'] = data['ummr_10'] > 0
data['h_mean_50'] = data['h_mean'] > 0.5
data['c_mean_50'] = data['c_mean'] > 0.5
data['c_mean_25'] = data['c_mean'] > 0.25
data['c_mean_75'] = data['c_mean'] > 0.75

# delete those not needed
data.drop(['ummr_10', 'c_mean', 'h_mean'], axis=1, inplace=True)

# lets now concatenate both the frames
data = pd.concat([
    data, pd.read_feather("./intermediate_train.feather")
], axis=1)

# delete intermediate file
! rm ./intermediate_train.feather

lgb_pred = np.array([])

for file in sorted(glob.glob("../input/col-sampled-train-dataset/lgb_pred_*")):
    lgb_pred = np.concatenate([lgb_pred, np.load(file)])
    
lgb2_pred = np.array([])

for file in sorted(glob.glob("../input/col-sampled-train-dataset/lgb2_pred_*")):
    lgb2_pred = np.concatenate([lgb2_pred, np.load(file)])
    
assert len(lgb_pred) == len(lgb2_pred)

# we divide them by two 
lgb_pred = (lgb_pred + lgb2_pred) / 2

del lgb2_pred
gc.collect()

# save the loaded values to new column
data['lgb_pred'] = lgb_pred
data['lgb_75'] = data['lgb_pred'] > 0.75
data['lgb_50'] = data['lgb_pred'] > 0.5
data['lgb_25'] = data['lgb_pred'] > 0.25

del lgb_pred
gc.collect()

print (data.shape)
print (data.columns)

Uint8, uint16 can't work with datatable frames. We need to type cast them. let's write a utility function for that:

In [None]:
def df_to_dt_format(df):
    for i in df.columns:
        org = str(df[i].dtype)
        converted = org.lstrip("u")
        if org != converted:
            converted = converted[:3] + str(int(converted.lstrip("int")) * 2)
            df[i] = df[i].astype(converted)

In [None]:
# type cast in place
df_to_dt_format(data)

# convert to datatable frame for training
data = dt.Frame(data)

In [None]:
FTRL_COLS = [
    'user_id', 'task_container_id', 'content_c', 'ummr_10_50', 'h_mean_50', 
    'c_mean_50', 'c_mean_25', 'c_mean_75', 'content_id', 'part', 'lgb_pred', 'lgb_75',
    'lgb_50', 'lgb_25'
]

INTERACTIONS = None

Train FTRL Model. Notice the speed at which FTRL trains!

In [None]:
%%time

from datatable.models import Ftrl

ftrl = Ftrl(
    nepochs=1, interactions=INTERACTIONS, 
    alpha=0.005, double_precision=True,
)

ftrl.fit(data[:int(9.5e7), FTRL_COLS], data[:int(9.5e7), ['answered_correctly']])

#### FTRL Evaluation:

Without online Learning:
1. With only the 4 model predictions:

In [None]:
%%time 

# make predictions for validation
preds = ftrl.predict(data[int(9.5e7):, FTRL_COLS]).to_pandas()
actual = data[int(9.5e7):, 'answered_correctly'].to_numpy()

# Base line to beat
print ("Baseline Score to beat: {:.4f}".format(roc_auc_score(
    actual, data[int(9.5e7):, 'lgb_pred'].to_pandas()
)))

# FTRL score
roc_auc_score(actual, preds)

2. With all 8 model predictions.

At model inference we need to run the model using all the LGB models we have trained. So lets do an evaluation that would mimic that scenario:

In [None]:
model = ensemble(models, treelite=False)
model, type(model.models[0])

In [None]:
if not os.path.exists("../input/treelite-converted-ensemble-8-models-25m/lgb_pred_final.npy"):
    # we make predictions using the ensemble model
    temp = pd.read_feather(LOC3).iloc[-(N_ROWS - int(9.5e7)):][model.feature_name()]
    lgb_pred = model.predict(temp)

    del temp
    gc.collect()

else:
    lgb_pred = np.load("../input/treelite-converted-ensemble-8-models-25m/lgb_pred_final.npy")

Our goal here is to make the last peice of data alone is uses all the LGB model predictions rather than just 4 of them:

In [None]:
data[int(9.5e7):, 'lgb_pred'] = lgb_pred
data[int(9.5e7):, 'lgb_75'] = data[int(9.5e7):, dt.f.lgb_pred > 0.75] 
data[int(9.5e7):, 'lgb_50'] = data[int(9.5e7):, dt.f.lgb_pred > 0.50] 
data[int(9.5e7):, 'lgb_25'] = data[int(9.5e7):, dt.f.lgb_pred > 0.25] 

# make our predictions
preds = ftrl.predict(data[int(9.5e7):, FTRL_COLS])

# Base line to beat
print ("Baseline Score to beat: {:.4f}".format(roc_auc_score(
    actual, data[int(9.5e7):, 'lgb_pred'].to_pandas()
)))

# FTRL score
roc_auc_score(data[int(9.5e7):, 'answered_correctly'].to_pandas(), preds)

We use FTRL for its ability to learn from test instances as well. So evaluation would be better performed if the model is trained and predicted batchwise. Let's do that:

In [None]:
from copy import deepcopy

# a save point
ol_ftrl = deepcopy(ftrl)

# reduce learning rate
ol_ftrl.alpha = 0.005

In [None]:
%%time 

ol_preds = []
preds = []

# smaller batch sizes would give more closer approximation
BATCH_SIZE = 1000

for front in range(int(9.5e7), N_ROWS, BATCH_SIZE):
    pred = ol_ftrl.predict(data[front:front+BATCH_SIZE, FTRL_COLS]).to_list()[0]
    ol_preds.append(pred)
    
    pred = ftrl.predict(data[front:front+BATCH_SIZE, FTRL_COLS]).to_list()[0]
    preds.append(pred)
    
    # learn from the batches
    ol_ftrl.fit(data[front:front+BATCH_SIZE, FTRL_COLS], data[front:front+BATCH_SIZE, 'answered_correctly'])

In [None]:
# lets compute how they have performed
actual = data[int(9.5e7):, 'answered_correctly'].to_numpy()

# Base line to beat
print ("Baseline Score to beat: {:.4f}".format(roc_auc_score(
    actual, data[int(9.5e7):, 'lgb_pred'].to_pandas()
)))

print ("\nModel Score Comparison: Online: {:.4f} | Offline: {:.4f}".format(
    roc_auc_score(actual, np.concatenate(ol_preds)),
    roc_auc_score(actual, np.concatenate(preds))
))

### Model State Generation:
Load the models (Choose between LGB or Treelite)

In [None]:
%%time 

TREELITE = False

if TREELITE:
    models = []
    for model in sorted(glob.glob("../input/treelite-converted-ensemble-8-models-25m/tl_*.so")):
        models.append(treelite_runtime.Predictor(model, verbose=False, nthread=1))
    
else:
    models = []
    for file in sorted(glob.glob("../input/riiid-final-model-inputs/trained_model_*.txt")):
        models.append(lgb.Booster(model_file=file))

model = ensemble(models, treelite=TREELITE)

model, model.n_models, type(model.models[0])

Generate the states for all the users and contents:

In [None]:
# set to false in case the dataset changes
load_from_file = True
cat_cols = ['tagF', 'tagS', 'tagT', 'tagL']

In [None]:
start_time = time.time()

if not load_from_file:    
    
    pq_shifted = pd.read_feather(
        "../input/riiid-train-data-multiple-formats/riiid_train.feather", 
        columns=['user_id', 'task_container_id', 'content_type_id', 
                 'prior_question_elapsed_time', 'prior_question_had_explanation'])
    
    # question mask for future use
    q_mask = pq_shifted['content_type_id'] == 0
    
    # filter out the lectures
    pq_shifted = pq_shifted.loc[q_mask] 
    
    # type cast to save memory and space
    pq_shifted['prior_question_had_explanation'] = (
        pq_shifted['prior_question_had_explanation']
        .fillna(False).astype('uint8'))
    
    # we take only one value per task_container_id, user_id pair
    temp = (pq_shifted.groupby(['user_id', 'task_container_id'])
            [['prior_question_elapsed_time', 'prior_question_had_explanation']]
            .mean())
    
    temp = temp.groupby("user_id").shift(-1)
    temp.columns = ['pqet_shifted', 'pqhe_shifted']
    
    # drop, no longer useful
    pq_shifted.drop(
        ['content_type_id', 'prior_question_elapsed_time', 'prior_question_had_explanation'], 
        axis=1, inplace=True)
    
    gc.collect()
    
    pq_shifted = pq_shifted.merge(temp, left_on=['user_id', 'task_container_id'], right_index=True, how='left')
    pq_shifted.drop(['user_id'], axis=1, inplace=True)
    
    # saving it for futher use down the line
    pq_shifted['pqet_shifted'] = pq_shifted['pqet_shifted'].astype('float32')
    pq_shifted['pqhe_shifted'] = pq_shifted['pqhe_shifted'].fillna(False).astype(bool)
    pq_shifted.reset_index().to_feather("./pq_shifted.feather")
    
    content_df = (pd.read_feather(
        "../input/riiid-train-data-multiple-formats/riiid_train.feather",
        columns=['content_id', 'answered_correctly']).loc[q_mask])
    
    content_df = pd.concat([content_df, pq_shifted], axis=1)
    
    del temp, pq_shifted
    gc.collect()
    
    # create the content mean response_time for right and wrong answers    
    et_med = content_df.groupby(["content_id", 'answered_correctly'])['pqet_shifted'].median()
    et_med = et_med.unstack()
    et_med.columns = ['wrong_et_med', 'right_et_med']
    
    # create the che_sum for all contents
    che_sum = content_df.groupby(['content_id'])['pqhe_shifted'].sum().rename("che_sum")
    
    # create the task_container_id median
    tmed = content_df.groupby('content_id')['task_container_id'].median().rename("tmed")

    # save the content_mean
    content_df = (content_df.groupby('content_id')['answered_correctly'].agg(
        content_c='count', c_mean='mean', c_std='std'))
    
    # merge the response_times and c_mean
    content_df = pd.concat([content_df, et_med, tmed, che_sum], axis=1)
    
    del et_med, tmed, che_sum
    gc.collect()

    # save the categorical columns required
    content_df = content_df.merge(
        ql.loc[~ql.content_type_id, ['content_id', *cat_cols, 'part', 'bundle_q_count', 'bundle_id']]
        .set_index("content_id"), on='content_id', how='left')
    
else:
    
    content_df = pd.read_csv("../input/riiid-final-model-inputs/content-df.csv", index_col=0)

# set size to minimum possible for saving space
content_df[cat_cols + ['bundle_q_count', 'part']] = content_df[cat_cols + ['bundle_q_count', 'part']].astype('uint8')
content_df[['wrong_et_med', 'right_et_med']] = content_df[['wrong_et_med', 'right_et_med']].astype('float32')
content_df[['c_mean', 'c_std']] = content_df[['c_mean', 'c_std']].astype('float32')
content_df[['content_c', 'tmed', 'che_sum', 'bundle_id']] = (
    content_df[['content_c', 'tmed', 'che_sum', 'bundle_id']]).astype('uint16')

print (f"Time Elapsed: {(time.time() - start_time):6.2f} Sec")
print (f"Memory Usage: {content_df.memory_usage(deep=True).sum() / (2**20):6.2f}", 'MB')
print (f"Length      : {len(content_df):7}")

content_df.sample(5)

In [None]:
start_time = time.time()
LEC_RECENT_ROLL = 10
SESSION_DURATION = 15 * 60 * 1000

if not load_from_file:

    user_df = (pd.read_feather(
        "../input/riiid-train-data-multiple-formats/riiid_train.feather",
        columns=['user_id', 'answered_correctly', 'timestamp', 'content_id', 
                 'prior_question_elapsed_time', 'prior_question_had_explanation']))
    
    # merge with part to be able to gather part wise statistics
    user_df = user_df.merge(
        content_df[['part', 'c_mean', 'bundle_id']], left_on='content_id', 
        right_index=True, how='left')
    
    # create the uf_bundle feature
    uf_bundle = user_df.groupby("user_id")['bundle_id'].first().rename("uf_bundle")
    user_df.drop(['bundle_id', 'content_id'], axis=1, inplace=True)
    
    # before fitering out the lectures, we need to calc the timestamp
    # based features, once done we filter them out
    
    user_df['sessions'] = user_df.groupby("user_id")['timestamp'].transform(lambda x: rt_func(x.values))
    user_df['sessions'] = (user_df['sessions'] > SESSION_DURATION).astype('uint8')
    
    sess = pd.concat([
        user_df.groupby('user_id')['sessions'].sum().rename("sessions"),    
        user_df.groupby('user_id')['sessions'].agg(
            lambda x: np.argmax(x.values[::-1]) if x.any() else len(x) - 1).rename("sess_event_count")
    ], axis=1)
    
    user_ts = user_df.groupby(['user_id', 'part'])['timestamp'].max().unstack().fillna(0)
    user_ts.columns = [f'ts_max_{i}' for i in range(1, 8)]
    
    user_ts = pd.concat([
        user_df.groupby("user_id")['timestamp'].agg(ts_max='max', uic='count'), 
        user_ts
    ], axis=1)
    
    # drop sessions, no longer needed
    user_df.drop(['sessions', 'timestamp'], axis=1, inplace=True)
    
    # create the lec_recent_feature
    lec_recent = user_df[['user_id', 'answered_correctly']].groupby("user_id").tail(LEC_RECENT_ROLL)
    lec_recent['content_type_id'] = lec_recent['answered_correctly'].map({-1: 1, 0: 0, 1: 0})
    lec_recent.drop("answered_correctly", axis=1, inplace=True)
    lec_recent = (
        lec_recent.groupby('user_id')['content_type_id']
        .apply(lambda x: np.where(x)[0][-1] + 1 if x.any() else 0)
        .rename("lec_recent")
    )
    
    ############ TIME STAMP BASED FEATURE GENERATION ENDS HERE ############
    ## filtering out the lectures
    user_df = user_df[user_df.answered_correctly != -1]
    
    # casting pqhe for successful aggregation
    user_df['prior_question_had_explanation'] = user_df['prior_question_had_explanation'].astype(bool).fillna(False)
    
    # generate the pq features
    pq_sum = user_df.groupby("user_id")[['prior_question_had_explanation', 'prior_question_elapsed_time']].sum()
    
    # drop these features, no longer needed
    user_df.drop(['prior_question_had_explanation', 'prior_question_elapsed_time'], axis=1, inplace=True)
    gc.collect()
    
    # loading pqhe_shifted from save file, can concat only with filtered df
    seen_exp = pd.read_feather("./pq_shifted.feather", columns=['index', 'pqhe_shifted']).set_index('index')
    user_df = pd.concat([user_df, seen_exp], axis=1)
    
    # creating seen_exp_when_* features
    seen_exp = user_df.groupby(['user_id', 'answered_correctly'])['pqhe_shifted'].sum().unstack()
    seen_exp.columns = ['seen_exp_when_wrong', 'seen_exp_when_right']
    seen_exp.fillna(0, inplace=True)
    user_df.drop(['pqhe_shifted'], axis=1, inplace=True)
    
    # calculate the sum of answered_correctly for h_mean later on, User Correct Sum
    u_crct_sum = user_df.groupby("user_id")['answered_correctly'].sum().rename("ucs")
    
    # calculate up_org_sum (unmodified part wise) for uph_mean later on
    up_org_sum = user_df.groupby(['user_id', 'part'])['answered_correctly'].agg('sum').unstack()
    up_org_sum = up_org_sum.astype("float32").fillna(0)
    
    # modify the answered_correctly (we are going to weigh it based on c_mean)
    user_df['answered_correctly'] = modify_ac(user_df['answered_correctly'].values, user_df['c_mean'].values)
    
    # drop the timestamp, and c_mean no longer needed
    user_df.drop(["c_mean"], axis=1, inplace=True)
    gc.collect()
    
    user_df = user_df.groupby(['user_id', 'part'])['answered_correctly'].agg(['sum', 'count']).unstack()
    
    # impute missing values before adding
    user_df = user_df.astype('float32').fillna(0)
    user_df['u_sum'] = user_df['sum'].sum(1)
    user_df['u_count'] = user_df['count'].sum(1)

    # concat ts and u related data
    user_df = pd.concat([
        user_ts, sess, lec_recent, up_org_sum, 
        user_df, uf_bundle, u_crct_sum, 
        pq_sum, seen_exp
    ], axis=1)
    
    del user_ts, pq_sum, u_crct_sum, lec_recent
    del uf_bundle, sess, seen_exp
    gc.collect()
    
    # rename the aggregated columns
    user_df.columns = (
        ['ts_max', 'uic'] + [f'ts_max_{i}' for i in range(1, 8)] + 
        ['sess_max', 'sess_event_sum'] + ['lec_recent'] +
        [f'ups{i}' for i in range(1, 8)] + 
        [f's{i}' for i in range(1, 8)] + 
        [f'c{i}' for i in range(1, 8)] + 
        ['us', 'uc'] + ['uf_bundle'] + 
        ['ucs'] + ['pqhe_sum', 'pqet_sum'] + 
        ['seen_exp_when_wrong', 'seen_exp_when_right'])
    
else:
    
    user_df = pd.read_csv("../input/riiid-final-model-inputs/user-df.csv", index_col=0)

## type cast to save space
user_df['lec_recent'] = user_df['lec_recent'].astype('uint8')
user_df[[f'ups{i}' for i in range(1, 8)]] = user_df[[f'ups{i}' for i in range(1, 8)]].astype("uint16")
user_df[[f'c{i}' for i in range(1, 8)]] = user_df[[f'c{i}' for i in range(1, 8)]].astype('uint16')
user_df[['uic', 'uc', 'ucs', 'pqhe_sum']] = user_df[['uic', 'uc', 'ucs', 'pqhe_sum']].astype("uint16")
user_df[['sess_event_sum', 'sess_max']] = user_df[['sess_event_sum', 'sess_max']].astype("uint16")
user_df[['seen_exp_when_wrong', 'seen_exp_when_right', 'uf_bundle']] = (
    user_df[['seen_exp_when_wrong', 'seen_exp_when_right', 'uf_bundle']].astype("uint16"))

print (f"Time Elapsed: {(time.time() - start_time):7.2f} Sec")
print (f"Memory Usage: {user_df.memory_usage(deep=True).sum() / (2**20):7.2f}", 'MB')
print (f"Length      : {len(user_df):8}")

user_df.sample(5).T


Pandas apply would take a lot of time. So let's try to parallelize them using multiprocessing. Source: https://stackoverflow.com/questions/26784164/pandas-multiprocessing-apply

In [None]:
from multiprocessing import  Pool
from functools import partial
import sys

max_q = ques.question_id.max() + 1

def to_ba(indices, max_q=max_q):
    'Function to convert indices to bitarray'
    ba = np.zeros(max_q, dtype=bool)
    ba[indices] = 1
    return bitarray(list(ba))

def parallelize(data, func, num_of_processes=8):
    data_split = np.array_split(data, num_of_processes)
    pool = Pool(num_of_processes)
    data = pd.concat(pool.map(func, data_split))
    pool.close()
    pool.join()
    return data

def run_on_subset(func, data_subset):
    return data_subset.apply(func)

def parallelize_on_rows(data, func, num_of_processes=8):
    return parallelize(data, partial(run_on_subset, func), num_of_processes)

start_time = time.time()

if not load_from_file:

    repeat_c = (pd.read_feather(
        "../input/riiid-train-data-multiple-formats/riiid_train.feather",
        columns=['user_id', 'content_id', 'answered_correctly']))

    repeat_c = repeat_c[repeat_c['answered_correctly'] != -1]
    repeat_c = repeat_c.groupby("user_id")['content_id'].unique()

    repeat_c = parallelize_on_rows(repeat_c, to_ba)
    
else:
    
    repeat_c = pd.read_pickle("../input/riiid-final-model-inputs/repeat-c.pkl")

print (f"Time Elapsed: {(time.time() - start_time):6.2f} Sec")
print (f"Memory Usage: {sys.getsizeof(repeat_c) / (2 ** 20):6.2f} MB")
print (f"{len(repeat_c)} users have been extracted!")

repeat_c.sample(3)

In [None]:
start_time = time.time()

# alter this window to alter the data gen
ROLL_WINDOW = 10
UROLL_NULL_FILL = -2

if not load_from_file:

    u_roll = (pd.read_feather(
        "../input/riiid-train-data-multiple-formats/riiid_train.feather",
        columns=['user_id', 'answered_correctly', 'content_id']))
    
    # filtering out the lectures
    u_roll = u_roll[u_roll.answered_correctly != -1]
    
    # retain only last window required number of elements
    u_roll = u_roll.groupby("user_id").tail(ROLL_WINDOW)
    
    # modify answered_correctly
    u_roll = u_roll.merge(content_df['c_mean'], left_on='content_id', right_index=True, how='left')
    u_roll['answered_correctly'] = modify_ac(u_roll['answered_correctly'].values, u_roll['c_mean'].values)
    
    # drop columns, no longer needed
    u_roll.drop(['c_mean', 'content_id'], axis=1, inplace=True)
    gc.collect()
    
    # we would be left shifting new responses as we receive
    # so padding missing users from left
    u_roll = u_roll.groupby("user_id")['answered_correctly'].apply(
        lambda x: np.concatenate([np.repeat(UROLL_NULL_FILL, ROLL_WINDOW - len(x)), x.values])
    )
    
    # convert to csv friendly format to save space
    u_roll = parallelize_on_rows(u_roll, pd.Series)
    gc.collect()
    
else:
    
    u_roll = pd.read_csv("../input/riiid-final-model-inputs/u_roll.csv", index_col=0)
    
# perform a type conversion
u_roll = u_roll.astype("float32")

print (f"Time Elapsed: {(time.time() - start_time):7.2f} Sec")
print (f"Memory Usage: {u_roll.memory_usage(deep=True).sum() / (2**20):7.2f}", 'MB')
print (f"Length      : {len(u_roll):7}")

u_roll.sample(5)

In [None]:
start_time = time.time()

# alter this window to alter the data gen
ROLL_WINDOW_PQET = 10
PQET_ROLL_NULL_FILL = -1

if not load_from_file:

    pqet_roll = (pd.read_feather(
        "../input/riiid-train-data-multiple-formats/riiid_train.feather",
        columns=['user_id', 'prior_question_elapsed_time', 'answered_correctly']))
    
    # filtering out the lectures
    pqet_roll = pqet_roll[pqet_roll.answered_correctly != -1]
    
    # drop columns, no longer needed
    pqet_roll.drop(['answered_correctly'], axis=1, inplace=True)
    
    # retain only last window required number of elements
    pqet_roll = pqet_roll.groupby("user_id").tail(ROLL_WINDOW_PQET)
    
    # we would be left shifting new responses as we receive
    # so padding missing users from left
    pqet_roll = pqet_roll.groupby("user_id")['prior_question_elapsed_time'].apply(
        lambda x: np.concatenate([np.repeat(PQET_ROLL_NULL_FILL, ROLL_WINDOW_PQET - len(x)), x.values])
    )
    
    # convert to csv friendly format to save space
    pqet_roll = parallelize_on_rows(pqet_roll, pd.Series)
    gc.collect()
    
    # fill the missing values (pqet first value for new user is always missing)
    pqet_roll = pqet_roll.fillna(PQET_ROLL_NULL_FILL)
    
else:
    
    pqet_roll = pd.read_csv("../input/riiid-final-model-inputs/pqet_roll.csv", index_col=0)
    
# perform a type conversion
pqet_roll = pqet_roll.astype("float32")

print (f"Time Elapsed: {(time.time() - start_time):7.2f} Sec")
print (f"Memory Usage: {pqet_roll.memory_usage(deep=True).sum() / (2**20):7.2f}", 'MB')
print (f"Length      : {len(pqet_roll):7}")

pqet_roll.sample(5)

In [None]:
start_time = time.time()

# alter this window to alter the data gen
TS_RECENCY_PERIOD = 10
TS_ROLL_NULL_FILL = np.nan

if not load_from_file:

    ts_roll = (pd.read_feather(
        "../input/riiid-train-data-multiple-formats/riiid_train.feather",
        columns=['user_id', 'timestamp']))
    
    # retain only last window required number of elements
    ts_roll = ts_roll.groupby("user_id").tail(TS_RECENCY_PERIOD)
    
    # we would be left shifting new responses as we receive
    # so padding missing users from left
    ts_roll = ts_roll.groupby("user_id")['timestamp'].apply(
        lambda x: np.concatenate([np.repeat(TS_ROLL_NULL_FILL, TS_RECENCY_PERIOD - len(x)), x.values])
    )
    
    # convert to csv friendly format to save space
    ts_roll = parallelize_on_rows(ts_roll, pd.Series)
    gc.collect()
    
    # impute missing values with padding
    ts_roll = ts_roll.fillna(TS_ROLL_NULL_FILL)
    
else:
    
    ts_roll = pd.read_csv("../input/riiid-final-model-inputs/ts_roll.csv", index_col=0)

print (f"Time Elapsed: {(time.time() - start_time):7.2f} Sec")
print (f"Memory Usage: {ts_roll.memory_usage(deep=True).sum() / (2**20):7.2f}", 'MB')
print (f"Length      : {len(ts_roll):7}")

ts_roll.sample(5)

Write them down to seperate file for faster loading the next time around:

In [None]:
%%time 

user_df.to_csv("user-df.csv")
content_df.to_csv("content-df.csv")
u_roll.to_csv("u_roll.csv")
repeat_c.to_pickle("repeat-c.pkl")
pqet_roll.to_csv("pqet_roll.csv")
ts_roll.to_csv("ts_roll.csv")

if os.path.exists("../input/riiid-final-model-inputs/pq_shifted.feather"):
    ! cp ../input/riiid-final-model-inputs/pq_shifted.feather ./pq_shifted.feather
    
with open("sample-batches.pkl", 'wb') as f:
    pickle.dump(batches, f)

### The End Pipeline:

The sample test data given does not have lectures in them. This is the reason why *so many* end pipeline fail. We fail to account for the presence of lectures during inference. So we define a simple function that inserts a lecture randomly. We do this only when we are in the *validation Mode*.

Our end pipeline has two modes:
1. Validaton Mode: When we need to test if our end pipe works as intended. Since once we exhaust the iter_test, it can no longer be used.

2. Prediction Mode: Whenever we need to submit the kernel for evaluation.

In [None]:
def insert_lecture(batches):
    'A simple function to randomly insert a lecture in between, for debugging purposes!'
    from copy import deepcopy
    
    temp = deepcopy(batches)
    
    i = np.random.choice(len(temp))
    j = np.random.choice(len(temp[i][0]))
    
    print (f"Lecture inserted at {i+1} batch at {j} index!")
    
    # assign the content as a lecture
    temp[i][0].iloc[j, 4] = 1 # content_type_id
    temp[i][0].iloc[j, 3] = np.random.choice(lectures['lecture_id'].values)
    temp[i][0].iloc[j, 6] = np.nan
    temp[i][0].iloc[j, 7] = np.nan
    
    if len(temp) > i + 1:
        pgac = eval(temp[i + 1][0].iloc[0]['prior_group_answers_correct'])
        pgr = eval(temp[i + 1][0].iloc[0]['prior_group_responses'])
        pgac[j] = -1
        pgr[j] = -1
        
        temp[i + 1][0].iloc[0, -2] = str(pgac)
        temp[i + 1][0].iloc[0, -1] = str(pgr)
        
    return temp

In [None]:
# set to False to bench mark pipeline speeds
SUBMIT = False

if not SUBMIT:
    print ("Validation Mode.")
    iter_test = iter(insert_lecture(batches))
    
    # list to hold the batch wise processed dataframes
    op = []
    
else:
    print ("Prediction Mode.")
    iter_test = env.iter_test()

In [None]:
%%time

i, prev_test = 0, tuple()

for i, batch in enumerate(iter_test):

    ## =========================================================== ##
    ## ====================== Online Learning ==================== ##
    ## =========================================================== ##

    if len(prev_test):

        # masks, u_missing, u_present, etc can be reused
        processed_batch = post_process(prev_test[0], batch[0])
        
        ftrl.fit(dt.Frame(prev_test[1])[:, FTRL_COLS], 
                  dt.Frame(processed_batch.loc[q_mask, 'answered_correctly']))

        for _, ts, user, content, ans, pqhe, pqet in (
            processed_batch[[
                'timestamp', 'user_id', 'content_id', 'answered_correctly', 
                'prior_question_had_explanation', 'prior_question_elapsed_time']].itertuples()):
            
            org_ans = ans
            
            if org_ans == -1:
                part = lectures.loc[lectures.lecture_id == content, 'part'].values[0]
                c_mean = None
                
            else:
                part = content_df.at[content, 'part']
                c_mean = content_df.at[content, 'c_mean']
                
                # adding weight to ans
                ans = (1 - c_mean) if ans == 1 else (- c_mean)
                
                # alter pqhe, pqet
                pqet = pqet if not pd.isna(pqet) else np.nan
                pqhe = int(pqhe) if not pd.isna(pqhe) else 0
                
            if user not in user_df.index:
                
                # first user interaction is always a lecture, else code couldnt have run
                # init with zeros or required padding
                user_df.loc[user] = [0] * user_df.shape[1]
                ts_roll.loc[user] = ([TS_ROLL_NULL_FILL] * (TS_RECENCY_PERIOD - 1)) + [ts]
                              
                repeat_c.at[user] = to_ba([content])
                u_roll.loc[user] = ([UROLL_NULL_FILL] * (ROLL_WINDOW - 1)) + [ans]
                
                pqet_roll.loc[user] = [PQET_ROLL_NULL_FILL] * ROLL_WINDOW_PQET

                # for first user save the bundle_id
                user_df.at[user, 'uf_bundle'] = content_df.at[content, 'bundle_id'] 
            
            else:
                
                ts_roll.loc[user] = ts_roll.loc[user].values.tolist()[1:] + [ts]
                
                if org_ans != -1:
                    u_roll.loc[user] = u_roll.loc[user].values.tolist()[1:] + [ans]
                    
                    temp = repeat_c.at[user]
                    temp[content] = True
                    repeat_c.at[user] = temp
                    
                    pqet_roll.loc[user] = pqet_roll.loc[user].values.tolist()[1:] + [pqet]
                    
            ## update these features regardless of content being a lecture or question                
            user_df.at[user, 'ts_max'] = ts
            user_df.at[user, f'ts_max_{int(part)}'] = ts
            user_df.at[user, 'uic'] = user_df.at[user, 'uic'] + 1
            
            temp = (ts - user_df.at[user, 'ts_max']) > SESSION_DURATION # True if it is a new session
            user_df.at[user, 'sess_max'] = (user_df.at[user, 'sess_max'] + 1 if temp else user_df.at[user, 'sess_max'])
            user_df.at[user, 'sess_event_sum'] = 0 if temp else user_df.at[user, 'sess_event_sum'] + 1
                
            if org_ans == -1: # a lecture
                
                user_df.at[user, 'lec_recent'] = LEC_RECENT_ROLL
                
            else: # not a lecture
                
                temp = user_df.at[user, 'lec_recent']
                user_df.at[user, 'lec_recent'] = (temp -1) if temp > 0 else 0
                
                user_df.at[user, 'uc'] = user_df.at[user, 'uc'] + 1
                user_df.at[user, 'us'] = user_df.at[user, 'us'] + ans

                user_df.at[user, f'c{part}'] = user_df.at[user, f'c{part}'] + 1
                user_df.at[user, f's{part}'] = user_df.at[user, f's{part}'] + ans

                user_df.at[user, f'ups{part}'] = user_df.at[user, f'ups{part}'] + org_ans
                user_df.at[user, 'ucs'] = user_df.at[user, 'ucs'] + org_ans
                
                user_df.at[user, 'pqhe_sum'] = user_df.at[user, 'pqhe_sum'] + pqhe
                
                # the complication is to replicate the expanding_mean of pandas
                if np.isnan(pqet): # first time
                    user_df.at[user, 'pqet_sum'] = np.nan
                    
                elif np.isnan(user_df.at[user, 'pqet_sum']): # when accessing the second time
                    user_df.at[user, 'pqet_sum'] = pqet
                    
                else: # after second time
                    user_df.at[user, 'pqet_sum'] = user_df.at[user, 'pqet_sum'] + pqet
                    
                if pqhe: 
                    if org_ans == 1: # right ans and seen exp
                        user_df.at[user, 'seen_exp_when_right'] = user_df.at[user, 'seen_exp_when_right'] + 1
                        
                    else: # if wrong ans and seen exp
                        user_df.at[user, 'seen_exp_when_wrong'] = user_df.at[user, 'seen_exp_when_wrong'] + 1

    ## =========================================================== ##
    ## =================== Make the predictions ================== ##
    ## =========================================================== ##
    
    # rows mask (lectures filtered out)
    q_mask = batch[0]['content_type_id'] == 0
    
    # we do a reset_index wthout drop, grp_num is saved
    pred = batch[0].loc[q_mask,  [
        'row_id', 'timestamp', 'task_container_id', 'content_id', 
        'user_id', 'prior_question_elapsed_time']].reset_index()
    
    # add all the required columns from content_df
    pred = pd.concat([
        pred, content_df.reindex(pred['content_id']).reset_index(drop=True)], 
        axis=1)
    
    # add u_mean_roll from u_roll
    pred[f'ummr_{ROLL_WINDOW}'] = u_roll.reindex(pred['user_id']).apply(
        lambda x: float(np.ma.masked_values(x, UROLL_NULL_FILL).mean()), axis=1).values
    
    # add u_mean_roll from u_roll
    pred[f'pqetmr_{ROLL_WINDOW_PQET}'] = pqet_roll.reindex(pred['user_id']).apply(
        lambda x: float(np.ma.masked_values(x, PQET_ROLL_NULL_FILL).mean()), axis=1).values
    
    pred['ts_recency_10'] = pred['timestamp'] - ts_roll.reindex(pred['user_id']).iloc[:, 0].values
    pred['ts_recency_5'] = pred['timestamp'] - ts_roll.reindex(pred['user_id']).iloc[:, 4].values
    
    # we save the user_df seperately to perform ops on them
    temp = user_df.reindex(pred['user_id']).reset_index(drop=True)

    # concat part with it for future calc
    temp = pd.concat([temp, pred['part']], axis=1)
    
    # up_mean, uq_count, uwrong_sum, upwrong_sum lec_recent
    temp['up_count'] = temp.apply(lambda x: x[f"c{int(x['part'])}"] , axis=1)
    temp['up_mean'] = temp.apply(lambda x: x[f"s{int(x['part'])}"] / x["up_count"], axis=1)
    temp['uph_mean'] = temp.apply(lambda x: x[f"ups{int(x['part'])}"] / x["up_count"], axis=1)
    temp['up_recency'] = temp.apply(lambda x: x[f"ts_max_{int(x['part'])}"], axis=1)
    
    temp['up_count'] = temp['up_count'] / temp['uc'].values
    temp['uwrong_sum'] = temp['uc'] - temp['ucs'].values
    
    #  Impute the missing values appropriately    
    ## could be na when den is 0 -> No questions attempted
    ## WE donot impute to replicate the training scenario
    temp['u_mean'] = (temp['us'] / temp['uc'].values)
    temp['h_mean'] = (temp['ucs'] / temp['uc'].values)
    
    #  Compute mean using the sum
    ## we do a -1 for pqet since we want to ignore nans, 
    ## similar to what expanding_mean in pandas does
    temp['pqet_mean'] = temp['pqet_sum'] / (temp['uc'].values - 1)
    temp['seen_ratio'] = temp['pqhe_sum'] / temp['uc'].values
    
    # type cast lec recent as False
    temp['lec_recent'] = temp['lec_recent'].fillna(0).astype(bool) # when 0, it will be False
    
    # retain only those user features needed
    temp = temp[['ts_max', 'up_count', 'up_mean', 'u_mean', 'uwrong_sum', 'lec_recent', 
                 'h_mean', 'pqet_mean', 'seen_ratio', 'uic', 'uph_mean', 'uf_bundle',
                'seen_exp_when_right', 'seen_exp_when_wrong', 'up_recency', 
                 'sess_max', 'sess_event_sum']]
    
    pred = pd.concat([pred, temp], axis=1).set_index("group_num")
    
    # adding the uq_per_hr feature
    pred['uq_per_hr'] = pred['uic'] / (pred['timestamp'] / 60000)
    
    # create the harmonic mean feature
    pred['h_mean'] = (2 * pred['h_mean'] * pred['c_mean']) / (pred['h_mean'] + pred['c_mean'])
    pred['uph_mean'] = (2 * pred['uph_mean'] * pred['c_mean']) / (pred['uph_mean'] + pred['c_mean'])
    
    #  now we add the repeat_c feature, we donot use merge    
    pred['repeat_c'] = repeat_c.reindex(pred['user_id']).values
    pred['repeat_c'] = (pred[['repeat_c', 'content_id']].apply(
        lambda x: x['repeat_c'][x['content_id']]  if type(x['repeat_c']) != float 
        else False, axis=1))
    
    pred['ts_max'] = pred['ts_max'].fillna(0)
    pred['up_recency'] = pred['timestamp'] - pred['up_recency']
    pred['response_time'] = pred['timestamp'] - pred['ts_max']
    pred['rt_per_task'] = pred['response_time'] / pred['bundle_q_count']
    
    pred['sessions'] = (pred['response_time'] > SESSION_DURATION).astype(bool)
    pred['sess_event_count'] = np.where(pred['sessions'], 0, pred['sess_event_sum'] + 1)
    pred['sessions'] = pred['sess_max'] + pred['sessions']
    
    # Impute the features for missing users as required
    pred['uf_bundle'] = pred['uf_bundle'].fillna(pred['bundle_id'])
    
    # TODO: Check if fillna works here as intended
    
    pred[['seen_exp_when_wrong', 'seen_exp_when_right', 'up_mean', 'up_count', 
          'up_recency', 'seen_ratio', 'rt_per_task', 'response_time', 'sessions', 
          'sess_event_count', 'uwrong_sum']] = (
        pred[['seen_exp_when_wrong', 'seen_exp_when_right', 'up_mean', 'up_count', 
              'up_recency', 'seen_ratio', 'rt_per_task', 'response_time', 'sessions', 
              'sess_event_count', 'uwrong_sum']].fillna(0))
    
    pred['lgb_pred'] = model.predict(pred)
    
    # save the loaded values to new column
    pred['lgb_75'] = pred['lgb_pred'] > 0.75
    pred['lgb_50'] = pred['lgb_pred'] > 0.5
    pred['lgb_25'] = pred['lgb_pred'] > 0.25

    # we create some meta features to help FTRL
    pred['ummr_10_50'] = pred['ummr_10'] > 0
    pred['h_mean_50'] = pred['h_mean'] > 0.5
    pred['c_mean_50'] = pred['c_mean'] > 0.5    
    pred['c_mean_25'] = pred['c_mean'] > 0.25
    pred['c_mean_75'] = pred['c_mean'] > 0.75
    
    pred['pqet_proximity'] = ((pred['prior_question_elapsed_time'] - pred['right_et_med']).abs() < 
                          (pred['prior_question_elapsed_time'] - pred['wrong_et_med']).abs())
    
    df_to_dt_format(pred) 
    pred['answered_correctly'] = ftrl.predict(dt.Frame(pred)[:, FTRL_COLS]).to_list()[0]

    if SUBMIT:
        env.predict(pred[['row_id', 'answered_correctly']])
        
    else:
        op.append(pred)

    # retaining current batch data for next batch
    prev_test = (batch[0].copy(deep=True), pred)

else:
    print (f"Sucessfully completed after {i + 1} iterations!")