# Riiid TFRecords
Creating TFRecord files for Riiid.

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.metrics import roc_auc_score

import time
import random

## Load and prepare data

In [None]:
dtype = {
    'answered_correctly': 'int8',
    # 'row_id': 'int64',
    'timestamp': 'int64',
    'user_id': 'int32',
    'content_id': 'int16',
    #'content_type_id': 'int8',
    'task_container_id': 'int16',
    # 'user_answer': 'int8',
    'prior_question_elapsed_time': 'float32'
    #'prior_question_had_explanation': 'boolean'
    }

dtype_questions = {
    'question_id': 'int32',
    # 'bundle_id': 'int32',
    # 'correct_answer': 'int8',
    'part': 'int8'
    #'tags': 'object'
    }

In [None]:
train = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/train.csv',
                    usecols=dtype.keys(),
                    dtype=dtype)

#only keep final 1000 interactions of each user
train = train.groupby('user_id').tail(500)

In [None]:
questions = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/questions.csv',
                        usecols = dtype_questions.keys(),
                        dtype = dtype_questions)

#encode tags
#tag_map = {tag : i + 1 for i, tag in enumerate(questions.tags.unique())}
#questions['tags'] = questions['tags'].map(tag_map)

In [None]:
%%time
# pre processing
#remove lectures
train = train.loc[train.answered_correctly != -1, :]

#part
train = train.join(questions, on = 'content_id')

#prior q
#0 padding, 1 start, 2...n bins
train.prior_question_elapsed_time = train.prior_question_elapsed_time.fillna(0)
train.prior_question_elapsed_time, bins = pd.qcut(train.prior_question_elapsed_time, 150,
                                           labels = False, retbins = True, duplicates = 'drop')
train.prior_question_elapsed_time = train.prior_question_elapsed_time + 1
bins[-1] = float('inf')

#lagtime
#0 padding, 1 start, 2...n bins
#train['lagtime'] = train['timestamp'] - train.groupby('user_id')['timestamp'].shift(1, fill_value = 0.0)
#train.lagtime, bins_lt = pd.qcut(train.lagtime, 250, labels = False, retbins = True, duplicates = 'drop')
#train.lagtime = train.lagtime + 1
#bins_lt[-1] = float('inf')

#timestamp
#0 padding, 1 start, 2...n bins
train.timestamp, bins_ts = pd.qcut(train.timestamp, 250, labels = False, retbins = True, duplicates = 'drop')
train.timestamp = train.timestamp + 1
bins_ts[-1] = float('inf')

#content_id
train['content_id'] += 1

#container_id
train['task_container_id'] += 1

#answered correctly: 0 start, 1 incorrect, 2 correct
train['answered_correctly'] += 1

In [None]:
window_size = 100
PADDING_TOKEN = 0

#features
x_cols = ['content_id', 'part', 'task_container_id']
y_cols = ['answered_correctly', 'prior_question_elapsed_time', 'timestamp']

cols = x_cols + y_cols

#vocab sizes
content_id_size = questions.question_id.max() + 2
prior_q_size = train.prior_question_elapsed_time.max() + 1
#tags_size = train.tags.max() + 1
timestamp_size = train.timestamp.max() + 1
part_size = train.part.max() + 1
container_size = 10001

input_vocab_sizes = [content_id_size, part_size, container_size]
input_vocab_sizes = [int(x) for x in input_vocab_sizes]

target_vocab_sizes = [3, prior_q_size, timestamp_size]
target_vocab_sizes = [int(x) for x in target_vocab_sizes]

In [None]:
print(input_vocab_sizes)
print(target_vocab_sizes)

In [None]:
def strided_window(a, w, s, ret_weights = False):
    '''
    Applies a rolling window to the array which moves with a given stride length.
    a : 2d np.array
    w : window length int
    s : stride length int
    ret_weights : wether to return the weights for each row
    
    Returns: array (n_windows, w, a.shape[1]), weights (n_windows, w)
    
    s0 is the number of bytes moved to advance a row in a. s1 is the analog for columns
    m = number of rows in a
    n = number of columns (features) in a
    
    We are moving with stride length s so the number of windows is np.ceil((m-w+1)/s)
    
    Sample weights:
    Weight of each row is 1/n where n is the number of times it appears across all windows.
    
    
    '''
    s0, s1 = a.strides
    m, n = a.shape
    
    windowed_arr = np.lib.stride_tricks.as_strided(
        a,
        shape = (int(np.ceil((m-w+1)/s)), w, n),
        strides = (s * s0, s0, s1)
    )
        
    if ret_weights:
        m_lower = s * ((m - w) // s)
        rows = np.arange(m_lower)
        weights = 1/np.minimum(np.minimum(1 + rows//s, w // s), 1 + (m_lower - rows)//s)
        all_weights = np.zeros((m,1), np.float32)
        all_weights[:m_lower,0] = weights
                        
        s0, s1 = all_weights.strides
        
        windowed_weights = np.lib.stride_tricks.as_strided(
            all_weights,
            shape = (int(np.ceil((m-w+1)/s)), w, 1),
            strides = (s * s0, s0, s1)
        )
        
        return windowed_arr, windowed_weights
    else:
        return windowed_arr

In [None]:
class DataGenerator(tf.keras.utils.Sequence):
    '''
    Data generator for processing user time series and returning batches of
    user histories and the correctness of their answer to the next question.
    Takes a full dataset of user interactions and creates a training set from
    a subset of the users (of given size) of time series and targets.
    '''
    
    def __init__(self, data, x_cols, y_cols, window_size, subset_size = 0.2,
                 stride = 10, batch_size = 32, pad_token = 0):
        '''
        data : full dataset
        x_cols : cols to use for input
        y_cols : col for target
        window_size : length of history to use
        stride : stride length for windowing
        pad_token : value to pad with
        subset_size : proportion of users to use for training data
        batch_size : batch size for training
        '''
        
        all_users = data.user_id.unique()
        random.shuffle(all_users)
        
        self.data = data
        self.all_users = all_users
        self.n_users = data.user_id.nunique()
        self.x_cols = x_cols
        self.y_cols = y_cols
        self.window_size = window_size
        self.stride = stride
        self.pad_token = pad_token
        self.subset_size = subset_size
        self.batch_size = batch_size
        
        self.calls = 0
        
    def update_users(self):
        '''
        Selects users to use by cycling through the dataset
        in chunks of size subset_size * n_users.
        '''
        start = int(self.n_users * ((self.subset_size * self.calls) % 1))
        end = int(np.min([start + self.n_users * self.subset_size, self.n_users]))
        
        self.users = self.all_users[start:end]
        self.calls += 1
        
    def random_update_users(self):
        ''' 
        Randomly selects a subset of the users
        of size = subset_size * n_users.
        '''
        size = int(self.n_users * self.subset_size)
        users = np.random.choice(self.n_users, size)
        self.users = users
        
    def set_batch_size(self, batch_size):
        self.batch_size = batch_size
        
    def regenerate(self, random_choice = False):
        '''prepare initial subset. must be called before the generator
        can be used'''
        
        #choose a subset of users
        if random_choice:
            self.random_update_users()
        else:
            self.update_users()
        
        #generate the training datasets from the users
        self.X_train, self.Y_train, self.weights = self.generate(self.users)
        
    def __len__(self):
        return self.X_train.shape[0]//self.batch_size
        
    def __getitem__(self, idx):            
        idx_l = idx * self.batch_size
        idx_u = (idx + 1) * self.batch_size
        return self.X_train[idx_l:idx_u,:,:], self.Y_train[idx_l:idx_u], self.weights[idx_l:idx_u]
            
    def generate(self, users):
        ''' creates datasets X_train, y_train that are returned through __getitem__'''
        
        #get the features and targets as list of 2d np arrays per user
        group = self.data.loc[self.data.user_id.isin(users)].groupby('user_id')
        X = group[self.x_cols].aggregate(list).apply(lambda x: np.vstack(x).T, axis = 1).tolist()
        Y = group[self.y_cols].aggregate(list).apply(lambda x: np.vstack(x).T, axis = 1).tolist()
        
        #pad so length at least window_size
        X = [np.pad(
            x, ((self.window_size - x.shape[0],0),(0,0))
        ) if x.shape[0] < self.window_size else x for x in X]
        
        Y = [np.pad(
            y, ((self.window_size - y.shape[0],0),(0,0))
        ) if y.shape[0] < self.window_size else y for y in Y]
        
        #apply windowing
        X = [strided_window(x, self.window_size, self.stride, True) for x in X]
        X, ws = zip(*X) #unpack weights
        Y = [strided_window(y, self.window_size, self.stride) for y in Y]
        
        #concatenate
        X = np.concatenate(X, axis = 0) # (n_samples, window_size, features_x)
        Y = np.concatenate(Y, axis = 0) # (n_samples, window_size, features_y)
        ws = np.concatenate(ws, axis = 0) # (n_samples, window_size, 1)
                
        #shuffle
        X, Y, ws = shuffle(X, Y, ws)
        
        #tensors
        #X = tf.convert_to_tensor(X, dtype = tf.int64)
        #Y = tf.convert_to_tensor(Y, dtype = tf.int64)
        
        return X, Y, ws

In [None]:
class DataSampler(tf.keras.utils.Sequence):
    '''
    Sample users based on how much history they have. Select rows at random from each.
    '''
    
    def __init__(self, data, cols, N, window):
        '''
        data : full dataset
        x_cols : cols to use for model
        N : number of users to sample
        '''
        
        user_lengths = data.groupby('user_id').size() # number of rows for each user
        users = user_lengths.index.to_numpy() # user id_s
        self.user_lengths = user_lengths
        self.user_dist = (user_lengths / user_lengths.sum()).to_numpy()
        self.users = users
        
        self.data = data
        self.cols = cols
        self.N = N
        self.w = window
        
    def sample_data(self):
        '''
        Generate input data as (x, y) from the data by sampling N rows.
        '''
        
        # choose users and rows
        users = np.random.choice(self.users, size = (self.N,), p = self.user_dist)
        
        # subset data
        data = self.data.loc[self.data.user_id.isin(users)]
                
        # get users
        X = [data.loc[data.user_id == user, cols].to_numpy() for user in users]
        
        # user lengths
        lengths = np.array([x.shape[0] for x in X]) # (N,)
        rows = np.random.randint(1, high = lengths)
             
        # extract rows and pad
        X = [user_data[np.max([r-self.w,0]):r] for r, user_data in zip(rows, X)]
        X = [np.pad(x, ((self.w - x.shape[0],0),(0,0))) if x.shape[0] < self.w else x for x in X]
        X = np.array(X) # (n_users, window, n_feats)
        
        return X

In [None]:
#save users for validation
val_users = np.random.choice(train.user_id.unique(), int(0.1 * len(train.user_id.unique())))

In [None]:
# samplers
N = 20000

train_sampler = DataSampler(train.loc[~train.user_id.isin(val_users)], cols, N, window_size)
val_sampler = DataSampler(train.loc[train.user_id.isin(val_users)], cols, N, window_size)

In [None]:
train = None

## Write TFRecords
Write the dataset into a TFRecord file.

In [None]:
def write_tfrec(X, filename):
    
    writer = tf.io.TFRecordWriter(filename)
        
    def _int_feature(array):
        return tf.train.Feature(int64_list = tf.train.Int64List(value = array))
    
    def _float_feature(array):
        return tf.train.Feature(float_list = tf.train.FloatList(value = array))
    
    for idx in range(X.shape[0]):
        x = X[idx]
        feature = {'content' : _int_feature(x[:,0]),
                   'part' : _int_feature(x[:,1]),
                   'container' : _int_feature(x[:,2]),
                   'correct' : _int_feature(x[:,3]),
                   'prior_q' : _int_feature(x[:,4]),
                   'timestamp' : _int_feature(x[:,5])}
        features = tf.train.Features(feature = feature)
        example = tf.train.Example(features = features)
        serialized = example.SerializeToString()
        writer.write(serialized)

In [None]:
#train
for i in range(60):
    print(i)
    filename = 'riiid_train_{}.tfrecords'.format(i)
    write_tfrec(train_sampler.sample_data(), filename)

In [None]:
#val
for i in range(12):
    print(i)
    filename = 'riiid_val_{}.tfrecords'.format(i)
    write_tfrec(val_sampler.sample_data(), filename)

## Read TFRecords
How to read the TFRecord files into tensors to input to the model.

In [None]:
train_files = ['riiid_train_{}.tfrecords'.format(i) for i in range(125)]
dataset = tf.data.TFRecordDataset(train_files)
maxlen = window_size

In [None]:
def _decode_features(example):
    feature_desc = {'content' : tf.io.FixedLenFeature((maxlen,), tf.int64),
                    'part' : tf.io.FixedLenFeature((maxlen,), tf.int64),
                    'container' : tf.io.FixedLenFeature((maxlen,), tf.int64),
                    'correct' : tf.io.FixedLenFeature((maxlen,), tf.int64),
                    'prior_q' : tf.io.FixedLenFeature((maxlen,), tf.int64),
                    'timestamp' : tf.io.FixedLenFeature((maxlen,), tf.int64)}
    
    example = tf.io.parse_single_example(example, feature_desc)
    
    x = tf.stack([example['content'], example['part'], example['container']], axis = 1)
    y = tf.stack([example['correct'], example['prior_q'], example['timestamp']], axis = 1)
    
    return x, y

In [None]:
dataset = dataset.map(_decode_features)