In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
import pandas as pd
import numpy as np
import dask.dataframe as dd
import tensorflow as tf
from tensorflow import keras
import seaborn as sns
import matplotlib.pyplot as plt
import gc
from sklearn.preprocessing import LabelEncoder,StandardScaler
import dill

### Track everything:

1. In this notebook, I first show how to pick a subset of users from the training dataset in a way to balance both positive and negative classes. 
2. I use Autoencoder to reduce the tags into one column.  
3. I preprocessed and did some features engineering on the selected subset, that I will use for further training.  
4. I show how to track the last state of all users in each feature with a database.

We start by converting the dataset into **.feather** format :  https://www.kaggle.com/tchaye59/riiid-dataset-to-feather

In [None]:
path_train = '/kaggle/input/riiid-dataset-to-feather/train.feather'
np.random.seed(123)
tf.random.set_seed(123)

In [None]:
#From https://www.kaggle.com/rohanrao/ashrae-half-and-half
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype
def reduce_mem_usage(df, use_float16=False,verbose=True):
    """
    Iterate through all the columns of a dataframe and modify the data type to reduce memory usage.        
    """
    
    start_mem = df.memory_usage().sum() / 1024**2
    if verbose :print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype("category")

    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    if verbose:print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
%%time
target = 'answered_correctly'
train_df = pd.read_feather(path_train)

In [None]:
def get_splits(df,n=10):
    res = []
    while df.shape[0]:
        tmp_df = df[['user_id','answered_correctly']].groupby('user_id').agg({'answered_correctly':['sum','count']}) 
        tmp_df.columns = ['class1','class_count']
        tmp_df['class2'] = tmp_df.class_count - tmp_df.class1
        # The difference between positive and negative class counts
        tmp_df['class_diff'] = tmp_df.class1.values - tmp_df.class2.values
        tmp_df.reset_index(inplace=True)
        
        s = 0
        n_max = 1000000 # This helps in controlling the size of each split
        user_ids = []
        # We compute the diff between the 2 classes count and separate positive and negative values
        negative_df = tmp_df[tmp_df.class_diff <= 0]
        positive_df = tmp_df[tmp_df.class_diff > 0]
        
        # The order doesn't matter, but I prefer to give priority to big values
        negative_df.sort_values(by=['class_count'],inplace=True,ascending=True,)
        positive_df.sort_values(by=['class_count'],inplace=True,ascending=False,)
        
        #We collect user form the negative diff values
        for i, row in negative_df.iterrows():
            user_ids.append(row['user_id'])
            s+=row['class_diff']
            if s < -n_max:
                break
                
        # 's' is negative  we collect user form the positive diff values to balance the two classes
        for i, row in positive_df.iterrows():
            user_ids.append(row['user_id'])
            s+=row['class_diff']
            if s >= 0:
                break
        
        if len(user_ids) == 1:
            user_ids = list(positive_df.user_id.values)
            user_ids.extend(list(negative_df.user_id.values))
            res.append(user_ids)
            break
            
        res.append(user_ids)
        if len(res) >= n:
            break
        # The remove the users from df and repeat the process
        df = df[~df.user_id.isin(user_ids)]
    return res
        

**get_splits** return list of user_ids. All splits classes are equally balanced except for the last one.

In [None]:
%%time
splits = get_splits(train_df[['user_id','answered_correctly']],n=10)
[len(s) for s in splits if s]

In [None]:
# let's look at the totals of first split
train_df.loc[train_df.user_id.isin(splits[0])].groupby('answered_correctly').size()#It can be perfect but here we are

In [None]:
del train_df
gc.collect()

## Reduce tags using AutoEncoder

In [None]:
content_df = pd.read_csv("../input/riiid-test-answer-prediction/questions.csv")
content_df.rename(columns={'question_id': 'content_id'},inplace=True)

In [None]:
def tags_fn(x):
    if type(x) == str:
        x=[int(i) for i in x.split()]
    if type(x) in [int,float]:
        x=[x,]
    return sorted([y for y in x if y!=float('nan')])
# Convert to list
tags  = list(map(tags_fn,content_df.tags.fillna('0').values))
# Get the max tag size
max_tags = max([len(x) for x in tags if x])
# Pad Sequences
tags = tf.keras.preprocessing.sequence.pad_sequences(tags)
# Normalization
tags = StandardScaler().fit_transform(tags)
max_tags

In [None]:
encoder = keras.models.Sequential(
            [
                keras.layers.Dense(max_tags, input_shape=(max_tags,)),
                keras.layers.Dense(4, activation="tanh"),
                keras.layers.Dense(2, activation="tanh"),
                keras.layers.Dense(1, activation="tanh"),
                
            ]
)
decoder = keras.models.Sequential(
            [
                keras.layers.Dense(1, activation="tanh", input_shape=(1,)),
                keras.layers.Dense(2, activation="tanh"),
                keras.layers.Dense(4, activation="tanh"),
                keras.layers.Dense(max_tags),
            ]
)
autoencoder = keras.models.Sequential(
            [encoder,decoder]
) 
autoencoder.compile(loss=keras.losses.mse, 
                    optimizer=keras.optimizers.Adam(lr=0.00001))

In [None]:
# Avoid training the AE every time we commit the notebook
! cp /kaggle/input/riiid-preprocess-and-balance-the-dataset/autoencoder.h5 ./

In [None]:
%%time
if not os.path.exists('./autoencoder.h5'):
    autoencoder.fit(tags,tags,
                    shuffle=True,
                    batch_size=256,
                    epochs=30000)
    autoencoder.save('autoencoder.h5')
else:
    autoencoder = keras.models.load_model('autoencoder.h5')
encoder = autoencoder.layers[0]
decoder = autoencoder.layers[1]
autoencoder.summary()

In [None]:
autoencoder.evaluate(tags,tags)

In [None]:
# get the tag
content_df['tag'] = encoder(tags).numpy()
del content_df['tags']

#### Add some features to content_df

In [None]:
#bundle_size
tmp = content_df[['bundle_id','content_id']].groupby('bundle_id').agg({'content_id':'count'})
tmp.columns = ['bundle_size',]
content_df = content_df.merge(tmp,how='left',on='bundle_id')

#part_size
tmp = content_df[['part','content_id']].groupby('part').agg({'content_id':'count'})
tmp.columns = ['part_size',]
content_df = content_df.merge(tmp,how='left',on='part')

#bundle_part_size
tmp = content_df[['part','bundle_id']].groupby(['part','bundle_id']).tail(1)
tmp = tmp.groupby(['part',]).agg({'bundle_id':'count'})
tmp.columns = ['part_bundle_size',]
content_df = content_df.merge(tmp,how='left',on=['part',])
content_df.fillna(0,inplace=True)
del tmp
content_df = reduce_mem_usage(content_df)
gc.collect()

### Utility functions

In [None]:
# metadata is typically a dictionary that holds the last user's state 

In [None]:
# Add or append rows to a key in the metadata
def metadata_append(metadata,df,key):
    if type(key) == str:
        key = [key,]
    key_h = tuple(key)
        
    if all([k in df.columns for k in key]):
        df.set_index(key,inplace=True)
    if key_h not in metadata:
        metadata[key_h] = df
    else:
        metadata[key_h] = metadata[key_h].append(df)

# Add or append rows to a key in the metadata. 
# The difference with the previous one is that if the row already exists, 
# it is updated by the addition operation
def metadata_add_append(metadata,df,key):
    if type(key) == str:
        key = [key,]
    key_h = tuple(key)
    if all([k in df.columns for k in key]):
        df.set_index(key,inplace=True)
        
    if key_h not in metadata:
        metadata[key_h] = df
    else:
        metadata[key_h] = metadata[key_h].add(df,fill_value=0)

In [None]:
# This function will add some rolling features to df
def user_roll_features(df,cols):
    metadata = {}
    for col in cols:
        # Cols to build
        c1 = f'{col}_roll_count'
        c2 = f'{col}_roll_sum'
        
        tmp_df = df[[col,'user_id','answered_correctly','row_id']]
        grp = tmp_df.groupby([col,'user_id'])
        tmp_df[c1] = grp.answered_correctly.cumcount()+1
        tmp_df[c2] = grp.answered_correctly.cumsum()
        
        del tmp_df['answered_correctly']
        
        win_cols = []
        # In windows
        for win in [50,10]:
            sum_col = f'{col}_win{win}_sum'
            count_col = f'{col}_win{win}_count'
            win_id = f'win{win}_id'
            win_cols.extend([sum_col,count_col])
            
            tmp_df[sum_col] = tmp_df[c2]
            tmp_df[count_col] = tmp_df[c1]
            tmp_df[win_id] = tmp_df[count_col] // win
            
            # pred_id
            tmp = tmp_df[['user_id',col]]
            tmp[win_id]  = (tmp_df[win_id]-1).values
            tmp = tmp.merge(tmp_df.groupby(['user_id',col,win_id]).tail(1),how='left',on=[win_id,'user_id',col])
            tmp.fillna(0,inplace=True)
            
            tmp_df[sum_col] = tmp_df[sum_col].values - tmp[sum_col].values
            tmp_df[count_col] = tmp_df[count_col].values - tmp[count_col].values
        
        #save metadata 
        grp = tmp_df.groupby(['user_id',col]).tail(1)
        metadata[('user_id',col)] = grp.set_index(['user_id',col])[[c1,c2,*win_cols]]
        
        # Now we can shift generated cols to prevent leaking some information
        tmp_df['task_container_id'] = df.task_container_id.values
        tmp = tmp_df.groupby(['user_id',col,'task_container_id']).tail(1)
        for c in [*win_cols,c1,c2]:
            tmp[c] = tmp[[col,c]].groupby([col])[c].shift(1,fill_value=0)
        # Set the first state of each user to 0
        row_id = tmp.groupby(['user_id',col]).head(1).row_id.values
        tmp.loc[tmp.row_id.isin(row_id),[*win_cols,c1,c2]] = 0
        
        #add to df
        k = [col,'user_id','task_container_id']
        tmp_df = tmp_df[k].merge(tmp,how='left',on=k)
        for c in [*win_cols,c1,c2]:
            df[c] = tmp_df[c].values
    return df,metadata

In [None]:
def process(data,metadata={}):
    global content_df

    # Merge content file 
    data = data.merge(content_df,how='left',on=['content_id',])
    
    ### Build User Specific Statistics
    
    # User performances over time
    tmp_df = data[['user_id','answered_correctly','task_container_id']]
    grp = tmp_df.groupby(['user_id'])
    tmp_df['user_id_roll_sum'] = grp['answered_correctly'].cumsum().values
    tmp_df['user_id_roll_count'] = (grp['answered_correctly'].cumcount()+1).values
    # also check user's performance in small windows
    for win in [50,10]:
        sum_col = f'user_id_win{win}_sum'
        count_col = f'user_id_win{win}_count'
        tmp_df[sum_col] = grp['answered_correctly'].rolling(win,min_periods=1).sum().values
        tmp_df[count_col] = grp['answered_correctly'].rolling(win,min_periods=1).count().values
    del tmp_df['answered_correctly']
    
    #We only need the last container's state
    tmp_df = tmp_df.groupby(['user_id','task_container_id']).tail(1)
    grp = tmp_df.groupby(['user_id',])
    # shift to avoid leakage
    tmp_df['user_id_roll_count'] = grp.user_id_roll_count.shift(1,fill_value=0)
    tmp_df['user_id_roll_sum'] = grp.user_id_roll_sum.shift(1,fill_value=0)
    for win in [50,10]:
        sum_col = f'user_id_win{win}_sum'
        count_col = f'user_id_win{win}_count'
        tmp_df[sum_col] = grp[sum_col].shift(1,fill_value=0)
        tmp_df[count_col] = grp[count_col].shift(1,fill_value=0)
    del grp
    # merge with data
    data = data.merge(tmp_df,how='left',on=['user_id','task_container_id'])
    
    # Track last state
    user_metadata_df = tmp_df.groupby(['user_id']).tail(1)
    del user_metadata_df['task_container_id']
    
    # Shift these columns to prevent leakage
    tmp_df = data[['user_answer','answered_correctly','user_id','timestamp']] .groupby('user_id').shift(1).fillna(-1)
    for col in tmp_df.columns:
        data[f'prev_{col}'] = tmp_df[col].values
    tmp_df = data[['user_id',*[f'prev_{col}' for col in tmp_df.columns]]].groupby('user_id').tail(1)
    user_metadata_df = user_metadata_df.merge(tmp_df,how='left',on=['user_id',])
    metadata_append(metadata,user_metadata_df,'user_id')# save last state
    del user_metadata_df
    
    #Compute the duration between last two prior_question_elapsed_time
    tmp_df = data[['user_id','timestamp']]
    data['prior_duration'] = tmp_df.timestamp.values-tmp_df.groupby('user_id').timestamp.shift(1,fill_value=0).values
    
    
    # Add some rolling features
    cols = ['content_id','bundle_id']
    data,metadata_tmp = user_roll_features(data,cols)
    # Update metadata 
    for key in metadata_tmp:
        tmp = metadata_tmp[key]
        if key == ('user_id','content_id'):
            # It is very likely that the user answers well a question which he had already been answered correctly
            #The prev user's answer in the same content.
            tmp_df = data[['content_id','user_id','answered_correctly','row_id',]]
            grp =  tmp_df.groupby(['content_id'])
            tmp_df['prev_content_answered_correctly'] = grp.answered_correctly.shift(1).fillna(0)
            # The first state is unknow so we set it to -1
            row_id = tmp_df.groupby(['content_id','user_id',]).head(1).row_id.values
            tmp_df.loc[tmp_df.row_id.isin(row_id),'prev_content_answered_correctly'] = -1
            data['prev_content_answered_correctly'] = tmp_df.prev_content_answered_correctly.values
            # metadata: We will get the last answer the user gave to content_id
            tmp_df = tmp_df[['content_id','user_id','prev_content_answered_correctly',]]
            tmp_df = tmp_df.groupby(['content_id','user_id',]).tail(1)
            tmp_df.set_index(['user_id','content_id'],inplace=True)
            tmp = tmp.merge(tmp_df,how='left',on=['user_id','content_id'])
            del grp,tmp_df
        metadata_append(metadata,tmp,key)
    
    print("Global statistics")
    # Global statistics
    # content
    tmp_df = data[['content_id','answered_correctly']].groupby(['content_id'])
    tmp_df = tmp_df.agg({'answered_correctly': ['count','sum']})
    tmp_df.columns = ['content_count','content_sum']
    metadata_add_append(metadata,tmp_df,'content_id')

    # bundle 
    tmp_df = data[['bundle_id','answered_correctly']]
    tmp_df = tmp_df.groupby(['bundle_id']).agg({'answered_correctly':['count','sum']})
    tmp_df.columns = ['bundle_count','bundle_sum']
    metadata_add_append(metadata,tmp_df,'bundle_id')
        
    #no need content_type_id
    del data['content_type_id']
    data.fillna(0,inplace=True)
    return data,metadata

## Process the training dataset

In [None]:
# Ignore the last split
splits = splits[:-1]

In [None]:
%%time
metadata = {}
for i,ids in enumerate(splits):
    print(f"Split--> {i+1}")
    
    df = pd.read_feather(path_train,)
    # exclude lectures
    df = df[df.answered_correctly!=-1]
    df.prior_question_had_explanation.fillna(False,inplace=True)
    df.replace([np.inf, -np.inf], 999999999,inplace=True)

    data = df.loc[df.user_id.isin(ids)]
    assert data.shape[0]
    del df
    gc.collect()
    
    print(f"Processing {i+1} | Shape {data.shape}")
    data,metadata = process(data,metadata)
    print(f"Metadata {i+1}")
    for key in metadata:
        metadata[key].fillna(0,inplace=True)
        metadata[key] =  reduce_mem_usage(metadata[key],verbose=False)
    data = reduce_mem_usage(data)
    data.prior_question_had_explanation  = data.prior_question_had_explanation.values.astype(np.int8)  
    data.to_feather(f'train{i}.feather')
    del data
    gc.collect()
del splits

## Collect metadata of remaining users

In [None]:
%%time
df = pd.read_feather(path_train,)
# exclude lectures
df = df[df.answered_correctly!=-1]
df = df.loc[~df.user_id.isin(metadata[('user_id',)].index.values)]
df.prior_question_had_explanation.fillna(False,inplace=True)
df.replace([np.inf, -np.inf], 999999,inplace=True)

user_ids = df['user_id'].unique()

In [None]:
%%time
step = 10000
for i in range(0,user_ids.shape[0],step):
    users = user_ids[i:min(i+step,user_ids.shape[0])]
    selector = df.user_id.isin(users)
    data = df.loc[selector]
    df = df.loc[~selector]
    gc.collect()
    
    print(f"Processing {i}/{user_ids.shape[0]} | Shape {data.shape}")
    _,metadata = process(data,metadata,)
    print(f"Metadata {i}/{user_ids.shape[0]}")
    for key in metadata:
        metadata[key].fillna(0,inplace=True)
        metadata[key] =  reduce_mem_usage(metadata[key],verbose=False)
    del data
    gc.collect()    

We will save the metadata and perform the migration to SQLite DB in : https://www.kaggle.com/tchaye59/riiid-metadata-to-sqlite


To see how I train and submit my model you can check: https://www.kaggle.com/tchaye59/riiid-work-with-the-full-state-using-sqlalchemy

In [None]:
with open('metadata.dill', 'wb') as file:
    dill.dump(metadata, file)
content_df.to_feather('content.feather')