# Working in progress
Data dumped using the CV strategy by Tito and Marisaka Mozz (`cv1_train` file) using multiprocessing. The iterator is `itertuples()` instead of the usual `iterrows()`.


Reference:
- https://www.kaggle.com/its7171/lgbm-with-loop-feature-engineering
- https://www.kaggle.com/ragnar123/riiid-model-lgbm
- https://www.kaggle.com/ceshine/values-to-numpy-vs-itertuples-vs-iterrows

In [None]:
import pandas as pd
import numpy as np
import gc
from collections import defaultdict
from contextlib import contextmanager
import psutil
import math
from time import time
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import pickle
import random
import os
import sys
from utils import *

# Parallelization
We first compare to see if the `multiprocessing` can recognize the 4 Xeon CPU threads gotten by the system, not just 2 physical cores.

In [None]:
def get_memory(num_var=10):
    for name, size in sorted(((name, sys.getsizeof(value)) for name, value in globals().items()), 
                             key= lambda x: -x[1])[:num_var]:
        print(color(f"{name:>30}:", color=Colors.green), 
              color(f"{get_size(size):>8}", color=Colors.magenta))

get_system()

In [None]:
import multiprocessing as mp
num_cpu = mp.cpu_count()
print(f"Total number of CPU threads: {num_cpu}")

In [None]:
DEBUG = True
FOLD = 1

In [None]:
SEED = 1127 # my boy's bday just for luck
# Function to seed everything
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    
seed_everything(SEED)

# Loading train

In [None]:
train_parquet = '../input/cv-strategy-in-the-kaggle-environment/cv1_train.parquet'
question_file = '../input/riiid-test-answer-prediction/questions.csv'

# Read data
features = ['timestamp', 
           'user_id', 
           'answered_correctly',
           'content_id', 
           'content_type_id', 
           'prior_question_elapsed_time', 
           'prior_question_had_explanation']
train_dtypes = {
    'timestamp': 'int64',
    'user_id': 'int32', 
    'answered_correctly': 'int8', 
    'content_id': 'int16', 
    'content_type_id':'int8', 
    'prior_question_elapsed_time': 'float32', 
    'prior_question_had_explanation': 'bool'
}


with timer("Loading train and valid."):
    df = pd.read_parquet(train_parquet)[features].astype(train_dtypes)


# Delete some trianing data to don't have ram problems
if DEBUG:
    df = df.iloc[:1_000_000]
else:
    df = df.iloc[-20_000_000:]

# Filter by content_type_id to discard lectures
df = df.loc[df.content_type_id == False].reset_index(drop = True)

# Changing dtype to avoid lightgbm error
df['prior_question_had_explanation'] = \
df.prior_question_had_explanation.fillna(False).astype('int8')

# Fill prior question elapsed time with the mean
prior_question_elapsed_time_mean = \
df['prior_question_elapsed_time'].dropna().mean()
df['prior_question_elapsed_time']\
.fillna(prior_question_elapsed_time_mean, inplace = True)

# Merge with question dataframe
questions_df = pd.read_csv(question_file)
questions_df['part'] = questions_df['part'].astype(np.int32)
questions_df['bundle_id'] = questions_df['bundle_id'].astype(np.int32)

df = pd.merge(df, questions_df[['question_id', 'part']], 
                 left_on = 'content_id', right_on = 'question_id', how = 'left')

# Feature engineering
The old feature gen function below, with some modifications. Removing the `if` checking to reduce some overheads. The function is un-enscapsulated to make multiprocessing easier to code with. The new `add_features` function applies to each row of the iterator.

Below is the new iterator, we first check what it is like for a row

In [None]:
iters = df[['user_id',
          'answered_correctly', 
          'content_id', 
          'prior_question_elapsed_time', 
          'prior_question_had_explanation',
          'timestamp']].itertuples()

for row in iters:
    print(row, '\n')
    print(row[0], '    ', row[1:])
    break

The `num` in the original function is obtaining the index from `enumerate`, here after resetting the index, the index itself can be used as is directly. 

In [None]:
# Funcion for user stats with loops
def add_features(row):
    
    '''
    row[0]: 'user_id',
    row[1]: 'answered_correctly', 
    row[2]: 'content_id', 
    row[3]: 'prior_question_elapsed_time', 
    row[4]: 'prior_question_had_explanation',
    row[5]: 'timestamp'
    '''
   
    num = row[0] # index
    row = row[1:]

    # Client features assignation
    # ------------------------------------------------------------------
    if answered_correctly_u_count[row[0]] != 0:
        answered_correctly_u_avg[num] = \
        answered_correctly_u_sum[row[0]] / answered_correctly_u_count[row[0]]

        elapsed_time_u_avg[num] = \
        elapsed_time_u_sum[row[0]] / answered_correctly_u_count[row[0]]

        explanation_u_avg[num] = \
        explanation_u_sum[row[0]] / answered_correctly_u_count[row[0]]

    else:
        answered_correctly_u_avg[num] = np.nan

        elapsed_time_u_avg[num] = np.nan

        explanation_u_avg[num] = np.nan

    if len(timestamp_u[row[0]]) == 0:
        timestamp_u_recency_1[num] = np.nan
        timestamp_u_recency_2[num] = np.nan
        timestamp_u_recency_3[num] = np.nan

    elif len(timestamp_u[row[0]]) == 1:
        timestamp_u_recency_1[num] = row[5] - timestamp_u[row[0]][0]
        timestamp_u_recency_2[num] = np.nan
        timestamp_u_recency_3[num] = np.nan

    elif len(timestamp_u[row[0]]) == 2:
        timestamp_u_recency_1[num] = row[5] - timestamp_u[row[0]][1]
        timestamp_u_recency_2[num] = row[5] - timestamp_u[row[0]][0]
        timestamp_u_recency_3[num] = np.nan

    elif len(timestamp_u[row[0]]) == 3:
        timestamp_u_recency_1[num] = row[5] - timestamp_u[row[0]][2]
        timestamp_u_recency_2[num] = row[5] - timestamp_u[row[0]][1]
        timestamp_u_recency_3[num] = row[5] - timestamp_u[row[0]][0]

    if len(timestamp_u_incorrect[row[0]]) == 0:
        timestamp_u_incorrect_recency[num] = np.nan
    else:
        timestamp_u_incorrect_recency[num] = \
        row[5] - timestamp_u_incorrect[row[0]][0]

    # ------------------------------------------------------------------
    # Question features assignation
    if answered_correctly_q_count[row[2]] != 0:
        answered_correctly_q_avg[num] = \
        answered_correctly_q_sum[row[2]] / answered_correctly_q_count[row[2]]
        elapsed_time_q_avg[num] = elapsed_time_q_sum[row[2]] / answered_correctly_q_count[row[2]]
        explanation_q_avg[num] = explanation_q_sum[row[2]] / answered_correctly_q_count[row[2]]
    else:
        answered_correctly_q_avg[num] = np.nan
        elapsed_time_q_avg[num] = np.nan
        explanation_q_avg[num] = np.nan
    # ------------------------------------------------------------------
    # Client Question assignation
    answered_correctly_uq_count[num] = answered_correctly_uq[row[0]][row[2]]
    # ------------------------------------------------------------------

    # ------------------------------------------------------------------
    # Client features updates
    answered_correctly_u_count[row[0]] += 1
    elapsed_time_u_sum[row[0]] += row[3]
    explanation_u_sum[row[0]] += int(row[4])

    if len(timestamp_u[row[0]]) == 3:
        timestamp_u[row[0]].pop(0)
        timestamp_u[row[0]].append(row[5])
    else:
        timestamp_u[row[0]].append(row[5])

    # ------------------------------------------------------------------
    # Question features updates
    answered_correctly_q_count[row[2]] += 1
    elapsed_time_q_sum[row[2]] += row[3]
    explanation_q_sum[row[2]] += int(row[4])
    # ------------------------------------------------------------------
    # Client Question updates
    answered_correctly_uq[row[0]][row[2]] += 1

    # ------------------------------------------------------------------
    # Flag for training and inference
    # ------------------------------------------------------------------
    # Client features updates
    answered_correctly_u_sum[row[0]] += row[1]
    if row[1] == 0:
        if len(timestamp_u_incorrect[row[0]]) == 1:
            timestamp_u_incorrect[row[0]].pop(0)
            timestamp_u_incorrect[row[0]].append(row[5])
        else:
            timestamp_u_incorrect[row[0]].append(row[5])

    # ------------------------------------------------------------------
    # Question features updates
    answered_correctly_q_sum[row[2]] += row[1]
        # ------------------------------------------------------------------

In [None]:
user_iters = df.groupby("user_id")
for group in user_iters:
    print(group[0], type(group[1])) # group[0] is the user_id
    break

In [None]:
group[1] # group[1] is the user's DF

In [None]:
group[1].index[0] # global index corresponding to the original df

In [None]:
for row in group[1][['user_id',
                  'answered_correctly', 
                  'content_id', 
                  'prior_question_elapsed_time', 
                  'prior_question_had_explanation',
                  'timestamp']].itertuples():
    print(row[0]) # index
    break

In [None]:
def add_features_user(group,):
    
    '''
    Replacing all row[0] with the user_id
    '''
    user_id = group[0]
    user_df = group[1]
    print(user_id)
    for row in user_df[['user_id',
                  'answered_correctly', 
                  'content_id', 
                  'prior_question_elapsed_time', 
                  'prior_question_had_explanation',
                  'timestamp']].itertuples():
        num = row[0]
        row = row[1:]
        # Client features assignation
        # ------------------------------------------------------------------
        if answered_correctly_u_count[user_id] != 0:
            answered_correctly_u_avg[num] = \
            answered_correctly_u_sum[user_id] / answered_correctly_u_count[user_id]

            elapsed_time_u_avg[num] = \
            elapsed_time_u_sum[user_id] / answered_correctly_u_count[user_id]

            explanation_u_avg[num] = \
            explanation_u_sum[user_id] / answered_correctly_u_count[user_id]

        else:
            answered_correctly_u_avg[num] = np.nan

            elapsed_time_u_avg[num] = np.nan

            explanation_u_avg[num] = np.nan

        if len(timestamp_u[user_id]) == 0:
            timestamp_u_recency_1[num] = np.nan
            timestamp_u_recency_2[num] = np.nan
            timestamp_u_recency_3[num] = np.nan

        elif len(timestamp_u[user_id]) == 1:
            timestamp_u_recency_1[num] = row[5] - timestamp_u[user_id][0]
            timestamp_u_recency_2[num] = np.nan
            timestamp_u_recency_3[num] = np.nan

        elif len(timestamp_u[user_id]) == 2:
            timestamp_u_recency_1[num] = row[5] - timestamp_u[user_id][1]
            timestamp_u_recency_2[num] = row[5] - timestamp_u[user_id][0]
            timestamp_u_recency_3[num] = np.nan

        elif len(timestamp_u[user_id]) == 3:
            timestamp_u_recency_1[num] = row[5] - timestamp_u[user_id][2]
            timestamp_u_recency_2[num] = row[5] - timestamp_u[user_id][1]
            timestamp_u_recency_3[num] = row[5] - timestamp_u[user_id][0]

        if len(timestamp_u_incorrect[user_id]) == 0:
            timestamp_u_incorrect_recency[num] = np.nan
        else:
            timestamp_u_incorrect_recency[num] = \
            row[5] - timestamp_u_incorrect[user_id][0]

        # ------------------------------------------------------------------
        # Question features assignation
        if answered_correctly_q_count[row[2]] != 0:
            answered_correctly_q_avg[num] = \
            answered_correctly_q_sum[row[2]] / answered_correctly_q_count[row[2]]
            elapsed_time_q_avg[num] = elapsed_time_q_sum[row[2]] / answered_correctly_q_count[row[2]]
            explanation_q_avg[num] = explanation_q_sum[row[2]] / answered_correctly_q_count[row[2]]
        else:
            answered_correctly_q_avg[num] = np.nan
            elapsed_time_q_avg[num] = np.nan
            explanation_q_avg[num] = np.nan
        # ------------------------------------------------------------------
        # Client Question assignation
        answered_correctly_uq_count[num] = answered_correctly_uq[user_id][row[2]]
        # ------------------------------------------------------------------

        # ------------------------------------------------------------------
        # Client features updates
        answered_correctly_u_count[user_id] += 1
        elapsed_time_u_sum[user_id] += row[3]
        explanation_u_sum[user_id] += int(row[4])

        if len(timestamp_u[user_id]) == 3:
            timestamp_u[user_id].pop(0)
            timestamp_u[user_id].append(row[5])
        else:
            timestamp_u[user_id].append(row[5])

        # ------------------------------------------------------------------
        # Question features updates
        answered_correctly_q_count[row[2]] += 1
        elapsed_time_q_sum[row[2]] += row[3]
        explanation_q_sum[row[2]] += int(row[4])
        # ------------------------------------------------------------------
        # Client Question updates
        answered_correctly_uq[user_id][row[2]] += 1

        # ------------------------------------------------------------------
        # Flag for training and inference
        # ------------------------------------------------------------------
        # Client features updates
        answered_correctly_u_sum[user_id] += row[1]
        if row[1] == 0:
            if len(timestamp_u_incorrect[user_id]) == 1:
                timestamp_u_incorrect[user_id].pop(0)
                timestamp_u_incorrect[user_id].append(row[5])
            else:
                timestamp_u_incorrect[user_id].append(row[5])

        # ------------------------------------------------------------------
        # Question features updates
        answered_correctly_q_sum[row[2]] += row[1]

In [None]:
# -----------------------------------------------------------------------
# Client features
answered_correctly_u_avg = np.zeros(len(df), dtype = np.float32)
elapsed_time_u_avg = np.zeros(len(df), dtype = np.float32)
explanation_u_avg = np.zeros(len(df), dtype = np.float32)
timestamp_u_recency_1 = np.zeros(len(df), dtype = np.float32)
timestamp_u_recency_2 = np.zeros(len(df), dtype = np.float32)
timestamp_u_recency_3 = np.zeros(len(df), dtype = np.float32)
timestamp_u_incorrect_recency = np.zeros(len(df), dtype = np.float32)
# -----------------------------------------------------------------------
# Question features
answered_correctly_q_avg = np.zeros(len(df), dtype = np.float32)
elapsed_time_q_avg = np.zeros(len(df), dtype = np.float32)
explanation_q_avg = np.zeros(len(df), dtype = np.float32)

# -----------------------------------------------------------------------
# User Question
answered_correctly_uq_count = np.zeros(len(df), dtype = np.int32)

# -----------------------------------------------------------------------

In [None]:
# Client dictionaries, global var to be updated
answered_correctly_u_count = defaultdict(int)
answered_correctly_u_sum = defaultdict(int)
elapsed_time_u_sum = defaultdict(int)
explanation_u_sum = defaultdict(int)
timestamp_u = defaultdict(list)
timestamp_u_incorrect = defaultdict(list)

# Question dictionaries, global var to be updated
answered_correctly_q_count = defaultdict(int)
answered_correctly_q_sum = defaultdict(int)
elapsed_time_q_sum = defaultdict(int)
explanation_q_sum = defaultdict(int)

# Client Question dictionary, if the user has not answer a questions, then the value is a defaultdict(int)
answered_correctly_uq = defaultdict(lambda: defaultdict(int))

In [None]:
pool = mp.Pool(num_cpu) # use 4 CPU threads
user_iters = df.groupby("user_id")
for group in tqdm(user_iters, total=len(user_iters)):
    pool.apply_async(add_features_user, group)

In [None]:
iters = df[['user_id',
          'answered_correctly', 
          'content_id', 
          'prior_question_elapsed_time', 
          'prior_question_had_explanation',
          'timestamp']].itertuples()
for _row in tqdm(iters, total=len(df)):
    add_features(_row)

# Dumping features

In [None]:
for item in answered_correctly_u_sum.items():
    print(item)
    break

In [None]:
with open('answered_correctly_u_count.pickle', 'wb') as f:
    pickle.dump(answered_correctly_u_count, f, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('answered_correctly_u_sum.pickle', 'wb') as f:
    pickle.dump(answered_correctly_u_sum, f, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('elapsed_time_u_sum.pickle', 'wb') as f:
    pickle.dump(elapsed_time_u_sum, f, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('explanation_u_sum.pickle', 'wb') as f:
    pickle.dump(explanation_u_sum, f, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('answered_correctly_q_count.pickle', 'wb') as f:
    pickle.dump(answered_correctly_q_count, f, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('answered_correctly_q_sum.pickle', 'wb') as f:
    pickle.dump(answered_correctly_q_sum, f, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('elapsed_time_q_sum.pickle', 'wb') as f:
    pickle.dump(elapsed_time_q_sum, f, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('explanation_q_sum.pickle', 'wb') as f:
    pickle.dump(explanation_q_sum, f, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('timestamp_u.pickle', 'wb') as f:
    pickle.dump(timestamp_u, f, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('timestamp_u_incorrect.pickle', 'wb') as f:
    pickle.dump(timestamp_u_incorrect, f, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
answered_correctly_uq_dict = defaultdict(int)
for num, row in enumerate(train[['user_id']].values):
    answered_correctly_uq_dict[row[0]] = answered_correctly_uq[row[0]]
    
with open('answered_correctly_uq_dict.pickle', 'wb') as f:
    pickle.dump(answered_correctly_uq_dict, f, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
get_memory(num_var=15)

In [None]:
files = find_files('pickle', '../working/')
print_file_size(files)

In [None]:
sub = pd.read_csv('../working/submission.csv')
sub['answered_correctly'].hist(bins=15);