In [None]:
# Logging 
# set up logging to file - see previous section for more details
def create_logging(OUTPUT_FOLDER):
    logging.basicConfig(level=logging.DEBUG,
                        format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
                        datefmt='%d-%m %H:%M:%S',
                        filename= os.path.join(OUTPUT_FOLDER, 'logger.log'),
                        filemode='w')

    # define a Handler which writes INFO messages or higher to the sys.stderr or sys.stdout
    console = logging.StreamHandler()
    console.setLevel(logging.INFO)
    # set a format which is simpler for console use
    formatter = logging.Formatter(fmt='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
                                  datefmt='%d-%m %H:%M')
    # tell the handler to use this format
    console.setFormatter(formatter)
    # add the handler to the root logger
    logging.getLogger().addHandler(console)
    return logging

In [None]:
def read_df_print(path, format = "feather"): # TODO: can change to other formats too
  if format == "feather":
    df = pd.read_feather(path)
  print(df.shape)
  display(df.head(3))
  return df

In [None]:
# Gets a feature as a sequence in lists (with pd.Series)
def get_user_sequence(feature):
  user_seq = train.groupby("user_id")[feature].apply(list)
  return user_seq

In [None]:
# Exercises
def return_E():
  E_lists = get_user_sequence("content_id")
  return E_lists

In [None]:
# Results r
def return_r():
  # Add results
  r_lists = get_user_sequence("answered_correctly") # All results (r)
  return r_lists

In [None]:
# Results r
def return_r(add_start_token):
  # Add results
  r_lists = get_user_sequence("answered_correctly") # All results (r)

  # Add start token to r_list
  if add_start_token:
    r_lists = r_lists.apply(lambda x: [N_response+1] + x)
  return r_lists

In [None]:
# Lag time - categorical
def return_ltc():
    train["ltc"] = train.groupby("user_id")["timestamp"].shift()
    
    # Lag in minutes
    train["ltc"] = ((train["timestamp"] - train["ltc"])/(1000.0 * 60))
    
    # Cap lag time to 1440 minutes
    train["ltc"] = np.clip(train["ltc"], 0, 1439)
    train["ltc"] = train["ltc"].fillna(1440)
    train["ltc"] = train["ltc"].astype(np.int32)

    ltc_lists = get_user_sequence("ltc") 
    del train["ltc"]
    return ltc_lists

In [None]:
# Lag time dense
def return_lt():
    train["lt"] = train.groupby("user_id")["timestamp"].shift()
    train["lt"] = train["timestamp"] - train["lt"]
    assert((train["lt"]<0).sum()==0) # There should be no negative time differences

    quantile_transformer = preprocessing.QuantileTransformer(random_state=0)
    train["lt"] = quantile_transformer.fit_transform(train["lt"].values.reshape(-1, 1))
    # train["lt"] = train["lt"].fillna(0.0) # Fill NA with 0.0? 0.5?
    train["lt"] = train["lt"].fillna(0.5)
        
    lt_lists = get_user_sequence("lt") 
    del train["lt"]
    return lt_lists, quantile_transformer

In [None]:
# Elapsed time dense
def return_et():
    train["et"] = train["prior_question_elapsed_time"].fillna(0) # Replace "start" with zero

    quantile_transformer = preprocessing.QuantileTransformer(random_state=0)
    qt_transform_train = quantile_transformer.fit_transform(train.loc[train["user_id"].isin(train_users), "et"].values.reshape(-1, 1))
    qt_transform_val = quantile_transformer.transform(train.loc[train["user_id"].isin(val_users), "et"].values.reshape(-1, 1))
    
    train.loc[train["user_id"].isin(train_users), "et"] = qt_transform_train
    train.loc[train["user_id"].isin(val_users), "et"] = qt_transform_val

    et_lists = get_user_sequence("et") # Elapsed  times
    del train["et"]
    return et_lists, quantile_transformer

In [None]:
# Elapsed time - categorical
def return_etc():
    train["etc"] = train["prior_question_elapsed_time"]/1000.0
    
    # Add start token
    train["etc"] = train["etc"].fillna(301)
    train["etc"] = train["etc"].astype(np.int32)

    etc_lists = get_user_sequence("etc") 
    del train["etc"]
    return etc_lists

In [None]:
# Part
def return_p():
    part_dict = dict(zip(questions.question_id, questions.part))
    train["part"]= train["content_id"].map(part_dict).fillna(0).astype("int8")
    p_lists = get_user_sequence("part") # All parts (p) of the exercises
    del train["part"]
    return p_lists

In [None]:
# Seperately train/val
# qt_transform_train = quantile_transformer.fit_transform(train.loc[train["user_id"].isin(train_users), "lt"].values.reshape(-1, 1))
# qt_transform_val = quantile_transformer.transform(train.loc[train["user_id"].isin(val_users), "lt"].values.reshape(-1, 1))

# train.loc[train["user_id"].isin(train_users), "lt"] = qt_transform_train
# train.loc[train["user_id"].isin(val_users), "lt"] = qt_transform_val 