In [None]:
# Logging 
# set up logging to file - see previous section for more details
def create_logging(OUTPUT_FOLDER):
    logging.basicConfig(level=logging.DEBUG,
                        format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
                        datefmt='%d-%m %H:%M:%S',
                        filename= os.path.join(OUTPUT_FOLDER, 'logger.log'),
                        filemode='w')

    # define a Handler which writes INFO messages or higher to the sys.stderr or sys.stdout
    console = logging.StreamHandler()
    console.setLevel(logging.INFO)
    # set a format which is simpler for console use
    formatter = logging.Formatter(fmt='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
                                  datefmt='%d-%m %H:%M')
    # tell the handler to use this format
    console.setFormatter(formatter)
    # add the handler to the root logger
    logging.getLogger().addHandler(console)
    return logging

In [None]:
def read_df_print(path, format = "feather"): # TODO: can change to other formats too
  if format == "feather":
    df = pd.read_feather(path)
  print(df.shape)
  display(df.head(3))
  return df

In [None]:
# Split users into multiple parts based on THR_E
def split_into_more_users(train, ascending=True):
  train["user_cumcount"] = train.groupby("user_id").cumcount(ascending)
  train["cumcount_thr"] = (train["user_cumcount"]/THR_E).astype("int")
  train["new_user_id"] = train.groupby(["user_id", "cumcount_thr"]).ngroup()
  del train["user_cumcount"], train["cumcount_thr"]
  return train

In [None]:
# Gets a feature as a sequence in lists (with pd.Series)
def get_user_sequence(feature, groupby_id = "new_user_id", to_dict = False):
  # user_seq = train.loc[train["content_type_id"]==0].groupby(groupby_id)[feature].apply(list)
  user_seq = train.groupby(groupby_id)[feature].apply(list)
  if to_dict:
    user_seq = user_seq.to_dict()
  return user_seq

In [None]:
def standardization(feature_df, feature_train): # feature_df can be of train again or test
    standard = (feature_df - np.nanmean(feature_train))/np.nanstd(feature_train)
    return standard

In [None]:
# Exercises
def return_E(groupby_id="new_user_id"):
  E_lists = get_user_sequence("content_id", groupby_id)
  return E_lists

In [None]:
# Results r
def return_r(add_start_token, groupby_id="new_user_id"):
  # Add results
  r_lists = get_user_sequence("answered_correctly", groupby_id) # All results (r)

  # Add start token to r_list
  if add_start_token:
    r_lists = r_lists.apply(lambda x: [N_response+1] + x)
  return r_lists

In [None]:
# Task
def return_task_binary(groupby_id="new_user_id"):
  # Add whether exercise was part of a task container
    train["task_binary"] = train[["user_id", "task_container_id"]].duplicated(keep=False).astype("int8") # with user_id!
    task_lists = get_user_sequence("task_binary", groupby_id)
    del train["task_binary"]
    return task_lists

In [None]:
# # Bundle binary value => 0/1
# def return_bundle_binary(groupby_id="new_user_id"):
#     # Add whether exercise was part of the same task container i.e. bundle id
#     train["bundle_binary"] = train[["user_id", "task_container_id"]].duplicated(keep=False).astype("int8")
#     task_lists = get_user_sequence("bundle_binary", groupby_id)
#     del train["bundle_binary"]
#     return bundle_lists

In [None]:
# Part
def return_p(groupby_id="new_user_id"):
    part_dict = dict(zip(questions.question_id, questions.part))
    train["part"]= train["content_id"].map(part_dict).fillna(0).astype("int8")
    p_lists = get_user_sequence("part", groupby_id) # All parts (p) of the exercises
    del train["part"]
    return p_lists

In [None]:
# Elapsed time
 # TODOs: 1) make categorical 2) change error first task container (mentioned in discsussions)
def return_et(groupby_id="new_user_id"):
    train["et"] = train["prior_question_elapsed_time"].fillna(0)/300000
    train["et"]= train["et"].astype("float32")
    et_lists = get_user_sequence("et", groupby_id) # Elapsed  times
    return et_lists

In [None]:
# Elapsed time
 # TODOs: 1) make categorical 2) change error first task container (mentioned in discsussions)
def return_et_std(groupby_id="new_user_id"):
    quantile_transformer = []
    train["et"] = train["prior_question_elapsed_time"].fillna(0).astype("float32")

    # TODO: only fit_transform on train, transform on val?
    quantile_transformer = preprocessing.QuantileTransformer(random_state=0)
    train_len = int(train["new_user_id"].nunique()*0.9)
    
    qt_transform_train = quantile_transformer.fit_transform(train.loc[train["new_user_id"]<train_len, "et"].values.reshape(-1, 1))
    qt_transform_val = quantile_transformer.transform(train.loc[train["new_user_id"]>=train_len, "et"].values.reshape(-1, 1))
    
    train.loc[train["new_user_id"]<train_len, "et_std"] = qt_transform_train
    train.loc[train["new_user_id"]>=train_len, "et_std"] = qt_transform_val

    et_lists = get_user_sequence("et_std", groupby_id) # Elapsed  times
    del train["et"], train["et_std"]
    return et_lists, quantile_transformer

In [None]:
# # Elapsed time
#  # TODOs: 1) make categorical 2) change error first task container (mentioned in discsussions)
# def return_et_std(groupby_id="new_user_id"):
#     quantile_transformer = []
#     train["et"] = train["prior_question_elapsed_time"].fillna(0)/300000
#     train["et"]= train["et"].astype("float32")
#     et_lists = get_user_sequence("et", groupby_id) # Elapsed  times
    
#     # Quantile transformer. TODO: only fit_transform on train, transform on val?
#     quantile_transformer = preprocessing.QuantileTransformer(random_state=0)
#     train["et_std"] = quantile_transformer.fit_transform(train["et"].values.reshape(-1, 1))
#     et_lists = get_user_sequence("et_std", groupby_id) # Elapsed  times
#     del train["et"], train["et_std"]
#     return et_lists, quantile_transformer

In [None]:
# Lag time
def return_lt(groupby_id="new_user_id"):
    train["lt"] = train["timestamp"].diff()

    train["task_cumcount"] = train.groupby("user_id")["task_container_id"].cumcount() # use user_id or new_user_id???
    train.loc[train["prior_question_elapsed_time"].isnull(), "lt"] = np.nan # lectures/first user
    train.loc[train["task_cumcount"]==0, "lt"] = np.nan # first bundle
    del train["task_cumcount"]

    train["lt"] = train["lt"].fillna(0)/1000 #  fill NA as zero seconds + convert to seconds
    assert((train["lt"]<0).sum()==0) # There should be no negative time differences

    # Transformation of lt. Currently implemented: QuantielTransformer
    # train["std_lt"] = train["lt"]**(1/4)
    quantile_transformer = preprocessing.QuantileTransformer(random_state=0)
    train["lt_std"] = quantile_transformer.fit_transform(train["lt"].values.reshape(-1, 1))
    train["lt_std"] = train["lt_std"].astype("float32")
    lt_lists = get_user_sequence("lt_std", groupby_id) 
    
    # train["lt"].isnull().sum() # +- sum of users + lectures
    del train["lt"], train["lt_std"]
    return lt_lists

In [None]:
# Question tags
def keep_N_highest_tags(x, pad_value, max_tags=1): # TODO: not completed yet!!
    tags, count = np.array(x["tag"]), np.array(x["tag_count"])
    ind = count.argsort()[-max_tags:][::-1] # https://stackoverflow.com/questions/6910641/how-do-i-get-indices-of-n-maximum-values-in-a-numpy-array
    if len(tags)>0:
      N_highest_tags = tags[ind[0]]
    else:
      N_highest_tags = pad_value
    return N_highest_tags

def return_N_highest_tags(groupby_id="new_user_id"):
    empty_list = []
    questions["tag"] = questions["tags"].apply(lambda x: x.split(" ") if pd.notnull(x) else empty_list)
    len_tags = questions["tag"].apply(lambda x: len(x) if isinstance(x, list) else 0)
    tags_count = Counter(x for xs in questions["tag"] for x in set(xs)) # https://stackoverflow.com/questions/19211018/using-counter-with-list-of-lists
    questions["tag_count"] = questions["tag"].apply(lambda x: [tags_count[i] for i in x])

    # Apply functions to dataframe. TODO: not completed yet!!
    N_tags, max_tags = len(tags_count), 1
    questions["N_highest_tag"] = questions.apply(keep_N_highest_tags, axis=1, args=(N_tags, max_tags))

    # map to train
    N_highest_tag_dict = dict(zip(questions.question_id, questions.N_highest_tag))
    train["N_highest_tag"]= train["content_id"].map(N_highest_tag_dict).fillna(0).astype("int32")

    # get sequence
    tag_lists = get_user_sequence("N_highest_tag", groupby_id)
    del train["N_highest_tag"]
    return tag_lists