## Event log SURVEY extract / remover

In [1]:
### IMPORT ###
from pathlib import Path
import pandas as pd


### LOCAL IMPORT ###
from config import config_reader

In [2]:
### GLOBALS ###
yaml_config = config_reader.config_read_yaml("config.yml", "config")
# print(yaml_config) # debug
log_dir = str(yaml_config["LOG_DIR"])

level = "PAGE" # PARA, PAGE
file_name = f"edu_event_log_{level}_raw_filtered_DISCO_ter_enr.csv" # file to be cleaned
activity_column = "Activity"

In [3]:
# OUTPUT (file cleaned)
file_out = f"{Path(file_name).stem}_no_SURVEY.csv" # e.g. "edu_event_log_PAGE_raw_filtered_terziles-DEF-332-cases_no_SURVEY.csv" 

## MAIN

In [4]:
# Load the CSV
print(">> Reading event log")
path_log = Path(log_dir) / file_name
print("Path:", path_log)
dic_t = {'Case ID':object, 'CaseLength':int, 'SUS_Tercile':int, 'Apprendimento percepito_Tercile':int, 'UEQ - Overall_Tercile':int} 
df_log = pd.read_csv(path_log, sep = ",", dtype=dic_t, low_memory=False)

>> Reading event log
Path: data_log/edu_event_log_PAGE_raw_filtered_DISCO_ter_enr.csv


In [5]:
#  Removes columns added by DISCO
col_del_list = ['Variant', 'Variant index']
for col_del in col_del_list:
  if col_del in df_log.columns:
    df_log.drop(columns=col_del, inplace=True)

In [6]:
df_log.columns

Index(['Case ID', 'Activity', 'Complete Timestamp', 'pageTitle', 'menu',
       'pageOrder', 'pagePara', 'eventPage', 'click_num', 'dbclick_num',
       'QuizSessionCount', 'QuizAnswerCorrectTotal', 'QuizAnswerWrongTotal',
       'QuizAnswerCorrectRatioOverCount', 'QuizAnswerCorrectRatioOverAll',
       'Q_1', 'Q_2', 'Q_3', 'Q_4', 'Q_5', 'Q_6', 'Q_7', 'Q_8', 'Q_9', 'Q_10',
       'Q_11', 'Q_12', 'Q_13', 'Q_14', 'Q_15', 'Q_16', 'Q_17', 'Q_18', 'Q_19',
       'Q_20', 'Q_21', 'Q_22', 'Q_23', 'Q_24', 'Q_25', 'Q_26', 'Q_27', 'Q_28',
       'SUS', 'Apprendimento percepito', 'UEQ - Pragmatic', 'UEQ - Hedonic',
       'UEQ - Overall', 'TotalTimeHH', 'TotalTimeMM', 'TotalTimeMM.1',
       'TotalTimeDD', 'CaseLength', 'Class', 'SUS_Tercile',
       'Apprendimento percepito_Tercile', 'UEQ - Overall_Tercile',
       'QuizAnswerCorrectRatioOverAll_Tercile', 'Class_Count', 'FN_N', 'FN_Q',
       'FN_A', 'FN_C', 'FN_I', 'FN_CT', 'A_Time_s', 'A_Time_m',
       'Backward_Jumps'],
      dtype='object')

In [7]:
# Dimension before cleaning
df_log.shape

(8092, 68)

### Create an event log without SURVEY event

In [8]:
# Filter rows that do not begin with ‘SURVEY’ in the col_name column
df_log_clean = df_log[~df_log[activity_column].str.startswith('SURVEY', na=False)]

In [9]:
# Dimension after cleaning
df_log_clean.shape

(7399, 68)

In [10]:
# Save the dataframe cleaned
print(">> Saving cleaned event log")
path_log = Path(log_dir) / file_out
print("Path:", path_log)
df_log_clean.to_csv(path_log, sep = ",", index=False)

>> Saving cleaned event log
Path: data_log/edu_event_log_PAGE_raw_filtered_DISCO_ter_enr_no_SURVEY.csv


### Create an event log with only SURVEY event

In [11]:
df_log_survey = df_log[df_log["Activity"].str.contains(r"SURVEY-")]

In [12]:
log_file_name_enr = f"{Path(file_name).stem}_enr_only_SURVEY.csv"
path_log_file = Path(log_dir) / log_file_name_enr 
print("Path:", path_log_file)
df_log_survey.to_csv(path_log_file, sep = ",", index=False)

Path: data_log/edu_event_log_PAGE_raw_filtered_DISCO_ter_enr_enr_only_SURVEY.csv
