In [261]:
"""
02_log_encoding.ipynb
"""

'\n02_log_encoding.ipynb\n'

# Event log (prefix and ongoing) encoding

In [None]:
### IMPORT ###
from pathlib import Path
import pandas as pd
from datetime import datetime

### LOCAL IMPORT ###
from config import config_reader
from utilities import create_directories_with_gitkeep

In [263]:
### GLOBALS ###
yaml_config = config_reader.config_read_yaml("config.yml", "config")
# print(yaml_config) # debug
prefix_dir = str(yaml_config["PREFIX_DIR"])
csv_sep = ","

# INPUT
level = "PAGE"
log_file = f"edu_event_log_{level}_raw_filtered_DISCO_ter_enr_no_SURVEY_P.csv" # The file filtered in the previous script (LEVEL to be replaced with prefix_type)

# Column (feature) names
id_variable = "Case ID" 
event_variable = "Activity"
timestamp_variable = "Complete Timestamp"

In [264]:
# INPUT
target_variable = "UEQ - Overall_Tercile" # Target variable for the prediction

In [265]:
# OUTPUT
encoding_dir = str(yaml_config["ENCODING_DIR"])

## FUNCTIONS

In [266]:
def encode_feature_column_binary(df: pd.DataFrame, column_name: str, prefix: str) -> pd.DataFrame:
    """
    Preprocess and perform one-hot encoding for the specified column in the provided DataFrame,
    and return the DataFrame with the original columns plus the encoded columns.

    Parameters:
        df (pd.DataFrame): The DataFrame containing the column to be encoded.
        column_name (str): The name of the column to one-hot encode.
        prefix (str): The prefix for the one-hot encoded columns.

    Returns:
        pd.DataFrame: A DataFrame with the original columns and the specified column one-hot encoded.
    """
    
    if column_name in df.columns:
        # Perform one-hot encoding
        df_encoded = pd.get_dummies(df[column_name], prefix=prefix).astype(int)
        # Concatenate the DataFrame with the encoded columns and drop the original column
        df_final = pd.concat([df.drop(columns=[column_name]), df_encoded], axis=1)
        return df_final
    else:
        raise ValueError(f"The provided DataFrame does not contain a '{column_name}' column.")

In [267]:
def encode_feature_column_frequency(df: pd.DataFrame, id_column: str, activity_column: str, prefix: str) -> pd.DataFrame:
    """
    Adds columns to the original DataFrame, where each column represents the cumulative frequency
    of a distinct value in the activity column, calculated separately for each CaseID. The column
    names are prefixed with the specified string. The activity column is removed from the resulting
    DataFrame.
    
    Parameters:
        df (pd.DataFrame): The original DataFrame.
        id_column (str): The name of the column containing case IDs.
        activity_column (str): The name of the column containing activities.
        prefix (str): The prefix to use for the new frequency columns.
        
    Returns:
        pd.DataFrame: The original DataFrame with new columns added for cumulative frequencies, and the activity column removed.
    """
    # Create a copy of the DataFrame to avoid modifying the original
    df = df.copy()
    
    # Initialise frequency columns for each activity
    unique_activities = df[activity_column].unique()
    for activity in unique_activities:
        df[f"{prefix}_{activity}"] = 0
    
    # Calculate cumulative frequencies for each CaseID
    for case_id, group in df.groupby(id_column):
        cumulative_counts = {activity: 0 for activity in unique_activities}
        for idx in group.index:
            current_activity = df.at[idx, activity_column]
            cumulative_counts[current_activity] += 1
            for activity in unique_activities:
                df.at[idx, f"{prefix}_{activity}"] = cumulative_counts[activity]
    
    # Remove the activity column
    df.drop(columns=[activity_column], inplace=True)

    return df

In [268]:
def encode_feature_column_index(df:pd.DataFrame, id_column:str, activity_col:str, timestamp_col:str, prefix:str) -> pd.DataFrame:
    """
    Perform simple index encoding on the given DataFrame.

    Parameters:
        df (pd.DataFrame): The input DataFrame containing the event log to be encoded. 
        id_column (str): The name of the column representing case IDs.
        activity_col (str): The name of the column representing activities.
        timestamp_col (str): The name of the column representing timestamps.
        prefix (str): The prefix to use for the new frequency columns.

    Returns:
        pd.DataFrame: A DataFrame with index encoding where each activity is a column with values indicating the order of appearance in each case.
    """
    
    # Sort the DataFrame by case ID and timestamp
    df_sorted = df.sort_values(by=[id_column, timestamp_col])

    # Group by case ID and apply a counter for each activity
    df_sorted['ACTIVITY_ORDER'] = df_sorted.groupby(id_column).cumcount() + 1

    # Create a column for each activity with the index of its occurrence
    for activity in df[activity_col].unique():
        activity_mask = df_sorted[activity_col] == activity
        df_sorted[f'{prefix}_{activity}'] = activity_mask.astype(int) * df_sorted['ACTIVITY_ORDER']

    # Fill NaN with 0 and convert to integer
    activity_columns = [col for col in df_sorted.columns if col.startswith('ACTIVITY_')]
    df_sorted[activity_columns] = df_sorted[activity_columns].fillna(0).astype(int)
    
    # Drop the temporary 'ACTIVITY_ORDER' column and the original activity_col
    df_final = df_sorted.drop(columns=['ACTIVITY_ORDER',activity_col])
    
    return df_final

## MAIN

In [269]:
print()
print("*** PROGRAM START ***")
print()


*** PROGRAM START ***



In [270]:
start_time = datetime.now().replace(microsecond=0)
print("Start process:", str(start_time))
print()

Start process: 2025-02-06 09:30:37



In [271]:
print("> Settings")
print("CSV separator:", csv_sep)
print("Prefix type:", prefix_type)
log_file_name = log_file.replace("PREFIX", prefix_type)
print("Input file:", log_file_name)
path_log_file = Path(prefix_dir) /log_file_name 
print("Path file:", path_log_file)

> Settings
CSV separator: ,
Prefix type: P
Input file: edu_event_log_PAGE_raw_filtered_DISCO_ter_enr_no_SURVEY_P.csv
Path file: data_prefix/edu_event_log_PAGE_raw_filtered_DISCO_ter_enr_no_SURVEY_P.csv


In [272]:
print("> Creating output directories")
dir_output = []
dir_output.append(encoding_dir)
create_directories_with_gitkeep(dir_output)

> Creating output directories
Directory 'data_encoding' and .gitkeep file created


In [273]:
print("> Reading event log")
# Column (feature) types
dic_t = {id_variable:object}
print("Input event log column types:", dic_t)
df_log = pd.read_csv(path_log_file, dtype=dic_t, sep=csv_sep, low_memory=False)
print("Distinct cases read:", df_log[id_variable].nunique())
print("Dataframe shape:", df_log.shape)

> Reading event log
Input event log column types: {'Case ID': <class 'object'>}
Distinct cases read: 332
Dataframe shape: (2260, 26)


In [274]:
df_log.head(5)

Unnamed: 0,Case ID,Activity,Complete Timestamp,pageTitle,click_num,dbclick_num,QuizSessionCount_P3,QuizAnswerCorrectTotal_P3,QuizAnswerWrongTotal_P3,QuizAnswerCorrectRatioOverCount_P3,...,FN_N,FN_Q,FN_A,FN_C,FN_I,FN_CT,A_Time_s,A_Time_m,Forward_Jumps,Backward_Jumps
0,0HcsM5K14bTga4CpYETnQuMBKMrDBCQgHeGk48sRul6Pha...,INTRO,2024-04-22 14:11:42,INTRO,1,0,3.0,2.0,1.0,0.67,...,0.0,0.0,1.0,1.0,0.0,FA,530.0,8.83,0,0
1,0HcsM5K14bTga4CpYETnQuMBKMrDBCQgHeGk48sRul6Pha...,INTRO-Q,2024-04-22 14:20:32,INTRO-Q,1,0,3.0,2.0,1.0,0.67,...,0.0,0.0,1.0,1.0,0.0,FA,32.0,0.53,0,0
2,0HcsM5K14bTga4CpYETnQuMBKMrDBCQgHeGk48sRul6Pha...,PROG,2024-04-22 14:21:04,PROG,1,0,3.0,2.0,1.0,0.67,...,0.0,0.0,1.0,1.0,0.0,FA,233.0,3.88,0,0
3,0HcsM5K14bTga4CpYETnQuMBKMrDBCQgHeGk48sRul6Pha...,PROG-Q,2024-04-22 14:24:57,PROG-Q,1,0,3.0,2.0,1.0,0.67,...,0.0,0.0,1.0,1.0,0.0,FA,17.0,0.28,0,0
4,0HcsM5K14bTga4CpYETnQuMBKMrDBCQgHeGk48sRul6Pha...,VARS,2024-04-22 14:25:14,VARS,1,0,3.0,2.0,1.0,0.67,...,0.0,0.0,1.0,1.0,0.0,FA,119.0,1.98,0,0


In [275]:
df_log.columns

Index(['Case ID', 'Activity', 'Complete Timestamp', 'pageTitle', 'click_num',
       'dbclick_num', 'QuizSessionCount_P3', 'QuizAnswerCorrectTotal_P3',
       'QuizAnswerWrongTotal_P3', 'QuizAnswerCorrectRatioOverCount_P3',
       'QuizAnswerCorrectRatioOverAll_P3', 'Class', 'SUS_Tercile',
       'Apprendimento percepito_Tercile', 'UEQ - Overall_Tercile',
       'Class_Count', 'FN_N', 'FN_Q', 'FN_A', 'FN_C', 'FN_I', 'FN_CT',
       'A_Time_s', 'A_Time_m', 'Forward_Jumps', 'Backward_Jumps'],
      dtype='object')

In [276]:
df_log.dtypes

Case ID                                object
Activity                               object
Complete Timestamp                     object
pageTitle                              object
click_num                               int64
dbclick_num                             int64
QuizSessionCount_P3                   float64
QuizAnswerCorrectTotal_P3             float64
QuizAnswerWrongTotal_P3               float64
QuizAnswerCorrectRatioOverCount_P3    float64
QuizAnswerCorrectRatioOverAll_P3      float64
Class                                  object
SUS_Tercile                             int64
Apprendimento percepito_Tercile         int64
UEQ - Overall_Tercile                   int64
Class_Count                           float64
FN_N                                  float64
FN_Q                                  float64
FN_A                                  float64
FN_C                                  float64
FN_I                                  float64
FN_CT                             

In [277]:
print("Distinct cases before encoding:", df_log[id_variable].nunique())

Distinct cases before encoding: 332


## Encoding the event log

In [278]:
print(">> Encoding")

>> Encoding


In [279]:
# Checking actual column types
print("> Checking actual column types")
df_log.dtypes

> Checking actual column types


Case ID                                object
Activity                               object
Complete Timestamp                     object
pageTitle                              object
click_num                               int64
dbclick_num                             int64
QuizSessionCount_P3                   float64
QuizAnswerCorrectTotal_P3             float64
QuizAnswerWrongTotal_P3               float64
QuizAnswerCorrectRatioOverCount_P3    float64
QuizAnswerCorrectRatioOverAll_P3      float64
Class                                  object
SUS_Tercile                             int64
Apprendimento percepito_Tercile         int64
UEQ - Overall_Tercile                   int64
Class_Count                           float64
FN_N                                  float64
FN_Q                                  float64
FN_A                                  float64
FN_C                                  float64
FN_I                                  float64
FN_CT                             

In [280]:
print("> Creating list of actual column types")
# List of column names with data type 'object'
object_columns_list = df_log.select_dtypes(include='object').columns.tolist()
# List of column names with numeric data types (int or float)
numeric_columns_list = df_log.select_dtypes(include=['int64', 'float64']).columns.tolist()
print(f"Object columns ({len(object_columns_list)}):", object_columns_list)
print(f"Numeric columns ({len(numeric_columns_list)}):", numeric_columns_list)

> Creating list of actual column types
Object columns (6): ['Case ID', 'Activity', 'Complete Timestamp', 'pageTitle', 'Class', 'FN_CT']
Numeric columns (20): ['click_num', 'dbclick_num', 'QuizSessionCount_P3', 'QuizAnswerCorrectTotal_P3', 'QuizAnswerWrongTotal_P3', 'QuizAnswerCorrectRatioOverCount_P3', 'QuizAnswerCorrectRatioOverAll_P3', 'SUS_Tercile', 'Apprendimento percepito_Tercile', 'UEQ - Overall_Tercile', 'Class_Count', 'FN_N', 'FN_Q', 'FN_A', 'FN_C', 'FN_I', 'A_Time_s', 'A_Time_m', 'Forward_Jumps', 'Backward_Jumps']


In [281]:
object_columns_list

['Case ID', 'Activity', 'Complete Timestamp', 'pageTitle', 'Class', 'FN_CT']

In [282]:
numeric_columns_list

['click_num',
 'dbclick_num',
 'QuizSessionCount_P3',
 'QuizAnswerCorrectTotal_P3',
 'QuizAnswerWrongTotal_P3',
 'QuizAnswerCorrectRatioOverCount_P3',
 'QuizAnswerCorrectRatioOverAll_P3',
 'SUS_Tercile',
 'Apprendimento percepito_Tercile',
 'UEQ - Overall_Tercile',
 'Class_Count',
 'FN_N',
 'FN_Q',
 'FN_A',
 'FN_C',
 'FN_I',
 'A_Time_s',
 'A_Time_m',
 'Forward_Jumps',
 'Backward_Jumps']

In [283]:
# From the object types, removes the case id (not needed), timestamp (not needed) and activity column that will be processed in various encodings 
print("> Removes the activity column that will be processed in various encodings")
object_columns_list = [col for col in object_columns_list if col != event_variable]

print("> Removes the ID column that will be processed in various encodings")
object_columns_list = [col for col in object_columns_list if col != id_variable]

print("> Removes the timestamp that will be processed in various encodings")
object_columns_list = [col for col in object_columns_list if col != timestamp_variable]

print("Updated object columns:", object_columns_list)

> Removes the activity column that will be processed in various encodings
> Removes the ID column that will be processed in various encodings
> Removes the timestamp that will be processed in various encodings
Updated object columns: ['pageTitle', 'Class', 'FN_CT']


In [284]:
print(f"Object columns ({len(object_columns_list)}):", object_columns_list)
print(f"Numeric columns ({len(numeric_columns_list)}):", numeric_columns_list)

Object columns (3): ['pageTitle', 'Class', 'FN_CT']
Numeric columns (20): ['click_num', 'dbclick_num', 'QuizSessionCount_P3', 'QuizAnswerCorrectTotal_P3', 'QuizAnswerWrongTotal_P3', 'QuizAnswerCorrectRatioOverCount_P3', 'QuizAnswerCorrectRatioOverAll_P3', 'SUS_Tercile', 'Apprendimento percepito_Tercile', 'UEQ - Overall_Tercile', 'Class_Count', 'FN_N', 'FN_Q', 'FN_A', 'FN_C', 'FN_I', 'A_Time_s', 'A_Time_m', 'Forward_Jumps', 'Backward_Jumps']


In [285]:
print("> Creating dataframe with encoding")
# First DataFrame: Add id_variable and numeric columns
df_encoding_base = pd.DataFrame()
df_encoding_base[id_variable] = df_log[id_variable]
df_encoding_base = pd.concat([df_encoding_base, df_log[numeric_columns_list].copy()], axis=1)

# Second DataFrame: Binary encoding of object columns
df_encoded_objects = pd.get_dummies(df_log[object_columns_list], drop_first=True)

> Creating dataframe with encoding


In [286]:
# Binary encoding of activities
df_log_encoded_b = encode_feature_column_binary(df_log, event_variable, event_variable)
# Binary encoding of object columns
for col_name in object_columns_list:
        print("Econding object column:", col_name)
        df_log_encoded_b = encode_feature_column_binary(df_log_encoded_b, col_name, col_name)

Econding object column: pageTitle
Econding object column: Class
Econding object column: FN_CT


In [287]:
df_log_encoded_b.head()

Unnamed: 0,Case ID,Complete Timestamp,click_num,dbclick_num,QuizSessionCount_P3,QuizAnswerCorrectTotal_P3,QuizAnswerWrongTotal_P3,QuizAnswerCorrectRatioOverCount_P3,QuizAnswerCorrectRatioOverAll_P3,SUS_Tercile,...,Class_SAA,Class_SMCN1,Class_SMCN2,Class_SMTO1,Class_SMTO2,Class_SMTO3,FN_CT_FA,FN_CT_L,FN_CT_M,FN_CT_SA
0,0HcsM5K14bTga4CpYETnQuMBKMrDBCQgHeGk48sRul6Pha...,2024-04-22 14:11:42,1,0,3.0,2.0,1.0,0.67,0.67,1,...,0,0,1,0,0,0,1,0,0,0
1,0HcsM5K14bTga4CpYETnQuMBKMrDBCQgHeGk48sRul6Pha...,2024-04-22 14:20:32,1,0,3.0,2.0,1.0,0.67,0.67,1,...,0,0,1,0,0,0,1,0,0,0
2,0HcsM5K14bTga4CpYETnQuMBKMrDBCQgHeGk48sRul6Pha...,2024-04-22 14:21:04,1,0,3.0,2.0,1.0,0.67,0.67,1,...,0,0,1,0,0,0,1,0,0,0
3,0HcsM5K14bTga4CpYETnQuMBKMrDBCQgHeGk48sRul6Pha...,2024-04-22 14:24:57,1,0,3.0,2.0,1.0,0.67,0.67,1,...,0,0,1,0,0,0,1,0,0,0
4,0HcsM5K14bTga4CpYETnQuMBKMrDBCQgHeGk48sRul6Pha...,2024-04-22 14:25:14,1,0,3.0,2.0,1.0,0.67,0.67,1,...,0,0,1,0,0,0,1,0,0,0


In [288]:
df_log_encoded_b.columns

Index(['Case ID', 'Complete Timestamp', 'click_num', 'dbclick_num',
       'QuizSessionCount_P3', 'QuizAnswerCorrectTotal_P3',
       'QuizAnswerWrongTotal_P3', 'QuizAnswerCorrectRatioOverCount_P3',
       'QuizAnswerCorrectRatioOverAll_P3', 'SUS_Tercile',
       'Apprendimento percepito_Tercile', 'UEQ - Overall_Tercile',
       'Class_Count', 'FN_N', 'FN_Q', 'FN_A', 'FN_C', 'FN_I', 'A_Time_s',
       'A_Time_m', 'Forward_Jumps', 'Backward_Jumps', 'Activity_INTRO',
       'Activity_INTRO-Q', 'Activity_PROG', 'Activity_PROG-Q', 'Activity_VARS',
       'Activity_VARS-Q', 'pageTitle_INTRO', 'pageTitle_INTRO-Q',
       'pageTitle_PROG', 'pageTitle_PROG-Q', 'pageTitle_VARS',
       'pageTitle_VARS-Q', 'Class_ECO', 'Class_SAA', 'Class_SMCN1',
       'Class_SMCN2', 'Class_SMTO1', 'Class_SMTO2', 'Class_SMTO3', 'FN_CT_FA',
       'FN_CT_L', 'FN_CT_M', 'FN_CT_SA'],
      dtype='object')

In [289]:
# Frequency encoding of activities
df_log_encoded_f = encode_feature_column_frequency(df_log, id_variable, event_variable, event_variable)
# Binary encoding of object columns
for col_name in object_columns_list:
        print("Econding object column:", col_name)
        df_log_encoded_f = encode_feature_column_binary(df_log_encoded_f, col_name, col_name)

Econding object column: pageTitle
Econding object column: Class
Econding object column: FN_CT


In [290]:
# Simple index encoding of activities
df_log_encoded_i = encode_feature_column_index(df_log, id_variable, event_variable, timestamp_variable, event_variable)
# Binary encoding of object columns
for col_name in object_columns_list:
        print("Econding object column:", col_name)
        df_log_encoded_i = encode_feature_column_binary(df_log_encoded_i, col_name, col_name)

Econding object column: pageTitle
Econding object column: Class
Econding object column: FN_CT


In [291]:
df_log_encoded_i.columns

Index(['Case ID', 'Complete Timestamp', 'click_num', 'dbclick_num',
       'QuizSessionCount_P3', 'QuizAnswerCorrectTotal_P3',
       'QuizAnswerWrongTotal_P3', 'QuizAnswerCorrectRatioOverCount_P3',
       'QuizAnswerCorrectRatioOverAll_P3', 'SUS_Tercile',
       'Apprendimento percepito_Tercile', 'UEQ - Overall_Tercile',
       'Class_Count', 'FN_N', 'FN_Q', 'FN_A', 'FN_C', 'FN_I', 'A_Time_s',
       'A_Time_m', 'Forward_Jumps', 'Backward_Jumps', 'Activity_INTRO',
       'Activity_INTRO-Q', 'Activity_PROG', 'Activity_PROG-Q', 'Activity_VARS',
       'Activity_VARS-Q', 'pageTitle_INTRO', 'pageTitle_INTRO-Q',
       'pageTitle_PROG', 'pageTitle_PROG-Q', 'pageTitle_VARS',
       'pageTitle_VARS-Q', 'Class_ECO', 'Class_SAA', 'Class_SMCN1',
       'Class_SMCN2', 'Class_SMTO1', 'Class_SMTO2', 'Class_SMTO3', 'FN_CT_FA',
       'FN_CT_L', 'FN_CT_M', 'FN_CT_SA'],
      dtype='object')

In [292]:
# Optional: Move target_variable as the last column in each DataFrame
for df in [df_log_encoded_b, df_log_encoded_f, df_log_encoded_i]:
    target_col = df.pop(target_variable)  # Remove and store the target column
    df[target_variable] = target_col      # Add the target column at the end

In [293]:
df_log_encoded_b.head()

Unnamed: 0,Case ID,Complete Timestamp,click_num,dbclick_num,QuizSessionCount_P3,QuizAnswerCorrectTotal_P3,QuizAnswerWrongTotal_P3,QuizAnswerCorrectRatioOverCount_P3,QuizAnswerCorrectRatioOverAll_P3,SUS_Tercile,...,Class_SMCN1,Class_SMCN2,Class_SMTO1,Class_SMTO2,Class_SMTO3,FN_CT_FA,FN_CT_L,FN_CT_M,FN_CT_SA,UEQ - Overall_Tercile
0,0HcsM5K14bTga4CpYETnQuMBKMrDBCQgHeGk48sRul6Pha...,2024-04-22 14:11:42,1,0,3.0,2.0,1.0,0.67,0.67,1,...,0,1,0,0,0,1,0,0,0,1
1,0HcsM5K14bTga4CpYETnQuMBKMrDBCQgHeGk48sRul6Pha...,2024-04-22 14:20:32,1,0,3.0,2.0,1.0,0.67,0.67,1,...,0,1,0,0,0,1,0,0,0,1
2,0HcsM5K14bTga4CpYETnQuMBKMrDBCQgHeGk48sRul6Pha...,2024-04-22 14:21:04,1,0,3.0,2.0,1.0,0.67,0.67,1,...,0,1,0,0,0,1,0,0,0,1
3,0HcsM5K14bTga4CpYETnQuMBKMrDBCQgHeGk48sRul6Pha...,2024-04-22 14:24:57,1,0,3.0,2.0,1.0,0.67,0.67,1,...,0,1,0,0,0,1,0,0,0,1
4,0HcsM5K14bTga4CpYETnQuMBKMrDBCQgHeGk48sRul6Pha...,2024-04-22 14:25:14,1,0,3.0,2.0,1.0,0.67,0.67,1,...,0,1,0,0,0,1,0,0,0,1


In [294]:
df_log_encoded_f.head()

Unnamed: 0,Case ID,Complete Timestamp,click_num,dbclick_num,QuizSessionCount_P3,QuizAnswerCorrectTotal_P3,QuizAnswerWrongTotal_P3,QuizAnswerCorrectRatioOverCount_P3,QuizAnswerCorrectRatioOverAll_P3,SUS_Tercile,...,Class_SMCN1,Class_SMCN2,Class_SMTO1,Class_SMTO2,Class_SMTO3,FN_CT_FA,FN_CT_L,FN_CT_M,FN_CT_SA,UEQ - Overall_Tercile
0,0HcsM5K14bTga4CpYETnQuMBKMrDBCQgHeGk48sRul6Pha...,2024-04-22 14:11:42,1,0,3.0,2.0,1.0,0.67,0.67,1,...,0,1,0,0,0,1,0,0,0,1
1,0HcsM5K14bTga4CpYETnQuMBKMrDBCQgHeGk48sRul6Pha...,2024-04-22 14:20:32,1,0,3.0,2.0,1.0,0.67,0.67,1,...,0,1,0,0,0,1,0,0,0,1
2,0HcsM5K14bTga4CpYETnQuMBKMrDBCQgHeGk48sRul6Pha...,2024-04-22 14:21:04,1,0,3.0,2.0,1.0,0.67,0.67,1,...,0,1,0,0,0,1,0,0,0,1
3,0HcsM5K14bTga4CpYETnQuMBKMrDBCQgHeGk48sRul6Pha...,2024-04-22 14:24:57,1,0,3.0,2.0,1.0,0.67,0.67,1,...,0,1,0,0,0,1,0,0,0,1
4,0HcsM5K14bTga4CpYETnQuMBKMrDBCQgHeGk48sRul6Pha...,2024-04-22 14:25:14,1,0,3.0,2.0,1.0,0.67,0.67,1,...,0,1,0,0,0,1,0,0,0,1


In [295]:
df_log_encoded_i.head()

Unnamed: 0,Case ID,Complete Timestamp,click_num,dbclick_num,QuizSessionCount_P3,QuizAnswerCorrectTotal_P3,QuizAnswerWrongTotal_P3,QuizAnswerCorrectRatioOverCount_P3,QuizAnswerCorrectRatioOverAll_P3,SUS_Tercile,...,Class_SMCN1,Class_SMCN2,Class_SMTO1,Class_SMTO2,Class_SMTO3,FN_CT_FA,FN_CT_L,FN_CT_M,FN_CT_SA,UEQ - Overall_Tercile
0,0HcsM5K14bTga4CpYETnQuMBKMrDBCQgHeGk48sRul6Pha...,2024-04-22 14:11:42,1,0,3.0,2.0,1.0,0.67,0.67,1,...,0,1,0,0,0,1,0,0,0,1
1,0HcsM5K14bTga4CpYETnQuMBKMrDBCQgHeGk48sRul6Pha...,2024-04-22 14:20:32,1,0,3.0,2.0,1.0,0.67,0.67,1,...,0,1,0,0,0,1,0,0,0,1
2,0HcsM5K14bTga4CpYETnQuMBKMrDBCQgHeGk48sRul6Pha...,2024-04-22 14:21:04,1,0,3.0,2.0,1.0,0.67,0.67,1,...,0,1,0,0,0,1,0,0,0,1
3,0HcsM5K14bTga4CpYETnQuMBKMrDBCQgHeGk48sRul6Pha...,2024-04-22 14:24:57,1,0,3.0,2.0,1.0,0.67,0.67,1,...,0,1,0,0,0,1,0,0,0,1
4,0HcsM5K14bTga4CpYETnQuMBKMrDBCQgHeGk48sRul6Pha...,2024-04-22 14:25:14,1,0,3.0,2.0,1.0,0.67,0.67,1,...,0,1,0,0,0,1,0,0,0,1


In [296]:
print("> Saving the encoded data")
file_out = f"{Path(log_file_name).stem}_B.csv"
dir_out = Path(encoding_dir) / file_out
print("File path:", dir_out)
df_log_encoded_b.to_csv(dir_out, sep=csv_sep, index=False)

file_out = f"{Path(log_file_name).stem}_F.csv"
dir_out = Path(encoding_dir) / file_out
print("File path:", dir_out)
df_log_encoded_f.to_csv(dir_out, sep=csv_sep, index=False)

file_out = f"{Path(log_file_name).stem}_I.csv"
dir_out = Path(encoding_dir) / file_out
print("File path:", dir_out)
df_log_encoded_i.to_csv(dir_out, sep=csv_sep, index=False)

> Saving the encoded data
File path: data_encoding/edu_event_log_PAGE_raw_filtered_DISCO_ter_enr_no_SURVEY_P_B.csv
File path: data_encoding/edu_event_log_PAGE_raw_filtered_DISCO_ter_enr_no_SURVEY_P_F.csv
File path: data_encoding/edu_event_log_PAGE_raw_filtered_DISCO_ter_enr_no_SURVEY_P_I.csv


In [297]:
end_time = datetime.now().replace(microsecond=0)
delta_time = end_time - start_time
total_seconds = int(delta_time.total_seconds())
minutes, seconds = divmod(total_seconds, 60)

print()
print("End process")
print("Script finished at:",  datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print(f"Time to finish ({delta_time}): {minutes} minutes, {seconds} seconds")


End process
Script finished at: 2025-02-06 09:30:37
Time to finish (0:00:00): 0 minutes, 0 seconds


In [298]:
print()
print("*** PROGRAM START ***")
print()


*** PROGRAM START ***

