### Import all Relevant Libraries

In [2]:
import os
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

### Import Data
* **Ex_ArcSwipe_Data.csv** contains value for each features for a given time step
* **ArcSwipe List of Metrics - Sheet1.csv** information about each metric name

In [3]:
file = pd.read_csv(os.path.join("datasets","Ex_ArcSwipe_Data.csv"), header=None)
file2 = pd.read_csv(os.path.join("datasets","ArcSwipe List of Metrics - Sheet1.csv"), header=None)

In [4]:
col_name = ['Session id', 'Start Time', 'Equipment Name', 'Excercise Name', 'Pass/Fail', 'Score', 'Metric Name',
            'Time Step', 'Value', 'Unit']

file.set_axis(col_name, axis=1, inplace=True)
file.head()

Unnamed: 0,Session id,Start Time,Equipment Name,Excercise Name,Pass/Fail,Score,Metric Name,Time Step,Value,Unit
0,5efb51adbcf5631c1400b415,Tue Jun 30 2020 10:52:29 GMT-0400 (Eastern Day...,VX_EX_EQP_NAME,EX_AS_NAME,,45,Application Extension Handler.Score.Inputs.Cur...,0.0,100.0,Percentage %
1,5efb51adbcf5631c1400b415,Tue Jun 30 2020 10:52:29 GMT-0400 (Eastern Day...,VX_EX_EQP_NAME,EX_AS_NAME,,45,ContentExtensionScene.CONS_ENV_MET_CAT.Inputs....,0.0,0.0,Count
2,5efb51adbcf5631c1400b415,Tue Jun 30 2020 10:52:29 GMT-0400 (Eastern Day...,VX_EX_EQP_NAME,EX_AS_NAME,,45,ContentExtensionScene.CONS_ENV_MET_CAT.Inputs....,0.0,0.0,Count
3,5efb51adbcf5631c1400b415,Tue Jun 30 2020 10:52:29 GMT-0400 (Eastern Day...,VX_EX_EQP_NAME,EX_AS_NAME,,45,ContentExtensionScene.CONS_ENV_MET_CAT.Inputs....,0.0,0.0,Count
4,5efb51adbcf5631c1400b415,Tue Jun 30 2020 10:52:29 GMT-0400 (Eastern Day...,VX_EX_EQP_NAME,EX_AS_NAME,,45,ContentExtensionScene.CONS_ENV_MET_CAT.Inputs....,0.0,0.0,Count


In [5]:
# Irrelevant features are dropped
file.drop(["Start Time", "Equipment Name", "Excercise Name"], axis=1, inplace=True)

In [6]:
file.head()

Unnamed: 0,Session id,Pass/Fail,Score,Metric Name,Time Step,Value,Unit
0,5efb51adbcf5631c1400b415,,45,Application Extension Handler.Score.Inputs.Cur...,0.0,100.0,Percentage %
1,5efb51adbcf5631c1400b415,,45,ContentExtensionScene.CONS_ENV_MET_CAT.Inputs....,0.0,0.0,Count
2,5efb51adbcf5631c1400b415,,45,ContentExtensionScene.CONS_ENV_MET_CAT.Inputs....,0.0,0.0,Count
3,5efb51adbcf5631c1400b415,,45,ContentExtensionScene.CONS_ENV_MET_CAT.Inputs....,0.0,0.0,Count
4,5efb51adbcf5631c1400b415,,45,ContentExtensionScene.CONS_ENV_MET_CAT.Inputs....,0.0,0.0,Count


#### Original Metric Name in the Dataset is swapped with human readable form

In [7]:
metric_name = {key: value for (key, value) in zip(file2[0],file2[1])}

file.replace(to_replace=[keys for keys in metric_name.keys()], 
             value=[values for values in metric_name.values()],
             inplace=True)

file.head()

Unnamed: 0,Session id,Pass/Fail,Score,Metric Name,Time Step,Value,Unit
0,5efb51adbcf5631c1400b415,,45,Current trainee score at that time,0.0,100.0,Percentage %
1,5efb51adbcf5631c1400b415,,45,Number of tennis balls knocked over by operator,0.0,0.0,Count
2,5efb51adbcf5631c1400b415,,45,Number of barrels knocked over,0.0,0.0,Count
3,5efb51adbcf5631c1400b415,,45,Number of barrels touches,0.0,0.0,Count
4,5efb51adbcf5631c1400b415,,45,Number of equipment collisions,0.0,0.0,Count


### Create new dataframe
* In the original dataset values for all metrics at a given time step are not specfied clearly.
* To train an RL/ML model value of all features at a given time step needs to be clearly defined.
* Following code processes the above dataset into a new dataframe.
* This new dataframe has values for all features for the given time step.

In [8]:
new_columns = ["Session id", "Time Step", *metric_name.values()]

In [9]:
all_sessions = file["Session id"].unique()

def Duration_of_Single_Sessions(sess_id):
    return len(list(file[file["Metric Name"]=="Exercise Time"][file["Session id"]==sess_id]["Time Step"]))
    
def Duration_of_All_Sessions():
    count = 0
    for ids in all_sessions:
         count += Duration_of_Single_Sessions(ids)
    return count

def value_at_trigger(sess_id, parameter):
    time_step = file[file["Metric Name"]==parameter][file["Session id"]==sess_id]["Time Step"].to_list()
    value = list(file[file["Metric Name"]==parameter][file["Session id"]==sess_id]["Value"])
    return time_step, value

def check_len(sess_id, parameter):
    total_time = file[file["Metric Name"]==parameter][file["Session id"]==sess_id]["Time Step"].to_list()
    return len(total_time)

In [10]:
total_rows = Duration_of_All_Sessions()
new_df = pd.DataFrame(columns = new_columns, index = range(total_rows))

old_indx = new_indx = 0

for ids in all_sessions:
    time_steps = Duration_of_Single_Sessions(ids)
    new_indx += time_steps
    new_df["Session id"][old_indx:new_indx] = ids
    x, _ = value_at_trigger(ids, "Average time out of path range")
    new_df["Time Step"][old_indx:new_indx] = range(time_steps)
    time_of_reward, value_of_reward = value_at_trigger(ids, "Current trainee score at that time")
    full_columns = [m for m in metric_name.values() if check_len(ids, m) == time_steps]
    one_value_columns = [m for m in metric_name.values() if check_len(ids, m) == 1]
    left_columns = [m for m in metric_name.values() if m not in (full_columns + one_value_columns)]

    for param in one_value_columns:
        time_step, values = value_at_trigger(ids, param)
        new_df[param][old_indx:new_indx] = values * time_steps 
        
    for param in full_columns:
        time_step, values = value_at_trigger(ids, param)
        new_df[param][old_indx:new_indx] = values  
    
    for param in left_columns:
        t_step, values = value_at_trigger(ids, param)
        init_indx = old_indx
        
        for i in range(len(t_step)-1):
            trigr_indx = list(map(lambda k: k > t_step[i+1], time_step)).index(True)
            new_init = old_indx + trigr_indx
            new_df[param][init_indx:new_init] = values[i]
            init_indx = new_init
        
        new_df[param][init_indx:old_indx+len(time_step)] = values[-1]
                     
    old_indx = new_indx

In [11]:
new_df.head()

Unnamed: 0,Session id,Time Step,Current trainee score at that time,Number of tennis balls knocked over by operator,Number of barrels knocked over,Number of barrels touches,Number of equipment collisions,Number of poles that fell over,Number of poles touched,Number of times user had to restart an arc,...,Tracks Ground Pressure Rear Right,Safety violation bucket over truck cab,Safety violation dump truck contact,Safety violation electrical lines,Safety violation human contact,Safety violation load over human,Safety violation unsafe parking position,Safety violation Flipped Vehicle,Exercise Number of goals met,Exercise Time
0,5efb51adbcf5631c1400b415,0,100,0,0,0,0,0,0,0,...,73002.2,0,0,0,0,0,0,0,1,0.0
1,5efb51adbcf5631c1400b415,1,100,0,0,0,0,0,0,0,...,73002.2,0,0,0,0,0,0,0,1,0.983333
2,5efb51adbcf5631c1400b415,2,100,0,0,0,0,0,0,0,...,73002.1,0,0,0,0,0,0,0,1,1.98333
3,5efb51adbcf5631c1400b415,3,100,0,0,0,0,0,0,0,...,73002.1,0,0,0,0,0,0,0,1,2.98333
4,5efb51adbcf5631c1400b415,4,100,0,0,0,0,0,0,0,...,73002.0,0,0,0,0,0,0,0,1,3.98333


In [11]:
new_df.to_csv(os.path.join("datasets","ExtractedFeatures.csv"))