In [1]:
import pandas as pd

In [2]:
data_path = "../data/WorkQueueLog_2025.csv"

In [3]:
df = pd.read_csv(data_path, sep=";", encoding="utf-8", low_memory=False, index_col=0)

df.Worktime = pd.to_timedelta(df.Worktime, errors="coerce")
df.AttemptWorktime = pd.to_timedelta(df.AttemptWorktime, errors="coerce")

df.Loaded = pd.to_datetime(df.Loaded, errors="coerce")
df.Completed = pd.to_datetime(df.Completed, errors="coerce")
df.Exception = pd.to_datetime(df.Exception, errors="coerce")
df.Deferred = pd.to_datetime(df.Deferred, errors="coerce")
df.DateInsert = pd.to_datetime(df.DateInsert, errors="coerce")

df.Attempts = pd.to_numeric(df.Attempts, errors="coerce")

df.columns

Index(['Status', 'Attempts', 'Worktime', 'AttemptWorktime', 'Loaded',
       'Completed', 'Deferred', 'Exception', 'ExceptionReason', 'QueueName',
       'ProcessName', 'UserRobot', 'Resource', 'DateInsert', 'FinishStatus',
       'id'],
      dtype='object')

In [4]:
df.to_parquet(
    "../clean_data/semi_raw.parquet",
    index=True,
    engine="pyarrow",
    compression="snappy",
)

In [5]:
filtered_df = df[~df['FinishStatus'].str.contains('Pending|Working', case=False, na=False)]
filtered_df = filtered_df.drop(columns=['QueueName'])

filtered_df['FinishStatus'] = filtered_df['FinishStatus'].str.contains('Complete', case=False, na=False)
filtered_df = filtered_df[filtered_df['Loaded'].dt.year >= 2025]

filtered_df = filtered_df[['Loaded', 'ProcessName', 'UserRobot', 'FinishStatus']].reset_index(drop=True)
filtered_df.Loaded = filtered_df.Loaded.dt.date
par_robot_proceso = filtered_df.groupby(['Loaded', 'ProcessName', 'UserRobot']).agg(
    tasa_excepcion= pd.NamedAgg(column='FinishStatus', aggfunc=lambda x: 1 - x.mean()),
    entries=('FinishStatus', 'count')
)

par_robot_proceso.reset_index().set_index('Loaded').to_parquet(
    "../clean_data/process_robot.parquet",
    index=True,
    engine="pyarrow",
    compression="snappy",
)

In [6]:
filtered_df.set_index('Loaded').fillna(0).groupby(level=0).agg(
    tasa_excepcion=pd.NamedAgg(column='FinishStatus', aggfunc=lambda x: 1-x.mean()),
    entries=('FinishStatus', 'count')
).reset_index().set_index('Loaded').to_parquet(
    "../clean_data/avg_global.parquet",
    index=True,
    engine="pyarrow",
    compression="snappy",
)