# Conectando com o Drive 

In [None]:
#para salvar os resultados no Drive
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


# Inicialização do ambiente

## Importando as bibliotecas

In [None]:
import os
import sys
import json
import math
import random
import pickle
import numpy as np
import pandas as pd
import plotly.express as px

import warnings
warnings.filterwarnings('ignore')

from google_drive_downloader import GoogleDriveDownloader as gdd
from google.colab import files

## Carregando dados

### Métodos

In [None]:
def get_duration_predictor():
  google_id = '1fIHJUHoC-aj5rEACAsTfyvmKdlErrhaw'
  gdd.download_file_from_google_drive(file_id=google_id, 
                                      dest_path = './duration_predictor.pickle', 
                                      showsize = True)
  return  pickle.load(open('duration_predictor.pickle','rb'))

def get_duration_mean():
  google_id = '1TbJb2OBWLubEgWT-WEmn5s4hTwUSouw9'
  gdd.download_file_from_google_drive(file_id=google_id, 
                                      dest_path = './duration_predictor_mean.csv', 
                                      showsize = True)
  return pd.read_csv('duration_predictor_mean.csv')
  

def get_duration_predictor_features():
  google_id = '1tvmT3U1CZLmSIXhKR7krwPOVTbD7OP4W'
  gdd.download_file_from_google_drive(file_id=google_id, 
                                      dest_path = './duration_predictor_features.pickle', 
                                      showsize = True)
  return pickle.load(open('duration_predictor_features.pickle','rb'))

def get_test_df():
  csv_filename = load_test_df()
  df = pd.read_csv(csv_filename)
  df = rename_cols(df)
  df['time:timestamp'] = pd.to_datetime(df['time:timestamp'], format="%Y/%m/%d %H:%M:%S")
  df = get_next_events(df)
  return df

def load_test_df():
  csv_filename = 'data_test.csv'
  google_id = '1881WtafRdH_hk2gUxH18a2xr9MkG6mJs'
  gdd.download_file_from_google_drive(file_id=google_id, 
                                      dest_path = './%s'%csv_filename, 
                                      showsize = True)
  return csv_filename

def rename_cols(df):
  rename_dict = {'lifecycle:transition_shifted': 'lifecycle:transition_prev',
                  'Activity_shifted':'Activity_prev',
                  'org:resource_shifted': 'org:resource_prev',
                  'time:timestamp_shifted': 'time:timestamp_prev'}
  return df.rename(columns=rename_dict)

def get_next_events(df):
  df = df.sort_values(['Case ID','Activity','org:resource','time:timestamp'])
  df['lifecycle:transition_next'] =  df['lifecycle:transition'].shift(-1)
  df['Activity_next'] = df['Activity'].shift(-1)
  df['org:resource_next'] = df['org:resource'].shift(-1)
  df['time:timestamp_next'] = df['time:timestamp'].shift(-1)
  df['time:timestamp_next'] = pd.to_datetime(df['time:timestamp_next'], format="%Y/%m/%d %H:%M:%S")
  df['Duration(s)_next'] = df['Duration(s)'].shift(-1)
  return df

### Carregando dados

In [None]:
duration_predictor = get_duration_predictor()
duration_predictor_mean = get_duration_mean()
features = get_duration_predictor_features()
data_test = get_test_df()
base = pd.DataFrame(columns = [col for col in features])
base.loc[0] = 0

In [None]:
data_test['Duration(s)'].sum()/60/60/24

24.464027777777776

In [None]:
LAST_TRAINING_WL = {10138: 0, 10609: 0, 10629: 0, 10809: 0, 10861: 0, 10881: 0, 10889: 0, 
                      10899: 0, 10909: 0, 10910: 1, 10913: 0, 10929: 0, 10932: 0, 10972: 0, 
                      10982: 0, 11000: 1, 11003: 1, 11009: 0, 11049: 0, 11119: 0, 11121: 0, 
                      11122: 0, 11169: 0, 11179: 0, 11180: 0, 11181: 0, 11189: 0, 11201: 0, 
                      11203: 0, 11259: 0}

RESOURCES = [11180,10982,11121,10609,10899,10629,11049,11201,10889,11119,11179,11169,10809,11122,
             11181,11009,11189,10138,10881,10909,10972,11203,10913,11000,10861,11259,10932,10910,
             10929,11003]

In [None]:
data_test.head()

Unnamed: 0,Case ID,Activity,AMOUNT_REQ,REG_DATE,org:resource,lifecycle:transition,time:timestamp,lifecycle:transition_prev,Activity_prev,org:resource_prev,time:timestamp_prev,duration,Duration(s),lifecycle:transition_next,Activity_next,org:resource_next,time:timestamp_next,Duration(s)_next
0,197509,W_Calling after sent offers,2500,03/01/2012 12:52:17,10909.0,COMPLETE,2012-02-11 09:16:04,COMPLETE,W_Calling after sent offers,10909.0,2012-01-26 11:17:51,,,COMPLETE,W_Calling after sent offers,11201.0,2012-02-06 09:15:06,
1,197581,W_Calling after sent offers,10000,03/01/2012 15:29:50,11201.0,COMPLETE,2012-02-06 09:15:06,COMPLETE,W_Calling after sent offers,11201.0,2012-01-30 16:35:50,,,COMPLETE,W_Calling after sent offers,11000.0,2012-02-04 09:15:36,
2,197680,W_Calling after sent offers,4000,03/01/2012 20:07:15,11000.0,COMPLETE,2012-02-04 09:15:36,COMPLETE,W_Calling after sent offers,11000.0,2012-02-02 13:02:08,,,START,W_Calling after sent offers,10909.0,2012-02-11 14:49:25,
3,197701,W_Calling after sent offers,6500,03/01/2012 20:47:09,10909.0,START,2012-02-11 14:49:25,COMPLETE,W_Calling after sent offers,10909.0,2012-01-26 11:24:55,,,COMPLETE,W_Calling after sent offers,10909.0,2012-02-11 14:56:04,399.0
4,197701,W_Calling after sent offers,6500,03/01/2012 20:47:09,10909.0,COMPLETE,2012-02-11 14:56:04,START,W_Calling after sent offers,10909.0,2012-02-11 14:49:25,0 days 00:06:39,399.0,COMPLETE,W_Calling after sent offers,11180.0,2012-02-06 09:15:31,


In [None]:
data_test['lifecycle:transition'].value_counts()

COMPLETE    3078
START       3006
Name: lifecycle:transition, dtype: int64

In [None]:
data_test.shape

(6084, 18)

In [None]:
data_test = data_test[((data_test['lifecycle:transition'] == 'START') & (~data_test['Duration(s)_next'].isnull())) | 
                      (data_test['lifecycle:transition'] == 'COMPLETE')]
data_test.shape

(6083, 18)

### Carregando resultados

In [None]:
results = [('1M0wD8lsRAKUwFBufJfeid77TYi-pnY2A', 'Regressão Linear'),
           ('1-1ipoKUqD6lP1qaD-JK-aiInwhNy_kiC', 'Random Forest - 5'),
           ('1-4ZSsOBsDpE35bmFEGMZkc-LCFk50sJ-', 'Random Forest - 10'),
           ('1-4yFL-z8EhVCWSUSmBpJp3YvhhxJco3Y', 'Random Forest - 20'),
           ('1-98yguJIuLUpAEf412DizIwCm0KM2nZt', 'Random Forest - 100'),
           ('1-EynA-s8So8CW8o4d8ojk00Z40y8MRZP', 'MLP - 10,10_relu'),
           ('1-HIUVV-XIpBZ_kU4xhOhFWu4-GDmkFs5', 'MLP - 50,50_relu'),
           ('1-IZUp5jSg9Pq3KrcGSU8WI5HBGkdhOP1', 'MLP - 100_relu'),
           ('1-_RrBUdxCaCnpj_yu-9ux8C6OEAq-SGD', 'MLP - 50,50_sigmoide'),
           ('1-YYkvxn-HvqSM1QGePWcn2cxhvKfBOr1', 'MLP - 100_sigmoide'),
           ('1-beS5nmVNjvPzXj7v6b_t5PfL76UVhTC', 'RPROP - 50,50_relu'),
           ('1-eSmFcFTl2VjORoMVrhd_613YnVdyJxx', 'RPROP - 100_sigmoide'),
           ('1-dL6U7hYb3XVzjmF1EEtQZlNUsrqR8sL', 'RPROP - 500,500_sigmoide'),]

In [None]:
QS = pd.DataFrame()
STATS = pd.DataFrame()
DELTAS = pd.DataFrame()
CASES_OTIMIZ = pd.DataFrame()

# Simulação para avaliação da aplicação da política

## Métodos

In [None]:
def process_q(q, alg_name):

  def get_data_pi(n):

    def get_pi():
      pi_filepath = '/gdrive/My Drive/ppar_results/pis/'
      pi_filename = '%s-n_%s.csv'%(alg_name,str(n))
      if os.path.isfile(pi_filepath+pi_filename):
        return pd.read_csv(open(pi_filepath+pi_filename, 'r'))
      pi = get_pi(q, n)
      save_pi()
      return pi

    def save_pi():
      with open('/gdrive/My Drive/ppar_results/pis/%s'%filename, 'w') as f:
        pi.to_csv(f, index=False)

    def save_data_pi():
      with open('/gdrive/My Drive/ppar_results/pi_applied_data/%s'%filename, 'w') as f:
        data_pi[['Case ID', 'Activity', 'Duration(s)_next','pi_action','pred_duration','pi_action_dur_source', 'Workload', 'time:timestamp']].to_csv(f, index=False)
    
    filepath = '/gdrive/My Drive/ppar_results/pi_applied_data/'
    filename = '%s-n_%s.csv'%(alg_name,str(n))
    if os.path.isfile(filepath+filename):
      return pd.read_csv(open(filepath+filename, 'r'))
    
    pi = get_pi()
    data_pi = apply_pi(pi)
    save_data_pi()
  
    return data_pi

  global QS, DELTAS, STATS, CASES_OTIMIZ

  d_melted_n = q.melt(id_vars=['case','state','action','cost', 'next_state'])
  d_melted_n = d_melted_n.rename(columns={'variable':'N', 'value':'Q'})
  d_melted_n['N'] = d_melted_n['N'].astype(int)
  d_melted_n['Q'] = d_melted_n['Q']/3600 #em horas
  d_melted_n['Modelo'] = alg_name
  QS = pd.concat([QS, d_melted_n])

  diffs = {'N':[], 'Delta':[]}
  q_stats = {'Total(d)': [], 'Otimizados (%)': [], 'N':[], 'Modelo': [], 
             '#Original_dur':[], '#Dur_model':[], '#Dur_avg_wl':[], '#Dur_avg_act':[], '#Events Otimiz':[]}
  data_test_pi = None
  last_n = d_melted_n.N.max()
  for n in range(1, last_n+1):
    print(n)
    diffs['N'].append(n)
    diffs['Delta'].append(abs(q[str(n)]-q[str(n-1)]).sum()/3600)

    data_test_pi = get_data_pi(n)

    cases_otimiz = pd.DataFrame(data_test_pi.groupby('Case ID')['Duration(s)_next'].sum().reset_index())
    cases_otimiz['Pi'] = data_test_pi.groupby('Case ID')['pred_duration'].sum().reset_index()['pred_duration']
    cases_otimiz['Otimizacao'] = cases_otimiz['Pi']-cases_otimiz['Duration(s)_next']

    q_stats['Total(d)'].append(data_test_pi.pred_duration.sum()/3600/24),
    q_stats['Otimizados (%)'].append((cases_otimiz['Otimizacao'] > 0).sum()/len(cases_otimiz['Case ID'].unique()))
    q_stats['#Original_dur'].append(data_test_pi[data_test_pi['pi_action_dur_source'] == 'original'].shape[0])
    q_stats['#Dur_model'].append(data_test_pi[data_test_pi['pi_action_dur_source'] == 'ml_model'].shape[0])
    q_stats['#Dur_avg_wl'].append(data_test_pi[data_test_pi['pi_action_dur_source']=='avg_wl'].shape[0])
    q_stats['#Dur_avg_act'].append(data_test_pi[data_test_pi['pi_action_dur_source']=='avg_act'].shape[0])
    data_test_pi['pi-original_duration'] = data_test_pi['pred_duration'] - data_test_pi['Duration(s)_next']
    q_stats['#Events Otimiz'].append(data_test_pi[data_test_pi['pi-original_duration'] > 0].shape[0]/data_test_pi.shape[0])
    q_stats['N'].append(n)
    q_stats['Modelo'].append(alg_name)
  
  

  STATS = pd.concat([STATS, pd.DataFrame(q_stats)])
  
  diffs = pd.DataFrame(diffs)
  diffs['Modelo'] = alg_name
  DELTAS = pd.concat([DELTAS, diffs])
  
  cases_otimiz['Modelo'] = alg_name
  cases_otimiz.rename(columns={'Duration(s)_next':'Custo original'})
  CASES_OTIMIZ = pd.concat([CASES_OTIMIZ, cases_otimiz])

In [None]:
def get_pi(states_set, n):
  aux = pd.DataFrame(states_set.groupby(['state','action']).apply(lambda x: x[str(n)].mean())).reset_index()
  return aux.loc[aux.groupby(['state'])[0].idxmin()]

def apply_pi(pi):
  def inicialize_workload_count():
    WORKLOAD_COUNT = pd.DataFrame(columns=RESOURCES)
    WORKLOAD_COUNT.loc[0] = 0
    #partir do Workload_count final do treinamento
    wc_final_train = LAST_TRAINING_WL
    for r,count in wc_final_train.items():
      WORKLOAD_COUNT[r].loc[0] = count
    return WORKLOAD_COUNT

  def process_event(event):

    def set_event_wl():
      wl = get_wl(WORKLOAD_COUNT)
      event['Workload'] = get_wl_dict(wl)
      return wl


    def get_wl(wl_count):
      activities_being_exec = wl_count.T.sum().loc[0]
      AVG_R = (activities_being_exec/len(wl_count.columns)) 

      def get_scale_int(x):
        if x[0] < 1:
          x[0] = 0#'FREE'
          return x
        if x[0] <= AVG_R:
          x[0] = 1#'LOW'
          return x
        x[0] = 2#'HIGH'
        return x
      
      wl = wl_count.copy().T
      wl = wl.apply(get_scale_int, axis=1)
      return wl.T


    def get_wl_dict(wl):
      return wl.to_dict('records')[0]


    def get_original_duration():
      original_duration = None
      if (event['lifecycle:transition_next'] == 'COMPLETE' and 
          event['Activity_next'] == event['Activity'] and
          event['org:resource_next'] == event['org:resource']):
        # verified that the dilemma of having two activity instances started for the same case (and executed by the same resource), 
        #  described in Process Mining book (by W. van der Aalst), p.132, does not happen in this log
        original_duration = event['Duration(s)_next'] 
      return original_duration


    def set_action_and_duration(wl):
      s = event['Activity'] + '-' + get_wl_str(event['Workload'])
      a_pi, duration, source = get_pi_action_and_duration(s, event['org:resource'], original_duration, 
                                                  event['Activity'], wl)
      event['pi_action'] = a_pi
      event['pred_duration'] = duration
      event['pi_action_dur_source'] = source


    def get_wl_str(wl):
      return json.dumps(wl).replace('"','').replace(': 0',": 'FREE'").replace(': 1',": 'LOW'").replace(': 2',": 'HIGH'")


    def get_pi_action_and_duration(state, original_action, original_duration, activity, wl):
      selection = pi[pi['state'] == state].action
      if selection.shape[0]>0:
        a_pi = int(selection.iloc[0])
        WORKLOAD_COUNT[a_pi] = WORKLOAD_COUNT[a_pi] + 1
        duration, source = predict_duration(a_pi, activity, wl)
        return a_pi, duration, source
      WORKLOAD_COUNT[int(original_action)] = WORKLOAD_COUNT[int(original_action)] + 1
      return original_action, original_duration, 'original'


    def predict_duration(action, activity, wl):
      x_predictor = base.copy() #indicar a atividade e o recurso 
      x_predictor.loc[float(action)] = 1
      x_predictor.loc[activity] = 1
      x_predictor.loc['workload'] = wl[action].iloc[0]
      #return max(0,duration_predictor.predict(x_predictor)[0])
      pred = duration_predictor.predict(x_predictor)[0]
      if pred > 0:
        return pred, 'ml_model'
      select_mean = duration_predictor_mean[(duration_predictor_mean['org:resource']==action) &
                            (duration_predictor_mean['Activity']==activity) &
                            (duration_predictor_mean['Workload-esp_resource']==wl[action].iloc[0])]
      if select_mean.shape[0]>0:
        return select_mean['mean'].iloc[0], 'avg_wl'
      return duration_predictor_mean[(duration_predictor_mean['org:resource']==action) &
                              (duration_predictor_mean['Activity']==activity)].mean.mean(), 'avg_act'


    def create_sint_complete_event():
      global DATA2PROCESS
      end_event = event.copy()
      event_complete_ts = event['time:timestamp'] + pd.to_timedelta(event['pred_duration'], unit = 'seconds')
      end_event['time:timestamp'] = event_complete_ts
      end_event['lifecycle:transition'] = 'SINT_COMPLETE' 
      DATA2PROCESS = DATA2PROCESS.append(end_event)

      return event_complete_ts


    def update_case_events_ts(event_complete_ts, original_duration):
      case_mask = (DATA2PROCESS['Case ID'] == event['Case ID'])
      ts_mask = (DATA2PROCESS['time:timestamp'] > event_complete_ts)

      durat_diff = 0

      durat_diff = original_duration - event['pred_duration']
      if durat_diff != 0:
        DATA2PROCESS.loc[case_mask & ts_mask, 'time:timestamp'] = DATA2PROCESS.loc[case_mask & ts_mask, 'time:timestamp'] - pd.to_timedelta(durat_diff, unit = 'seconds')

    # MAIN OF APPLY_PI METHOD
    if event['lifecycle:transition'] == 'START':
      wl = set_event_wl()
      original_duration = get_original_duration()
      set_action_and_duration(wl) #from pi
      event_complete_ts = create_sint_complete_event()
      update_case_events_ts(event_complete_ts, original_duration)

    if event['lifecycle:transition'] == 'SINT_COMPLETE':
      WORKLOAD_COUNT[int(event['pi_action'])] = WORKLOAD_COUNT[int(event['pi_action'])] - 1
    
    return event

  #MAIN OF APPLY_PI METHOD
  global DATA2PROCESS
  WORKLOAD_COUNT = inicialize_workload_count()
  DATA2PROCESS = data_test.sort_values('time:timestamp').copy()
  PROCESSED = pd.DataFrame()

  while DATA2PROCESS.shape[0]>1:
    PROCESSED = PROCESSED.append(process_event(DATA2PROCESS.iloc[0]))
    DATA2PROCESS = DATA2PROCESS.iloc[1:].sort_values('time:timestamp')

  return PROCESSED[PROCESSED['lifecycle:transition']!= 'SINT_COMPLETE']

## Aplicar política e coletar estatísticas

In [None]:
for q_g_id, alg in results:
    print(alg)
    gdd.download_file_from_google_drive(file_id=q_g_id, 
                                    dest_path = './%s.csv'%alg, 
                                    showsize = True)
    q = pd.read_csv('./%s.csv'%alg)
    process_q(q, alg)

Regressão Linear
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
Random Forest - 5
1
2
3
4
5
6
7
8
9
10
11
12
Random Forest - 10
1
2
3
4
5
6
7
8
9
10
11
Random Forest - 20
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
Random Forest - 100
1
2
3
4
5
6
7
8
9
10
11
12
13
14
MLP - 10,10_relu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
MLP - 50,50_relu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
MLP - 100_relu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
MLP - 50,50_sigmoide
1
2
3
4
5
MLP - 100_sigmoide
1
2
3
4
5
6
7
RPROP - 50,50_relu
1
2
3
4
5
6
7
8
9
10
11
12
13
RPROP - 100_sigmoide
1
2
3
4
5
6
7
8
9
10
11
12
RPROP - 500,500_sigmoide
1
2
3
4
5
6
7
8
9
10
11
12


## Gráficos



In [None]:
STATS.head()

Unnamed: 0,Total(d),Otimizados (%),N,Modelo,#Original_dur,#Dur_model,#Dur_avg_wl,#Dur_avg_act
0,21.733502,0.45338,1,Regressão Linear,1799,0,1206,0
1,19.208203,0.479021,2,Regressão Linear,1740,0,1265,0
2,19.15276,0.474359,3,Regressão Linear,1768,0,1237,0
3,19.15276,0.474359,4,Regressão Linear,1768,0,1237,0
4,18.32107,0.473193,5,Regressão Linear,1721,0,1284,0


In [None]:
DELTAS.Modelo.unique()

array(['Regressão Linear', 'Random Forest - 5', 'Random Forest - 10',
       'Random Forest - 20', 'Random Forest - 100', 'MLP - 10,10_relu',
       'MLP - 50,50_relu', 'MLP - 100_relu', 'MLP - 50,50_sigmoide',
       'MLP - 100_sigmoide', 'RPROP - 50,50_relu', 'RPROP - 100_sigmoide',
       'RPROP - 500,500_sigmoide'], dtype=object)

In [None]:
DELTAS.Modelo = (DELTAS.Modelo.str.replace('Regressão Linear','FQI / Linear Regression')
                              .str.replace('Random Forest - 100','FQI / Random Forest - 100')
                              .str.replace('Random Forest - 20','FQI / Random Forest - 20')
                              .str.replace('Random Forest - 10$','FQI / Random Forest - 10', regex=True)
                              .str.replace('MLP - 10,10_relu','NFQ / MLP - (10,10), relu')
                              .str.replace('MLP - 50,50_relu','NFQ / MLP - (50,50), relu')
                              .str.replace('MLP - 100_relu','NFQ / MLP - (100), relu')
                              .str.replace('MLP - 50,50_sigmoide','NFQ / MLP - (50,50), sigmoide')
                              .str.replace('MLP - 100_sigmoide','NFQ / MLP - (100), sigmoide')
                              .str.replace('RPROP - 50,50_relu','NFQ / RPROP - (50,50), relu')
                              .str.replace('RPROP - 100_sigmoide','NFQ / RPROP - (100), sigmoide')
                              .str.replace('RPROP - 500,500_sigmoide','NFQ / RPROP - (500,500), sigmoide'))
graph_delta = px.line(DELTAS[DELTAS.Modelo != 'Random Forest - 5'].rename(columns={'Delta':'∆', 'N':'Iteration'}), 
                      x = 'Iteration', y = '∆', color = 'Modelo',
                      template = 'plotly_white', color_discrete_sequence = px.colors.qualitative.Light24)
graph_delta.show()

In [None]:
graph_delta = px.line(DELTAS.rename(columns={'Delta':'∆', 'N':'Iteration'}), 
                      x = 'Iteration', y = '∆', color = 'Modelo',
                      template = 'plotly_white', color_discrete_sequence = px.colors.qualitative.Light24)

graph_q = px.line(QS[QS.Modelo != 'Random Forest - 100'].groupby(['N','Modelo']).sum().reset_index(), 
                  x='N', y = 'Q', color = 'Modelo',
                  template = 'plotly_white')


graph_total = px.line(STATS[STATS.Modelo != 'Random Forest - 100'].sort_values(by=['Modelo','N']), x='N', y = 'Total(d)', color = 'Modelo',
                  template = 'plotly_white')

graph_improv = px.line(STATS[STATS.Modelo != 'Random Forest - 100'].sort_values(by=['Modelo','N']), x='N', y = 'Otimizados (%)', color = 'Modelo',
                  template = 'plotly_white')

graph_q.show()
graph_total.show()
graph_improv.show()

AttributeError: ignored

## Índices

In [None]:
pd.set_option('display.max_columns', None)


In [None]:
max = STATS.groupby('Modelo').N.max()
res = pd.DataFrame()
for alg in list(max.index):
  res = pd.concat([res, STATS[(STATS['Modelo'] == alg) & (STATS['N'] == max[alg])]])
res

Unnamed: 0,Total(d),Otimizados (%),N,Modelo,#Original_dur,#Dur_model,#Dur_avg_wl,#Dur_avg_act,#Events Otimiz
21,16.893555,0.468531,22,"MLP - 10,10_relu",1479,0,1526,0,0.163571
23,18.71696,0.486014,24,MLP - 100_relu,1668,0,1337,0,0.151406
6,26.812177,0.451049,7,MLP - 100_sigmoide,2040,0,965,0,0.126911
25,17.098168,0.438228,26,"MLP - 50,50_relu",1637,0,1368,0,0.145159
4,30.51109,0.432401,5,"MLP - 50,50_sigmoide",2052,0,953,0,0.118856
11,15.92832,0.392774,12,RPROP - 100_sigmoide,1443,0,1562,0,0.152228
12,19.11568,0.376457,13,"RPROP - 50,50_relu",1567,0,1438,0,0.144008
11,15.365122,0.409091,12,"RPROP - 500,500_sigmoide",1431,0,1574,0,0.152721
10,21.79998,0.470862,11,Random Forest - 10,1793,0,1212,0,0.137761
13,19.692339,0.47669,14,Random Forest - 100,1740,0,1265,0,0.14483


In [None]:
res['Pi applied'] = res['#Dur_avg_wl'] / 3005

In [None]:
res[['Modelo', 'N', 'Total(d)', 'Pi applied','#Events Otimiz', 'Otimizados (%)']].sort_values('Modelo', ascending=False)

Unnamed: 0,Modelo,N,Total(d),Pi applied,#Events Otimiz,Otimizados (%)
29,Regressão Linear,30,15.893267,0.501165,0.143515,0.390443
11,Random Forest - 5,12,21.87777,0.44193,0.153214,0.475524
14,Random Forest - 20,15,23.074513,0.421963,0.143515,0.469697
13,Random Forest - 100,14,19.692339,0.420965,0.14483,0.47669
10,Random Forest - 10,11,21.79998,0.403328,0.137761,0.470862
11,"RPROP - 500,500_sigmoide",12,15.365122,0.523794,0.152721,0.409091
12,"RPROP - 50,50_relu",13,19.11568,0.478536,0.144008,0.376457
11,RPROP - 100_sigmoide,12,15.92832,0.5198,0.152228,0.392774
4,"MLP - 50,50_sigmoide",5,30.51109,0.317138,0.118856,0.432401
25,"MLP - 50,50_relu",26,17.098168,0.455241,0.145159,0.438228


In [None]:
CASES_OTIMIZ['Duration(s)_next'] = CASES_OTIMIZ['Duration(s)_next']/2 
CASES_OTIMIZ.Otimizacao = CASES_OTIMIZ['Pi'] - CASES_OTIMIZ['Duration(s)_next']
CASES_OTIMIZ[CASES_OTIMIZ.Otimizacao < 0].groupby(['Case ID','Modelo']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,Duration(s)_next,Pi,Otimizacao
Case ID,Modelo,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
197701.0,RPROP - 100_sigmoide,199.5,169.672414,-29.827586
200085.0,"RPROP - 50,50_relu",350.5,169.672414,-180.827586
200578.0,"MLP - 10,10_relu",203.5,169.672414,-33.827586
200578.0,RPROP - 100_sigmoide,203.5,169.672414,-33.827586
200578.0,"RPROP - 500,500_sigmoide",203.5,169.672414,-33.827586
...,...,...,...,...
214319.0,"MLP - 50,50_relu",504.5,265.460227,-239.039773
214319.0,RPROP - 100_sigmoide,504.5,265.460227,-239.039773
214319.0,"RPROP - 50,50_relu",504.5,265.460227,-239.039773
214319.0,"RPROP - 500,500_sigmoide",504.5,265.460227,-239.039773


In [None]:
CASES_OTIMIZ[CASES_OTIMIZ.Otimizacao == 0].groupby(['Case ID','Modelo']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,Duration(s)_next,Pi,Otimizacao
Case ID,Modelo,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
197509.0,"MLP - 10,10_relu",0.0,0.0,0.0
197509.0,MLP - 100_relu,0.0,0.0,0.0
197509.0,MLP - 100_sigmoide,0.0,0.0,0.0
197509.0,"MLP - 50,50_relu",0.0,0.0,0.0
197509.0,"MLP - 50,50_sigmoide",0.0,0.0,0.0
...,...,...,...,...
204808.0,Random Forest - 10,0.0,0.0,0.0
204808.0,Random Forest - 100,0.0,0.0,0.0
204808.0,Random Forest - 20,0.0,0.0,0.0
204808.0,Random Forest - 5,0.0,0.0,0.0


In [None]:
CASES_OTIMIZ[CASES_OTIMIZ.Otimizacao > 0].groupby(['Case ID','Modelo']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,Duration(s)_next,Pi,Otimizacao
Case ID,Modelo,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
197701.0,"MLP - 10,10_relu",199.5,399.000000,199.500000
197701.0,MLP - 100_relu,199.5,528.693182,329.193182
197701.0,MLP - 100_sigmoide,199.5,399.000000,199.500000
197701.0,"MLP - 50,50_relu",199.5,399.000000,199.500000
197701.0,"MLP - 50,50_sigmoide",199.5,399.000000,199.500000
...,...,...,...,...
214376.0,Random Forest - 10,27.5,55.000000,27.500000
214376.0,Random Forest - 100,27.5,55.000000,27.500000
214376.0,Random Forest - 20,27.5,592.460000,564.960000
214376.0,Random Forest - 5,27.5,592.460000,564.960000


# Verificar em cada política a distribuição de recursos nos estados

In [None]:
PIS_PATH = '/gdrive/MyDrive/ppar_results/pis/'
filenames = [filename for filename in os.listdir(PIS_PATH)]
filenames = sorted(filenames)

In [None]:
APPLIED_DATA_PATH = '/gdrive/MyDrive/ppar_results/pi_applied_data/'
filenames = [filename for filename in os.listdir(PIS_PATH)]
filenames = sorted(filenames)
pis_df = pd.DataFrame({'filename':  filenames})
filenames = {filename.split('-n')[0]: filename for filename in filenames}

import ast
def check_workloads(x,n):
  if x['Workload'] is not np.nan:
    wl = ast.literal_eval(x['Workload'])
    x['>%s?'%str(n)] = pd.Series([wl[resource] > n for resource in wl]).sum()
  return x

res_in_pis = pd.DataFrame()
for model in filenames:
  filename = filenames[model]
  pi = pd.read_csv(PIS_PATH+filename)
  pi['activity'] = pi.state.apply(lambda x: x.split('-')[0])
  applied_data = pd.read_csv(APPLIED_DATA_PATH+filename)
  for n in [1,2]:
    applied_data = applied_data.apply(lambda x: check_workloads(x,n), axis=1)
    count = pd.DataFrame(pi.groupby('activity').action.nunique()).T.reset_index(drop=True)
    count['Modelo'] = model
    #count['Overload'] = pd.DataFrame(applied_data['>%s?'%str(n)].value_counts()).to_dict()['>%s?'%str(n)]
    res_in_pis = pd.concat([count, res_in_pis])

KeyboardInterrupt: ignored

In [None]:
res_in_pis

# Verificar ocupação dos recursos depois da aplicação da política

## Gráfico de ocupação de recursos

Fazer o mesmo gráfico de ocupação dos recursos só que durante os eventos do conjunto de teste após a aplicação da política

In [None]:
APPLIED_DATA_PATH = '/gdrive/MyDrive/ppar_results/pi_applied_data/'
applied_data_filename = 'Regressão Linear-n_30.csv'
applied_data = pd.read_csv(APPLIED_DATA_PATH + applied_data_filename)
applied_data['time:timestamp'] = pd.to_datetime(applied_data['time:timestamp'], 
                                                format="%Y/%m/%d %H:%M:%S")

In [None]:
applied_data.head()

Unnamed: 0,Case ID,Activity,Duration(s)_next,pi_action,pred_duration,pi_action_dur_source,Workload,time:timestamp
0,202185.0,W_Calling to add missing information to the ap...,133.0,10629.0,272.461538,avg_wl,"{11180: 0, 10982: 0, 11121: 0, 10609: 0, 10899...",2012-02-03 16:40:30.000000000
1,202185.0,W_Calling to add missing information to the ap...,,,,,,2012-02-03 16:42:43.000000000
2,206053.0,W_Filling in information for the application,548.0,10861.0,248.460227,avg_wl,"{11180: 0, 10982: 0, 11121: 0, 10609: 0, 10899...",2012-02-03 16:47:24.000000000
3,206053.0,W_Filling in information for the application,,,,,,2012-02-03 16:51:32.460227273
4,202947.0,W_Calling to add missing information to the ap...,1028.0,10629.0,272.461538,avg_wl,"{11180: 0, 10982: 0, 11121: 0, 10609: 0, 10899...",2012-02-03 16:51:59.000000000


In [None]:
from ast import literal_eval

#na representacao vetorial
def def_workload_status(x):
  def get_r_workload(resource):
    if x['Workload'][resource] == 0:
      return 'FREE'
    elif x['Workload'][resource] == 1: 
      return 'LOW'
    return 'HIGH' 
  x['Workload'] = {resource: get_r_workload(resource) for resource in x['Workload']}
  return x
applied_data = applied_data.dropna()
applied_data['Workload'] = applied_data['Workload'].apply(lambda x: literal_eval(x))
applied_data = applied_data.apply(lambda x: def_workload_status(x), axis=1)

In [None]:
def get_res_wl(x):
  x['Resource Workload'] = x['Workload'][int(x['pi_action'])]
  return x
applied_data = applied_data.apply(get_res_wl, axis=1)

In [None]:
applied_data['Resource Workload'].value_counts()

FREE    2646
HIGH     359
Name: Resource Workload, dtype: int64

In [None]:
grouped = pd.DataFrame(applied_data.reset_index()
                             .groupby(['pi_action','Resource Workload'])['index']
                             .count())
grouped = (grouped.reset_index()
                 .rename(columns = {'index':'Count'})
                 .sort_values(['Resource Workload','Count']))

total_per_resource_test = grouped.groupby('pi_action').sum().rename(columns={'Count':'Sum'})
grouped_relative = total_per_resource_test.join(grouped.set_index('pi_action')).reset_index()
grouped_relative['Count relative'] = grouped_relative.apply(lambda x: x.Count/x.Sum, axis=1)

In [None]:
grouped

Unnamed: 0,pi_action,Resource Workload,Count
19,10929.0,FREE,2
12,10889.0,FREE,12
6,10809.0,FREE,32
30,11049.0,FREE,32
41,11180.0,FREE,32
20,10932.0,FREE,34
45,11201.0,FREE,34
35,11122.0,FREE,42
34,11121.0,FREE,48
17,10913.0,FREE,50


In [None]:
fig = px.bar(grouped.sort_values(by = ['pi_action','Resource Workload'], ascending =[True,False]).round(2), x = 'pi_action', y = 'Count', 
             color = 'Resource Workload', text = 'Count', template = 'plotly_white', barmode='group',
             category_orders={'pi_action': grouped_relative.sort_values('pi_action').pi_action})
fig.update_xaxes(type='category')
fig

In [None]:
fig = px.bar(grouped_relative.sort_values('pi_action').round(2), x = 'pi_action', y = 'Count relative', 
             color = 'Resource Workload', text = 'Count relative', template = 'plotly_white', height = 400, 
             category_orders={'pi_action': grouped_relative.sort_values('pi_action').pi_action})
fig.update_xaxes(type='category', tickangle=45)

fig

In [None]:
px.box(applied_data, y = 'pred_duration', color = 'Resource Workload', width = 400, height = 600, template = 'plotly_white')

## Contagem de recursos 

In [None]:
pd.DataFrame(applied_data.groupby('Activity')['pi_action'].value_counts())

Unnamed: 0_level_0,Unnamed: 1_level_0,pi_action
Activity,pi_action,Unnamed: 2_level_1
W_Assessing the application,10138.0,266
W_Assessing the application,10609.0,81
W_Assessing the application,10972.0,69
W_Assessing the application,10629.0,45
W_Assessing the application,10809.0,22
...,...,...
W_Fixing incoming lead,11122.0,5
W_Fixing incoming lead,11180.0,2
W_Fixing incoming lead,10609.0,1
W_Fixing incoming lead,10929.0,1


In [None]:
applied_data.pi_action.value_counts()

11181.0    389
10861.0    299
10138.0    271
10881.0    256
10629.0    235
10982.0    137
11169.0    137
11203.0    132
10609.0    112
11179.0    101
11009.0     98
11119.0     88
10899.0     87
10972.0     78
11000.0     68
11259.0     63
10909.0     63
10913.0     52
11003.0     51
11121.0     48
11122.0     43
11201.0     40
10932.0     37
11049.0     36
11180.0     35
10809.0     34
10889.0     13
10929.0      2
Name: pi_action, dtype: int64

# Montar log a partir do resultado da simulação


Passar o resultado da aplicação da política ao conjunto de teste para o formato de log

### Conjunto de teste original

In [None]:
filter_rows = (data_test['lifecycle:transition']=='START')&(~data_test['Duration(s)_next'].isnull())
filter_cols = ['Case ID', 'Activity', 'time:timestamp', 'org:resource', 'Duration(s)_next']
data_test_select = data_test[filter_rows][filter_cols]
data_test_select['Duration(s)_next'] = data_test_select['Duration(s)_next'].apply(lambda x: pd.Timedelta(x, unit='seconds')) 
data_test_select['end_ts'] = data_test_select['time:timestamp'] + data_test_select['Duration(s)_next']

data_test_select = data_test_select.rename(columns={#'time:timestamp':'original_timestamp',
                                                    'org:resource': 'Resource',
                                                    'Duration(s)_next':'Duration(s)'})
data_test_select.to_csv('data_test-log_format.csv')
files.download('data_test-log_format.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
data_test_select[data_test_select['Case ID'] == 206639]

Unnamed: 0,Case ID,Activity,time:timestamp,Resource,Duration(s),end_ts
1927,206639,W_Filling in information for the application,2012-02-06 18:33:55,11181.0,0 days 00:25:17,2012-02-06 18:59:12
1929,206639,W_Fixing incoming lead,2012-02-06 09:32:53,11179.0,0 days 00:02:05,2012-02-06 09:34:58


### Conjunto de teste simulado

In [None]:
ap_log = applied_data.dropna()
ap_log['Case ID'] = ap_log['Case ID'].astype(int)
ap_log['Duration'] = ap_log['pred_duration'].apply(lambda x: pd.Timedelta(x, unit='seconds')) 
ap_log['timestamp_end'] = ap_log['time:timestamp'] + ap_log['Duration']
ap_log = ap_log.rename(columns={'time:timestamp':'timestamp_start',
                                'pi_action':'Resource'})
ap_log = ap_log[['Case ID','Activity','Resource','Resource Workload','Duration','timestamp_start','timestamp_end']]

In [None]:
logfilename = applied_data_filename.replace('.csv','-logformat.csv')
ap_log.to_csv(logfilename)
files.download(logfilename)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Checar se o timestamp foi devidamente alterado

Quando a duração de uma atividade é alterada, o timestamp de início de todas as próximas atividades do case deveriam ser alteradas

In [None]:
applied_data[applied_data['Case ID']==198310].sort_values('time:timestamp')

Unnamed: 0,Case ID,Activity,Duration(s)_next,pi_action,pred_duration,pi_action_dur_source,Workload,time:timestamp,Resource Workload
553,198310,W_Assessing the application,18.0,10138.0,886.71875,avg_wl,"{11180: 'FREE', 10982: 'FREE', 11121: 'FREE', ...",2012-02-07 14:02:55.000000,FREE
690,198310,W_Assessing the application,45.0,10609.0,45.0,original,"{11180: 'FREE', 10982: 'FREE', 11121: 'FREE', ...",2012-02-08 09:30:50.718750,FREE
1122,198310,W_Assessing the application,15.0,10138.0,886.71875,avg_wl,"{11180: 'FREE', 10982: 'FREE', 11121: 'FREE', ...",2012-02-09 16:24:29.718750,FREE
2571,198310,W_Assessing the application,374.0,10138.0,374.0,original,"{11180: 'FREE', 10982: 'FREE', 11121: 'FREE', ...",2012-02-15 16:51:21.437500,FREE
2630,198310,W_Assessing the application,113.0,10138.0,886.71875,avg_wl,"{11180: 'FREE', 10982: 'FREE', 11121: 'FREE', ...",2012-02-16 08:43:13.437500,FREE


In [None]:
data_test[(data_test['Case ID']==198310)&(data_test['lifecycle:transition']=='START')].sort_values('time:timestamp')

Unnamed: 0,Case ID,Activity,AMOUNT_REQ,REG_DATE,org:resource,lifecycle:transition,time:timestamp,lifecycle:transition_prev,Activity_prev,org:resource_prev,time:timestamp_prev,duration,Duration(s),lifecycle:transition_next,Activity_next,org:resource_next,time:timestamp_next,Duration(s)_next
16,198310,W_Assessing the application,25000,05/01/2012 14:24:08,11259.0,START,2012-02-07 14:02:55,COMPLETE,W_Assessing the application,10809.0,2012-01-24 14:40:21,,,COMPLETE,W_Assessing the application,11259.0,2012-02-07 14:03:13,18.0
9,198310,W_Assessing the application,25000,05/01/2012 14:24:08,10609.0,START,2012-02-08 09:16:22,COMPLETE,W_Assessing the application,10138.0,2012-02-15 16:28:35,,,COMPLETE,W_Assessing the application,10609.0,2012-02-08 09:17:07,45.0
11,198310,W_Assessing the application,25000,05/01/2012 14:24:08,10609.0,START,2012-02-09 16:10:01,COMPLETE,W_Assessing the application,10609.0,2012-02-08 09:17:07,,,COMPLETE,W_Assessing the application,10609.0,2012-02-09 16:10:16,15.0
7,198310,W_Assessing the application,25000,05/01/2012 14:24:08,10138.0,START,2012-02-15 16:22:21,COMPLETE,W_Fixing incoming lead,11003.0,2012-01-05 14:00:02,,,COMPLETE,W_Assessing the application,10138.0,2012-02-15 16:28:35,374.0
13,198310,W_Assessing the application,25000,05/01/2012 14:24:08,10629.0,START,2012-02-16 08:14:13,COMPLETE,W_Assessing the application,10629.0,2012-01-20 16:57:24,,,COMPLETE,W_Assessing the application,10629.0,2012-02-16 08:16:06,113.0


**Conclusão:** comparando os timestamps de um dos casos no log original e depois da aplicação da política, parece que não está alterando corretamente não :(

#TO DO

- Descartar casos "quebrados" no meio pela divisão de treinamento e teste
- Verificar se o Q que estpu escolhendo pra determinar a política não é tudo igual pra todas as ações (e aí não estou de fato escolhendo com base no Q estimado, mas sim somente aleatoriamente) 

# Hipóteses/riscos

1. Determinar o tempo que um recurso leva para executar uma atividade

 - O que foi feito: modelo de regressão linear que leva em conta o recurso, a atividade e o workload de todos os recursos naquele momento.
 - Oportunidade: a função pode levar em conta somente o workload daquele recurso naquele momento; pode ser, inclusive, simplificada para a média do tempo que aquele recurso leva dado o workload dele (somente dele) naquele momento.

2. Determinar a próxima demanda (próxima atividade a ser executada)
  - O que foi feito: a próxima demanda não depende do recurso (nem do tempo dispendido), pois é inerente às características do case (características de negócio) e, assim, a próxima atividade será a mesma que consta originalmente no log.

  - Oportunidade: determinar as probabilidades de transição (a depender da atividade, do recurso e/ou até da duração da atividade) + algo como Monte Carlo para determinar a média de vários cenários?


3. Como a alteração da duração da atividade (que pode ser alterada a partir da designação da execução a um certo recurso) afeta o restante do log

  - O que foi feito: somente o momento de início das próximas atividades do case relacionado são afetadas, mas o workload para todos os próximos eventos pode ser afetado.

4. Se o estado s não é encontrado na política pi (aquele cenário de workload na execucao daquela atividade)?
  - O que foi feito: mantenho mesmo recurso (e mesma duração) originalmente utilizado - nesse caso, a pergunta respondida pela avaliação é: "quanto a política ajuda quando pode ajudar?".
  - Oportunidade: fazer como uma distância para o workload mais próximo que existe na política.

Próximos experimentos:
- Combinações de parâmetros dos algoritmos de regressão.
- Testes com diferentes quantidade de casos (resultados bons com menos dados).
- Testes com diferentes filtros de casos (quais tipos de casos são necessários).
