# ML Pipeline testing
---
---

# Imports

#### Standard library imports

In [1]:
import sys
import pickle
import os

sys.path.append("../../")

#### Third party imports

In [2]:
import pandas as pd
pd.set_option('display.max_columns', 500)

#### Local application imports

In [3]:
%load_ext autoreload
%autoreload 2

from pkg_dir.config import *
from pkg_dir.src.utils import *
from pkg_dir.src.functions import *
from pkg_dir.src.parameters import *

# Extraction

In [None]:
extract_pipeline_func()

## Results

In [None]:
pkl_obj = pipeline_pkl_extract_path + "/" + pipeline_pkl_extract_name

In [None]:
with open(pkl_obj, 'rb') as obj_content:
    dfx = pickle.load(obj_content)

# Transformation

## Steps

### Query

In [None]:
sql_string = """
SELECT cita.citaid as appointment_id,
       citafecha as appointment_date,
       citahorad as appointment_start_time,
       citahorah as appointment_end_time,
       c.cestatusobs as metadata,
       citaestado as appointment_status,
       u.usuarionomfull as doctor,
       e.especialidadnom as medical_specialty,
       su.sucursalnom as clinic,
       se.servicionom as service,
       p.pacienteid as patient_id,
       p.pacientefnac as patient_birth_date

FROM cita


 INNER JOIN usuario u ON cita.citadoctorid = u.usuarioid
 INNER JOIN servicio se ON cita.servicioid = se.servicioid
 INNER JOIN especialidad e ON u.usuarioespecialidadid = e.especialidadid
 INNER JOIN sucursal su ON cita.citasucursalid = su.sucursalid
 LEFT JOIN paciente p ON cita.pacienteid = p.pacienteid
 INNER JOIN citaestatus c ON cita.citaid = c.citaid AND cita.citaanio = c.citaanio


WHERE citafecha >= '2023-12-15'
  AND citafecha <= '2023-12-16'

ORDER BY
    cita.citaid

;

"""

In [None]:
dfx = sql_string_to_df('pc_db_prod', sql_string)

dfx

## Results

In [None]:
transform_pipeline_func()

## Results

In [None]:
pkl_obj = pipeline_pkl_extract_path + "/" + pipeline_pkl_extract_name

In [None]:
with open(pkl_obj, 'rb') as obj_content:
    dfx = pickle.load(obj_content)

In [None]:
dfx

# Notes

## All appointment status

In [None]:
sql_string = """
SELECT 
       DISTINCT citaestado as appointment_status

FROM cita


WHERE citafecha >= '2022-01-01'
  AND citafecha <= '2023-12-31'
;
"""

In [None]:
dfx = sql_string_to_df('pc_db_prod', sql_string)

dfx

## Metadata actions

### Extracting from SQL

In [None]:
sql_string = """
SELECT cita.citaid as appointment_id,
       c.cestatusobs as metadata,
       SUBSTRING(c.cestatusobs, '^[^ ]*') as metadata_usr,
       SUBSTRING(c.cestatusobs, '\((.+?)\)') as metadata_datetime,
       SUBSTRING(c.cestatusobs, ' :(.*?)  ') as metadata_action,
       SUBSTRING(c.cestatusobs, ' :(.*)') as metadata_action_full

FROM cita

 INNER JOIN citaestatus c ON cita.citaid = c.citaid AND cita.citaanio = c.citaanio


WHERE citafecha >= '2022-01-01'
  AND citafecha <= '2023-12-31'

;
"""

In [None]:
dfx = sql_string_to_df('pc_db_prod', sql_string)

In [None]:
dfx

In [None]:
dfx.rename(
    columns={
        0: "appointment_id",
        1: "metadata_full",
        2: "user",
        3: "date",
        4: "action",
        5: "action_full",
    }, 
    inplace=True
)

### Pickle save/load

In [None]:
## Saving df as pickle and storing it locally
path = '../../pkg_dir/data/pickles/robs'
name = 'metadata_raw.pkl'

pickle.dump(
    dfx,
    open(
        os.path.join(path, name),
        'wb'
    )
)

In [None]:
## Saving df as pickle and storing it locally
path = '../../pkg_dir/data/pickles/robs'
name = 'metadata_raw.pkl'


## Reading extract object saved as pickle locally
pkl_obj = path + "/" + name

with open(pkl_obj, 'rb') as obj_content:
    dfx = pickle.load(obj_content)

### Cleaning actions fields

In [None]:
dfz = dfx.copy()

In [None]:
## Dummy col
dfz['action_clean'] = 'irrelevant'

In [None]:
### Confirmado
mr1 = dfz['action_full'].str.contains('Confirmado')

dfz.loc[mr1, 'action_clean'] = 'confirmed'

In [None]:
### Creado
mr1 = dfz['action_full'].str.contains('Creado')

dfz.loc[mr1, 'action_clean'] = 'created'

In [None]:
### Sala espera
mr1 = dfz['action_full'].str.contains('En sala de espera')

dfz.loc[mr1, 'action_clean'] = 'in_waiting_room'

In [None]:
### Completado
mr1 = dfz['action_full'].str.contains('Completado')
mr2 = dfz['action_full'].str.contains('Completada')
mrs = mr1 | mr2

dfz.loc[mrs, 'action_clean'] = 'completed'

In [None]:
### En consulta
mr1 = dfz['action_full'].str.contains('En consulta')

dfz.loc[mr1, 'action_clean'] = 'with_doctor'

In [None]:
### No se presentó
mr1 = dfz['action_full'].str.contains('No se presentó')

dfz.loc[mr1, 'action_clean'] = 'no_show'

In [None]:
### Cancelada por el paciente
mr1 = dfz['action_full'].str.contains('Cancelada por el paciente')

dfz.loc[mr1, 'action_clean'] = 'canceled_by_patient'

In [None]:
### Cancelada por empledo
mr1 = dfz['action_full'].str.contains('Cancelada por empledo')

dfz.loc[mr1, 'action_clean'] = 'canceled_by_employee'

In [None]:
### Se actualiza la fecha/hora
mr1 = dfz['action_full'].str.contains('Se actualiza la fecha/hora')

dfz.loc[mr1, 'action_clean'] = 'update_date_hour'

In [None]:
### Disponible
mr1 = dfz['action_full'].str.contains('Disponible')

dfz.loc[mr1, 'action_clean'] = 'available'

In [None]:
### Mensaje dejado
mr1 = dfz['action_full'].str.contains('Mensaje dejado')

dfz.loc[mr1, 'action_clean'] = 'message_left'

### Formatting and adjusting

In [None]:
dfz[dfz['action_clean'] == 'completed']

In [None]:
## Dropping irrelevant columns
mr1 = dfz['action_clean'] != 'irrelevant'
rc = ['appointment_id', 'user', 'date', 'action_clean']

dfz = dfz.loc[mr1, rc].copy()

In [None]:
## Adjusting dateformat
dfz['date'] = pd.to_datetime(dfz['date'], format='%d/%m/%Y %H:%M:%S', errors='coerce')

dfz = dfz.loc[~dfz['date'].isna(), :].copy()

In [None]:
## Sorting values 
dfz.sort_values(by=['appointment_id', 'action_clean', 'date'], inplace=True, ascending=False)

In [None]:
## Keeping latest 
dfz.drop_duplicates(keep='first', subset=['appointment_id', 'action_clean'], inplace=True)

### Pivot

In [None]:
dfp = dfz.pivot(index='appointment_id', columns='action_clean', values=['user', 'date'])

In [None]:
dfp.columns = ['meta__' + col[1] + '_$' + col[0] for col in dfp.columns]

In [None]:
cols = dfp.columns.tolist()
cols.sort()

dfp = dfp.loc[:, cols].copy()

### Pickle save/load

In [None]:
## Saving df as pickle and storing it locally
path = '../../pkg_dir/data/pickles/robs'
name = 'pivot_df_v2.pkl'

pickle.dump(
    dfp,
    open(
        os.path.join(path, name),
        'wb'
    )
)

In [None]:
## Params
path = '../../pkg_dir/data/pickles/robs'
name = 'pivot_df_v2.pkl'


## Reading extract object saved as pickle locally
pkl_obj = path + "/" + name

with open(pkl_obj, 'rb') as obj_content:
    dfp = pickle.load(obj_content)


### 
### Base dataset with no metadata

In [None]:
## Function parameters
db_crds = 'pc_db_prod'
query = '''
SELECT cita.citaid as appointment_id,
       citafecha as appointment_date,
       citahorad as appointment_start_time,
       citahorah as appointment_end_time,
       citaestado as appointment_status,
       u.usuarionomfull as doctor,
       e.especialidadnom as medical_specialty,
       su.sucursalnom as clinic,
       se.servicionom as service,
       p.pacienteid as patient_id,
       p.pacientefnac as patient_birth_date

FROM cita

 INNER JOIN usuario u ON cita.citadoctorid = u.usuarioid
 INNER JOIN servicio se ON cita.servicioid = se.servicioid
 INNER JOIN especialidad e ON u.usuarioespecialidadid = e.especialidadid
 INNER JOIN sucursal su ON cita.citasucursalid = su.sucursalid
 LEFT JOIN paciente p ON cita.pacienteid = p.pacienteid

WHERE citafecha >= '2022-01-01'
  AND citafecha <= '2023-12-31'

;
'''


In [None]:
dfx = sql_string_to_df(db_crds, query)
dfx.columns = ['appointment_id', 'appointment_date', 'appointment_start_time', 'appointment_end_time', 'appointment_status', 'doctor', 'medical_specialty', 'clinic', 'service', 'patient_id', 'patient_birth_date']

### Merging pivot dataframe and main

In [None]:
dfp.reset_index(inplace = True)

In [None]:
dataset = pd.merge(dfx, dfp, on='appointment_id', how='left')

### Formatting date columns

In [None]:
dataset

In [None]:
## Formatting appointment date
dataset['appointment_date'] = pd.to_datetime(dataset['appointment_date'], format='%Y-%m-%d')

### Appointment status simplification

In [None]:
## Reference to simplify labels in appointment status
values_map = {

    ## Completed
    'COMPLETADA': 'completed',
    'SALA_ESPERA': 'completed',
    'CONSULTA': 'completed',
    'CONFIRMADO_PAGO': 'completed',

    ## No shows
    'NO_PRESENTO': 'no_show',
    'MENSAJE_DEJADO': 'no_show',
    'NO_CONFIRMADO': 'no_show',
    'CONFIRMADO': 'no_show',
    'LLEGA_TARDE_ESPERA': 'no_show',
    'REAGENDAR': 'no_show',
    'TRIAGE': 'no_show',
    'TRIAGE_COMPLETO': 'no_show',
    'LISTA_ESPERA': 'no_show',
    'VALIDA_DATOS': 'no_show',

    ## Unused
    'DISPONIBLE': 'unused',

    ## Blocked
    'BLOQUEADO': 'blocked',

    ## Canceled
    'CANCELA_PACIENTE': 'cancel_patient',
    'CANCELA_EMPLEADO': 'cancel_employee',
    
    
}

In [None]:
dataset['appointment_status'].value_counts()

In [None]:
## Mapping values according to reference
dataset['appointment_status_simplified'] = dataset['appointment_status'].map(values_map).fillna(dataset['appointment_status'])

In [None]:
dataset['appointment_status_simplified'].value_counts()

In [None]:
dataset.drop(['appointment_status'], axis=1, inplace=True)

### Saving data

In [None]:
## Saving df as pickle and storing it locally
path = '../../pkg_dir/data/pickles/robs'
name = 'dataset.pkl'

pickle.dump(
    dataset,
    open(
        os.path.join(path, name),
        'wb'
    )
)

In [5]:
## Params
path = '../../pkg_dir/data/pickles/robs'
name = 'dataset.pkl'


## Reading extract object saved as pickle locally
pkl_obj = path + "/" + name

with open(pkl_obj, 'rb') as obj_content:
    dataset = pickle.load(obj_content)

In [None]:
## Saving as csv
dataset.to_csv('../../pkg_dir/data/pickles/robs/dataset.csv')

### Data description

In [11]:
list(dataset.columns)

['appointment_id',
 'appointment_date',
 'appointment_start_time',
 'appointment_end_time',
 'doctor',
 'medical_specialty',
 'clinic',
 'service',
 'patient_id',
 'patient_birth_date',
 'meta__available_$date',
 'meta__available_$user',
 'meta__canceled_by_employee_$date',
 'meta__canceled_by_employee_$user',
 'meta__canceled_by_patient_$date',
 'meta__canceled_by_patient_$user',
 'meta__completed_$date',
 'meta__completed_$user',
 'meta__confirmed_$date',
 'meta__confirmed_$user',
 'meta__created_$date',
 'meta__created_$user',
 'meta__in_waiting_room_$date',
 'meta__in_waiting_room_$user',
 'meta__message_left_$date',
 'meta__message_left_$user',
 'meta__no_show_$date',
 'meta__no_show_$user',
 'meta__update_date_hour_$date',
 'meta__update_date_hour_$user',
 'meta__with_doctor_$date',
 'meta__with_doctor_$user',
 'appointment_status_simplified']

## Generate data schema

In [None]:
df_cols = dfx.columns.tolist()
data_dicts_loc = 'pkg_dir/src/parameters/'
data_dict_filename = 'data_schema.txt'

In [None]:
generate_data_dictionary(df_cols, data_dicts_loc, data_dict_filename)

## Final dataset

### No matches

In [None]:
dataset_ai = dataset['appointment_id'].tolist()
pivot_ai = dfp['appointment_id'].tolist()

In [None]:
non_matches = list(set(dataset_ai) - set(pivot_ai))

In [None]:
df_nm = dataset[dataset['appointment_id'].isin(non_matches)]

In [None]:
df_nm['appointment_status'].value_counts()

### Other

In [None]:
## Reading extract object saved as pickle locally
pkl_obj = "../../pkg_dir/data/pickles/robs" + "/" + "dataset.pkl"

with open(pkl_obj, 'rb') as obj_content:
    dfx = pickle.load(obj_content)

In [None]:
mr1 = dfx['appointment_status'] == 'COMPLETADA'
mr2 = dfx['created__date'].isna()
mrs = mr1 & mr2

dfx.loc[mrs, :]

In [None]:
mr1 = dfx['appointment_status'] == 'DISPONIBLE'

dfx.loc[mr1, :]

---

---