# Merge Datasets

# Imports

#### Standard library imports

In [1]:
import sys
import os
sys.path.append("../")

#### Third party imports

In [2]:
import pandas as pd
import pickle

#### Local application imports

In [3]:
%load_ext autoreload
%autoreload 2

from pkg_dir.config import *
from pkg_dir.src.utils import *
from pkg_dir.src.functions import *
from pkg_dir.src.parameters import *

# Query

In [4]:
## Function parameters
db_crds = 'pc_db_prod'
query = '''
SELECT cita.citaid as appointment_id,
       citafecha as appointment_date,
       citahorad as appointment_start_time,
       citahorah as appointment_end_time,
       citaestado as appointment_status,
       u.usuarionomfull as doctor,
       e.especialidadnom as medical_specialty,
       su.sucursalnom as clinic,
       se.servicionom as service,
       p.pacienteid as patient_id,
       p.pacientefnac as patient_birth_date

FROM cita

 INNER JOIN usuario u ON cita.citadoctorid = u.usuarioid
 INNER JOIN servicio se ON cita.servicioid = se.servicioid
 INNER JOIN especialidad e ON u.usuarioespecialidadid = e.especialidadid
 INNER JOIN sucursal su ON cita.citasucursalid = su.sucursalid
 LEFT JOIN paciente p ON cita.pacienteid = p.pacienteid

WHERE citafecha >= '2022-01-01'
  AND citafecha <= '2023-12-31'

;
'''


In [5]:
df = sql_string_to_df(db_crds, query)
df.columns = ['appointment_id', 'appointment_date', 'appointment_start_time', 'appointment_end_time', 'appointment_status', 'doctor', 'medical_specialty', 'clinic', 'service', 'patient_id', 'patient_birth_date']

In [6]:
df

Unnamed: 0,appointment_id,appointment_date,appointment_start_time,appointment_end_time,appointment_status,doctor,medical_specialty,clinic,service,patient_id,patient_birth_date
0,2023001456120,2023-12-21,09:00,09:30,COMPLETADA,DR. FERNANDO ENRIQUE TOVAR GARCIA,GASTROENTEROLOGIA,COAPA,CONSULTA DE GASTROENTEROLOGÍA,283070.0,1986-10-07
1,2023001456156,2023-12-22,19:00,19:30,BLOQUEADO,DRA MONICA MARCELA MACIAS ORTEGA,ENDOCRINOLOGIA,DEL VALLE,CONSULTA DE ENDOCRINOLOGÍA,,
2,2023000431657,2023-12-21,10:30,11:00,COMPLETADA,DRA ANA BELEN CLAVIJO GAIBOR,ENDOCRINOLOGIA,COAPA,CONSULTA DE ENDOCRINOLOGÍA,264741.0,1999-05-15
3,2023001456255,2023-12-21,19:00,19:30,DISPONIBLE,DR. EDGAR FLORES GONZAGA,ALERGOLOGIA,DEL VALLE,CONSULTA DE ALERGOLOGIA,,
4,2023001456285,2023-12-21,13:20,14:00,DISPONIBLE,DR. HECTOR ARTURO VILLAFAN ESCOBAR,MEDICINA INTERNA,TLALPAN,CONSULTA DE MEDICINA INTERNA (40 MIN),,
...,...,...,...,...,...,...,...,...,...,...,...
544085,2023000881983,2023-12-21,10:00,10:30,CANCELA_PACIENTE,DRA MONICA DAVALOS TANAKA,DERMATOLOGIA,CUAJIMALPA,CONSULTA DE DERMATOLOGÍA,275322.0,2016-04-20
544086,2023001451480,2023-12-20,17:30,18:00,COMPLETADA,DRA CINDY MARBELLA VARGAS GUERRERO,ANGIOLOGIA,DEL VALLE,CONSULTA DE ANGIOLOGIA,282939.0,1953-12-20
544087,2022871299,2023-12-20,11:30,12:00,COMPLETADA,DR. EDUARDO VALENCIA GONZALEZ,ORTOPEDIA,TLALPAN,CONSULTA DE ORTOPEDIA,282465.0,1971-12-16
544088,2023001033979,2023-12-21,10:00,10:30,COMPLETADA,DR. FERNANDO ENRIQUE TOVAR GARCIA,GASTROENTEROLOGIA,COAPA,CONSULTA DE GASTROENTEROLOGÍA,27215.0,1970-08-12


# Pickle Extraction

In [7]:
path = '../pkg_dir/data/pickles'
name = 'pivot_df.pkl'

## Reading extract object saved as pickle locally
pkl_obj = path + "/" + name

with open(pkl_obj, 'rb') as obj_content:
    dfp = pickle.load(obj_content)

In [8]:
dfp.reset_index(inplace = True)

In [9]:
dfp

Unnamed: 0,appointment_id,available__date,available__user,canceled_by_employee__date,canceled_by_employee__user,canceled_by_patient__date,canceled_by_patient__user,confirmed__date,confirmed__user,created__date,created__user,in_waiting_room__date,in_waiting_room__user,message_left__date,message_left__user,no_show__date,no_show__user,with_doctor__date,with_doctor__user
0,2021000657,NaT,,NaT,,2022-04-04 08:23:12,karina.carballo,NaT,,NaT,,NaT,,NaT,,NaT,,NaT,
1,2021001443,NaT,,NaT,,NaT,,NaT,,2022-05-24 17:05:24,yaselin.zurita,2022-08-16 08:22:05,marisol.santiag,NaT,,NaT,,2022-08-17 14:21:29,sandra.ruelas
2,2021001765,NaT,,NaT,,2021-11-06 12:37:11,ivonne.gonzalez,NaT,,2021-07-08 08:53:37,marisol.santiag,NaT,,2022-01-08 11:18:47,valeria.flores,2022-01-10 08:23:29,marisol.santiag,NaT,
3,2021002526,NaT,,2021-12-31 08:10:15,valeria.flores,NaT,,NaT,,2021-10-04 11:18:12,yaselin,NaT,,NaT,,NaT,,NaT,
4,2021002563,NaT,,NaT,,NaT,,2022-01-08 09:09:33,valeria.flores,2021-10-28 11:43:48,yaselin,2022-01-10 10:15:09,marisol.santiag,NaT,,NaT,,2022-01-10 10:35:21,sandra.ruelas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
386618,2023241701,NaT,,NaT,,NaT,,2023-01-04 11:02:38,cristopher.tinajero_call,2023-01-04 11:02:38,cristopher.tinajero_call,2023-01-04 11:59:51,veronica.vargas,NaT,,NaT,,2023-01-04 12:18:00,norma.ramirez
386619,2023241702,NaT,,NaT,,NaT,,2023-01-04 15:05:40,yafte.trejo_call,2023-01-04 15:05:40,yafte.trejo_call,2023-01-04 18:18:30,erika.flores,NaT,,NaT,,2023-01-04 18:19:17,miguel.villanueva
386620,2023241703,NaT,,NaT,,NaT,,NaT,,2023-01-04 10:53:49,yesica.cano,2023-01-04 10:54:04,yesica.cano,NaT,,NaT,,2023-01-04 11:04:16,edgar.medrano
386621,2023241705,NaT,,NaT,,NaT,,2023-01-10 08:18:55,veronica.vargas,2023-01-03 22:02:58,ivonne.gonzalez_call,2023-01-11 13:13:01,veronica.vargas,NaT,,NaT,,2023-01-11 13:29:57,monica.castillo


# Performing a Left Join on appointment_id

In [13]:
left_join_df = pd.merge(df, dfp, on='appointment_id', how = 'left')

In [14]:
left_join_df

Unnamed: 0,appointment_id,appointment_date,appointment_start_time,appointment_end_time,appointment_status,doctor,medical_specialty,clinic,service,patient_id,...,created__date,created__user,in_waiting_room__date,in_waiting_room__user,message_left__date,message_left__user,no_show__date,no_show__user,with_doctor__date,with_doctor__user
0,2023001456120,2023-12-21,09:00,09:30,COMPLETADA,DR. FERNANDO ENRIQUE TOVAR GARCIA,GASTROENTEROLOGIA,COAPA,CONSULTA DE GASTROENTEROLOGÍA,283070.0,...,2023-12-20 18:26:04,lourdes.martinez_call,2023-12-21 08:53:54,araceli.gonzale,NaT,,NaT,,2023-12-21 09:01:38,fernando.tovar
1,2023001456156,2023-12-22,19:00,19:30,BLOQUEADO,DRA MONICA MARCELA MACIAS ORTEGA,ENDOCRINOLOGIA,DEL VALLE,CONSULTA DE ENDOCRINOLOGÍA,,...,NaT,,NaT,,NaT,,NaT,,NaT,
2,2023000431657,2023-12-21,10:30,11:00,COMPLETADA,DRA ANA BELEN CLAVIJO GAIBOR,ENDOCRINOLOGIA,COAPA,CONSULTA DE ENDOCRINOLOGÍA,264741.0,...,2023-10-14 08:53:46,veronica.anaya_bot,2023-12-21 10:13:18,araceli.gonzale,NaT,,NaT,,2023-12-21 10:14:15,ana.clavijo
3,2023001456255,2023-12-21,19:00,19:30,DISPONIBLE,DR. EDGAR FLORES GONZAGA,ALERGOLOGIA,DEL VALLE,CONSULTA DE ALERGOLOGIA,,...,NaT,,NaT,,NaT,,NaT,,NaT,
4,2023001456285,2023-12-21,13:20,14:00,DISPONIBLE,DR. HECTOR ARTURO VILLAFAN ESCOBAR,MEDICINA INTERNA,TLALPAN,CONSULTA DE MEDICINA INTERNA (40 MIN),,...,NaT,,NaT,,NaT,,NaT,,NaT,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
544085,2023000881983,2023-12-21,10:00,10:30,CANCELA_PACIENTE,DRA MONICA DAVALOS TANAKA,DERMATOLOGIA,CUAJIMALPA,CONSULTA DE DERMATOLOGÍA,275322.0,...,2023-11-23 10:17:34,gabriela.rivera,NaT,,2023-12-20 14:24:44,citas.online,NaT,,NaT,
544086,2023001451480,2023-12-20,17:30,18:00,COMPLETADA,DRA CINDY MARBELLA VARGAS GUERRERO,ANGIOLOGIA,DEL VALLE,CONSULTA DE ANGIOLOGIA,282939.0,...,2023-12-20 12:20:40,fernando.maya_call,2023-12-20 17:30:05,maria.leyva,NaT,,NaT,,2023-12-20 17:38:56,cindy.marbella
544087,2022871299,2023-12-20,11:30,12:00,COMPLETADA,DR. EDUARDO VALENCIA GONZALEZ,ORTOPEDIA,TLALPAN,CONSULTA DE ORTOPEDIA,282465.0,...,2023-12-18 20:00:29,nancy.lopez_call,2023-12-20 11:26:34,elizabeth.flores,NaT,,NaT,,2023-12-20 11:43:47,eduardo.valencia
544088,2023001033979,2023-12-21,10:00,10:30,COMPLETADA,DR. FERNANDO ENRIQUE TOVAR GARCIA,GASTROENTEROLOGIA,COAPA,CONSULTA DE GASTROENTEROLOGÍA,27215.0,...,2023-11-23 09:51:41,lesly.mendoza,2023-12-21 09:42:46,estela.garcia,NaT,,NaT,,2023-12-21 09:51:32,fernando.tovar


In [15]:
## Saving df as pickle and storing it locally
path = '../pkg_dir/data/pickles/'
name = 'dataset.pkl'

pickle.dump(
    left_join_df,
    open(
        os.path.join(path, name),
        'wb'
    )
)