# Pharmacy Department - Merged Data Cleaning & EDA results

Includes PEH, GEH, MEH, MNH

## Step 1: Import necessary packages

In [None]:
import pandas as pd
import numpy as np

## Step 2: Read all cleaned csv files

In [None]:
PEH = pd.read_csv('../../Data/Pharmacy Dept/Data Cleaning/PEH_Data_Cleaned.csv')
GEH = pd.read_csv('../../Data/Pharmacy Dept/Data Cleaning/GEH_Data_Cleaned.csv')
MEH = pd.read_csv('../../Data/Pharmacy Dept/Data Cleaning/MEH_Data_Cleaned.csv')
MNH = pd.read_csv('../../Data/Pharmacy Dept/Data Cleaning/MNH_Data_Cleaned.csv')

## Step 3: Assigning 'Hospital' Name as indicator

In [None]:
#PEH
PEH['Hospital'] = 'PEH'

In [None]:
PEH.head()

In [None]:
PEH.info()

In [None]:
#GEH
GEH['Hospital'] = 'GEH'

In [None]:
GEH.head()

In [None]:
GEH.info()

In [None]:
#Change 'Date' Column to '%Y-%m-%d' 
GEH['Date'] = pd.to_datetime(GEH['Date']).dt.date
GEH

In [None]:
#MEH
MEH['Hospital'] = 'MEH'

In [None]:
MEH.head()

In [None]:
MEH.info()

In [None]:
#MNH
MNH['Hospital'] = 'MNH'

In [None]:
MNH.head()

In [None]:
MNH.info()

In [None]:
# rename packing to checked for standardization 
MNH= MNH.rename({'DateTime TTO Packing': 'DateTime TTO Checked',
    'Time Taken (Received to Packing)' : 'Time Taken (Received to Checked)', 
    'Time Taken (Packing to Dispensed)' : 'Time Taken (Checked to Dispensed)',
    'Avg Time Taken / Day (Received to Packing)' : 'Avg Time Taken / Day (Received to Checked)',
    'Avg Time Taken / Day (Packing to Dispensed)' : 'Avg Time Taken / Day (Checked to Dispensed)', 
    'Avg Time Taken / Month (Received to Packing)' : 'Avg Time Taken / Month (Received to Checked)',
    'Avg Time Taken / Month (Packing to Dispensed)' : 'Avg Time Taken / Month (Checked to Dispensed)'}, axis=1)

In [None]:
#Change 'Date' Column to %Y-%m-%d only
MNH['Date'] = pd.to_datetime(MNH['Date']).dt.date

In [None]:
MNH

## Step 4: Merge ALL relevant CSV(s)

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
df = pd.concat([PEH,GEH,MEH,MNH], ignore_index=True)

df

In [None]:
df.info()

In [None]:
df[df['Date'].isnull()]

In [None]:
df.isnull().sum()

## Step 5: Change those with 0 days 00:00:00 under Time Taken (Received to Checked)/Time Taken (Checked to Dispensed) to empty 


In [None]:
df['Time Taken (Received to Checked)'] = df['Time Taken (Received to Checked)'].replace(['0 days 00:00:00'], np.nan)

In [None]:
df['Time Taken (Checked to Dispensed)'] = df['Time Taken (Checked to Dispensed)'].replace(['0 days 00:00:00'], np.nan)

In [None]:
# Add additional columns to analyse those cases that did not meet KPI

In [None]:

#def f(row):
#    if (row['Meet KPI'] == 'No') | (row['Meet KPI'] == ''):
#        if (row['Time Taken (Received to Checked)']  != '') & (row['Time Taken (Checked to Dispensed)']  != ''):
#            if (row['Time Taken (Received to Checked)'] >= row['Avg Time Taken / Day (Received to Checked)']) & (row['Time Taken (Checked to Dispensed)'] >= row['Avg Time Taken / Day (Checked to Dispensed)']):
#                val = 'Yes'
#            else:
#                val = 'No'
#        elif (row['Time Taken (Received to Checked)']  != '') | (row['Time Taken (Checked to Dispensed)']  != ''):
#            if (row['Time Taken (Received to Checked)'] >= row['Avg Time Taken / Day (Received to Checked)']) | (row['Time Taken (Checked to Dispensed)'] >= row['Avg Time Taken / Day (Checked to Dispensed)']):
#                val = 'Yes'
#            else:
#                val = 'No'
#        else:
#            val = ''
#    else:
#        val = 'No'
#    return val


In [None]:
#def f2(row):
#    if (row['Time Taken (Received to Checked)'] == '') & (row['Time Taken (Checked to Dispensed)']  == ''):
#        val = ''
#    elif (row['Time Taken (Received to Checked)']  != '') | (row['Time Taken (Checked to Dispensed)']  != ''):
#        if (row['Time Taken (Received to Checked)'] >= row['Avg Time Taken / Day (Received to Checked)']) & (row['Time Taken (Checked to Dispensed)'] >= row['Avg Time Taken / Day (Checked to Dispensed)']):
#            val = 'Yes'
#        else:
#            val = 'No'
#    elif (row['Time Taken (Received to Checked)']  != '') & (row['Time Taken (Checked to Dispensed)']  != ''):
#        if (row['Time Taken (Received to Checked)'] >= row['Avg Time Taken / Day (Received to Checked)']) & (row['Time Taken (Checked to Dispensed)'] >= row['Avg Time Taken / Day (Checked to Dispensed)']):
#            val = 'Yes'
#        else:
#            val = 'No'
#    else:
#        val = 'No'
#    return val

In [None]:
#df['Process Time Issue'] = df.apply(f, axis=1)
#df

In [None]:

#df['Time Taken (Received to Checked)'].notnull() >= df['Avg Time Taken / Day (Received to Checked)']

In [None]:
df['Process Time Issue'] = np.where( (df['Meet KPI'] == 'No') | (df['Meet KPI'] == '') ,
                            np.where(  
                                ((df['Time Taken (Received to Checked)'] >= df['Avg Time Taken / Day (Received to Checked)']) | (df['Time Taken (Checked to Dispensed)'] >= df['Avg Time Taken / Day (Checked to Dispensed)']) ),
                                        'Yes', 'No'), 'No')


In [None]:
#df

## Step 6: Export to CSV

In [None]:
#export to CSV
df.to_csv("../../Data/Pharmacy Dept/Data Cleaning/Merged_Pharmacy_Dept_CLEANED.csv", index=None, header=True)