# PEH Pharmacy Department

In [None]:
import pandas as pd
import numpy as np

## Combining Excel Tabs and Basic Cleaning

In [None]:
#Combining all Excel sheets into one --Assuming all have the same headers

f = '../PEH TTO Data.xlsx'
df = pd.read_excel(f, sheet_name=None)
df2 = pd.concat(df, ignore_index=True)

df2.head()

In [None]:
#drop unnecessary columns --unnamed
df2.drop(df2.columns[df2.columns.str.contains('Unnamed')], axis=1, inplace=True)
df2.head()

In [None]:
df2.info()

In [None]:
df2.dropna(axis = 0, how = 'all', inplace = True)
df2.info()

In [None]:
df2.tail()

In [None]:
df2.drop(df2.tail(3).index,inplace=True) 

In [None]:
df2.tail()

### Export to CSV from Excel

In [None]:
#export to csv
df2.to_csv('merged_PEH_TTO_Data.csv', header=True,index=False)

## Data Cleaning

In [None]:
#read csv for further cleaning
df_csv = pd.read_csv('merged_PEH_TTO_Data.csv')
df_csv

In [None]:
df_csv.info()

### Handle Missing Values

In [None]:
df_csv.notnull().sum()

In [None]:
df_csv.isnull().sum()

### Empty Case Number

In [None]:
#What do 'Empty' Case Number means even if the system tracked a medication dispensed record?

df_csv[df_csv['Case Number'].isnull()]

In [None]:
df_csv = df_csv.dropna(subset=['Date'])

df_csv.isnull().sum()

In [None]:
df_csv[df_csv['Case Number'].isnull()]

In [None]:
#Affix "EMPTY" if Case Number = NaN
df_csv.loc[df_csv['Case Number'].isnull(), 'Case Number'] = "EMPTY"

In [None]:
df_csv[df_csv['Case Number'] == "EMPTY"]

### Data Standardization

#### Rename Columns

In [None]:
#Rename Columns for future standardization
new_cols_name = {'Verified': 'Time TTO Received', 
                'Meds Sent Down': 'Time (Meds Sent Down)', 
                'Checked': 'Time TTO Checked', 
                'Counselled': 'Time (Counselled)', 
                'Remarks\n(input paeds if it is a 4P patient in 4A)': 'Remarks', 
                'Checked by Pharmacist': 'Checked By (Pharmacist)', 
                'Time to Ready for Collection': 'Time Taken (Ready for Collection)', 
                'Time to Counselling': 'Time Taken (to Counselling)', 
                'No. of Items': 'No. of Drugs'}

df_csv.rename(columns=new_cols_name, inplace=True)

df_csv.head()

#### Fix Data Type(s) for Time

In [None]:
#Change 'Date' Column to datetime
df_csv['Date'] = pd.to_datetime(df_csv['Date'], infer_datetime_format=True, errors='coerce')

In [None]:
#Change 'Date' Column to datetime.dt.date
df_csv['Date'] = df_csv['Date'].dt.date

In [None]:
#FIXED with latest dataset

#Drop invalid Time (Case Number: 3021022234, 3021022185, 3021026234, 3021032879, 3021032837, 3021000560)
#Drop invalid Date (Date: 'dfdfdf', Case Number: 3021031272)
#Dropped externally as format do not fit

In [None]:
#Change 'Time TTO Received', 'Time (Meds Sent Down)', 'Time TTO Checked', 'Time (Counselled)'
#       'Time Taken (Ready for Collection)', 'Time Taken (to Counselling)' 
#       Columns to datetime.dt.time

df_csv['Time TTO Received'] = pd.to_datetime(df_csv['Time TTO Received'], format='%H:%M:%S').dt.time
df_csv['Time (Meds Sent Down)'] = pd.to_datetime(df_csv['Time (Meds Sent Down)'], format='%H:%M:%S').dt.time
df_csv['Time TTO Checked'] = pd.to_datetime(df_csv['Time TTO Checked'], format='%H:%M:%S').dt.time
df_csv['Time (Counselled)'] = pd.to_datetime(df_csv['Time (Counselled)'], format='%H:%M:%S').dt.time
df_csv['Time Taken (Ready for Collection)'] = pd.to_datetime(df_csv['Time Taken (Ready for Collection)'], format='%H:%M:%S').dt.time
df_csv['Time Taken (to Counselling)'] = pd.to_datetime(df_csv['Time Taken (to Counselling)'], format='%H:%M:%S').dt.time

In [None]:
#Change 'NaT/NaN' to '00:00:00'

#df_csv['Time TTO Received'] = df_csv['Time TTO Received'].fillna(pd.Timedelta(seconds=0))
#df_csv['Time (Meds Sent Down)'] = df_csv['Time (Meds Sent Down)'].fillna(pd.Timedelta(seconds=0))
#df_csv['Time TTO Checked'] = df_csv['Time TTO Checked'].fillna(pd.Timedelta(seconds=0))
#df_csv['Time (Counselled)'] = df_csv['Time (Counselled)'].fillna(pd.Timedelta(seconds=0))
#df_csv['Time Taken (Ready for Collection)'] = df_csv['Time Taken (Ready for Collection)'].fillna(pd.Timedelta(seconds=0))
#df_csv['Time Taken (to Counselling)'] = df_csv['Time Taken (to Counselling)'].fillna(pd.Timedelta(seconds=0))

In [None]:
#Remove '0 days' in all Time data type

#df_csv['Time TTO Received'] = df_csv['Time TTO Received'].astype(str).str.split('0 days ').str[-1]
#df_csv['Time (Meds Sent Down)'] = df_csv['Time (Meds Sent Down)'].astype(str).str.split('0 days ').str[-1]
#df_csv['Time TTO Checked'] = df_csv['Time TTO Checked'].astype(str).str.split('0 days ').str[-1]
#df_csv['Time (Counselled)'] = df_csv['Time (Counselled)'].astype(str).str.split('0 days ').str[-1]
#df_csv['Time Taken (Ready for Collection)'] = df_csv['Time Taken (Ready for Collection)'].astype(str).str.split('0 days ').str[-1]
#df_csv['Time Taken (to Counselling)'] = df_csv['Time Taken (to Counselling)'].astype(str).str.split('0 days ').str[-1]

In [None]:
#reinstate into Time format again

#df_csv['Time TTO Received'] = pd.to_datetime(df_csv['Time TTO Received'], format='%H:%M:%S').dt.time
#df_csv['Time (Meds Sent Down)'] = pd.to_datetime(df_csv['Time (Meds Sent Down)'], format='%H:%M:%S').dt.time
#df_csv['Time TTO Checked'] = pd.to_datetime(df_csv['Time TTO Checked'], format='%H:%M:%S').dt.time
#df_csv['Time (Counselled)'] = pd.to_datetime(df_csv['Time (Counselled)'], format='%H:%M:%S').dt.time
#df_csv['Time Taken (Ready for Collection)'] = pd.to_datetime(df_csv['Time Taken (Ready for Collection)'], format='%H:%M:%S').dt.time
#df_csv['Time Taken (to Counselling)'] = pd.to_datetime(df_csv['Time Taken (to Counselling)'], format='%H:%M:%S').dt.time

In [None]:
df_csv.head()

#### Fix NaN values in non-Time Data Type

In [None]:
#No. of Drugs 
df_csv['No. of Drugs'] = df_csv['No. of Drugs'].fillna("NA")

In [None]:
#Bedside Counselling Candidate
df_csv['Bedside Counselling Candidate'] = df_csv['Bedside Counselling Candidate'].fillna('No')

#IF Time Taken (to Counselling) == 00:00:00, then Bedside Counselling Candidate = No
df_csv.loc[(df_csv['Time Taken (to Counselling)'] == pd.to_datetime('00:00:00').time()), 'Bedside Counselling Candidate'] = "No"

#IF Time (to Counselling) > 00:00:00, then Bedside Counselling Candidate = Yes
df_csv.loc[(df_csv['Time Taken (to Counselling)'] > pd.to_datetime('00:00:00').time()), 'Bedside Counselling Candidate'] = "Yes"

In [None]:
df_csv.isnull().sum()

In [None]:
# Fill all NA with empty string

#dfinal = df_csv.fillna('')
#dfinal

In [None]:
dfinal = df_csv

## EDA

### Day of Week

In [None]:
dfinal['Weekday'] = pd.to_datetime(dfinal['Date']).dt.day_name()

#Fill NaN with empty string for those without a Date
dfinal['Weekday'] = dfinal['Weekday'].fillna('')

dfinal.head()

### Time TTO Dispensed 

PEH does not have a Time TTO Dispensed indicator in their original raw dataset

In [None]:
#IF time counselled > time checked : time dispensed == time counselled
#else time dispensed == time checked

dfinal['Time TTO Dispensed'] = np.where((dfinal['Time (Counselled)'] > dfinal['Time TTO Checked']), 
                                        dfinal['Time (Counselled)'], dfinal['Time TTO Checked'])

dfinal.head()

### Overall Time Taken

In [None]:
dfinal['Overall Time Taken (mins)'] = np.where(
                                                (dfinal['Time Taken (to Counselling)'].isnull()), 
                                                pd.to_datetime(dfinal['Time Taken (Ready for Collection)'].astype(str)).dt.minute, 
                                                pd.to_datetime(dfinal['Time Taken (to Counselling)'].astype(str)).dt.minute)

dfinal.head()

In [None]:
#pd.set_option('display.max_columns', None)
dfinal[dfinal['Time Taken (to Counselling)'].notnull()]

In [None]:
#Time TTO Dispensed - Time (Counselled) - Time TTO Checked - Time (Meds Sent Down) - Time TTO Received

#IF Time (Counselled) != 00:00:00 & IF Time (Meds Sent Down) != 00:00:00, Subtract all from Time TTO Dispensed
#IF Time (Counselled) == 00:00:00 & Time (Meds Sent Down) == 00:00:00, Time TTO Dispensed - Time TTO Checked - Time TTO Received
#IF Time (Counselled) == 00:00:00 & Time (Meds Sent Down) != 00:00:00, Time TTO Dispensed - Time TTO Checked - Time (Meds Sent Down) - Time TTO Received
#IF Time (Counselled) != 00:00:00 & Time (Meds Sent Down) == 00:00:00, Time TTO Dispensed - Time (Counselled) - Time TTO Checked - Time TTO Received

#df_test = dfinal
#
#df_test['Overall Time Taken to Dispensed'] = np.where(  
#                                                    (df_test['Time (Counselled)'] != pd.to_datetime('00:00:00').time()) & 
#                                                    (df_test['Time (Meds Sent Down)'] != pd.to_datetime('00:00:00').time()),
#                                                        (pd.to_timedelta(df_test['Time TTO Dispensed'].astype(str)) 
#                                                        - pd.to_timedelta(df_test['Time (Counselled)'].astype(str)) 
#                                                        - pd.to_timedelta(df_test['Time TTO Checked'].astype(str)) 
#                                                        - pd.to_timedelta(df_test['Time (Meds Sent Down)'].astype(str)) 
#                                                        - pd.to_timedelta(df_test['Time TTO Received'].astype(str))) ,
#                                                        
#                                                    np.where(
#                                                        (df_test['Time (Counselled)'] == pd.to_datetime('00:00:00').time()) & 
#                                                        (df_test['Time (Meds Sent Down)'] == pd.to_datetime('00:00:00').time()),
#                                                            (pd.to_timedelta(df_test['Time TTO Dispensed'].astype(str))                                     
#                                                            - pd.to_timedelta(df_test['Time TTO Checked'].astype(str))                                                            
#                                                            - pd.to_timedelta(df_test['Time TTO Received'].astype(str))) ,
#
#                                                    np.where(
#                                                        (df_test['Time (Counselled)'] == pd.to_datetime('00:00:00').time()) & 
#                                                        (df_test['Time (Meds Sent Down)'] != pd.to_datetime('00:00:00').time()),
#                                                            (pd.to_timedelta(df_test['Time TTO Dispensed'].astype(str))                                                             
#                                                            - pd.to_timedelta(df_test['Time TTO Checked'].astype(str)) 
#                                                            - pd.to_timedelta(df_test['Time (Meds Sent Down)'].astype(str)) 
#                                                            - pd.to_timedelta(df_test['Time TTO Received'].astype(str))) ,
#
#                                                    np.where(
#                                                        (df_test['Time (Counselled)'] != pd.to_datetime('00:00:00').time()) & 
#                                                        (df_test['Time (Meds Sent Down)'] == pd.to_datetime('00:00:00').time()),
#                                                            (pd.to_timedelta(df_test['Time TTO Dispensed'].astype(str)) 
#                                                            - pd.to_timedelta(df_test['Time (Counselled)'].astype(str)) 
#                                                            - pd.to_timedelta(df_test['Time TTO Checked'].astype(str))                                                             
#                                                            - pd.to_timedelta(df_test['Time TTO Received'].astype(str))) , 
#
#                                                            np.NaN))))
#
#                                                            
#df_test

#dfinal['Overall Time Taken to Dispensed'] = (pd.to_timedelta(dfinal['Time TTO Dispensed'].astype(str)) 
#                                            - pd.to_timedelta(dfinal['Time (Counselled)'].astype(str)) 
#                                            - pd.to_timedelta(dfinal['Time TTO Checked'].astype(str)) 
#                                            - pd.to_timedelta(dfinal['Time (Meds Sent Down)'].astype(str)) 
#                                            - pd.to_timedelta(dfinal['Time TTO Received'].astype(str)))


In [None]:
#def td_to_hmsstr(td):
#    """
#    convert a timedelta object td to a string in HH:MM:SS format.
#    """
#    hours, remainder = divmod(td.total_seconds(), 3600)
#    minutes, seconds = divmod(remainder, 60)
#    return f'{int(hours):02}:{int(minutes):02}:{int(seconds):02}'
#
#
#df_test['Overall Time Taken to Dispensed'] = df_test['Overall Time Taken to Dispensed'].apply(td_to_hmsstr)
#df_test

### Meet KPI?

GEH: Total Time Taken < 45mins = Yes, else: No.

In [None]:
dfinal['Meet KPI'] = np.where((dfinal['Overall Time Taken (mins)'].isnull()), "NA",
                            np.where((dfinal['Overall Time Taken (mins)'] <= (pd.to_datetime('00:45:00')).minute),
                                    'Yes', 'No'))

dfinal

In [None]:
dfinal[dfinal['Meet KPI']=="No"]

### Office Hours

Standard Office Hours: 8:30AM to 5:00PM

In [None]:
dfinal['Office Hours'] = np.where(dfinal['Time TTO Received'].isnull() | dfinal['Time TTO Dispensed'].isnull() , 'NA',
                            np.where(  #(dfinal['Time TTO Received'] >= pd.to_datetime('08:30:00').time()) & 
                                        (dfinal['Time TTO Dispensed'] <= pd.to_datetime('17:00:00').time()),
                                        'Yes', 'No'))

dfinal

In [None]:
dfinal[dfinal['Office Hours'] == 'No']

## Export CLEANED to csv

In [None]:
#export to csv
dfinal.to_csv('PEH_CLEANED_Data.csv', header=True, index=False)