# MNH Pharmacy Department

## Step 1: Import the necessary packages and make changes to any display settings

In [None]:
import pandas as pd
import numpy as np
from datetime import *

In [None]:
pd.set_option('display.max_columns', None)

## Data Cleaning

### Step 2: Read the data 

In [None]:
df = pd.read_csv('../../Data/Pharmacy Dept/MNH TTO Data/MNH TTO Data - MERGED.csv', na_values=' ')

In [None]:
df

In [None]:
df.info()

In [None]:
list(df.columns)

### Step 3: Drop columns that should be ignore

In [None]:
# drop repeated column
df.drop(['Can close', 'Time TTO received (24 hr format) (eg 1340)', 'Timing of Packing (24 hr format / PBH) (eg 1340 / PBH)', 'Timing of Dispensing (24 hr format)',
        'Weekday', 'Weekday Count', 'Weekend', 'Weekend Count', 'NOT Ward 9', '20 min and below', 'Between 20 to 30 min', '30min and below', 'More than 30 min'], 
        axis=1, inplace=True)

In [None]:
#df.drop(df.iloc[:, 18:23], inplace = True, axis = 1)

### Step 4: Rename common columns

In [None]:
#rename columns
df.rename({'Case no.': 'Case Number', 
            'Bed no.': 'Room/Bed', 
            'No. of drugs': 'No. of Drugs',
           'Time TTO received': 'DateTime TTO Received',
            'Timing of packing': 'DateTime TTO Packing', 
            'Timing of Dispensing': 'DateTime TTO Dispensed'},   
           axis=1, inplace=True)

### Step 5: Fix date time type

In [None]:
columns = ['DateTime TTO Received',
           'DateTime TTO Packing', 
           'DateTime TTO Dispensed']
for column in columns:
    df.loc[:,column] = pd.to_datetime((df['Date'].astype(str) + ' ' + df[column].astype(str)), infer_datetime_format=True, errors='coerce')

In [None]:
df.info()

### Step 6: Handle missing values

In [None]:
df.isnull().sum()

In [None]:
#df.iloc[5750:5770]

#### Missing date

In [None]:
missing_date = df[df['Case Number'].isnull()]
missing_date

In [None]:
# fill up with 15-07-21 based on the given data CSV
df[['Date']] = df[['Date']].fillna('15-07-21')

#### Missing Time TTO received

In [None]:
# drop rows with missing Time TTO received -> 1 row only
df.replace('#VALUE!', np.nan, inplace = True)
df = df.dropna(subset=['DateTime TTO Received'])
#df.replace('blank', np.nan, inplace = True)

#### Missing case number

In [None]:
df['Case Number'] = df['Case Number'].replace('  ', 'EMPTY')
df[['Case Number']] = df[['Case Number']].fillna('EMPTY')

#### Missing Time Taken To Pack (mins)

In [None]:
df[df['Time Taken To Pack (mins)'].isnull()]

In [None]:
# assyne rgar those with missing time taken to pack (mins) with PBH
df[['Time Taken To Pack (mins)']] = df[['Time Taken To Pack (mins)']].fillna('PBH')

#### Missing Time TTO packing

In [None]:
# fill up missing time with naT
df[df['DateTime TTO Packing'].isnull()].head(3)

In [None]:
#df[['Time TTO Packing']] = df[['Time TTO Packing']].fillna('NA')

#### Missing Time TTO Dispensed

In [None]:
#df[df['Time TTO Dispensed'].isnull()].head(3)

In [None]:
# handle row with missing Time TTO packing, Time TTO dispensing with blank because it is empty but dont want to drop
#df[['Time TTO Dispensed']] = df[['Time TTO Dispensed']].fillna('NA')
#df[df['Time TTO packing'].isnull()].head()

In [None]:
df.isnull().sum()

In [None]:
df

In [None]:
#no need to handle missing values for comments since they can leave comments or not

### Step 7: Handle rows with special characters

#### Handle non numeric value in No. of drugs

In [None]:
#select rows not numeric
df[df['No. of Drugs'].apply(lambda x: not x.isnumeric())]

In [None]:
# replace with Unknown number 
df['No. of Drugs'] = df['No. of Drugs'].str.replace('-','NA')
df['No. of Drugs'] = df['No. of Drugs'].str.replace('relabeling','NA')

#### Handle #### value in Time Taken To Pack (mins) / Time Taken to Reach Pt (mins)

In [None]:
df_special = df.loc[df['Time Taken To Pack (mins)'].str.contains(r'#') == True]
df_special.head()

In [None]:
df['Time Taken To Pack (mins)'] = df['Time Taken To Pack (mins)'].str.replace('###############################################################################################################################################################################################################################################################', 
                                                                              'PBH') # replace with PBH

In [None]:
# replace blank with nan
df['Time Taken To Pack (mins)'] = df['Time Taken To Pack (mins)'].str.replace('blank', "")

In [None]:
df['Time Taken to Reach Pt (mins)'] = df['Time Taken to Reach Pt (mins)'].str.replace('###############################################################################################################################################################################################################################################################', 
                                                                              'PBH') # replace with PBH

In [None]:
# replace blank with NA
df['Time Taken to Reach Pt (mins)']  = df['Time Taken to Reach Pt (mins)'].str.replace('blank', '')

In [None]:
df.iloc[474:476]

### Step 8: Replace year not equal to 2021 as 2021 in Date

In [None]:
# change to datetime formate
df['Date'] = pd.to_datetime(df['Date'])

In [None]:
df_filtered = df[df['Date'].dt.strftime('%Y') != '2021']
df_filtered.count()

In [None]:
df_filtered

In [None]:
df.loc[8626:8673, 'Date'] = '2021-07-22'
df['Date'] = pd.to_datetime(df['Date']) # change back to datetime format from obj

In [None]:
df.info()

In [None]:
df.loc[8619:8669]

In [None]:
df.isnull().sum()

## EDA

### Step 9: Get Day of Week

In [None]:
df['Weekday'] = df['Date'].dt.day_name()

In [None]:
df.info()

### Step 10: Calculate Overall Time Taken

Get the minutes from Time Taken To Pack (mins) & Time Taken to Reach Pt (mins)

In [None]:
#df_filtered = df[df['Time Taken to Reach Pt (mins)'] == 'PBH']
#df_filtered

In [None]:
tto_received = pd.to_datetime(df['DateTime TTO Received'].astype(str)) 
tto_dispensed = pd.to_datetime(df[df['Time Taken to Reach Pt (mins)'] != '']['DateTime TTO Dispensed'].astype(str))

#df[df['Time Taken to Reach Pt (mins)'] == 'PBH']
df['TAT'] = tto_dispensed.sub(tto_received).dt.total_seconds().div(60)
df

In [None]:
#df[df['Time Taken to Reach Pt (mins)'] != 'NA']['Time TTO Dispensed']

### Step 11: Create Meet KPI column with value yes/no

MNH: Total Time Taken < 25mins = Yes, else: No

In [None]:
def f(row):
    if row['TAT'] > 25:
        val = 'No'
    elif row['TAT'] > 0 and row['TAT'] <= 25:
        val = 'Yes'
    else:
        val = 'NA'
    return val

In [None]:
df['Meet KPI'] = df.apply(f, axis=1)

In [None]:
df

In [None]:
df.info()

In [None]:
df['Meet KPI'].value_counts()

### Step 12: Create Office Hours column to check if case handled during offcie hours

Standard Office Hours: 8:00AM to 8:00PM

In [None]:
#df["Time TTO Received"]  = pd.to_datetime(df["Time TTO Received"] , infer_datetime_format=True, errors='coerce')
#df["Time TTO Packing"]  = pd.to_datetime(df["Time TTO Packing"] , infer_datetime_format=True, errors='coerce')
#df["Time TTO Dispensed"]  = pd.to_datetime(df["Time TTO Dispensed"] , infer_datetime_format=True, errors='coerce')

In [None]:
#df["Time TTO Received"]  = df['Time TTO Received'].dt.time
#df["Time TTO Packing"]  = df['Time TTO Packing'].dt.time
#df["Time TTO Dispensed"]  = df['Time TTO Dispensed'].dt.time

In [None]:
df['Office Hours'] = np.where(df['DateTime TTO Received'].isnull() | df['DateTime TTO Dispensed'].isnull() , 'NA', 
                            (np.where(( #(df['Time TTO Received'] >= pd.to_datetime('08:00:00').time()) & 
                                        (df['DateTime TTO Dispensed'].dt.time <= pd.to_datetime('20:00:00').time())),
                                        'Yes', 'No')))

df

### Step 13: Create Average Time Taken from (Received to Packing) & (Packing to Dispensed) columns

In [None]:
#Get Duration Difference of Received to Checked
#df['Time Taken (Received to Packing)'] = (pd.to_datetime(df['DateTime TTO Packing'].astype(str)) - pd.to_datetime(df['DateTime TTO Received'].astype(str))).dt.total_seconds().div(60)
#Get Duration Difference of Checked to Dispensed
#df['Time Taken (Packing to Dispensed)'] = (pd.to_datetime(df['DateTime TTO Dispensed'].astype(str)) - pd.to_datetime(df['DateTime TTO Packing'].astype(str))).dt.total_seconds().div(60)

In [None]:
#Get Duration Difference of Received to Checked
df['Time Taken (Received to Packing)'] = (df['DateTime TTO Packing'] - df['DateTime TTO Received'])
#Get Duration Difference of Checked to Dispensed
df['Time Taken (Packing to Dispensed)']  = (df['DateTime TTO Dispensed'] - df['DateTime TTO Packing'])

In [None]:
#Create new dataframe to get results of Average Time Taken for each (Received to Checked) & (Checked to Dispensed)
df_avg_time = df[['Date', 'Time Taken (Received to Packing)', 'Time Taken (Packing to Dispensed)']]
df_avg_time.head()

In [None]:
#Convert NaT to 00:00:00
df_avg_time.loc[df_avg_time['Time Taken (Received to Packing)'].isnull(), 'Time Taken (Received to Packing)'] = pd.to_timedelta(0)
df_avg_time.loc[df_avg_time['Time Taken (Packing to Dispensed)'].isnull(), 'Time Taken (Packing to Dispensed)'] = pd.to_timedelta(0)

In [None]:
df_avg_time.info()

In [None]:
#Get average time taken per day
df_avg_time = df_avg_time.groupby(pd.to_datetime(df_avg_time['Date']).dt.date).mean(numeric_only=False)
df_avg_time

In [None]:
#Rename Columns
#df_avg_time = df_avg_time.rename({'Time Taken (Received to Packing)' : 'Avg Time Taken / Day (Received to Packing)', 'Time Taken (Packing to Dispensed)' : 'Avg Time Taken / Day (Packing to Dispensed)'}, axis=1)

In [None]:
df_avg_time = df_avg_time.reset_index(drop=True)
df_avg_time

In [None]:
#Remove miliseconds
df_avg_time['Time Taken (Received to Packing)'] = df_avg_time['Time Taken (Received to Packing)'].dt.floor('s')
df_avg_time['Time Taken (Packing to Dispensed)'] = df_avg_time['Time Taken (Packing to Dispensed)'].dt.floor('s')

df_avg_time

#### Avg Time Taken / Day

In [None]:
#Rename Columns
df_avg_time = df_avg_time.rename({'Time Taken (Received to Packing)' : 'Avg Time Taken / Day (Received to Packing)', 'Time Taken (Packing to Dispensed)' : 'Avg Time Taken / Day (Packing to Dispensed)'}, axis=1)

In [None]:
#Merge the results from grouby for Avg Time Taken
df = df.merge(df_avg_time, on='Date', how='left')
df

In [None]:
#Replace 00:00:00 in time columns with np.NaN
columns = ['Time Taken (Received to Packing)', 
               'Time Taken (Packing to Dispensed)', 
               "Avg Time Taken / Day (Received to Packing)", 
               "Avg Time Taken / Day (Packing to Dispensed)"]


for column in columns:
    df[column] = df[column].replace(pd.Timedelta(0), np.NaN)

#### Avg Time Taken / Month

In [None]:
df['Month']= pd.to_datetime(df['Date']).dt.month_name()

In [None]:
dfinal_avg_time = df[['Date', 'Time Taken (Received to Packing)', 'Time Taken (Packing to Dispensed)']]

dfinal_avg_time['Month']= pd.to_datetime(dfinal_avg_time['Date']).dt.month_name()

#Get average time taken per month
dfinal_avg_time= dfinal_avg_time.groupby('Month').agg({'Time Taken (Received to Packing)': np.sum, 'Time Taken (Packing to Dispensed)': np.sum})

x = dfinal_avg_time['Time Taken (Received to Packing)'] / np.timedelta64(1, 'm')
y = df.groupby('Month')['Time Taken (Received to Packing)'].count()

dfinal_avg_time['Avg Time Taken / Month (Received to Packing)'] = (x/y)

dfinal_avg_time['Avg Time Taken / Month (Received to Packing)'] = pd.to_datetime(dfinal_avg_time['Avg Time Taken / Month (Received to Packing)'], unit='m').apply(lambda x: x.strftime("%H:%M:%S"))

In [None]:
a = dfinal_avg_time['Time Taken (Packing to Dispensed)'] / np.timedelta64(1, 'm')
b = df.groupby('Month')['Time Taken (Packing to Dispensed)'].count()

dfinal_avg_time['Avg Time Taken / Month (Packing to Dispensed)'] = (a/b)

dfinal_avg_time['Avg Time Taken / Month (Packing to Dispensed)'] = pd.to_datetime(dfinal_avg_time['Avg Time Taken / Month (Packing to Dispensed)'], unit='m').apply(lambda x: x.strftime("%H:%M:%S"))

In [None]:
dfinal_avg_time = dfinal_avg_time.drop(dfinal_avg_time.columns[[0, 1]], axis=1)

In [None]:
df = df.merge(dfinal_avg_time, on='Month', how='left')
df

In [None]:
df['Avg Time Taken / Month (Received to Packing)'] = pd.to_timedelta(df['Avg Time Taken / Month (Received to Packing)'])
df['Avg Time Taken / Month (Packing to Dispensed)'] = pd.to_timedelta(df['Avg Time Taken / Month (Packing to Dispensed)'])

In [None]:
tto_received = pd.to_datetime(df['DateTime TTO Received'].astype(str)) 
tto_packing = pd.to_datetime(df[df['DateTime TTO Packing'] != '']['DateTime TTO Packing'].astype(str))

#df[df['Time Taken to Reach Pt (mins)'] == 'PBH']
df['Time Taken to Pack'] = tto_packing.sub(tto_received).dt.total_seconds().div(60)

#### Replace those with negative value to empty since it is PBH 

In [None]:
df['Time Taken to Pack'] = df['Time Taken to Pack'].apply(lambda x : x if x >= 0 else np.nan)

In [None]:
df['Time Taken to Pack'].mask( df['Time Taken To Pack (mins)'] == 'PBH', 'PBH' , inplace=True )

In [None]:
df

In [None]:
df.info()

In [None]:
# fill empty Time taken (received to packing) & (packing to dispensed) with PBH
#df['Time Taken (Received to Packing)'] = np.where(df['Time Taken to Pack'] == 'PBH', 'PBH', df['Time Taken (Received to Packing)'])
df['Time Taken (Received to Packing)'] = df['Time Taken (Received to Packing)'].fillna('PBH')
df['Time Taken (Packing to Dispensed)'] = df['Time Taken (Packing to Dispensed)'].fillna('PBH')

In [None]:
#df[["Time TTO Received", "Time TTO Dispensed"]] = df[["Time TTO Received", "Time TTO Dispensed"]].apply(pd.to_datetime(format='%H:%M') - pd.to_datetime(format='%H:%M').dt.normalize())


In [None]:
#df["Time TTO Received"] = pd.to_datetime(df["Time TTO Received"],format= '%H:%M:%S')
#df["Time TTO Received"] =df["Time TTO Received"].strptime('%d-%b-%y %I.%M.%S.%f %p')
#df["Time TTO Received"] = pd.to_timedelta(df["Time TTO Received"].dt.strftime('%H:%M:%S'))

In [None]:
#df["Time TTO Received"]  = pd.to_datetime(df["Time TTO Received"], format='%H:%M') - pd.to_datetime(df["Time TTO Received"], format='%H:%M').dt.normalize()

In [None]:
#df["Time TTO Received"] = pd.Series([pd.Timedelta(minutes=i) for i in range(0,100,5)])
#df["Time TTO Received"]= df["Time TTO Received"].astype(str).str.split('0 days ').str[-1]

In [None]:
#df["Time TTO Received"]  = pd.to_datetime(df["Time TTO Received"] , errors='coerce')

### Step 14: Map Room/Bed No. to Ward No.

Match Room/Bed No. to the corresponding Ward No.

#### Combining Excel Tabs for MNH Ward Dataset

In [None]:
f_ward = '../../Data/Pharmacy Dept/MNH TTO Data/MNH Wards.xlsx'
df_ward = pd.read_excel(f_ward, sheet_name=None)
dfinal_ward = pd.concat(df_ward, ignore_index=True)

dfinal_ward.head()

In [None]:
dfinal_ward

In [None]:
dfinal_ward.info()

In [None]:
# concat with the original ward df
#dfinal_ward = pd.concat([dfinal_ward, df_invalid_ward], ignore_index=True, sort=False)
#dfinal_ward

In [None]:
dfinal_ward.to_csv("../../Data/Pharmacy Dept/MNH TTO Data/MNH Wards Merged.csv", index=None, header=True)

#### Merge Ward No. based on Room/Bed No.

In [None]:
df_ward = pd.read_csv("../../Data/Pharmacy Dept/MNH TTO Data/MNH Wards Merged.csv")

In [None]:
dfinal = pd.merge(df, df_ward, how='left', left_on=['Room/Bed'], right_on=['Room No'])

dfinal.head()

In [None]:
df_x = dfinal[dfinal['Ward'].isnull()]

In [None]:
#fixing values in Bed no. that are actually referring to the Ward, and match its corresponding in the Ward column
#dfinal.loc[(dfinal['Room/Bed'].str.contains('eye', case=False, na=False)) | (dfinal['Room/Bed'].str.contains('edw', case=False)), 'Ward'] = 'Eyedayward'
#dfinal.loc[(dfinal['Room/Bed'].str.contains('DS', case=False, na=False)), 'Ward'] = 'DS'
#dfinal.loc[(dfinal['Room/Bed'].str.contains('endo', case=False, na=False)), 'Ward'] = 'Endo'

In [None]:
df_invalid_ward = pd.read_csv("../../Data/Pharmacy Dept/MNH TTO Data/MNH_invalid_BedNo_edit.csv")
df_invalid_ward = df_invalid_ward[['Case Number', 'Room No', 'Ward']]
df_invalid_ward

In [None]:
df_invalid_ward.info()

In [None]:
#dfinal_1['X'] = np.where(dfinal['Case Number'] == df_invalid_ward['Case Number'], 'True', 'False')

In [None]:
dfinal_1 = pd.merge(df_x, df_invalid_ward, on='Case Number')
#dfinal = pd.merge(dfinal, df_invalid_ward, how='left', left_on=['Room/Bed'], right_on=['Room No'])
dfinal_1 = dfinal_1.drop(['Room No_x', 'Ward_x'], axis=1)
dfinal_1.rename({'Room No_y': 'Room No', 'Ward_y':'Ward'}, axis=1, inplace=True)
dfinal_1

In [None]:
dfinal = dfinal[dfinal['Ward'].notnull()]
dfinal

In [None]:
dfinal = pd.concat([dfinal, dfinal_1], ignore_index=True, sort=False)
dfinal

In [None]:
dfinal.drop(['Room No'], axis=1, inplace=True)

In [None]:
dfinal

## Step 15: Export CLEANED to csv

In [None]:
dfinal.to_csv("../../Data/Pharmacy Dept/Data Cleaning/MNH_Data_Cleaned.csv", index = None, header=True)