### Import libraries

In [38]:
# import necessary libraries
import os
import pandas as pd
import numpy as np


### Define functions and file pathway for collecting excel files

In [39]:
# define the directory for the files
directory = r"C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24"

In [40]:
def combine_excel_files(input_directory, sheet_name, skiprows, nrows, usecols):
    """Function to combine data from Excel files into a single dataframe

    Parameters:
    input_directory (string): defines where to retrieve the files from 
    sheet_name (string): defines which sheet in the files to retrieve data from
    skiprows (int): defines how many rows to skip before retrieving data
    nrows (int): defines how many rows to retrieve data for
    usecols(int): defines which columns retrieve data from

    Returns:
    DataFrame: the result of combined dataframes for each excel file"""
    # define empty list for storing data
    all_data = []
    # iterate over each file
    for file in os.listdir(input_directory):
        # retrieve only excel files in the directory
        if file.endswith('.xlsx'):
            try:
                # join file name with directory
                file_path = os.path.join(input_directory, file)
                # print file name currently being processed
                print(f"Processing file: {file_path}")

                # assign file to a dataframe
                data = pd.read_excel(file_path, 
                                     sheet_name=sheet_name, 
                                     skiprows=skiprows, 
                                     nrows=nrows, 
                                     usecols=usecols
                                     ).assign(**{'File Name': file}) # add file name column
                # append dataframe to list

                all_data.append(data)
            except ValueError as e:
                # print error message and file name
                print(f"ValueError in file: {file} - {e}")
            except Exception as e:
                # print exception message and file name
                print(f"Error in file: {file} - {e}")

    # return concatenate all of the dataframes into a single dataframe
    return pd.concat(all_data, ignore_index=True)



# function to extract the health baord  from the file name
def extract_hb(data):
    try:
        parts = data.split(' ')
        health_board = parts[1]
        return health_board
    except IndexError:
        # Handle cases where the format is not as expected
        return 'Unknown'


### Ingest data from 1. Workload & Production sheet

#### Histopathology

In [None]:
# assign the workload data to dataframe using the defined function
histopathology_df = combine_excel_files(directory, '1. Workload & Production', skiprows=8, nrows=7, usecols=[1,2,3,4])

In [42]:
# unpivot the dataframe to have a single value column
histopathology_melted_df = pd.melt(histopathology_df,
                                       id_vars=['Type', 'File Name'],
                                       value_vars=['General Activity', 'Neuropathology Activity', 'Paediatric Activity'],
                                       var_name='Category',
                                       value_name='Value')
    

In [43]:
# add in the area value specific to this dataframe along with the source sheet and table
histopathology_melted_df['Area'] = 'Histopathology'
histopathology_melted_df['Source Sheet'] = 'Sheet1_1_Histo'


#### Cytopathology

In [None]:
# assign the workload data to dataframe using the defined function
cytopathology_df = combine_excel_files(directory, '1. Workload & Production', skiprows=20, nrows=13, usecols=[1,2,3,4])

In [45]:
# unpivot the dataframe to have a single value column
cytopathology_melted_df = pd.melt(cytopathology_df,
                                  id_vars=['Type', 'File Name'],
                                  value_vars=['Non-Cervical/ Diagnostic Activity (incorporating any Paediatric Pathology and Neuropathology)', 
                                              'Andrology Activity', 'Cervical Screening Activity'],
                                  var_name='Category',
                                  value_name='Value')

In [46]:
# add in the area value specific to this dataframe along with the source sheet and table
cytopathology_melted_df['Area'] = 'Cytopathology'
cytopathology_melted_df['Source Sheet'] = 'Sheet1_2_Cyto'

#### Electron Microscopy

###### Health boards collect different information for this section and can be separated into two groups for ingestion

In [None]:
# assign the workload data to dataframe using the defined function
electron_1_df = combine_excel_files(directory, '1. Workload & Production', skiprows=37, nrows=3, usecols=[1,2])

In [48]:
# filter for the boards in group 1
electron_1_df = electron_1_df[(electron_1_df['File Name'] == 'NHS D&G SPAN 2023-24.xlsx') |
                              (electron_1_df['File Name'] == 'NHS Fife SPAN 2023-24.xlsx') |
                              (electron_1_df['File Name'] == 'NHS Forth Valley SPAN 2023-24.xlsx') |
                              (electron_1_df['File Name'] == 'NHS Highland SPAN 2023-24.xlsx') |
                              (electron_1_df['File Name'] == 'NHS Lanarkshire SPAN 2023-24.xlsx')] 

In [49]:
# drop unwanted columns
electron_1_df = electron_1_df.drop(columns=['*General Histopathology EM figures should exclude Neuro/Paediatric activity that should be recorded in adjacent columns unless otherwise stated\n*Where Neuro/Paediatric cannot be identified separately please include in General Histopathology Electron Microscopy but flag that this is the case',
                                            'Unnamed: 2'])

In [50]:
# add new column category
electron_1_df['Category'] = 'General Activity'

In [51]:
# rename value column appropriately
electron_1_df = electron_1_df.rename(columns={'General Activity': 'Value'})

In [None]:
# assign the workload data to dataframe using the defined function
electron_2_df = combine_excel_files(directory, '1. Workload & Production', skiprows=38, nrows=3, usecols=[1,2,3,4])

In [53]:
# filter for the boards in group 2
electron_2_df = electron_2_df[(electron_2_df['File Name'] == 'NHS A&A SPAN 2023-24.xlsx') |
                              (electron_2_df['File Name'] == 'NHS GG&C SPAN 2023-24.xlsx') |
                              (electron_2_df['File Name'] == 'NHS Grampian SPAN 2023-24.xlsx') |
                              (electron_2_df['File Name'] == 'NHS Lothian SPAN 2023-24.xlsx') |
                              (electron_2_df['File Name'] == 'NHS Tayside SPAN 2023-24.xlsx')] 

In [54]:
# drop unwanted columns
electron_2_df = electron_2_df.drop(columns=['Requests',
                                            0,
                                            'Unnamed: 2',
                                            'Unnamed: 3',
                                            'Unnamed: 4'])

In [56]:
# unpivot the dataframe to have a single value column
electron_2_melted_df = pd.melt(electron_2_df,
                             id_vars=['Type', 'File Name'],
                             value_vars=['General Activity', 
                                         'Neuropathology Activity',
                                         'Paediatric Activity'],
                             var_name='Category',
                             value_name='Value')
    

In [57]:
# combine into a single dataframe
electron_melted_df = pd.concat([electron_2_melted_df, electron_1_df])

In [58]:
# add in the area value specific to this dataframe along with the source sheet and table
electron_melted_df['Area'] = 'Electron Microscopy'
electron_melted_df['Source Sheet'] = 'Sheet1_3_Electron'

#### Autopsy

###### Because the electron microscopy is collected differently for boards, the autopsy table is also affected as it is placed directly beneath.
###### Boards will be ingested in 2 groups again.

In [None]:
# assign the workload data to dataframe using the defined function
autopsy_1_df = combine_excel_files(directory, '1. Workload & Production', skiprows=44, nrows=3, usecols=[1,2])

In [None]:
# assign the workload data to dataframe using the defined function
autopsy_2_df = combine_excel_files(directory, '1. Workload & Production', skiprows=45, nrows=3, usecols=[1,2])

In [61]:
# filter for the boards in group 1
autopsy_1_df = autopsy_1_df[(autopsy_1_df['File Name'] == 'NHS D&G SPAN 2023-24.xlsx') |
                              (autopsy_1_df['File Name'] == 'NHS Fife SPAN 2023-24.xlsx') |
                              (autopsy_1_df['File Name'] == 'NHS Forth Valley SPAN 2023-24.xlsx') |
                              (autopsy_1_df['File Name'] == 'NHS Highland SPAN 2023-24.xlsx') |
                              (autopsy_1_df['File Name'] == 'NHS Lanarkshire SPAN 2023-24.xlsx')] 


# filter for the boards in group 2
autopsy_2_df = autopsy_2_df[(autopsy_2_df['File Name'] == 'NHS A&A SPAN 2023-24.xlsx') |
                              (autopsy_2_df['File Name'] == 'NHS GG&C SPAN 2023-24.xlsx') |
                              (autopsy_2_df['File Name'] == 'NHS Grampian SPAN 2023-24.xlsx') |
                              (autopsy_2_df['File Name'] == 'NHS Lothian SPAN 2023-24.xlsx') |
                              (autopsy_2_df['File Name'] == 'NHS Tayside SPAN 2023-24.xlsx')] 

In [62]:
# drop unwanted columns
autopsy_1_df = autopsy_1_df.drop(columns=['ALL AUTOPSY (incorporating General Pathology, Neuropathology, Paediatric Pathology and Fiscal) ',
                                          'Unnamed: 2'])


# drop unwanted columns
autopsy_2_df = autopsy_2_df.drop(columns=['Non Fiscal Adult Post Mortems (incl. BRAIN only)',
                                          2,
                                          7,
                                          4,
                                          8,
                                          'Unnamed: 2'])

In [63]:
# add new column category
autopsy_1_df['Category'] = 'Autopsy Activity'


# add new column category
autopsy_2_df['Category'] = 'Autopsy Activity'

In [64]:
# rename value column appropriately
autopsy_1_df = autopsy_1_df.rename(columns={'Autopsy Activity': 'Value'})


# rename value column appropriately
autopsy_2_df = autopsy_2_df.rename(columns={'Autopsy Activity': 'Value'})

In [65]:
autopsy_df = pd.concat([autopsy_1_df, autopsy_2_df])

In [66]:
# add area column
autopsy_df['Area'] = 'Autopsy'


# add the source sheet
autopsy_df['Source Sheet'] = 'Sheet1_4_Autopsy'

#### Merge the dataframes

In [67]:
# use concat to merge
merged_df = pd.concat([histopathology_melted_df,
                      cytopathology_melted_df,
                      electron_melted_df,
                      autopsy_df])

In [68]:
# add a year column
merged_df['Year'] = '23/24'

In [69]:
# extract health board from file name
merged_df['Health Board'] = merged_df['File Name'].apply(extract_hb)

In [70]:
# rename health boards with short code
merged_df = merged_df.replace({'Fife': 'FIFE',
                               'Forth': 'FV',
                               'Grampian': 'GRAM',
                               'Highland': 'HIGH',
                               'Lanarkshire': 'LAN',
                               'Lothian': 'LOTH',
                               'Tayside': 'TAY'})

In [71]:
# rename column
merged_df = merged_df.rename(columns={'File Name': 'Source File'})

In [72]:
# define new order for columns
column_order = ['Source File',
               'Source Sheet',
               'Health Board',
               'Year',
               'Area',
               'Category',
               'Type',
               'Value']


# apply new column order
merged_df = merged_df[column_order]

In [73]:
# remove values of 0 and na
merged_df = merged_df[merged_df['Value'] != 0.0]
merged_df = merged_df.dropna()

In [74]:
merged_df.to_csv('cleaned_data/workload_production.csv', index=False)