### Import libraries

In [78]:
import os
import pandas as pd
import numpy as np


### Define functions and file pathway for collecting excel files

In [79]:
# define the directory for the files
directory = r"C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24"

In [80]:
# function to combine data from Excel files
def combine_excel_files(input_directory, sheet_name, skiprows, nrows, usecols):
    all_data = []
    for file in os.listdir(input_directory):
        if file.endswith('.xlsx'):
            try:
                file_path = os.path.join(input_directory, file)
                print(f"Processing file: {file_path}")
                data = pd.read_excel(file_path, sheet_name=sheet_name, 
                                     skiprows=skiprows, nrows=nrows, usecols=usecols).assign(**{'File Name': file})
                all_data.append(data)
            except ValueError as e:
                print(f"ValueError in file: {file} - {e}")
            except Exception as e:
                print(f"Error in file: {file} - {e}")
    return pd.concat(all_data, ignore_index=True)



# function to extract the health baord  from the file name
def extract_hb(data):
    try:
        parts = data.split(' ')
        health_board = parts[1]
        return health_board
    except IndexError:
        # Handle cases where the format is not as expected
        return 'Unknown'


### Ingest data from 2. Turnaround sheet

#### General Histopathology Specimen Specific Turnaround Times

In [81]:
# assign the workload data to dataframe using the defined function
specimen_specific_df = combine_excel_files(directory, '2. Turnaround', skiprows=9, nrows=6, usecols=[1,2,3,4,5,6,7,8,9,10,11])

Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS A&A SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS D&G SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS Fife SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS Forth Valley SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS GG&C SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS Grampian SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS Highland SPAN 2023-24.xlsx
Processing fi

In [82]:
specimen_specific_df = specimen_specific_df.rename(columns={'Unnamed: 1': 'Category'})

In [83]:
# unpivot the dataframe to have a single value column

melted_specimen_specific_df = pd.melt(specimen_specific_df,
                                      id_vars=['Category', 'File Name'],
                                      value_vars=['Diagnostic Breast Core Needle Biopsies',
                                                  'Diagnostic Colorectal Endoscopy Biopsies',
                                                  'Diagnostic Prostate Needle Core Biopsies', 'Gallbladder Resections',
                                                  'Rectal Anterior Resections',
                                                  'Breast Wide Local Excisions (incl nodes)', 
                                                  'Lung Biopsies',
                                                  'Duodenal Biopsies', 
                                                  'Vocal Cord  Biopsy', 
                                                  'Laryngectomy Resections'],
                                      var_name='Type',
                                      value_name='Value')
                                      

In [84]:
# add area column
melted_specimen_specific_df['Area'] = 'General Histopathology Specimen Specific Turnaround Times'

In [85]:
# drop na and 0 rows
melted_specimen_specific_df = melted_specimen_specific_df[melted_specimen_specific_df['Value'] != 0]
melted_specimen_specific_df = melted_specimen_specific_df.dropna(subset='Value')

#### General Histopathology Percentage Turnaround Times

In [86]:
# assign the workload data to dataframe using the defined function
percentage_based_df = combine_excel_files(directory, '2. Turnaround', skiprows=20, nrows=6, usecols=[1,2,3,4,5])

Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS A&A SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS D&G SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS Fife SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS Forth Valley SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS GG&C SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS Grampian SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS Highland SPAN 2023-24.xlsx
Processing fi

In [87]:
percentage_based_df = percentage_based_df.rename(columns={'               ': 'Category',
                                                          'Cervical Screening: \n336 hours (14 Calendar Days)': 'Cervical Screening:336 hours (14 Calendar Days)',
                                                          'Cervical Cytology:\n336 hours (14 Calendar Days)': 'Cervical Cytology:336 hours (14 Calendar Days)'})

In [88]:
# unpivot the dataframe to have a single value column
melted_percentage_based_df = pd.melt(percentage_based_df,
                              id_vars=['Category', 'File Name'],
                              value_vars=['72 hours (3 Calendar Days)',
                                         '168 hours (7 Calendar Days)',
                                         'Cervical Screening:336 hours (14 Calendar Days)' ,
                                         'Cervical Cytology:336 hours (14 Calendar Days)'],
                              var_name='Type',
                              value_name='Value')
                                      

In [89]:
# add area column
melted_percentage_based_df['Area'] = 'General Histopathology Percentage Turnaround Times'

In [90]:
# drop na and 0 rows
melted_percentage_based_df = melted_percentage_based_df[melted_percentage_based_df['Value'] != 0]
melted_percentage_based_df = melted_percentage_based_df.dropna(subset='Value')

#### Non-Cervical/Diagnostic Cytopathology Percentage Turnaround Times

In [91]:
# assign the workload data to dataframe using the defined function
non_cerv_percentage_based_1_df = combine_excel_files(directory, '2. Turnaround', skiprows=31, nrows=2, usecols=[1,2,3])

Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS A&A SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS D&G SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS Fife SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS Forth Valley SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS GG&C SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS Grampian SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS Highland SPAN 2023-24.xlsx
Processing fi

In [92]:
# assign the workload data to dataframe using the defined function
non_cerv_percentage_based_2_df = combine_excel_files(directory, '2. Turnaround', skiprows=31, nrows=2, usecols=[4])

Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS A&A SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS D&G SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS Fife SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS Forth Valley SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS GG&C SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS Grampian SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS Highland SPAN 2023-24.xlsx
Processing fi

In [93]:
# rename column appropriately
non_cerv_percentage_based_1_df = non_cerv_percentage_based_1_df.rename(columns={'Unnamed: 1': 'Category'})

In [94]:
# unpivot the dataframe to have a single value column
melted_non_cerv_percentage_based_1_df = pd.melt(non_cerv_percentage_based_1_df,
                                              id_vars=['Category', 'File Name'],
                                              value_vars=['24 hours (1 Calendar Day)',
                                                          '72 hours (3 Calendar Days)'],
                                              var_name='Type',
                                              value_name='Value')
                                      

In [95]:
# drop na values
non_cerv_percentage_based_2_df = non_cerv_percentage_based_2_df.dropna(subset='Total Number of Requests')

In [96]:
# unpivot the dataframe to have a single value column
melted_non_cerv_percentage_based_2_df = pd.melt(non_cerv_percentage_based_2_df,
                                              id_vars=['File Name'],
                                              value_vars=['Total Number of Requests'],
                                              var_name='Type',
                                              value_name='Value')
                                      

In [97]:
# assign category columnn
melted_non_cerv_percentage_based_2_df['Category'] = 'Total number of requests reported within'

In [98]:
# merge the two dataframes into one
melted_non_cerv_df = pd.concat([melted_non_cerv_percentage_based_1_df,
                                melted_non_cerv_percentage_based_2_df])

In [99]:
# drop na and values of 0
melted_non_cerv_df = melted_non_cerv_df.dropna(subset='Value')
melted_non_cerv_df = melted_non_cerv_df[melted_non_cerv_df['Value'] != 0]

In [100]:
# add area column
melted_non_cerv_df['Area'] = 'Non-Cervical/Diagnostic Cytopathology Percentage Turnaround Times'

#### Autopsy/Post Mortem QIS Turnaround Times

In [101]:
# assign the workload data to dataframe using the defined function
autopsy_initial_df = combine_excel_files(directory, '2. Turnaround', skiprows=40, nrows=3, usecols=[1,2])

Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS A&A SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS D&G SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS Fife SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS Forth Valley SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS GG&C SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS Grampian SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS Highland SPAN 2023-24.xlsx
Processing fi

In [102]:
autopsy_final_df = combine_excel_files(directory, '2. Turnaround', skiprows=44, nrows=3, usecols=[1,2])

Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS A&A SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS D&G SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS Fife SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS Forth Valley SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS GG&C SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS Grampian SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS Highland SPAN 2023-24.xlsx
Processing fi

In [103]:
autopsy_initial_df = autopsy_initial_df.rename(columns={'Initial Reports': 'Type',
                                                'Unnamed: 2': 'Value'})


autopsy_final_df = autopsy_final_df.rename(columns={'Final Reports': 'Type',
                                              'Unnamed: 2': 'Value'})

In [104]:
autopsy_initial_df['Category'] = 'Initial Reports'
autopsy_final_df['Category'] = 'Final Reports'

In [105]:
autopsy_initial_df['Source Sheet'] = 'Sheet2_4_AutopsyInitial' 
autopsy_final_df['Source Sheet'] = 'Sheet2_5_AutopsyFinal' 

In [106]:
autopsy_df = pd.concat([autopsy_initial_df,
                        autopsy_final_df])

In [107]:
autopsy_df['Area'] = 'Autopsy/Post Mortem QIS Turnaround Times'

In [108]:
autopsy_df = autopsy_df[autopsy_df['Value'] != 0]
autopsy_df = autopsy_df.dropna(subset='Value')

In [109]:
melted_specimen_specific_df['Source Sheet'] = 'Sheet2_1_HistoSpecimenSpecific' 
melted_percentage_based_df['Source Sheet'] = 'Sheet2_2_HistoPercent'
melted_non_cerv_df['Source Sheet'] = 'Sheet2_3_NonCervical'

### Merge the dataframes

In [110]:
merged_df = pd.concat([melted_specimen_specific_df,
                      melted_percentage_based_df,
                      melted_non_cerv_df,
                      autopsy_df])

In [111]:
merged_df = merged_df.rename(columns={'File Name': 'Source File'}) 

In [112]:
# extract health board from file name
merged_df['Health Board'] = merged_df['Source File'].apply(extract_hb)

In [113]:
merged_df['Year'] = '23/23'

In [114]:
# rename health boards with short code
merged_df = merged_df.replace({'Fife': 'FIFE',
                               'Forth': 'FV',
                               'Grampian': 'GRAM',
                               'Highland': 'HIGH',
                               'Lanarkshire': 'LAN',
                               'Lothian': 'LOTH',
                               'Tayside': 'TAY'})

In [115]:
# define new order for columns
column_order = ['Source File',
               'Source Sheet',
               'Health Board',
               'Year',
               'Area',
               'Category',
               'Type',
               'Value']


# apply new column order
merged_df = merged_df[column_order]

In [116]:
merged_df

Unnamed: 0,Source File,Source Sheet,Health Board,Year,Area,Category,Type,Value
0,NHS A&A SPAN 2023-24.xlsx,Sheet2_1_HistoSpecimenSpecific,A&A,23/23,General Histopathology Specimen Specific Turna...,Mean turnaround time (hours),Diagnostic Breast Core Needle Biopsies,121.46
1,NHS A&A SPAN 2023-24.xlsx,Sheet2_1_HistoSpecimenSpecific,A&A,23/23,General Histopathology Specimen Specific Turna...,Total number on which the mean turnaround is b...,Diagnostic Breast Core Needle Biopsies,967.0
2,NHS A&A SPAN 2023-24.xlsx,Sheet2_1_HistoSpecimenSpecific,A&A,23/23,General Histopathology Specimen Specific Turna...,Number reported within 7 calendar days,Diagnostic Breast Core Needle Biopsies,867.0
6,NHS D&G SPAN 2023-24.xlsx,Sheet2_1_HistoSpecimenSpecific,D&G,23/23,General Histopathology Specimen Specific Turna...,Mean turnaround time (hours),Diagnostic Breast Core Needle Biopsies,111.6
7,NHS D&G SPAN 2023-24.xlsx,Sheet2_1_HistoSpecimenSpecific,D&G,23/23,General Histopathology Specimen Specific Turna...,Total number on which the mean turnaround is b...,Diagnostic Breast Core Needle Biopsies,241.0
...,...,...,...,...,...,...,...,...
19,NHS Highland SPAN 2023-24.xlsx,Sheet2_5_AutopsyFinal,HIGH,23/23,Autopsy/Post Mortem QIS Turnaround Times,Final Reports,Total number of requests,8.0
20,NHS Highland SPAN 2023-24.xlsx,Sheet2_5_AutopsyFinal,HIGH,23/23,Autopsy/Post Mortem QIS Turnaround Times,Final Reports,% available within 30 Working Days (Target 90%),0.375
24,NHS Lothian SPAN 2023-24.xlsx,Sheet2_5_AutopsyFinal,LOTH,23/23,Autopsy/Post Mortem QIS Turnaround Times,Final Reports,Total number of requests within 30 Working Days,17.0
25,NHS Lothian SPAN 2023-24.xlsx,Sheet2_5_AutopsyFinal,LOTH,23/23,Autopsy/Post Mortem QIS Turnaround Times,Final Reports,Total number of requests,169.0


In [117]:
merged_df.to_csv('cleaned_data/turnaround.csv', index=False)