### Import libraries

In [50]:
import os
import pandas as pd
import numpy as np


### Define functions and file pathway for collecting excel files

In [51]:
# define the directory for the files
directory = r"C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24"

In [52]:
# function to combine data from Excel files
def combine_excel_files(input_directory, sheet_name, skiprows, nrows, usecols):
    all_data = []
    for file in os.listdir(input_directory):
        if file.endswith('.xlsx'):
            try:
                file_path = os.path.join(input_directory, file)
                print(f"Processing file: {file_path}")
                data = pd.read_excel(file_path, sheet_name=sheet_name, 
                                     skiprows=skiprows, nrows=nrows, usecols=usecols).assign(**{'File Name': file})
                all_data.append(data)
            except ValueError as e:
                print(f"ValueError in file: {file} - {e}")
            except Exception as e:
                print(f"Error in file: {file} - {e}")
    return pd.concat(all_data, ignore_index=True)



# function to extract the health baord  from the file name
def extract_hb(data):
    try:
        parts = data.split(' ')
        health_board = parts[1]
        return health_board
    except IndexError:
        # Handle cases where the format is not as expected
        return 'Unknown'


### Ingest data from 5. Medical Staffing sheet

#### Consultants

In [53]:
# assign the data to dataframe using the defined function
consultants_df = combine_excel_files(directory, '5. Medical Staffing', skiprows=8, nrows=11, usecols=[1,2,3,4,5,6,7,8])

Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS A&A SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS D&G SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS Fife SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS Forth Valley SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS GG&C SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS Grampian SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS Highland SPAN 2023-24.xlsx
Processing fi

In [54]:
consultants_df = consultants_df.rename(columns={'Unnamed: 1': 'Sub-Category',
                                               'File Name': 'Source File'})

In [55]:
consultants_df = consultants_df[consultants_df['Sub-Category'] != 'Consultants']

In [56]:
# unpivot the dataframe to have a single value column
melted_consultants_df = pd.melt(consultants_df,
                                      id_vars=['Sub-Category', 'Source File'],
                                      value_vars=['General Histopathology',
                                                  'Non-Cervical/ Diagnostic Cytopathology',
                                                  'Cervical Screening Cytopathology',
                                                  'Mortuary & Autopsy (Adult)',
                                                  'Mortuary & Autopsy (Perinatal)', 
                                                  'Paediatric Pathology',
                                                  'Neuropathology'],
                                      var_name='Area',
                                      value_name='Value')
                                      

In [57]:
melted_consultants_df['Category'] = 'Consultants'

In [58]:
# drop na and 0 rows
melted_consultants_df = melted_consultants_df[melted_consultants_df['Value'] != 0]
melted_consultants_df = melted_consultants_df.dropna(subset='Value')

In [59]:
melted_consultants_df['Source Sheet'] = 'Sheet5_1_ConsultantsNHS' 

#### Non-Consultants (SAS)

In [60]:
# assign the data to dataframe using the defined function
non_consultants_df = combine_excel_files(directory, '5. Medical Staffing', skiprows=20, nrows=11, usecols=[1,2,3,4,5,6,7,8])

Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS A&A SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS D&G SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS Fife SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS Forth Valley SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS GG&C SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS Grampian SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS Highland SPAN 2023-24.xlsx
Processing fi

In [61]:
non_consultants_df = non_consultants_df.rename(columns={'Unnamed: 1': 'Sub-Category',
                                               'File Name': 'Source File'})

In [62]:
non_consultants_df = non_consultants_df[non_consultants_df['Sub-Category'] != 'Non-Consultant (SAS)']

In [63]:
# unpivot the dataframe to have a single value column
melted_non_consultants_df = pd.melt(non_consultants_df,
                                      id_vars=['Sub-Category', 'Source File'],
                                      value_vars=['General Histopathology',
                                                  'Non-Cervical/ Diagnostic Cytopathology',
                                                  'Cervical Screening Cytopathology',
                                                  'Mortuary & Autopsy (Adult)',
                                                  'Mortuary & Autopsy (Perinatal)', 
                                                  'Paediatric Pathology',
                                                  'Neuropathology'],
                                      var_name='Area',
                                      value_name='Value')
                                      

In [64]:
melted_non_consultants_df['Category'] = 'Non-Consultants (SAS)'

In [65]:
# drop na and 0 rows
melted_non_consultants_df = melted_non_consultants_df[melted_non_consultants_df['Value'] != 0]
melted_non_consultants_df = melted_non_consultants_df.dropna(subset='Value')

In [66]:
melted_non_consultants_df['Source Sheet'] = 'Sheet5_1_NonConsultantsNHS' 

#### University Consultants

In [67]:
# assign the data to dataframe using the defined function
university_consultants_df = combine_excel_files(directory, '5. Medical Staffing', skiprows=41, nrows=11, usecols=[1,2,3,4,5,6,7,8])

Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS A&A SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS D&G SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS Fife SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS Forth Valley SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS GG&C SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS Grampian SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS Highland SPAN 2023-24.xlsx
Processing fi

In [68]:
university_consultants_df = university_consultants_df.rename(columns={'Unnamed: 1': 'Sub-Category',
                                               'File Name': 'Source File'})

In [69]:
university_consultants_df = university_consultants_df[university_consultants_df['Sub-Category'] != 'Consultants']

In [70]:
# unpivot the dataframe to have a single value column
melted_university_consultants_df = pd.melt(university_consultants_df,
                                      id_vars=['Sub-Category', 'Source File'],
                                      value_vars=['General Histopathology',
                                                  'Non-Cervical/ Diagnostic Cytopathology',
                                                  'Cervical Screening Cytopathology',
                                                  'Mortuary & Autopsy (Adult)',
                                                  'Mortuary & Autopsy (Perinatal)', 
                                                  'Paediatric Pathology',
                                                  'Neuropathology'],
                                      var_name='Area',
                                      value_name='Value')
                                      

In [71]:
melted_university_consultants_df['Category'] = 'University Consultant'

In [72]:
# drop na and 0 rows
melted_university_consultants_df = melted_university_consultants_df[melted_university_consultants_df['Value'] != 0]
melted_university_consultants_df = melted_university_consultants_df.dropna(subset='Value')

In [73]:
melted_university_consultants_df['Source Sheet'] = 'Sheet5_3_ConsultantsAcademic' 

#### External Funding

In [74]:
# assign the data to dataframe using the defined function
external_funding_df = combine_excel_files(directory, '5. Medical Staffing', skiprows=57, nrows=2, usecols=[1,2,3,4,5,6,7,8])

Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS A&A SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS D&G SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS Fife SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS Forth Valley SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS GG&C SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS Grampian SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS Highland SPAN 2023-24.xlsx
Processing fi

In [75]:
external_funding_df = external_funding_df.rename(columns={'Unnamed: 1': 'Sub-Category',
                                               'File Name': 'Source File'})

In [76]:
# unpivot the dataframe to have a single value column
melted_external_funding_df = pd.melt(external_funding_df,
                                      id_vars=['Sub-Category', 'Source File'],
                                      value_vars=['General Histopathology',
                                                  'Non-Cervical/ Diagnostic Cytopathology',
                                                  'Cervical Screening Cytopathology',
                                                  'Mortuary & Autopsy (Adult)',
                                                  'Mortuary & Autopsy (Perinatal)', 
                                                  'Paediatric Pathology',
                                                  'Neuropathology'],
                                      var_name='Area',
                                      value_name='Value')
                                      

In [77]:
melted_external_funding_df['Category'] = 'External Funding'

In [78]:
# drop na and 0 rows
melted_external_funding_df = melted_external_funding_df[melted_external_funding_df['Value'] != 0]
melted_external_funding_df = melted_external_funding_df.dropna(subset='Value')

In [79]:
melted_external_funding_df['Source Sheet'] = 'Sheet5_4_ExternalFunding' 

In [80]:
melted_external_funding_df

Unnamed: 0,Sub-Category,Source File,Area,Value,Category,Source Sheet
8,Number of PA's funded from sources external to...,NHS GG&C SPAN 2023-24.xlsx,General Histopathology,11.0,External Funding,Sheet5_4_ExternalFunding


#### Excess Activity

In [81]:
# assign the data to dataframe using the defined function
excess_activity_df = combine_excel_files(directory, '5. Medical Staffing', skiprows=64, nrows=4, usecols=[1,2,3,4,5,6,7,8])

Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS A&A SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS D&G SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS Fife SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS Forth Valley SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS GG&C SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS Grampian SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS Highland SPAN 2023-24.xlsx
Processing fi

In [82]:
excess_activity_df = excess_activity_df.rename(columns={'Unnamed: 1': 'Sub-Category',
                                                        'File Name': 'Source File'})

In [83]:
# unpivot the dataframe to have a single value column
melted_excess_activity_df = pd.melt(excess_activity_df,
                                      id_vars=['Sub-Category', 'Source File'],
                                      value_vars=['General Histopathology',
                                                  'Non-Cervical/ Diagnostic Cytopathology',
                                                  'Cervical Screening Cytopathology',
                                                  'Mortuary & Autopsy (Adult)',
                                                  'Mortuary & Autopsy (Perinatal)', 
                                                  'Paediatric Pathology',
                                                  'Neuropathology'],
                                      var_name='Area',
                                      value_name='Value')
                                      

In [84]:
melted_excess_activity_df['Category'] = 'Excess Activity'

In [85]:
# drop na and 0 rows
melted_excess_activity_df = melted_excess_activity_df[melted_excess_activity_df['Value'] != 0]
melted_excess_activity_df = melted_excess_activity_df.dropna(subset='Value')

In [86]:
melted_excess_activity_df['Source Sheet'] = 'Sheet5_5_ExcessActivity' 

#### Expert Opinion Referral Activity

In [87]:
# assign the data to dataframe using the defined function
expert_opinion_df = combine_excel_files(directory, '5. Medical Staffing', skiprows=72, nrows=2, usecols=[1,2,3,4,5,6,7,8])

Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS A&A SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS D&G SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS Fife SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS Forth Valley SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS GG&C SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS Grampian SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS Highland SPAN 2023-24.xlsx
Processing fi

In [88]:
expert_opinion_df = expert_opinion_df.rename(columns={'Unnamed: 1': 'Sub-Category',
                                               'File Name': 'Source File'})

In [89]:
# unpivot the dataframe to have a single value column
melted_expert_opinion_df = pd.melt(expert_opinion_df,
                                      id_vars=['Sub-Category', 'Source File'],
                                      value_vars=['General Histopathology',
                                                  'Non-Cervical/ Diagnostic Cytopathology',
                                                  'Cervical Screening Cytopathology',
                                                  'Mortuary & Autopsy (Adult)',
                                                  'Mortuary & Autopsy (Perinatal)', 
                                                  'Paediatric Pathology',
                                                  'Neuropathology'],
                                      var_name='Area',
                                      value_name='Value')
                                      

In [90]:
melted_expert_opinion_df['Category'] = 'Expert Opinion'

In [91]:
# drop na and 0 rows
melted_expert_opinion_df = melted_expert_opinion_df[melted_expert_opinion_df['Value'] != 0]
melted_expert_opinion_df = melted_expert_opinion_df.dropna(subset='Value')

In [92]:
melted_expert_opinion_df['Source Sheet'] = 'Sheet5_6_ExpertOpinion' 

### Merge the dataframes

In [93]:
merged_df = pd.concat([melted_expert_opinion_df, 
                       melted_excess_activity_df, 
                       melted_external_funding_df, 
                       melted_university_consultants_df,
                       melted_non_consultants_df,
                       melted_consultants_df])


In [94]:
merged_df['Year'] = '23/24'

In [95]:
# extract health board from file name
merged_df['Health Board'] = merged_df['Source File'].apply(extract_hb)

In [96]:
# rename health boards with short code
merged_df = merged_df.replace({'Fife': 'FIFE',
                               'Forth': 'FV',
                               'Grampian': 'GRAM',
                               'Highland': 'HIGH',
                               'Lanarkshire': 'LAN',
                               'Lothian': 'LOTH',
                               'Tayside': 'TAY'})

In [97]:
# define new order for columns
column_order = ['Source File',
               'Source Sheet',
               'Health Board',
               'Year',
               'Area',
               'Category',
               'Sub-Category',
               'Value']


# apply new column order
merged_df = merged_df[column_order]

In [98]:
merged_df.to_csv('cleaned_data/medical_staffing.csv', index=False)