### Import libraries

In [16]:
import os
import pandas as pd
import numpy as np


### Define functions and file pathway for collecting excel files

In [17]:
# define the directory for the files
directory = r"C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24"

In [18]:
# function to combine data from Excel files
def combine_excel_files(input_directory, sheet_name, skiprows, nrows, usecols):
    all_data = []
    for file in os.listdir(input_directory):
        if file.endswith('.xlsx'):
            try:
                file_path = os.path.join(input_directory, file)
                print(f"Processing file: {file_path}")
                data = pd.read_excel(file_path, sheet_name=sheet_name, 
                                     skiprows=skiprows, nrows=nrows, usecols=usecols).assign(**{'File Name': file})
                all_data.append(data)
            except ValueError as e:
                print(f"ValueError in file: {file} - {e}")
            except Exception as e:
                print(f"Error in file: {file} - {e}")
    return pd.concat(all_data, ignore_index=True)



# function to extract the health baord  from the file name
def extract_hb(data):
    try:
        parts = data.split(' ')
        health_board = parts[1]
        return health_board
    except IndexError:
        # Handle cases where the format is not as expected
        return 'Unknown'


### Ingest data from 6. Consumables Costs sheet

In [19]:
# assign the workload data to dataframe using the defined function
consumables_df = combine_excel_files(directory, '6. Consumables Costs', skiprows=6, nrows=1, usecols=[1,2,3,4,5,6,7,8])

Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS A&A SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS D&G SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS Fife SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS Forth Valley SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS GG&C SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS Grampian SPAN 2023-24.xlsx
Processing file: C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAN UNI\Returns\23_24\NHS Highland SPAN 2023-24.xlsx
Processing fi

In [20]:
consumables_df = consumables_df.rename(columns={'File Name': 'Source File'})

In [21]:
# unpivot the dataframe to have a single value column
melted_consumables_df = pd.melt(consumables_df,
                                      id_vars=['Source File'],
                                      value_vars=['General Histopathology',
                                                  'Non-Cervical/ Diagnostic Cytopathology',
                                                  'Cervical Screening Cytopathology',
                                                  'Mortuary & Autopsy (Adult)',
                                                  'Mortuary & Autopsy (Perinatal)', 
                                                  'Paediatric Pathology',
                                                  'Neuropathology'],
                                      var_name='Area',
                                      value_name='Value')
                                      

In [22]:
melted_consumables_df['Area'] = melted_consumables_df['Area'].replace({'Mortuary & Autopsy (Adult)': 'Mortuary & Autopsy',
                                                                       'Mortuary & Autopsy (Perinatal)': 'Mortuary & Autopsy'})

In [23]:
melted_consumables_df['Year'] = '23/24'

In [24]:
# extract health board from file name
melted_consumables_df['Health Board'] = melted_consumables_df['Source File'].apply(extract_hb)

In [25]:
# rename health boards with short code
melted_consumables_df = melted_consumables_df.replace({'Fife': 'FIFE',
                               'Forth': 'FV',
                               'Grampian': 'GRAM',
                               'Highland': 'HIGH',
                               'Lanarkshire': 'LAN',
                               'Lothian': 'LOTH',
                               'Tayside': 'TAY'})

In [26]:
# define new order for columns
column_order = ['Source File',
               'Health Board',
               'Year',
               'Area',
               'Value']


# apply new column order
melted_consumables_df = melted_consumables_df[column_order]

In [27]:
melted_consumables_df = melted_consumables_df.dropna(subset='Value')

In [28]:
sum = melted_consumables_df['Value'].sum()
sum

11033245.530000001

In [29]:
melted_consumables_df.to_csv('cleaned_data/consumables.csv', index=False)