# SPAH Postcode Mapping - Initial cleaning of data

In [25]:
import pandas as pd

### Import data

In [26]:
# define file pathways
cleaned_folder_pathway = r'C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAH\Postcode Mapping\cleaned_data'
raw_folder_pathway = r'C:\Users\scotth07\OneDrive - NHS Scotland\Documents\Networks & Projects\SPAH\Postcode Mapping\raw_data'
patient_data_pathway = raw_folder_pathway + r'\2024-08-01 SPAH Raw Tables.xlsx'

In [27]:
# import raw data
patient_df = pd.read_excel(patient_data_pathway, sheet_name='Demographics_rpt')

### Clean data

In [28]:
# remove deceased patients
patient_df = patient_df[patient_df['Date Of Death'].isna()]

In [29]:
# filter the dataframe to only keep required columns
columns_to_keep = [
    'Chi Number', 
    'Date Of Birth', 
    'Health Board Of Residence Description',
    'Patient Treatment Centre Health Board',
    'Patient Treatment Centre',
    'Patient Status',
    'Patients Postcode'
]
patient_df = patient_df[columns_to_keep]

In [30]:
# rename column headers
column_renaming = {
    'Chi Number': 'chi', 
    'Date Of Birth': 'dob', 
    'Health Board Of Residence Description': 'hb_residence',
    'Patient Treatment Centre Health Board': 'hb_treatment_centre',
    'Patient Treatment Centre': 'treatment_centre',
    'Patient Status': 'patient_status',
    'Patients Postcode': 'patient_postcode'   
}
patient_df = patient_df.rename(columns=column_renaming)

In [31]:
# filter for active patients only
patient_df = patient_df[(patient_df['patient_status'] == 'FOLLOW UP') |
                        (patient_df['patient_status'].isna())]

In [32]:
# filter for patients in scotland only
patient_df = patient_df[(patient_df['hb_residence'] != 'ENGLAND/WALES/NORTHERN IRELAND')]

In [33]:
# reset index
patient_df = patient_df.reset_index(drop=True)

### Export data

In [34]:
# isolate postcodes into a single list without other patient information
patient_postcodes_df = pd.DataFrame(patient_df['patient_postcode'])

In [35]:
patient_df.to_excel(cleaned_folder_pathway + r'\patient_data.xlsx',
                   index=False)

In [36]:
patient_postcodes_df.to_excel(cleaned_folder_pathway + r'\patient_postcodes_data.xlsx', 
                              index=False)