In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load Dataset
file_path = "nhs_ae_merged_fixed.csv"
nhs_data = pd.read_csv(file_path)

# Display Basic Info
print("Dataset Shape:", nhs_data.shape)
print("\nColumn Names:", nhs_data.columns)
print("\nFirst 5 Rows:")
print(nhs_data.head())

# Check for Missing Values
missing_values = nhs_data.isnull().sum()
print("\nMissing Values:")
print(missing_values[missing_values > 0])

# Convert 'year' to string and 'month' to categorical
nhs_data["year"] = nhs_data["year"].astype(str)
nhs_data["month"] = pd.Categorical(nhs_data["month"], categories=[
    "January", "February", "March", "April", "May", "June",
    "July", "August", "September", "October", "November", "December"
], ordered=True)

# Feature Engineering: Calculate total A&E attendances
if "total_a&e_attendances" not in nhs_data.columns:
    nhs_data["total_a&e_attendances"] = (
        nhs_data.get("a&e_attendances_type_1", 0).fillna(0) +
        nhs_data.get("a&e_attendances_type_2", 0).fillna(0) +
        nhs_data.get("a&e_attendances_other_a&e_department", 0).fillna(0)
    )

# Drop Unnecessary Columns
columns_to_drop = ["a&e_attendances_type_1", "a&e_attendances_type_2", "a&e_attendances_other_a&e_department"]
nhs_data.drop(columns=[col for col in columns_to_drop if col in nhs_data.columns], inplace=True)

# Save Preprocessed Data for Next Steps
preprocessed_file = "nhs_ae_preprocessed.csv"
nhs_data.to_csv(preprocessed_file, index=False)
print(f"\n📁 Preprocessed dataset saved as {preprocessed_file}")


Dataset Shape: (12546, 31)

Column Names: Index(['period', 'org_code', 'parent_org', 'org_name',
       'a&e_attendances_type_1', 'a&e_attendances_type_2',
       'a&e_attendances_other_a&e_department',
       'a&e_attendances_booked_appointments_type_1',
       'a&e_attendances_booked_appointments_type_2',
       'a&e_attendances_booked_appointments_other_department',
       'attendances_over_4hrs_type_1', 'attendances_over_4hrs_type_2',
       'attendances_over_4hrs_other_department',
       'attendances_over_4hrs_booked_appointments_type_1',
       'attendances_over_4hrs_booked_appointments_type_2',
       'attendances_over_4hrs_booked_appointments_other_department',
       'patients_who_have_waited_4-12_hs_from_dta_to_admission',
       'patients_who_have_waited_12+_hrs_from_dta_to_admission',
       'emergency_admissions_via_a&e_-_type_1',
       'emergency_admissions_via_a&e_-_type_2',
       'emergency_admissions_via_a&e_-_other_a&e_department',
       'other_emergency_admission