In [21]:
import pandas as pd
import numpy as np

# Load Dataset
file_path = "nhs_ae_merged_fixed.csv"
nhs_data = pd.read_csv(file_path)

# Convert 'year' to string and 'month' to categorical
nhs_data["year"] = nhs_data["year"].astype(str)
nhs_data["month"] = pd.Categorical(nhs_data["month"], categories=[
    "January", "February", "March", "April", "May", "June",
    "July", "August", "September", "October", "November", "December"
], ordered=True)

# Handle missing months
if nhs_data["month"].isna().sum() > 0:
    nhs_data.loc[nhs_data["month"].isna(), "month"] = nhs_data["period"].str.extract(r'-(\w+)-')[0]

# Drop rows where 'month' is still missing
nhs_data.dropna(subset=["month"], inplace=True)

# 🔍 Fix Inf and NaN Values in 'total_a&e_attendances'
nhs_data["total_a&e_attendances"].replace([np.inf, -np.inf], np.nan, inplace=True)
nhs_data["total_a&e_attendances"].fillna(nhs_data["total_a&e_attendances"].median(), inplace=True)

# 🔍 Fill Missing Attendance Columns Using Yearly Averages, then Median
attendance_columns = [
    "number_of_a&e_attendances_type_1",
    "number_of_a&e_attendances_type_2",
    "number_of_a&e_attendances_other_a&e_department",
    "number_of_attendances_over_4hrs_type_1",
    "number_of_attendances_over_4hrs_type_2",
    "number_of_attendances_over_4hrs_other_a&e_department"
]

for col in attendance_columns:
    if col in nhs_data.columns:
        nhs_data[col] = nhs_data.groupby("year")[col].transform("mean")
        nhs_data[col] = nhs_data[col].fillna(nhs_data[col].median())  # Final fallback

# 🔍 Check for Abnormal Attendance Data (All Same Values)
for col in attendance_columns:
    if nhs_data[col].nunique() == 1:  # If all values are identical
        print(f"⚠ Warning: {col} might have been filled incorrectly. Check imputation logic.")

# Drop Unnecessary Columns
columns_to_drop = ["a&e_attendances_type_1", "a&e_attendances_type_2", "a&e_attendances_other_a&e_department"]
nhs_data.drop(columns=[col for col in columns_to_drop if col in nhs_data.columns], inplace=True)

# 🔍 Drop Any Remaining Rows with Too Many Missing Values
nhs_data.dropna(inplace=True)

# Print Only Essential Information
print("\n✅ Data Cleaning Completed.")
print("🔍 Remaining Missing Values:\n", nhs_data.isna().sum()[nhs_data.isna().sum() > 0])

# Save Cleaned Data
cleaned_file = "nhs_ae_cleaned.csv"
nhs_data.to_csv(cleaned_file, index=False)
print(f"\n📁 Cleaned dataset saved as {cleaned_file}")



✅ Data Cleaning Completed.
🔍 Remaining Missing Values:
 Series([], dtype: int64)

📁 Cleaned dataset saved as nhs_ae_cleaned.csv


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  nhs_data["total_a&e_attendances"].replace([np.inf, -np.inf], np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  nhs_data["total_a&e_attendances"].fillna(nhs_data["total_a&e_attendances"].median(), inplace=True)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Histogram for 'total_a&e_attendances'
plt.figure(figsize=(10, 6))
sns.histplot(nhs_data['total_a&e_attendances'], kde=True)
plt.title("Distribution of total A&E Attendances")
plt.xlabel("Total A&E Attendances")
plt.ylabel("Frequency")
plt.show()

# Bar plot for 'month' to check distribution
plt.figure(figsize=(10, 6))
sns.countplot(data=nhs_data, x='month', order=nhs_data['month'].cat.categories)
plt.title("Distribution of A&E Data by Month")
plt.xlabel("Month")
plt.ylabel("Frequency")
plt.xticks(rotation=45)
plt.show()

# Bar plot for 'year' to check distribution
plt.figure(figsize=(10, 6))
sns.countplot(data=nhs_data, x='year')
plt.title("Distribution of A&E Data by Year")
plt.xlabel("Year")
plt.ylabel("Frequency")
plt.show()

# Heatmap to visualize missing values
plt.figure(figsize=(12, 8))
sns.heatmap(nhs_data.isnull(), cbar=False, cmap='viridis')
plt.title("Missing Values Heatmap")
plt.show()

# Correlation Matrix (if there are many numeric features)
# Filter out non-numeric columns
numeric_data = nhs_data.select_dtypes(include=[np.number])

plt.figure(figsize=(12, 8))
corr = numeric_data.corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title("Correlation Matrix")
plt.show()


In [None]:
# Print the first 5 rows to check structure
print("\n🔍 First 5 Rows:")
print(nhs_data.head())

# Check the column names and data types
print("\n🔍 Data Types and Missing Values:")
print(nhs_data.info())

# Summary statistics of numerical columns
print("\n🔍 Summary Statistics (Numerical Data):")
print(nhs_data.describe())

# Check unique values in categorical columns
print("\n🔍 Unique Years:")
print(nhs_data["year"].unique())

print("\n🔍 Unique Months:")
print(nhs_data["month"].unique())

# Check for missing values
print("\n🔍 Missing Values Count:")
print(nhs_data.isnull().sum())

# Check for duplicate rows
print("\n🔍 Duplicate Rows:", nhs_data.duplicated().sum())

# Sample 10 random rows to manually inspect
print("\n🔍 Random Sample of 10 Rows:")
print(nhs_data.sample(10))



🔍 First 5 Rows:
              period org_code                             parent_org  \
0  MSitAE-APRIL-2020   Y02572  NHS ENGLAND NORTH EAST AND YORKSHIRE    
1  MSitAE-APRIL-2020      RY8                  NHS ENGLAND MIDLANDS    
2  MSitAE-APRIL-2020    NTV0W                NHS ENGLAND SOUTH EAST    
3  MSitAE-APRIL-2020   Y02532                NHS ENGLAND NORTH WEST    
4  MSitAE-APRIL-2020    NTV0B                NHS ENGLAND SOUTH EAST    

                                            org_name  \
0                            PARK COMMUNITY PRACTICE   
1  DERBYSHIRE COMMUNITY HEALTH SERVICES NHS FOUND...   
2                              WOKING WALK IN CENTRE   
3                             MIRIAM MINOR EMERGENCY   
4                             ASHFORD WALK-IN-CENTRE   

   a&e_attendances_booked_appointments_type_1  \
0                                         0.0   
1                                         0.0   
2                                         0.0   
3                

  sqr = _ensure_numeric((avg - values) ** 2)
