In [None]:
import pandas as pd
import numpy as np

# Load processed utilization dataset
file_path = "../../data/processed/medical_equipment_utilization_synthetic_cleaned(in).csv"

raw_df = pd.read_csv(file_path)
raw_df.columns = [c.strip().replace(" ", "_") for c in raw_df.columns]

df = raw_df.dropna(axis=1, how="all").copy()

# Clean currency-like columns
currency_cols = [
    "Cost_per_procedure",
    "Daily_Operating_Cost",
    "Procedure_Revenue",
    "Net-Profit_(daily)",
]
for col in currency_cols:
    if col in df.columns:
        df[col] = (
            df[col]
            .astype(str)
            .str.replace("KES", "", case=False, regex=False)
            .str.replace(",", "", regex=False)
            .str.replace(" ", "", regex=False)
        )
        df[col] = pd.to_numeric(df[col], errors="coerce")

numeric_cols = [
    "Available_Hours",
    "Actual_Operating_Hours",
    "Utilization_Rate",
    "Number_of_Procedures",
]
for col in numeric_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

if "Date" in df.columns:
    df["Date"] = pd.to_datetime(df["Date"], errors="coerce")

print("Rows:", len(df))
print("Columns:", df.columns.tolist())
df.head()


In [None]:
# --- Data overview ---

print("=== Info ===")
df.info()

print("\n=== Missing values per column ===")
print(df.isna().sum())

print("\n=== Basic statistics for numeric columns ===")
print(df.describe().T)


In [None]:
# --- Categorical distributions ---

for col in ["Equipment_Type", "Department", "Day_of_Week", "Month"]:
    if col in df.columns:
        print(f"\n=== Value counts for {col} ===")
        print(df[col].value_counts())


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")

# --- Distribution plots for key numeric variables ---

num_cols = [
    "Available_Hours",
    "Actual_Operating_Hours",
    "Utilization_Rate",
    "Number_of_Procedures",
]

for col in num_cols:
    if col in df.columns:
        plt.figure(figsize=(6, 4))
        sns.histplot(df[col], kde=True)
        plt.title(f"Distribution of {col}")
        plt.tight_layout()
        plt.show()


In [None]:
# --- Categorical bar charts ---

cat_cols = ["Equipment_Type", "Department", "Day_of_Week", "Month"]

for col in cat_cols:
    if col in df.columns:
        counts = df[col].value_counts().reset_index()
        counts.columns = [col, "count"]
        plt.figure(figsize=(7, 4))
        sns.barplot(data=counts, x=col, y="count")
        plt.title(f"Count of records by {col}")
        plt.xticks(rotation=45, ha="right")
        plt.tight_layout()
        plt.show()
