In [18]:
import sys, pandas as pd
print(sys.executable)
print("pandas:", pd.__version__)


/opt/homebrew/opt/python@3.12/bin/python3.12
pandas: 2.3.2


In [19]:
## Load Raw Data from CSV

csv_path = Path("../data/raw/medical_data.csv")

df = pd.read_csv(csv_path)

print(f"Rows: {len(df):,}  |  Cols: {df.shape[1]}")
df.head()

Rows: 1,000  |  Cols: 13


Unnamed: 0,Patient_ID,Age,Gender,Medical_Condition,Treatment,Outcome,Insurance_Type,Income,Region,Smoking_Status,Admission_Type,Hospital_ID,Length_of_Stay
0,1,77,Female,Chronic Obstructive,Dialysis,Stable,Public,77444,North,Former smoker,Urgent,3173,20
1,2,62,Female,Obesity,Physical therapy,Improved,Public,19367,West,Non-smoker,Urgent,65671,4
2,3,77,Male,Hypertension,Inhaler therapy,Improved,Medicare,16054,North,Non-smoker,Urgent,96914,3
3,4,41,Female,Alzheimer's Disease,Medication C,Worsened,Medicare,54371,West,Non-smoker,Emergency,15732,11
4,5,82,Male,Alzheimer's Disease,Chemotherapy,Stable,Private,55489,West,Non-smoker,Emergency,98232,2


In [20]:
## Load Data into PostgreSQL (Raw Schema)

import pandas as pd
import numpy as np

df.columns = (
    df.columns.str.strip()
              .str.lower()
              .str.replace(' ', '_')
              .str.replace('-', '_')
)

# Convert numeric columns
for c in ["patient_id", "age", "hospital_id", "length_of_stay"]:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce").astype("Int64")

if "income" in df.columns:
    df["income"] = pd.to_numeric(df["income"], errors="coerce")

# Clean categorical text
cat_cols = [
    "gender","medical_condition","treatment","outcome","insurance_type",
    "region","smoking_status","admission_type"
]
for c in cat_cols:
    if c in df.columns:
        df[c] = df[c].astype("string").str.strip()

# Basic validity rules
if "age" in df.columns:
    df.loc[(df["age"].notna()) & ((df["age"] < 0) | (df["age"] > 120)), "age"] = pd.NA

if "length_of_stay" in df.columns:
    df.loc[(df["length_of_stay"].notna()) & (df["length_of_stay"] < 0), "length_of_stay"] = pd.NA
    df.loc[(df["length_of_stay"].notna()) & (df["length_of_stay"] > 365), "length_of_stay"] = pd.NA

df = df.drop_duplicates()

df.head()

Unnamed: 0,patient_id,age,gender,medical_condition,treatment,outcome,insurance_type,income,region,smoking_status,admission_type,hospital_id,length_of_stay
0,1,77,Female,Chronic Obstructive,Dialysis,Stable,Public,77444,North,Former smoker,Urgent,3173,20
1,2,62,Female,Obesity,Physical therapy,Improved,Public,19367,West,Non-smoker,Urgent,65671,4
2,3,77,Male,Hypertension,Inhaler therapy,Improved,Medicare,16054,North,Non-smoker,Urgent,96914,3
3,4,41,Female,Alzheimer's Disease,Medication C,Worsened,Medicare,54371,West,Non-smoker,Emergency,15732,11
4,5,82,Male,Alzheimer's Disease,Chemotherapy,Stable,Private,55489,West,Non-smoker,Emergency,98232,2


In [21]:
## Load Data into PostgreSQL (Raw Schema)

from sqlalchemy import create_engine, text
from sqlalchemy.types import Integer, Float, Text as SqlText

engine = create_engine("postgresql+psycopg2://dquser:dqpass@localhost:5432/healthcare")

df.to_sql(
    "patients_raw",
    engine,
    schema="raw",
    if_exists="replace",
    index=False,
    dtype={
        "patient_id": Integer(),
        "age": Integer(),
        "hospital_id": Integer(),
        "length_of_stay": Integer(),
        "income": Float(),
        "gender": SqlText(),
        "medical_condition": SqlText(),
        "treatment": SqlText(),
        "outcome": SqlText(),
        "insurance_type": SqlText(),
        "region": SqlText(),
        "smoking_status": SqlText(),
        "admission_type": SqlText(),
    }
)

# verify table exists
with engine.connect() as conn:
    rows = conn.execute(text("""
        SELECT table_schema, table_name
        FROM information_schema.tables
        WHERE table_schema='raw' AND table_name='patients_raw';
    """)).fetchall()
rows


[('raw', 'patients_raw')]

In [22]:
## Clean and Load into Staging Schema

from sqlalchemy import text

create_staging_sql = text("""
DROP TABLE IF EXISTS staging.patients;

CREATE TABLE staging.patients AS
SELECT
    patient_id::int                                   AS patient_id,
    NULLIF(age::text,'')::int                         AS age,
    gender,
    medical_condition,
    treatment,
    outcome,
    insurance_type,
    NULLIF(income::text,'')::numeric                  AS income,
    region,
    smoking_status,
    admission_type,
    NULLIF(hospital_id::text,'')::int                 AS hospital_id,
    NULLIF(length_of_stay::text,'')::int              AS length_of_stay
FROM raw.patients_raw
WHERE
    (age IS NULL OR (age BETWEEN 0 AND 120)) AND
    (length_of_stay IS NULL OR (length_of_stay BETWEEN 0 AND 365));
""")

with engine.begin() as conn:
    conn.execute(create_staging_sql)

In [23]:
## Validate Staging Data

import pandas as pd

pd.read_sql("SELECT COUNT(*) AS rows FROM raw.patients_raw;", engine)
pd.read_sql("SELECT COUNT(*) AS rows FROM staging.patients;", engine)
pd.read_sql("SELECT * FROM staging.patients ORDER BY patient_id LIMIT 5;", engine)

pd.read_sql("""
SELECT
  SUM(CASE WHEN age IS NULL OR age < 0 OR age > 120 THEN 1 ELSE 0 END) AS bad_age,
  SUM(CASE WHEN length_of_stay IS NULL OR length_of_stay < 0 OR length_of_stay > 365 THEN 1 ELSE 0 END) AS bad_los,
  SUM(CASE WHEN patient_id IS NULL THEN 1 ELSE 0 END) AS null_patient_id
FROM staging.patients;
""", engine)


Unnamed: 0,bad_age,bad_los,null_patient_id
0,0,0,0


In [26]:
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", None)
pd.set_option("display.width", 120)

df.head()

Unnamed: 0,patient_id,age,gender,medical_condition,treatment,outcome,insurance_type,income,region,smoking_status,admission_type,hospital_id,length_of_stay
0,1,77,Female,Chronic Obstructive,Dialysis,Stable,Public,77444,North,Former smoker,Urgent,3173,20
1,2,62,Female,Obesity,Physical therapy,Improved,Public,19367,West,Non-smoker,Urgent,65671,4
2,3,77,Male,Hypertension,Inhaler therapy,Improved,Medicare,16054,North,Non-smoker,Urgent,96914,3
3,4,41,Female,Alzheimer's Disease,Medication C,Worsened,Medicare,54371,West,Non-smoker,Emergency,15732,11
4,5,82,Male,Alzheimer's Disease,Chemotherapy,Stable,Private,55489,West,Non-smoker,Emergency,98232,2


In [27]:
## Schema & quick summaries

display(df.dtypes.to_frame("dtype"))
# 5 random rows
display(df.sample(5, random_state=42))

print("\n--- .info() ---")
df.info()

print("\n--- Numeric describe ---")
display(df.describe().T)   # removed numeric_only=True for compatibility

# Count unique values in each column (cardinality)
print("\n--- Cardinality per column ---")
display(df.nunique().sort_values(ascending=False).to_frame("nunique"))


Unnamed: 0,dtype
patient_id,Int64
age,Int64
gender,string[python]
medical_condition,string[python]
treatment,string[python]
outcome,string[python]
insurance_type,string[python]
income,int64
region,string[python]
smoking_status,string[python]


Unnamed: 0,patient_id,age,gender,medical_condition,treatment,outcome,insurance_type,income,region,smoking_status,admission_type,hospital_id,length_of_stay
521,522,61,Male,Rheumatoid Arthritis,Immunosuppressants,Stable,Medicare,91295,East,Non-smoker,Elective,93904,4
737,738,17,Female,Obesity,Medication B,Stable,Medicare,62951,North,Non-smoker,Urgent,93523,13
740,741,37,Female,Cancer,Physical therapy,Stable,Medicare,94400,South,Non-smoker,Elective,13518,17
660,661,86,Female,Hypertension,Immunosuppressants,Stable,Public,36174,West,Former smoker,Urgent,83976,7
411,412,83,Male,Obesity,Memory exercises,Stable,Private,74137,East,Former smoker,Urgent,13025,20



--- .info() ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   patient_id         1000 non-null   Int64 
 1   age                1000 non-null   Int64 
 2   gender             1000 non-null   string
 3   medical_condition  1000 non-null   string
 4   treatment          1000 non-null   string
 5   outcome            1000 non-null   string
 6   insurance_type     1000 non-null   string
 7   income             1000 non-null   int64 
 8   region             1000 non-null   string
 9   smoking_status     1000 non-null   string
 10  admission_type     1000 non-null   string
 11  hospital_id        1000 non-null   Int64 
 12  length_of_stay     1000 non-null   Int64 
dtypes: Int64(4), int64(1), string(8)
memory usage: 105.6 KB

--- Numeric describe ---


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
patient_id,1000.0,500.5,288.819436,1.0,250.75,500.5,750.25,1000.0
age,1000.0,45.71,26.797612,0.0,22.75,45.0,70.0,90.0
income,1000.0,50181.756,29178.022872,169.0,24079.75,51972.5,74972.5,99980.0
hospital_id,1000.0,51709.297,29653.627294,37.0,25293.0,52220.5,78788.0,99978.0
length_of_stay,1000.0,10.041,6.066145,0.0,5.0,10.0,15.0,20.0



--- Cardinality per column ---


Unnamed: 0,nunique
patient_id,1000
hospital_id,993
income,989
age,91
length_of_stay,21
medical_condition,15
treatment,13
gender,8
region,4
outcome,3


In [31]:
# Audit categorical columns
cat_cols = [
    "gender", "medical_condition", "treatment", "outcome",
    "insurance_type", "region", "smoking_status", "admission_type"
]

for c in cat_cols:
    print(f"\n--- {c} (nunique={df[c].nunique()}) ---")
    print(df[c].value_counts(dropna=False).head(20))



--- gender (nunique=8) ---
gender
Male           462
Female         442
Polygender      20
Non-binary      19
Bigender        18
Genderfluid     14
Agender         13
Genderqueer     12
Name: count, dtype: Int64

--- medical_condition (nunique=15) ---
medical_condition
Chronic Kidney Disease    81
Chronic Obstructive       79
Rheumatoid Arthritis      77
Depression                77
Diabetes                  76
Alzheimer's Disease       72
Asthma                    71
Hypertension              68
Cancer                    63
Stroke                    60
Anxiety                   59
Obesity                   58
Osteoporosis              56
Heart Disease             53
Arthritis                 50
Name: count, dtype: Int64

--- treatment (nunique=13) ---
treatment
Medication B          93
Therapy               85
Physical therapy      82
Immunosuppressants    80
Dietary counseling    77
Memory exercises      77
Inhaler therapy       75
Medication C          74
Chemotherapy          74
D

In [33]:
import numpy as np
import pandas as pd

df_clean = df.copy()

# Normalize all text columns
text_cols = df_clean.select_dtypes(include=["object", "string"]).columns
for c in text_cols:
    df_clean[c] = (df_clean[c]
                   .astype("string")
                   .str.strip()
                   .str.replace(r"\s+", " ", regex=True)
                   .str.title())

# Standardize specific categorical columns
title_cols = [
    "medical_condition", "treatment", "outcome",
    "insurance_type", "region", "smoking_status", "admission_type"
]
for c in title_cols:
    if c in df_clean.columns:
        df_clean[c] = (df_clean[c]
                       .astype("string")
                       .str.strip()
                       .str.replace(r"\s+", " ", regex=True)
                       .str.title())

# Gender normalization
if "gender" in df_clean.columns:
    g_raw = df_clean["gender"].astype("string").str.lower().str.replace(r"[^a-z\- ]", "", regex=True)
    gender_map = {
        "m": "Male", "male": "Male", "man": "Male",
        "f": "Female", "female": "Female", "woman": "Female"
    }
    df_clean["gender"] = g_raw.map(gender_map).fillna(df_clean["gender"].astype("string").str.title())

# Smoking status normalization: Non-Smoker / Former Smoker / Current Smoker
if "smoking_status" in df_clean.columns:
    smoke = df_clean["smoking_status"].astype("string").str.lower().str.replace(r"\s+", " ", regex=True)
    smoke_map = {
        "non-smoker": "Non-Smoker", "nonsmoker": "Non-Smoker", "non smoker": "Non-Smoker",
        "former smoker": "Former Smoker", "ex-smoker": "Former Smoker", "ex smoker": "Former Smoker",
        "current smoker": "Current Smoker", "smoker": "Current Smoker"
    }
    df_clean["smoking_status"] = smoke.map(smoke_map).fillna(df_clean["smoking_status"].str.title())
    # Normalize common capitalization variants that may slip through
    df_clean["smoking_status"] = df_clean["smoking_status"].replace({
        "Former smoker": "Former Smoker",
        "Current smoker": "Current Smoker",
        "Non-smoker": "Non-Smoker"
    })

display(df_clean.head())
for c in ["gender", "smoking_status", "admission_type", "outcome"]:
    if c in df_clean.columns:
        print(f"\n{c} value_counts():")
        print(df_clean[c].value_counts(dropna=False))


Unnamed: 0,patient_id,age,gender,medical_condition,treatment,outcome,insurance_type,income,region,smoking_status,admission_type,hospital_id,length_of_stay
0,1,77,Female,Chronic Obstructive,Dialysis,Stable,Public,77444,North,Former Smoker,Urgent,3173,20
1,2,62,Female,Obesity,Physical Therapy,Improved,Public,19367,West,Non-Smoker,Urgent,65671,4
2,3,77,Male,Hypertension,Inhaler Therapy,Improved,Medicare,16054,North,Non-Smoker,Urgent,96914,3
3,4,41,Female,Alzheimer'S Disease,Medication C,Worsened,Medicare,54371,West,Non-Smoker,Emergency,15732,11
4,5,82,Male,Alzheimer'S Disease,Chemotherapy,Stable,Private,55489,West,Non-Smoker,Emergency,98232,2



gender value_counts():
gender
Male           462
Female         442
Polygender      20
Non-Binary      19
Bigender        18
Genderfluid     14
Agender         13
Genderqueer     12
Name: count, dtype: int64

smoking_status value_counts():
smoking_status
Former Smoker    500
Non-Smoker       500
Name: count, dtype: int64

admission_type value_counts():
admission_type
Urgent       350
Emergency    345
Elective     305
Name: count, dtype: Int64

outcome value_counts():
outcome
Improved    363
Worsened    324
Stable      313
Name: count, dtype: Int64


In [35]:
## Checks

int_cols = ["Patient_ID", "Age", "Income", "Hospital_ID", "Length_of_Stay"]
for c in int_cols:
    df_clean[c] = pd.to_numeric(df_clean[c], errors="coerce").astype("Int64")

issues = {}

# Patient_ID
issues["dup_patient_id"] = df_clean[df_clean["Patient_ID"].duplicated(keep=False)]

# Ages (flag)
issues["age_out_of_range"] = df_clean[(df_clean["Age"] < 0) | (df_clean["Age"] > 100)]

# Length_of_Stay < 0 (invalid) or == 0 (review)
issues["los_negative"] = df_clean[df_clean["Length_of_Stay"] < 0]
issues["los_zero"] = df_clean[df_clean["Length_of_Stay"] == 0]

# Income nonpositive or suspiciously low (< 1,000) for flagging
issues["income_suspicious"] = df_clean[df_clean["Income"] < 1000]

# Show issue counts
for k, v in issues.items():
    print(f"{k}: {len(v)} rows")



dup_patient_id: 0 rows
age_out_of_range: 0 rows
los_negative: 0 rows
los_zero: 55 rows
income_suspicious: 16 rows


In [37]:
## Staging Copy

ordered_cols = [
    "Patient_ID","Age","Gender","Medical_Condition","Treatment","Outcome",
    "Insurance_Type","Income","Region","Smoking_Status","Admission_Type",
    "Hospital_ID","Length_of_Stay"
]
df_staging = df_clean[ordered_cols].copy()

# Cast to plain ints where safe (replace <NA> with np.nan then backfill if needed)
df_staging["Patient_ID"] = df_staging["Patient_ID"].astype(int)
df_staging["Age"] = df_staging["Age"].astype(int)
df_staging["Income"] = df_staging["Income"].astype(int)
df_staging["Hospital_ID"] = df_staging["Hospital_ID"].astype(int)
df_staging["Length_of_Stay"] = df_staging["Length_of_Stay"].astype(int)

df_staging.head

<bound method NDFrame.head of      Patient_ID  Age  Gender    Medical_Condition           Treatment   Outcome Insurance_Type  Income Region  \
0             1   77  Female  Chronic Obstructive            Dialysis    Stable         Public   77444  North   
1             2   62  Female              Obesity    Physical Therapy  Improved         Public   19367   West   
2             3   77    Male         Hypertension     Inhaler Therapy  Improved       Medicare   16054  North   
3             4   41  Female  Alzheimer'S Disease        Medication C  Worsened       Medicare   54371   West   
4             5   82    Male  Alzheimer'S Disease        Chemotherapy    Stable        Private   55489   West   
..          ...  ...     ...                  ...                 ...       ...            ...     ...    ...   
995         996   72    Male  Alzheimer'S Disease  Immunosuppressants  Worsened         Public    6239  North   
996         997   58  Female  Alzheimer'S Disease            Dialy

In [38]:
## Load to Postgre

from sqlalchemy import create_engine, text

engine = create_engine("postgresql+psycopg2://dquser:dqpass@localhost:5432/healthcare")

# Write to 'staging' schema
df_staging.to_sql(
    "patients_staging",
    engine,
    schema="staging",
    if_exists="replace",
    index=False
)

# Verify table exists & row count
with engine.connect() as conn:
    print(conn.execute(text("""
        SELECT table_schema, table_name
        FROM information_schema.tables
        WHERE table_schema='staging' AND table_name='patients_staging'
    """)).fetchall())

    print(conn.execute(text("SELECT COUNT(*) FROM staging.patients_staging")).scalar(), "rows")

[('staging', 'patients_staging')]
1000 rows
