In [0]:
%sql
CREATE SCHEMA IF NOT EXISTS mimic

In [0]:
patients_df = spark.read.option("header", "true").option("inferSchema", "true").csv("dbfs:/mnt/mimic/PATIENTS.csv")

In [0]:
patients_df.write.format("delta").mode("overwrite").saveAsTable("mimic.patients")

In [0]:
%sql
SELECT 
    COUNT(*) AS total_patients,
    COUNT(DISTINCT subject_id) AS unique_patients,
    SUM(CASE WHEN gender = 'M' THEN 1 ELSE 0 END) AS male_count,
    SUM(CASE WHEN gender = 'F' THEN 1 ELSE 0 END) AS female_count,
    ROUND(AVG(CASE WHEN gender = 'M' THEN 1.0 ELSE 0.0 END) * 100, 2) AS male_percentage,
    ROUND(AVG(CASE WHEN gender = 'F' THEN 1.0 ELSE 0.0 END) * 100, 2) AS female_percentage
FROM mimicdata_2567611159492892.mimic.patients;

total_patients,unique_patients,male_count,female_count,male_percentage,female_percentage
100,100,45,55,45.0,55.0


Databricks visualization. Run in Databricks to view.

In [0]:
%sql
SELECT 
    MIN(DATEDIFF(dod, dob) / 365.25) AS min_age_at_death,
    MAX(DATEDIFF(dod, dob) / 365.25) AS max_age_at_death,
    ROUND(AVG(DATEDIFF(dod, dob) / 365.25), 2) AS avg_age_at_death,
    ROUND(STDDEV(DATEDIFF(dod, dob) / 365.25), 2) AS stddev_age_at_death,
    COUNT(CASE WHEN DATEDIFF(dod, dob) / 365.25 >= 100 THEN 1 END) AS centenarians_count
FROM mimicdata_2567611159492892.mimic.patients
WHERE dod IS NOT NULL;

-- Age distribution by gender
SELECT 
    gender,
    COUNT(*) AS patient_count,
    ROUND(AVG(DATEDIFF(dod, dob) / 365.25), 1) AS avg_age_at_death,
    ROUND(MIN(DATEDIFF(dod, dob) / 365.25), 1) AS min_age_at_death,
    ROUND(MAX(DATEDIFF(dod, dob) / 365.25), 1) AS max_age_at_death,
    ROUND(STDDEV(DATEDIFF(dod, dob) / 365.25), 1) AS stddev_age
FROM  mimicdata_2567611159492892.mimic.patients
WHERE dod IS NOT NULL
GROUP BY gender
ORDER BY gender;

gender,patient_count,avg_age_at_death,min_age_at_death,max_age_at_death,stddev_age
F,55,94.2,20.0,302.4,67.9
M,45,83.6,27.0,301.1,60.9


Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.

In [0]:
%sql
SELECT 
    CASE 
        WHEN DATEDIFF(dod, dob) / 365.25 BETWEEN 0 AND 20 THEN '0-20'
        WHEN DATEDIFF(dod, dob) / 365.25 BETWEEN 21 AND 40 THEN '21-40'
        WHEN DATEDIFF(dod, dob) / 365.25 BETWEEN 41 AND 60 THEN '41-60'
        WHEN DATEDIFF(dod, dob) / 365.25 BETWEEN 61 AND 80 THEN '61-80'
        WHEN DATEDIFF(dod, dob) / 365.25 BETWEEN 81 AND 100 THEN '81-100'
        WHEN DATEDIFF(dod, dob) / 365.25 > 100 THEN '100+'
        ELSE 'Unknown'
    END AS age_group,
    COUNT(*) AS patient_count,
    ROUND(COUNT(*) * 100.0 / (SELECT COUNT(*) FROM mimicdata_2567611159492892.mimic.patients WHERE dod IS NOT NULL), 2) AS percentage
FROM mimicdata_2567611159492892.mimic.patients 
WHERE dod IS NOT NULL
GROUP BY 
    CASE 
        WHEN DATEDIFF(dod, dob) / 365.25 BETWEEN 0 AND 20 THEN '0-20'
        WHEN DATEDIFF(dod, dob) / 365.25 BETWEEN 21 AND 40 THEN '21-40'
        WHEN DATEDIFF(dod, dob) / 365.25 BETWEEN 41 AND 60 THEN '41-60'
        WHEN DATEDIFF(dod, dob) / 365.25 BETWEEN 61 AND 80 THEN '61-80'
        WHEN DATEDIFF(dod, dob) / 365.25 BETWEEN 81 AND 100 THEN '81-100'
        WHEN DATEDIFF(dod, dob) / 365.25 > 100 THEN '100+'
        ELSE 'Unknown'
    END
ORDER BY 
    CASE age_group
        WHEN '0-20' THEN 1
        WHEN '21-40' THEN 2
        WHEN '41-60' THEN 3
        WHEN '61-80' THEN 4
        WHEN '81-100' THEN 5
        WHEN '100+' THEN 6
        ELSE 7
    END;

age_group,patient_count,percentage
0-20,1,1.0
21-40,4,4.0
41-60,15,15.0
61-80,34,34.0
81-100,34,34.0
100+,8,8.0
Unknown,4,4.0


Databricks visualization. Run in Databricks to view.

In [0]:
%sql
SELECT 
    CONCAT(FLOOR(YEAR(dob) / 10) * 10, 's') AS birth_decade,
    COUNT(*) AS patient_count,
    ROUND(COUNT(*) * 100.0 / (SELECT COUNT(*) FROM mimicdata_2567611159492892.mimic.patients), 2) AS percentage
FROM mimicdata_2567611159492892.mimic.patients
GROUP BY FLOOR(YEAR(dob) / 10) * 10
ORDER BY FLOOR(YEAR(dob) / 10) * 10;

-- Death decade distribution
SELECT 
    CONCAT(FLOOR(YEAR(dod) / 10) * 10, 's') AS death_decade,
    COUNT(*) AS death_count,
    ROUND(COUNT(*) * 100.0 / (SELECT COUNT(*) FROM mimicdata_2567611159492892.mimic.patients WHERE dod IS NOT NULL), 2) AS percentage
FROM mimicdata_2567611159492892.mimic.patients
WHERE dod IS NOT NULL
GROUP BY FLOOR(YEAR(dod) / 10) * 10
ORDER BY FLOOR(YEAR(dod) / 10) * 10;

-- Year-over-year death trends (for line charts)
SELECT 
    YEAR(dod) AS death_year,
    COUNT(*) AS deaths_per_year,
    SUM(CASE WHEN gender = 'M' THEN 1 ELSE 0 END) AS male_deaths,
    SUM(CASE WHEN gender = 'F' THEN 1 ELSE 0 END) AS female_deaths
FROM mimicdata_2567611159492892.mimic.patients 
WHERE dod IS NOT NULL
GROUP BY YEAR(dod)
ORDER BY YEAR(dod);

death_year,deaths_per_year,male_deaths,female_deaths
2105,3,2,1
2107,3,2,1
2108,1,0,1
2111,1,0,1
2112,1,1,0
2114,1,1,0
2115,1,1,0
2117,1,0,1
2119,1,0,1
2120,2,1,1


Databricks visualization. Run in Databricks to view.

In [0]:
%sql
SELECT 
    'Total Records' AS metric,
    COUNT(*) AS count,
    '100%' AS completeness
FROM mimicdata_2567611159492892.mimic.patients

UNION ALL

SELECT 
    'Records with DOB' AS metric,
    COUNT(*) AS count,
    CONCAT(ROUND(COUNT(*) * 100.0 / (SELECT COUNT(*) FROM mimicdata_2567611159492892.mimic.patients), 2), '%') AS completeness
FROM mimicdata_2567611159492892.mimic.patients
WHERE dob IS NOT NULL

UNION ALL

SELECT 
    'Records with DOD' AS metric,
    COUNT(*) AS count,
    CONCAT(ROUND(COUNT(*) * 100.0 / (SELECT COUNT(*) FROM mimicdata_2567611159492892.mimic.patients), 2), '%') AS completeness
FROM mimicdata_2567611159492892.mimic.patients 
WHERE dod IS NOT NULL

UNION ALL

SELECT 
    'Records with Hospital DOD' AS metric,
    COUNT(*) AS count,
    CONCAT(ROUND(COUNT(*) * 100.0 / (SELECT COUNT(*) FROM mimicdata_2567611159492892.mimic.patients), 2), '%') AS completeness
FROM mimicdata_2567611159492892.mimic.patients 
WHERE dod_hosp IS NOT NULL

UNION ALL

SELECT 
    'Records with SSN DOD' AS metric,
    COUNT(*) AS count,
    CONCAT(ROUND(COUNT(*) * 100.0 / (SELECT COUNT(*) FROM mimicdata_2567611159492892.mimic.patients), 2), '%') AS completeness
FROM mimicdata_2567611159492892.mimic.patients 
WHERE dod_ssn IS NOT NULL;

metric,count,completeness
Records with DOB,100,100.00%
Records with DOD,100,100.00%
Records with Hospital DOD,70,70.00%
Records with SSN DOD,77,77.00%
Total Records,100,100%


Databricks visualization. Run in Databricks to view.

In [0]:
%sql
SELECT 
    CASE 
        WHEN YEAR(dob) BETWEEN 1800 AND 1849 THEN '1800-1849'
        WHEN YEAR(dob) BETWEEN 1850 AND 1899 THEN '1850-1899'
        WHEN YEAR(dob) BETWEEN 1900 AND 1949 THEN '1900-1949'
        WHEN YEAR(dob) BETWEEN 1950 AND 1999 THEN '1950-1999'
        WHEN YEAR(dob) BETWEEN 2000 AND 2049 THEN '2000-2049'
        WHEN YEAR(dob) BETWEEN 2050 AND 2099 THEN '2050-2099'
        WHEN YEAR(dob) BETWEEN 2100 AND 2149 THEN '2100-2149'
        WHEN YEAR(dob) >= 2150 THEN '2150+'
        ELSE 'Other'
    END AS birth_cohort,
    COUNT(*) AS patient_count,
    ROUND(AVG(DATEDIFF(dod, dob) / 365.25), 1) AS avg_lifespan,
    MIN(DATEDIFF(dod, dob) / 365.25) AS min_lifespan,
    MAX(DATEDIFF(dod, dob) / 365.25) AS max_lifespan
FROM mimicdata_2567611159492892.mimic.patients
WHERE dod IS NOT NULL AND dob IS NOT NULL
GROUP BY 
    CASE 
        WHEN YEAR(dob) BETWEEN 1800 AND 1849 THEN '1800-1849'
        WHEN YEAR(dob) BETWEEN 1850 AND 1899 THEN '1850-1899'
        WHEN YEAR(dob) BETWEEN 1900 AND 1949 THEN '1900-1949'
        WHEN YEAR(dob) BETWEEN 1950 AND 1999 THEN '1950-1999'
        WHEN YEAR(dob) BETWEEN 2000 AND 2049 THEN '2000-2049'
        WHEN YEAR(dob) BETWEEN 2050 AND 2099 THEN '2050-2099'
        WHEN YEAR(dob) BETWEEN 2100 AND 2149 THEN '2100-2149'
        WHEN YEAR(dob) >= 2150 THEN '2150+'
        ELSE 'Other'
    END
ORDER BY avg_lifespan DESC;


birth_cohort,patient_count,avg_lifespan,min_lifespan,max_lifespan
1800-1849,2,300.9,300.629706,301.127995
1850-1899,6,300.5,299.997262,302.392882
2000-2049,15,81.0,68.884326,89.002053
2050-2099,56,72.5,27.022587,91.132101
2100-2149,19,64.2,29.322382,86.557153
2150+,2,20.3,19.989049,20.553046


Databricks visualization. Run in Databricks to view.

In [0]:
%sql
SELECT 
    'Extremely Long Lifespan (>120 years)' AS anomaly_type,
    COUNT(*) AS count
FROM mimicdata_2567611159492892.mimic.patients 
WHERE DATEDIFF(dod, dob) / 365.25 > 120

UNION ALL

SELECT 
    'Future Birth Dates' AS anomaly_type,
    COUNT(*) AS count
FROM mimicdata_2567611159492892.mimic.patients 
WHERE YEAR(dob) > 2025

UNION ALL

SELECT 
    'Death Before Birth' AS anomaly_type,
    COUNT(*) AS count
FROM mimicdata_2567611159492892.mimic.patients 
WHERE dod < dob

UNION ALL

SELECT 
    'Mismatched Death Dates' AS anomaly_type,
    COUNT(*) AS count
FROM mimicdata_2567611159492892.mimic.patients 
WHERE dod_hosp IS NOT NULL 
    AND dod_ssn IS NOT NULL 
    AND ABS(DATEDIFF(dod_hosp, dod_ssn)) > 30;



anomaly_type,count
Extremely Long Lifespan (>120 years),8
Future Birth Dates,91
Death Before Birth,0
Mismatched Death Dates,1


Databricks visualization. Run in Databricks to view.

In [0]:
%sql
SELECT 
    'Male vs Female Longevity Comparison' AS analysis_type,
    (SELECT ROUND(AVG(DATEDIFF(dod, dob) / 365.25), 2) FROM mimicdata_2567611159492892.mimic.patients WHERE gender = 'M' AND dod IS NOT NULL) AS male_avg_age,
    (SELECT ROUND(AVG(DATEDIFF(dod, dob) / 365.25), 2) FROM mimicdata_2567611159492892.mimic.patients WHERE gender = 'F' AND dod IS NOT NULL) AS female_avg_age,
    (SELECT ROUND(AVG(DATEDIFF(dod, dob) / 365.25), 2) FROM mimicdata_2567611159492892.mimic.patients WHERE gender = 'F' AND dod IS NOT NULL) - 
    (SELECT ROUND(AVG(DATEDIFF(dod, dob) / 365.25), 2) FROM mimicdata_2567611159492892.mimic.patients WHERE gender = 'M' AND dod IS NOT NULL) AS female_advantage_years;

analysis_type,male_avg_age,female_avg_age,female_advantage_years
Male vs Female Longevity Comparison,83.58,94.17,10.59


Databricks visualization. Run in Databricks to view.

In [0]:
%sql
SELECT 
    DATE_FORMAT(dod, 'MMMM') AS death_month,
    MONTH(dod) AS month_number,
    COUNT(*) AS death_count,
    ROUND(COUNT(*) * 100.0 / (SELECT COUNT(*) FROM mimicdata_2567611159492892.mimic.patients WHERE dod IS NOT NULL), 2) AS percentage
FROM mimicdata_2567611159492892.mimic.patients 
WHERE dod IS NOT NULL
GROUP BY MONTH(dod), DATE_FORMAT(dod, 'MMMM')
ORDER BY MONTH(dod);

death_month,month_number,death_count,percentage
January,1,8,8.0
February,2,11,11.0
March,3,12,12.0
April,4,5,5.0
May,5,10,10.0
June,6,2,2.0
July,7,4,4.0
August,8,8,8.0
September,9,11,11.0
October,10,7,7.0


Databricks visualization. Run in Databricks to view.