In [24]:
%load_ext autoreload
%autoreload 2
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import countDistinct,col ,avg, abs
from pyspark.ml.regression import LinearRegression, RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
spark = SparkSession.builder \
    .appName("Intensive Care Unit Data Analysis") \
    .getOrCreate()

In [11]:
# Read the datasets
df_admissions = spark.read.csv("data/ADMISSIONS.csv", header=True, inferSchema=True).drop("ROW_ID")
df_diagnoses = spark.read.csv("data/DIAGNOSES_ICD.csv", header=True, inferSchema=True).drop("ROW_ID")
df_icustays = spark.read.csv("data/ICUSTAYS.csv", header=True, inferSchema=True).drop("ROW_ID")
df_patients = spark.read.csv("data/PATIENTS.csv", header=True, inferSchema=True).drop("ROW_ID")
df_icd_diagnoses = spark.read.csv("data/D_ICD_DIAGNOSES.csv", header=True, inferSchema=True).drop("ROW_ID")
# df_chartevents = spark.read.csv("data/CHARTEVENTS.csv", header=True, inferSchema=True).drop("ROW_ID")

In [12]:
final_df = df_patients.join(df_admissions, df_patients["SUBJECT_ID"] == df_admissions["SUBJECT_ID"], how="left").drop(df_admissions["SUBJECT_ID"])
final_df = final_df.join(df_icustays, final_df["HADM_ID"] == df_icustays["HADM_ID"], how="left").drop(df_icustays["SUBJECT_ID"]).drop(df_icustays["HADM_ID"])
final_df = final_df.join(df_diagnoses, final_df["SUBJECT_ID"] == df_diagnoses["SUBJECT_ID"], how="left").drop(df_diagnoses["SUBJECT_ID"]).drop(df_diagnoses["HADM_ID"])
final_df = final_df.join(df_icd_diagnoses,final_df["ICD9_CODE"] == df_icd_diagnoses["ICD9_CODE"], how="left").drop(df_icd_diagnoses["ICD9_CODE"])
# final_df = final_df.join(df_chartevents,final_df["ICUSTAY_ID"] == df_chartevents["ICUSTAY_ID"], how="left").drop(df_chartevents["SUBJECT_ID"]).drop(df_chartevents["HADM_ID"]).drop(df_chartevents["ICUSTAY_ID"])


# Drop ids columns because they are not useful for the analysis
columns_to_remove = [
    "SUBJECT_ID", "HADM_ID", "ICUSTAY_ID", "SEQ_NUM", "ICD9_CODE"
]

final_df=final_df.drop(*columns_to_remove)
final_df = final_df.dropna().drop_duplicates()

In [13]:
final_df.show()

+------+-------------------+-------------------+-------------------+-------------------+-----------+-------------------+-------------------+-------------------+--------------+--------------------+------------------+----------+--------+-----------------+--------------+--------------------+-------------------+-------------------+--------------------+--------------------+--------------------+----------+--------------+-------------+------------+-----------+-------------------+-------------------+-------+--------------------+--------------------+
|GENDER|                DOB|                DOD|           DOD_HOSP|            DOD_SSN|EXPIRE_FLAG|          ADMITTIME|          DISCHTIME|          DEATHTIME|ADMISSION_TYPE|  ADMISSION_LOCATION|DISCHARGE_LOCATION| INSURANCE|LANGUAGE|         RELIGION|MARITAL_STATUS|           ETHNICITY|          EDREGTIME|          EDOUTTIME|           DIAGNOSIS|HOSPITAL_EXPIRE_FLAG|HAS_CHARTEVENTS_DATA|  DBSOURCE|FIRST_CAREUNIT|LAST_CAREUNIT|FIRST_WARDID|LAST_WA

                                                                                

In [14]:
unique_value_columns = []

# Check each column for the number of distinct values
for col_name in final_df.columns:
    # Count distinct values in the column
    distinct_count = final_df.agg(countDistinct(col(col_name)).alias("count")).collect()[0]["count"]
    # If only one unique value exists, add to the list
    if distinct_count == 1:
        unique_value_columns.append(col_name)

final_df = final_df.drop(*unique_value_columns)

                                                                                

In [15]:
final_df.show()

[Stage 1390:>                                                       (0 + 8) / 8]

+------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+--------------------+----------+--------+-----------------+--------------+--------------------+-------------------+-------------------+--------------------+--------------------+----------+--------------+-------------+------------+-----------+-------------------+-------------------+-------+--------------------+--------------------+
|GENDER|                DOB|                DOD|           DOD_HOSP|            DOD_SSN|          ADMITTIME|          DISCHTIME|          DEATHTIME|  ADMISSION_LOCATION| INSURANCE|LANGUAGE|         RELIGION|MARITAL_STATUS|           ETHNICITY|          EDREGTIME|          EDOUTTIME|           DIAGNOSIS|HAS_CHARTEVENTS_DATA|  DBSOURCE|FIRST_CAREUNIT|LAST_CAREUNIT|FIRST_WARDID|LAST_WARDID|             INTIME|            OUTTIME|    LOS|         SHORT_TITLE|          LONG_TITLE|
+------+-------------------+----------

                                                                                

# Pre processing

In [None]:
from pyspark.sql.functions import col, year, month, dayofmonth, hour
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline

label_column = "LOS"

# Identify columns by data type
timestamp_columns = [col_name for col_name, dtype in final_df.dtypes if dtype == 'timestamp']
string_columns = [col_name for col_name, dtype in final_df.dtypes if dtype == 'string']

# Process Timestamp columns - converting timestamps to multiple numerical columns
for col_name in timestamp_columns:
    final_df = final_df.withColumn(col_name + "_year", year(col(col_name))) \
                       .withColumn(col_name + "_month", month(col(col_name))) \
                       .withColumn(col_name + "_day", dayofmonth(col(col_name))) \
                       .withColumn(col_name + "_hour", hour(col(col_name)))

# Drop original timestamp columns if no longer needed
final_df = final_df.drop(*timestamp_columns)

# Process String columns - using StringIndexer and optionally OneHotEncoder
indexers = [StringIndexer(inputCol=c, outputCol=c+"_index") for c in string_columns]
encoders = [OneHotEncoder(inputCol=c+"_index", outputCol=c+"_vec") for c in string_columns]

# Combine stages and create the pipeline
pipeline_stages = indexers + encoders
pipeline = Pipeline(stages=pipeline_stages)

# Fit and transform
final_df = pipeline.fit(final_df).transform(final_df)

# Optionally, remove original string columns if they are not needed
final_df = final_df.drop(*string_columns)

# Assemble all features into a single vector column
assembled_features = [c for c in final_df.columns if c.endswith("_year") or c.endswith("_month") or c.endswith("_day") or c.endswith("_hour") or c.endswith("_vec")]
assembler = VectorAssembler(inputCols=assembled_features, outputCol="features")
assembled_df = assembler.transform(final_df)

# Prediction

In [17]:
lr = LinearRegression(featuresCol='features', labelCol=label_column)
lr_model = lr.fit(assembled_df)

24/06/05 11:45:43 WARN DAGScheduler: Broadcasting large task binary with size 1117.1 KiB
24/06/05 11:45:45 WARN DAGScheduler: Broadcasting large task binary with size 1118.3 KiB
24/06/05 11:45:46 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/06/05 11:45:46 WARN DAGScheduler: Broadcasting large task binary with size 1117.6 KiB
24/06/05 11:45:47 WARN DAGScheduler: Broadcasting large task binary with size 1118.8 KiB
24/06/05 11:45:47 WARN DAGScheduler: Broadcasting large task binary with size 1117.6 KiB
24/06/05 11:45:47 WARN DAGScheduler: Broadcasting large task binary with size 1118.8 KiB
24/06/05 11:45:47 WARN DAGScheduler: Broadcasting large task binary with size 1117.6 KiB
24/06/05 11:45:47 WARN DAGScheduler: Broadcasting large task binary with size 1118.8 KiB
24/06/05 11:45:47 WARN DAGScheduler: Broadcasting large task binary with size 1117.6 KiB
24/06/05 11:45:47 WARN DAGScheduler: Broadcasting large task binary with size 1118.8 KiB
24/

In [18]:
# Define and fit the Random Forest model
rf = RandomForestRegressor(featuresCol='features', labelCol=label_column)
rf_model = rf.fit(assembled_df)

24/06/05 11:46:00 WARN DAGScheduler: Broadcasting large task binary with size 1114.2 KiB
24/06/05 11:46:00 WARN DAGScheduler: Broadcasting large task binary with size 1114.3 KiB
24/06/05 11:46:01 WARN DAGScheduler: Broadcasting large task binary with size 1118.5 KiB
24/06/05 11:46:01 WARN DAGScheduler: Broadcasting large task binary with size 1406.9 KiB
24/06/05 11:46:02 WARN MemoryStore: Not enough space to cache rdd_3687_2 in memory! (computed 44.6 MiB so far)
24/06/05 11:46:02 WARN MemoryStore: Not enough space to cache rdd_3687_3 in memory! (computed 44.6 MiB so far)
24/06/05 11:46:02 WARN MemoryStore: Not enough space to cache rdd_3687_1 in memory! (computed 44.6 MiB so far)
24/06/05 11:46:02 WARN MemoryStore: Not enough space to cache rdd_3687_4 in memory! (computed 44.6 MiB so far)
24/06/05 11:46:02 WARN MemoryStore: Not enough space to cache rdd_3687_0 in memory! (computed 44.6 MiB so far)
24/06/05 11:46:02 WARN MemoryStore: Not enough space to cache rdd_3687_5 in memory! (comp

In [19]:
# Evaluate models
evaluator = RegressionEvaluator(labelCol=label_column, predictionCol="prediction", metricName="rmse")

# Generate predictions
lr_predictions = lr_model.transform(assembled_df)
rf_predictions = rf_model.transform(assembled_df)



In [26]:
# Redefine the calculate_mape function to use the correct abs function
def calculate_accuracy(predictions, label_column):
    # Calculate the Mean Absolute Error (MAE)
    mae = predictions.withColumn('abs_error', abs(col(label_column) - col("prediction"))) \
                     .select(avg('abs_error')).first()[0]

    # Convert MAE to a pseudo-accuracy metric (assuming MAE is less than 100)
    accuracy = 100 - mae  # This assumes that lower error corresponds to higher accuracy

    return accuracy

# Assume you have label_column set and predictions DataFrames ready
lr_mape = calculate_accuracy(lr_predictions, label_column)
rf_mape = calculate_accuracy(rf_predictions, label_column)

# Print the results
print("Mean Absolute Percentage Error (MAPE) on Linear Regression =", lr_mape)
print("Mean Absolute Percentage Error (MAPE) on Random Forest Regression =", rf_mape)

24/06/05 11:54:07 WARN DAGScheduler: Broadcasting large task binary with size 1148.9 KiB
24/06/05 11:54:09 WARN DAGScheduler: Broadcasting large task binary with size 1103.7 KiB


Mean Absolute Percentage Error (MAPE) on Linear Regression = 96.53916278576196
Mean Absolute Percentage Error (MAPE) on Random Forest Regression = 95.55042571391218


                                                                                