In [None]:
pip install pyspark

In [13]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, Normalizer
from pyspark.ml.classification import RandomForestClassifier
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession.builder.appName('SBP Features Importance').getOrCreate()

spark.sparkContext.setLogLevel('ERROR')

data = spark.read.csv('mimic_2425.csv', inferSchema=True, header=True)
data = data.withColumn(
    'SBP_Classification',
    when(col('SBP') > 80, lit(3))  # very high
    .when(col('SBP') > 50, lit(2))  # high
    .otherwise(lit(1))  # low
)
data = data.withColumn("label", data["SBP_Classification"].cast("integer"))

# Print the structure of columns in data
data.printSchema()

# Put features into a feature vector column
feature_columns = ["RR","SPO2","MAP","HR","PP","CO"]
assembler = VectorAssembler(inputCols=feature_columns, outputCol='features')

final_data = assembler.transform(data)

final_data.show(1)

my_data = final_data.select('features', 'SBP_Classification')

# Normalization In Preprocessing
normalizer = Normalizer().setInputCol("features").setOutputCol("nor_features").setP(1.0)
l1NormData = normalizer.transform(my_data)
my_data = l1NormData.select('nor_features', 'SBP_Classification')

# Split the data
train_data, test_data = my_data.randomSplit([0.7, 0.3])

rfc = RandomForestClassifier(labelCol='SBP_Classification', featuresCol='nor_features')

rfc_model = rfc.fit(train_data)

print("RandomForestClassifier features importance:")
print("Feature indexes:  RR->0, SPO2->1, MAP->2, HR->3, PP->4, CO->5")
print(rfc_model.featureImportances)

# Stop the session
spark.stop()


root
 |-- DATETIME: timestamp (nullable = true)
 |-- RR: double (nullable = true)
 |-- SPO2: double (nullable = true)
 |-- MAP: double (nullable = true)
 |-- SBP: double (nullable = true)
 |-- DBP: double (nullable = true)
 |-- HR: double (nullable = true)
 |-- PP: double (nullable = true)
 |-- CO: double (nullable = true)
 |-- SBP_Classification: integer (nullable = false)
 |-- label: integer (nullable = false)

+-------------------+----+----+---+---+---+-----+---+---+------------------+-----+--------------------+
|           DATETIME|  RR|SPO2|MAP|SBP|DBP|   HR| PP| CO|SBP_Classification|label|            features|
+-------------------+----+----+---+---+---+-----+---+---+------------------+-----+--------------------+
|2020-10-18 15:24:25|35.0|99.9|0.0|0.0|0.0|106.9|0.0|0.0|                 1|    1|[35.0,99.9,0.0,10...|
+-------------------+----+----+---+---+---+-----+---+---+------------------+-----+--------------------+
only showing top 1 row

RandomForestClassifier features importa

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, Normalizer
from pyspark.ml.classification import RandomForestClassifier
from pyspark.sql.functions import *
from pyspark.sql.types import *

# Initialize Spark session
spark = SparkSession.builder.appName('BloodPressureAnomalyDetection').getOrCreate()
spark.sparkContext.setLogLevel('ERROR')

# Read data from CSV file
csv_path = 'mimic_2425.csv'  # Replace with the actual path to your CSV file
data = spark.read.csv(csv_path, inferSchema=True, header=True)

# Feature engineering
feature_columns = ["RR", "SPO2", "MAP", "HR", "PP", "CO", "SBP", "DBP"]
assembler = VectorAssembler(inputCols=feature_columns, outputCol='features')
final_data = assembler.transform(data)

# Normalization
normalizer = Normalizer().setInputCol("features").setOutputCol("nor_features").setP(1.0)
l1NormData = normalizer.transform(final_data)

# Train Random Forest model
rfc = RandomForestClassifier(labelCol='label', featuresCol='nor_features')
rfc_model = rfc.fit(l1NormData)

# Show predictions on the dataset
predictions = rfc_model.transform(l1NormData)
predictions.select("features", "label", "prediction", "probability").show()

# Stop the session
spark.stop()


IllegalArgumentException: label does not exist. Available: DATETIME, RR, SPO2, MAP, SBP, DBP, HR, PP, CO, features, nor_features