In [0]:
dbutils.fs.cp("dbfs:/FileStore/shared_uploads/reddy.pranav.gaddam@gmail.com/health_data_parquet_1.zip", "file:/tmp/part1.zip")
dbutils.fs.cp("dbfs:/FileStore/shared_uploads/reddy.pranav.gaddam@gmail.com/health_data_parquet_2.zip", "file:/tmp/part2.zip")



In [0]:
!unzip /tmp/part1.zip -d /tmp/health_part1
!unzip /tmp/part2.zip -d /tmp/health_part2

In [0]:
dbutils.fs.cp("file:/tmp/health_part1", "dbfs:/FileStore/tables/health_data_parquet_part1", recurse=True)
dbutils.fs.cp("file:/tmp/health_part2", "dbfs:/FileStore/tables/health_data_parquet_part2", recurse=True)


In [0]:
display(dbutils.fs.ls("/FileStore/tables/"))


In [0]:
display(dbutils.fs.ls("/FileStore/tables/health_data_parquet_part1"))
display(dbutils.fs.ls("/FileStore/tables/health_data_parquet_part2"))


In [0]:
# Load from nested folders where actual .parquet files are located
df1 = spark.read.parquet("/FileStore/tables/health_data_parquet_part1/health_data_parquet_1/")
df2 = spark.read.parquet("/FileStore/tables/health_data_parquet_part2/health_data_parquet_2/")

# Combine both parts
df = df1.unionByName(df2)

# Show schema and sample
df.printSchema()
df.show(5)


root
 |-- user_id: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- datestamp: date (nullable = true)
 |-- height_inches: double (nullable = true)
 |-- weight_kg: double (nullable = true)
 |-- bmi: double (nullable = true)
 |-- heart_rate: double (nullable = true)
 |-- spo2: double (nullable = true)
 |-- ecg_avg: double (nullable = true)
 |-- body_temp_f: double (nullable = true)
 |-- step_count: integer (nullable = true)
 |-- calories_burnt: integer (nullable = true)
 |-- sleep_duration_hr: double (nullable = true)
 |-- sleep_quality_score: double (nullable = true)
 |-- systolic_bp: integer (nullable = true)
 |-- diastolic_bp: integer (nullable = true)

+-----------+---+------+----------+-------------+---------+-----+----------+-----+-------+-----------+----------+--------------+-----------------+-------------------+-----------+------------+
|    user_id|age|gender| datestamp|height_inches|weight_kg|  bmi|heart_rate| spo2|ecg_avg|

In [0]:
from pyspark.sql.functions import col

df = df.withColumn("risk_score", 
    0.3 * (col("systolic_bp") / 180) +
    0.2 * (col("heart_rate") / 150) +
    0.2 * ((100 - col("spo2")) / 100) +
    0.15 * ((8 - col("sleep_duration_hr")) / 8) +
    0.15 * ((10 - col("sleep_quality_score")) / 10)
)


In [0]:
selected_features = ["age", "calories_burnt", "step_count"]
label_col = "risk_score"

# Drop rows with nulls
df_clean = df.select(label_col, *selected_features).dropna()


In [0]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=selected_features, outputCol="features")
assembled_df = assembler.transform(df_clean).select("features", label_col)


In [0]:
train_df, test_df = assembled_df.randomSplit([0.8, 0.2], seed=42)


In [0]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(featuresCol="features", labelCol=label_col, maxIter=10)
model = lr.fit(train_df)


In [0]:
predictions = model.transform(test_df)

from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(labelCol=label_col, predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print(f"RMSE: {rmse}")


RMSE: 0.04943707981196163
