In [5]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline

In [6]:
spark = SparkSession.builder.appName("DiabetesMLModel").getOrCreate()

file_path = "dataset.tsv"  # Replace with the actual path to your TSV file
df = spark.read.csv(file_path, sep="\t", header=True, inferSchema=True)


In [7]:
df.show()
df.printSchema()

+---+---+----+-----+---+-----+----+----+------+---+---+
|AGE|SEX| BMI|   BP| S1|   S2|  S3|  S4|    S5| S6|  Y|
+---+---+----+-----+---+-----+----+----+------+---+---+
| 59|  2|32.1|101.0|157| 93.2|38.0| 4.0|4.8598| 87|151|
| 48|  1|21.6| 87.0|183|103.2|70.0| 3.0|3.8918| 69| 75|
| 72|  2|30.5| 93.0|156| 93.6|41.0| 4.0|4.6728| 85|141|
| 24|  1|25.3| 84.0|198|131.4|40.0| 5.0|4.8903| 89|206|
| 50|  1|23.0|101.0|192|125.4|52.0| 4.0|4.2905| 80|135|
| 23|  1|22.6| 89.0|139| 64.8|61.0| 2.0|4.1897| 68| 97|
| 36|  2|22.0| 90.0|160| 99.6|50.0| 3.0|3.9512| 82|138|
| 66|  2|26.2|114.0|255|185.0|56.0|4.55|4.2485| 92| 63|
| 60|  2|32.1| 83.0|179|119.4|42.0| 4.0|4.4773| 94|110|
| 29|  1|30.0| 85.0|180| 93.4|43.0| 4.0|5.3845| 88|310|
| 22|  1|18.6| 97.0|114| 57.6|46.0| 2.0|3.9512| 83|101|
| 56|  2|28.0| 85.0|184|144.8|32.0| 6.0|3.5835| 77| 69|
| 53|  1|23.7| 92.0|186|109.2|62.0| 3.0|4.3041| 81|179|
| 50|  2|26.2| 97.0|186|105.4|49.0| 4.0|5.0626| 88|185|
| 61|  1|24.0| 91.0|202|115.4|72.0| 3.0|4.2905| 

In [8]:
feature_columns = ["AGE", "SEX", "BMI", "BP", "S1", "S2", "S3", "S4", "S5", "S6"]
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")

In [9]:
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=True)

In [10]:
train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)

In [11]:
lr = LinearRegression(featuresCol="scaledFeatures", labelCol="Y", maxIter=100, regParam=0.1)

pipeline = Pipeline(stages=[assembler, scaler, lr])

model = pipeline.fit(train_data)

predictions = model.transform(test_data)
predictions.select("Y", "prediction").show()

from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(labelCol="Y", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE): {rmse}")

+---+------------------+
|  Y|        prediction|
+---+------------------+
|200|140.34554571372053|
| 55| 73.83848225762115|
|101|102.41351279842553|
| 97|106.94631030841333|
| 68|114.28126929607578|
| 49| 91.63709174839992|
|233|198.41895399793856|
|101|189.27474163905794|
|143| 70.76665046297568|
|202|146.08861528798616|
|214|  119.675030398686|
|101|100.04839697713273|
|258| 166.0158222672214|
| 96|113.84047247763486|
| 70| 65.09009333130471|
|172|139.37813989647447|
|243|277.27659610120645|
|259|235.47168032419415|
|128| 95.31569572801807|
| 67|131.82466328967936|
+---+------------------+
only showing top 20 rows

Root Mean Squared Error (RMSE): 59.6307568418301


In [12]:
# Save the trained model
model_save_path = "model"
model.write().overwrite().save(model_save_path)