In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, LinearSVC
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline

In [2]:
# Create a Spark session
spark = SparkSession.builder \
    .appName("Model Training and Selection") \
    .getOrCreate()
spark

In [3]:
# Load data
data = spark.read.csv('C:\\Users\\Rishita\\Desktop\\Rishita Desktop Copy\\IoT_DA331_Project\\processed_data.csv', header=True, inferSchema=True)

# Display the first few rows of the dataframe
data.show()


+---+-------------------+-----------+-------------------+
|_c0|               time|temperature|optimal_temperature|
+---+-------------------+-----------+-------------------+
|  0|2019-01-01 00:59:59|        3.1|               21.9|
|  1|2019-01-01 01:59:59|        2.9| 21.799999999999997|
|  2|2019-01-01 02:59:59|        3.1| 21.699999999999996|
|  3|2019-01-01 03:59:59|        1.9| 21.599999999999994|
|  4|2019-01-01 04:59:59|        0.7| 21.599999999999994|
|  5|2019-01-01 05:59:59|        0.5| 21.499999999999993|
|  6|2019-01-01 06:59:59|        0.0|  21.39999999999999|
|  7|2019-01-01 07:59:59|       -0.8|  21.29999999999999|
|  8|2019-01-01 08:59:59|        0.0|  21.19999999999999|
|  9|2019-01-01 09:59:59|        2.2| 20.799999999999983|
| 10|2019-01-01 10:59:59|        3.1|  20.59999999999998|
| 11|2019-01-01 11:59:59|        5.5|  19.99999999999997|
| 12|2019-01-01 12:59:59|        7.8| 18.899999999999956|
| 13|2019-01-01 13:59:59|       10.1| 15.399999999999919|
| 14|2019-01-0

In [None]:
from pyspark.sql.functions import year, month, dayofmonth, hour, minute

# Assuming 'time' is the timestamp column, we can extract parts of the timestamp
data = data.withColumn("year", year("time"))
data = data.withColumn("month", month("time"))
data = data.withColumn("day", dayofmonth("time"))
data = data.withColumn("hour", hour("time"))
data = data.withColumn("minute", minute("time"))

# Now remove the original 'time' column if it's no longer needed
data = data.drop("time")
data.show()

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `time` cannot be resolved. Did you mean one of the following? [`_c0`, `day`, `hour`, `minute`, `year`].;
'Project [_c0#17, temperature#19, optimal_temperature#20, year('time) AS year#95, month#53, day#60, hour#68, minute#77]
+- Project [_c0#17, temperature#19, optimal_temperature#20, year#47, month#53, day#60, hour#68, minute#77]
   +- Project [_c0#17, time#18, temperature#19, optimal_temperature#20, year#47, month#53, day#60, hour#68, minute(time#18, Some(Asia/Calcutta)) AS minute#77]
      +- Project [_c0#17, time#18, temperature#19, optimal_temperature#20, year#47, month#53, day#60, hour(time#18, Some(Asia/Calcutta)) AS hour#68]
         +- Project [_c0#17, time#18, temperature#19, optimal_temperature#20, year#47, month#53, dayofmonth(cast(time#18 as date)) AS day#60]
            +- Project [_c0#17, time#18, temperature#19, optimal_temperature#20, year#47, month(cast(time#18 as date)) AS month#53]
               +- Project [_c0#17, time#18, temperature#19, optimal_temperature#20, year(cast(time#18 as date)) AS year#47]
                  +- Relation [_c0#17,time#18,temperature#19,optimal_temperature#20] csv


In [7]:
data.show()

+---+-----------+-------------------+----+-----+---+----+------+
|_c0|temperature|optimal_temperature|year|month|day|hour|minute|
+---+-----------+-------------------+----+-----+---+----+------+
|  0|        3.1|               21.9|2019|    1|  1|   0|    59|
|  1|        2.9| 21.799999999999997|2019|    1|  1|   1|    59|
|  2|        3.1| 21.699999999999996|2019|    1|  1|   2|    59|
|  3|        1.9| 21.599999999999994|2019|    1|  1|   3|    59|
|  4|        0.7| 21.599999999999994|2019|    1|  1|   4|    59|
|  5|        0.5| 21.499999999999993|2019|    1|  1|   5|    59|
|  6|        0.0|  21.39999999999999|2019|    1|  1|   6|    59|
|  7|       -0.8|  21.29999999999999|2019|    1|  1|   7|    59|
|  8|        0.0|  21.19999999999999|2019|    1|  1|   8|    59|
|  9|        2.2| 20.799999999999983|2019|    1|  1|   9|    59|
| 10|        3.1|  20.59999999999998|2019|    1|  1|  10|    59|
| 11|        5.5|  19.99999999999997|2019|    1|  1|  11|    59|
| 12|        7.8| 18.8999

In [8]:
target_col = "optimal_temperature"
# Assuming the target/label column is named 'label'
# Assuming you've either transformed or removed the 'time' column
feature_columns = [col for col in data.columns if col != 'label' and col not in {'time'}]
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")

# Transform data
data = assembler.transform(data)
data.show()

+---+-----------+-------------------+----+-----+---+----+------+--------------------+
|_c0|temperature|optimal_temperature|year|month|day|hour|minute|            features|
+---+-----------+-------------------+----+-----+---+----+------+--------------------+
|  0|        3.1|               21.9|2019|    1|  1|   0|    59|[0.0,3.1,21.9,201...|
|  1|        2.9| 21.799999999999997|2019|    1|  1|   1|    59|[1.0,2.9,21.79999...|
|  2|        3.1| 21.699999999999996|2019|    1|  1|   2|    59|[2.0,3.1,21.69999...|
|  3|        1.9| 21.599999999999994|2019|    1|  1|   3|    59|[3.0,1.9,21.59999...|
|  4|        0.7| 21.599999999999994|2019|    1|  1|   4|    59|[4.0,0.7,21.59999...|
|  5|        0.5| 21.499999999999993|2019|    1|  1|   5|    59|[5.0,0.5,21.49999...|
|  6|        0.0|  21.39999999999999|2019|    1|  1|   6|    59|[6.0,0.0,21.39999...|
|  7|       -0.8|  21.29999999999999|2019|    1|  1|   7|    59|[7.0,-0.8,21.2999...|
|  8|        0.0|  21.19999999999999|2019|    1|  1|  

In [None]:
# Split the data into training and testing sets
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

In [None]:
# Initialize models
lr = LogisticRegression(featuresCol='features', labelCol='label')
rf = RandomForestClassifier(featuresCol='features', labelCol='label')
svc = LinearSVC(featuresCol='features', labelCol='label')

models = {
    "Logistic Regression": lr,
    "Random Forest": rf,
    "SVM": svc
}

# Dictionary to hold accuracy results
accuracies = {}

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

for name, model in models.items():
    # Fit model
    model = model.fit(train_data)
    # Make predictions
    predictions = model.transform(test_data)
    # Evaluate model
    accuracy = evaluator.evaluate(predictions)
    accuracies[name] = accuracy

    print(f"{name} accuracy: {accuracy}")


In [None]:
# Find the model with the highest accuracy
best_model_name = max(accuracies, key=accuracies.get)
best_accuracy = accuracies[best_model_name]

print(f"Best model is {best_model_name} with an accuracy of {best_accuracy}")


In [None]:
# Assuming the best model is stored in a variable `best_model`
best_model = models[best_model_name].fit(data)  # Refit on the entire dataset if needed
best_model.write().overwrite().save(f"path_to_save/{best_model_name}_model")
