In [1]:
import pyspark
from pyspark.sql import SparkSession


spark = SparkSession.builder.appName("YourAppName").getOrCreate()

df = spark.read.csv("C:/bdataset/building_energy_consumption_datasets.csv", header=True, inferSchema=True)



df.show()




+----------+------------+--------------------+-------------------+-------+---------------+---------+-----------+---------------+-------------------+----------------------------------+
|      Date|Floor Number|Units Consumed (kWh)|        Time of Day|Weather|Temperature (C)|Occupancy|HVAC Status|Lighting Status|Appliance Load (kW)|Renewable Energy Contribution (kW)|
+----------+------------+--------------------+-------------------+-------+---------------+---------+-----------+---------------+-------------------+----------------------------------+
|2024-09-22|           7|                  52|2025-01-19 11:00:00|  Sunny|             26|        4|        Off|            Off|  4.399613974107632|                26.179342532827864|
|2024-10-07|           6|                 197|2025-01-19 17:30:00|  Sunny|             29|        3|        Off|            Off|  2.570556710979072|                12.148311181697219|
|2024-10-03|           6|                  50|2025-01-19 17:30:00|  Sunny|      

In [2]:
from pyspark.ml.feature import VectorAssembler, Bucketizer, StringIndexer, StandardScaler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.sql.functions import hour, sin, cos, radians

df_sample = df.sample(fraction=0.1, seed=42)  


df_sample = df_sample.withColumn("Time of Day Hour", hour(df_sample["Time of Day"]))


df_sample = df_sample.withColumn("Time of Day Hour_sin", sin(radians(df_sample["Time of Day Hour"])))
df_sample = df_sample.withColumn("Time of Day Hour_cos", cos(radians(df_sample["Time of Day Hour"])))


string_columns = ["Weather", "HVAC Status", "Lighting Status"]  


indexers = [StringIndexer(inputCol=col, outputCol=col + "_indexed").fit(df_sample) for col in string_columns]


for indexer in indexers:
    df_sample = indexer.transform(df_sample)


feature_columns = [
    "Floor Number",
    "Time of Day Hour",
    "Time of Day Hour_sin",
    "Time of Day Hour_cos",
    "Weather_indexed",
    "Temperature (C)",
    "Occupancy",
    "HVAC Status_indexed",
    "Lighting Status_indexed",
    "Appliance Load (kW)",
    "Renewable Energy Contribution (kW)"
]


assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")

splits = [-float("inf"), 15.0, 35.0, 55.0, 75.0, float("inf")]
bucketizer = Bucketizer(splits=splits, inputCol="Units Consumed (kWh)", outputCol="label")


df_binned = bucketizer.setHandleInvalid("keep").transform(df_sample)


df_transformed = assembler.transform(df_binned)


df_transformed = df_transformed.na.fill(0)  


scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withMean=True, withStd=True)
scaler_model = scaler.fit(df_transformed)
df_transformed = scaler_model.transform(df_transformed)

train_data, test_data = df_transformed.randomSplit([0.8, 0.2], seed=42)


rf_classifier = RandomForestClassifier(featuresCol="scaled_features", labelCol="label")


param_grid = (ParamGridBuilder()
              .addGrid(rf_classifier.numTrees, [100, 200]) 
              .addGrid(rf_classifier.maxDepth, [5, 10]) 
              .build())

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")


cross_validator = CrossValidator(estimator=rf_classifier,
                                 estimatorParamMaps=param_grid,
                                 evaluator=evaluator,
                                 numFolds=5)  


cv_model = cross_validator.fit(train_data)


predictions = cv_model.transform(test_data)

accuracy = evaluator.evaluate(predictions)


print(f"Model Accuracy after Cross-Validation: {accuracy * 100:.2f}%")


best_rf_model = cv_model.bestModel


feature_importances = best_rf_model.featureImportances
formatted_importances = [(i, importance) for i, importance in enumerate(feature_importances)]
print("\nFeature Importance: ")
print(f"({len(feature_columns)}, {list(range(len(feature_columns)))}, {list(feature_importances)})")


print("\nDetailed Feature Importances:")
for i, col in enumerate(feature_columns):
    print(f"Feature: {col}, Importance: {feature_importances[i]:.4f}")


Model Accuracy after Cross-Validation: 94.44%

Feature Importance: 
(11, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [np.float64(0.13873122393443582), np.float64(0.10775445898305594), np.float64(0.08188140745793716), np.float64(0.04686561523144085), np.float64(0.03739804828760026), np.float64(0.11442245365112265), np.float64(0.05706200927164533), np.float64(0.0334652713866121), np.float64(0.02809729394253651), np.float64(0.1729392266655108), np.float64(0.18138299118810255)])

Detailed Feature Importances:
Feature: Floor Number, Importance: 0.1387
Feature: Time of Day Hour, Importance: 0.1078
Feature: Time of Day Hour_sin, Importance: 0.0819
Feature: Time of Day Hour_cos, Importance: 0.0469
Feature: Weather_indexed, Importance: 0.0374
Feature: Temperature (C), Importance: 0.1144
Feature: Occupancy, Importance: 0.0571
Feature: HVAC Status_indexed, Importance: 0.0335
Feature: Lighting Status_indexed, Importance: 0.0281
Feature: Appliance Load (kW), Importance: 0.1729
Feature: Renewable Energy Con