In [0]:
spark

# Charger les données vers un Spark DataFrame

In [0]:
machines_data = spark.read.format("csv").option("header",True).option("inferSchema", True).load("dbfs:/FileStore/predictive_maintenance.csv")

In [0]:
display(machines_data)

UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Failure Type
1,M14860,M,298.1,308.6,1551,42.8,0,0,No Failure
2,L47181,L,298.2,308.7,1408,46.3,3,0,No Failure
3,L47182,L,298.1,308.5,1498,49.4,5,0,No Failure
4,L47183,L,298.2,308.6,1433,39.5,7,0,No Failure
5,L47184,L,298.2,308.7,1408,40.0,9,0,No Failure
6,M14865,M,298.1,308.6,1425,41.9,11,0,No Failure
7,L47186,L,298.1,308.6,1558,42.4,14,0,No Failure
8,L47187,L,298.1,308.6,1527,40.2,16,0,No Failure
9,M14868,M,298.3,308.7,1667,28.6,18,0,No Failure
10,M14869,M,298.5,309.0,1741,28.0,21,0,No Failure


In [0]:
machines_data.printSchema()

root
 |-- UDI: integer (nullable = true)
 |-- Product ID: string (nullable = true)
 |-- Type: string (nullable = true)
 |-- Air temperature [K]: double (nullable = true)
 |-- Process temperature [K]: double (nullable = true)
 |-- Rotational speed [rpm]: integer (nullable = true)
 |-- Torque [Nm]: double (nullable = true)
 |-- Tool wear [min]: integer (nullable = true)
 |-- Target: integer (nullable = true)
 |-- Failure Type: string (nullable = true)



In [0]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, OneHotEncoder, StringIndexer, StandardScaler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


In [0]:
machines_data.printSchema()

root
 |-- Type: string (nullable = true)
 |-- Air temperature [K]: double (nullable = true)
 |-- Process temperature [K]: double (nullable = true)
 |-- Rotational speed [rpm]: integer (nullable = true)
 |-- Torque [Nm]: double (nullable = true)
 |-- Tool wear [min]: integer (nullable = true)
 |-- Failure Type: string (nullable = true)



# Prétraiter les données (caractéristiques numériques et catégorielles)


In [0]:
# Définir les caractéristiques numériques et catégorielles
num_features = ["Air temperature [K]", "Process temperature [K]", "Rotational speed [rpm]", "Torque [Nm]", "Tool wear [min]"]
cat_features = ["Type"]
target_variable = 'Failure Type'

# Indexer la variable cible 'Failure Type'
indexer = StringIndexer(inputCol=target_variable, outputCol="indexed_target")

# Indexer les variables catégorielles
indexers = [StringIndexer(inputCol=col, outputCol=col+"_indexed") for col in cat_features]

# Assembler les caractéristiques numériques et catégorielles
assembler = VectorAssembler(inputCols=num_features + [col+"_indexed" for col in cat_features], outputCol="features")

# Standardiser les caractéristiques
scaler = StandardScaler(inputCol="features", outputCol="scaled_features")


# Diviser les données en ensembles d'entraînement et de test

In [0]:
train_df, test_df = machines_data.randomSplit([0.8, 0.2], seed=42)

# Appliquer les transformations d'indexation et d'assemblage sur l'ensemble d'entraînement et de test
pipeline = Pipeline(stages=[indexer] + indexers + [assembler, scaler])
train_df_transformed = pipeline.fit(train_df).transform(train_df)
test_df_transformed = pipeline.fit(train_df).transform(test_df)


Downloading artifacts:   0%|          | 0/40 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/40 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

#Entraînement du modèle RandomForest avec les hyperparamètres spécifiés

In [0]:
random_forest_best_model = RandomForestClassifier(
    numTrees=196,
    maxDepth=10,
    minInstancesPerNode=2,
    seed=42,
    featuresCol="scaled_features",
    labelCol="indexed_target"
)

#Entrainement du modèle

In [0]:
# Entraîner le modèle Random Forest
rf_model = random_forest_best_model.fit(train_df_transformed)

# Faire des prédictions sur l'ensemble de test
predictions = rf_model.transform(test_df_transformed)


Downloading artifacts:   0%|          | 0/20 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

# Evaluer le Modele

In [0]:
# Utiliser l'évaluateur pour calculer le F1-score
evaluator = MulticlassClassificationEvaluator(labelCol="indexed_target", predictionCol="prediction", metricName="f1")
f1_score = evaluator.evaluate(predictions)
print(f"F1-Score sur l'ensemble de test: {f1_score}")


F1-Score sur l'ensemble de test: 0.9687088562777277


# Sauvegarder le modèle Random Forest

In [0]:
rf_model.save("/FileStore/models")