Step 1: Add More Models
We'll use models such as:

Random Forest Classifier - Handles non-linear relationships and works well for tabular data.

Logistic Regression (Multinomial) - Simple linear model for comparison.

In [1]:
from pyspark.sql import SparkSession

# Step 1: Initialize Spark Session
spark = SparkSession.builder \
    .appName("Multiclass Failure Prediction") \
    .getOrCreate()

# Load the dataset
file_path = "C:/Users/Zahra/Desktop/ProjectBigDataAnalytics/data/predictive_maintenance_data_all_columns.csv"  
data = spark.read.csv(file_path, header=True, inferSchema=True)

# Display schema and first few rows
data.printSchema()
data.show(5)

# Count rows and columns
print(f"Total rows: {data.count()}")
print(f"Total columns: {len(data.columns)}")

root
 |-- UDI: integer (nullable = true)
 |-- Product ID: string (nullable = true)
 |-- Type: string (nullable = true)
 |-- Air temperature [K]: double (nullable = true)
 |-- Process temperature [K]: double (nullable = true)
 |-- Rotational speed [rpm]: integer (nullable = true)
 |-- Torque [Nm]: double (nullable = true)
 |-- Tool wear [min]: integer (nullable = true)
 |-- Target: integer (nullable = true)
 |-- Failure Type: string (nullable = true)
 |-- failure_type_id: double (nullable = true)
 |-- type_id: integer (nullable = true)
 |-- product_name: string (nullable = true)
 |-- product_category: string (nullable = true)
 |-- manufacturer: string (nullable = true)
 |-- date: date (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- quarter: integer (nullable = true)
 |-- day_of_week: string (nullable = true)

+---+----------+----+-------------------+-----------------------+----------------------+-----------+---------------+------+------------+---------------+-------+-

In [5]:
from pyspark.sql.functions import col

# Step 2.1: Drop rows with missing values (or use .fillna() for filling missing data)
data_cleaned = data.na.drop()

# Step 2.2: Select essential features
# Exclude irrelevant features like 'UDI', 'Manufacturer', 'Timestamp', etc.
selected_features = ['Type', 'Air temperature [K]', 'Process temperature [K]', 
                     'Rotational speed [rpm]', 'Torque [Nm]', 
                     'Tool wear [min]', 'Failure Type']
data_selected = data_cleaned.select(selected_features)

# Step 2.3: Convert 'Failure Type' to numerical values for classification
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol="Failure Type", outputCol="Failure_Type_Index")
data_prepared = indexer.fit(data_selected).transform(data_selected)

# Show prepared data
data_prepared.show(5)

+----+-------------------+-----------------------+----------------------+-----------+---------------+------------+------------------+
|Type|Air temperature [K]|Process temperature [K]|Rotational speed [rpm]|Torque [Nm]|Tool wear [min]|Failure Type|Failure_Type_Index|
+----+-------------------+-----------------------+----------------------+-----------+---------------+------------+------------------+
|   M|              298.1|                  308.6|                  1551|       42.8|              0|  No Failure|               0.0|
|   L|              298.2|                  308.7|                  1408|       46.3|              3|  No Failure|               0.0|
|   L|              298.1|                  308.5|                  1498|       49.4|              5|  No Failure|               0.0|
|   L|              298.2|                  308.6|                  1433|       39.5|              7|  No Failure|               0.0|
|   L|              298.2|                  308.7|            

In [6]:
# Step 3: Split the data into training and testing sets
train_data, test_data = data_prepared.randomSplit([0.8, 0.2], seed=42)

print(f"Training set: {train_data.count()} rows")
print(f"Test set: {test_data.count()} rows")

Training set: 8079 rows
Test set: 1921 rows


In [8]:
from pyspark.ml.feature import VectorAssembler
# Step 4.1: Assemble features into a vector
feature_columns = ['Air temperature [K]', 'Process temperature [K]', 
                   'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]']
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
train_data = assembler.transform(train_data)
test_data = assembler.transform(test_data)

In [10]:
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier, LogisticRegression

# Step 1.1: Random Forest Classifier
rf = RandomForestClassifier(labelCol="Failure_Type_Index", featuresCol="features", numTrees=50)
rf_model = rf.fit(train_data)
rf_predictions = rf_model.transform(test_data)

# Step 1.2: Gradient-Boosted Trees Classifier
# gbt = GBTClassifier(labelCol="Failure_Type_Index", featuresCol="features", maxIter=20)
# gbt_model = gbt.fit(train_data)
# gbt_predictions = gbt_model.transform(test_data)

# Step 1.3: Logistic Regression (Multinomial)
lr = LogisticRegression(labelCol="Failure_Type_Index", featuresCol="features", maxIter=10, family="multinomial")
lr_model = lr.fit(train_data)
lr_predictions = lr_model.transform(test_data)

# Display predictions from each model (example: Random Forest)
rf_predictions.select("features", "Failure_Type_Index", "prediction").show(5)

+--------------------+------------------+----------+
|            features|Failure_Type_Index|prediction|
+--------------------+------------------+----------+
|[295.6,306.1,1256...|               0.0|       0.0|
|[295.6,306.2,1632...|               0.0|       0.0|
|[295.7,306.2,1458...|               0.0|       0.0|
|[296.2,307.0,1542...|               0.0|       0.0|
|[296.4,307.5,1403...|               0.0|       0.0|
+--------------------+------------------+----------+
only showing top 5 rows



In [14]:
# Step 5: Evaluate the model
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(
    labelCol="Failure_Type_Index", 
    predictionCol="prediction", 
    metricName="accuracy"
)

accuracy = evaluator.evaluate(rf_predictions)
print(f"Accuracy: {accuracy}")

# Optional: Compute precision and recall
precision = evaluator.evaluate(rf_predictions, {evaluator.metricName: "weightedPrecision"})
recall = evaluator.evaluate(rf_predictions, {evaluator.metricName: "weightedRecall"})
print(f"Precision: {precision}")
print(f"Recall: {recall}")

Accuracy: 0.9672045809474232
Precision: 0.9567811903522471
Recall: 0.9672045809474232


In [15]:
# Step 5: Evaluate the model
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(
    labelCol="Failure_Type_Index", 
    predictionCol="prediction", 
    metricName="accuracy"
)

accuracy = evaluator.evaluate(lr_predictions)
print(f"Accuracy: {accuracy}")

# Optional: Compute precision and recall
precision = evaluator.evaluate(lr_predictions, {evaluator.metricName: "weightedPrecision"})
recall = evaluator.evaluate(lr_predictions, {evaluator.metricName: "weightedRecall"})
print(f"Precision: {precision}")
print(f"Recall: {recall}")

Accuracy: 0.9729307652264445
Precision: 0.9637389628811835
Recall: 0.9729307652264445


In [17]:
# Generate a confusion matrix
confusion_matrix = lr_predictions.select("Failure_Type_Index", "prediction") \
                              .groupBy("Failure_Type_Index", "prediction") \
                              .count() \
                              .orderBy("Failure_Type_Index", "prediction")

confusion_matrix.show()


+------------------+----------+-----+
|Failure_Type_Index|prediction|count|
+------------------+----------+-----+
|               0.0|       0.0| 1842|
|               0.0|       1.0|    5|
|               0.0|       2.0|    1|
|               1.0|       0.0|   18|
|               1.0|       1.0|    8|
|               2.0|       0.0|    3|
|               2.0|       2.0|   15|
|               2.0|       3.0|    1|
|               3.0|       0.0|   13|
|               3.0|       3.0|    4|
|               4.0|       0.0|    6|
|               5.0|       0.0|    5|
+------------------+----------+-----+

