In [None]:
#data source https://archive.ics.uci.edu/dataset/601/ai4i+2020+predictive+maintenance+dataset

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import f1_score
from sklearn.feature_selection import mutual_info_classif

In [None]:
file_path = "/Users/jengjava/Desktop/my-new-repo/ml-fp/data/raw/predictive_maintenance.csv"

In [None]:
df = pd.read_csv(file_path)
df = df.copy()
df.head(5)

In [None]:
print(df.dtypes)

In [None]:
# Cell: Define Data Columns
categorical_cols = ["Type", "Product ID", "Failure Type"]
numerical_cols = [
    "Air temperature [K]",
    "Process temperature [K]",
    "Rotational speed [rpm]",
    "Torque [Nm]",
    "Tool wear [min]"
]


In [None]:
def define_data_columns():
    categorical_cols = ["Type", "Product ID", "Failure Type"]

    numerical_cols = [
    "Air temperature [K]",
    "Process temperature [K]",
    "Rotational speed [rpm]",
    "Torque [Nm]",
    "Tool wear [min]",
]
    return categorical_cols, numerical_cols

cat_cols, num_cols = define_data_columns()
print("Categorical Columns:", cat_cols)
print("Numerical Columns:", num_cols)

In [None]:
# label enconding
le_dict = {}

for col in cat_cols:
    le = LabelEncoder()
    df[col + "_encoded"] = le.fit_transform(df[col])
    le_dict[col] = le

print("encoded columns:")
print (df[[col + "_encoded" for col in cat_cols]].head(5))

In [None]:
feature_cols = numerical_cols + [col + "_encoded" for col in categorical_cols]
features_array = df[feature_cols].values
features_array

In [None]:
y_binary = df["Target"].values
y_multiclass = df["Failure Type_encoded"].values

mi_scores = mutual_info_classif(features_array, y_binary)

feature_importance = pd.DataFrame({
    "feature": feature_cols,
    "mi_scores": mi_scores
})

feature_importance = feature_importance.sort_values(by="mi_scores", ascending=False)

print("Feature Importance:")
print(feature_importance)

In [None]:
import matplotlib.pyplot as plt

# Plot the mutual information scores as a bar chart
plt.figure(figsize=(8, 5))
plt.bar(feature_importance["feature"], feature_importance["mi_scores"], color="skyblue")
plt.xlabel("Features")
plt.ylabel("Mutual Information Score")
plt.title("Feature Importance Based on Mutual Information")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()


In [None]:
threshold = mi_scores.mean()

selected_features = feature_importance[feature_importance["mi_scores"] > threshold]["feature"].tolist()
X = df[selected_features].values


# Display the selected features and the shape of the resulting feature matrix.
print("Selected Features:", selected_features)
print("Feature matrix shape:", X.shape)
X

In [None]:
# 5-fold cross validation

kf = KFold(n_splits=5, shuffle=True, random_state=42)

binary_predictions = np.zeros(len(df))
multiclass_predictions = np.zeros(len(df))

f1_scores = []


In [None]:

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    # Split the data into training and validation sets
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y_binary[train_idx], y_binary[val_idx]
    y_train_multi, y_val_multi = y_multiclass[train_idx], y_multiclass[val_idx]

    # --- Binary Classification Model ---
    binary_model = XGBClassifier(random_state=42, eval_metric="logloss")
    binary_model.fit(X_train, y_train)
    binary_pred = binary_model.predict(X_val)
    fold_f1 = f1_score(y_val, binary_pred)
    f1_scores.append(fold_f1)
    print(f"Fold {fold + 1} - Binary F1 Score: {fold_f1:.4f}")
    binary_predictions[val_idx] = binary_pred

    # --- Multiclass Classification Model ---
    multiclass_model = XGBClassifier(random_state=42, eval_metric="mlogloss")
    multiclass_model.fit(X_train, y_train_multi)
    multiclass_predictions[val_idx] = multiclass_model.predict(X_val)


In [None]:
# ---------------------------------------------------
# Step 8: Create Submission DataFrame
# ---------------------------------------------------

# Create the submission DataFrame using:
# - "UDI": Unique identifiers from the original dataset.
# - "Target": Binary predictions (converted to integers).
# - "Failure_Type": Multiclass predictions are inverse-transformed back to original labels.
submission = pd.DataFrame({
    "UDI": df["UDI"],
    "Target": binary_predictions.astype(int),
    "Failure_Type": le_dict["Failure Type"].inverse_transform(multiclass_predictions.astype(int))
})

# Display the first few rows of the submission DataFrame
print("Submission preview:")
print(submission.head())
