In [1]:
# **Kernel 1: Load Required Libraries**
import boto3
import pandas as pd
import numpy as np
import io
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

print("✅ Required Libraries Loaded!")



✅ Required Libraries Loaded!


Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.


In [3]:
import boto3
import pandas as pd
import numpy as np
import io
from sklearn.preprocessing import LabelEncoder

print("✅ Required Libraries Loaded!")

# **🔹 Kernel 2: Load Data from AWS S3**
bucket_name = "hackathon-predictive-maintenance"

# **File Paths for Training, Sample, and Test Data**
train_files = [
    "belt_1_9_months_negative_data.xlsx", 
    "belt_2_9_months_negative_data.xlsx",
    "belt_3_9_months_negative_data.xlsx",
    "belt_4_9_months_negative_data.xlsx",
    "belt_5_9_months_negative_data.xlsx"
]
test_files = [
    "belt_1_test_data.xlsx",
    "belt_2_test_data.xlsx",
    "belt_3_test_data.xlsx",
    "belt_4_test_data.xlsx",
"belt_5_test_data.xlsx"
]

sample_data_file = "belt_sample_data.xlsx"
sample_result_file = "belt_sample_result_sheet.xlsx"

# **Initialize S3 Client**
s3_client = boto3.client("s3")

# **Load Training Data**
train_dfs = []
for file in train_files:
    obj = s3_client.get_object(Bucket=bucket_name, Key=file)
    df = pd.read_excel(io.BytesIO(obj["Body"].read()))
    train_dfs.append(df)

train_data = pd.concat(train_dfs, ignore_index=True)

# **Load Sample Data**
sample_data = pd.read_excel(io.BytesIO(s3_client.get_object(Bucket=bucket_name, Key=sample_data_file)["Body"].read()))

# **Expected Column Names (Since Test Data Has No Headers)**
expected_columns = [
    "Name", "Timestamp", "Status", "Description", "Vibration Frequency",
    "Vibration Amplitude", "Bearing Temperature", "Motor Temperature",
    "Belt Load", "Torque", "Noise Levels", "Current and Voltage",
    "Hydraulic Pressure", "Belt Thickness", "Roller Condition"
]

# **Load Test Data & Assign Column Names**
test_dfs = []
for file in test_files:
    obj = s3_client.get_object(Bucket=bucket_name, Key=file)
    df = pd.read_excel(io.BytesIO(obj["Body"].read()), header=None)  # No headers
    df.columns = expected_columns  # Assign correct column names
    test_dfs.append(df)

test_data = pd.concat(test_dfs, ignore_index=True)

print("✅ Training, Sample, and Test Data Loaded Successfully!")

# **🔹 Kernel 3: Data Preprocessing**
# **Convert 'Timestamp' Column to Datetime**
for df in [train_data, sample_data, test_data]:
    df["Timestamp"] = pd.to_datetime(df["Timestamp"], errors="coerce")

# **Custom Encoding for 'Name' (Belt ID)**
name_mapping = {
    "Conveyor Belt 1": 1,
    "Conveyor Belt 2": 2,
    "Conveyor Belt 3": 3,
    "Conveyor Belt 4": 4,
    "Conveyor Belt 5": 5
}
for df in [train_data, sample_data, test_data]:
    df["Name"] = df["Name"].map(name_mapping)

# **Custom Encoding for 'Status'**
status_mapping = {
    "Running": 1,
    "Maintenance": 2,
}
for df in [train_data, sample_data, test_data]:
    df["Status"] = df["Status"].map(status_mapping)

# **Handle Missing Values in 'Description' & Encode it**
label_encoder = LabelEncoder()
for df in [train_data, sample_data, test_data]:
    df["Description"] = df["Description"].astype(str).fillna("Unknown")
    df["Description"] = label_encoder.fit_transform(df["Description"])

# **Extract Time-Based Features**
for df in [train_data, sample_data, test_data]:
    df["Hour"] = df["Timestamp"].dt.hour
    df["Day"] = df["Timestamp"].dt.day
    df["Month"] = df["Timestamp"].dt.month
    df["Weekday"] = df["Timestamp"].dt.weekday

print("✅ Data Preprocessing Complete for Train, Sample, and Test Data!")


✅ Required Libraries Loaded!
✅ Training, Sample, and Test Data Loaded Successfully!
✅ Data Preprocessing Complete for Train, Sample, and Test Data!


In [4]:
# **Kernel 4: Detect Anomalies in Train, Sample, and Test Data**

def detect_anomalies(df):
    """ Flags rows where sensor values exceed expected ranges. """
    return (
        (df["Vibration Frequency"] < 1490) | (df["Vibration Frequency"] > 1510) |
        (df["Vibration Amplitude"] < 0.04) | (df["Vibration Amplitude"] > 0.06) |
        (df["Bearing Temperature"] < 60) | (df["Bearing Temperature"] > 80) |
        (df["Motor Temperature"] < 80) | (df["Motor Temperature"] > 100) |
        (df["Belt Load"] < 1.0) | (df["Belt Load"] > 1.4) |
        (df["Torque"] < 280) | (df["Torque"] > 320) |
        (df["Noise Levels"] < 55) | (df["Noise Levels"] > 65) |
        (df["Current and Voltage"] < 14) | (df["Current and Voltage"] > 16) |
        (df["Hydraulic Pressure"] < 375) | (df["Hydraulic Pressure"] > 385) |
        (df["Belt Thickness"] < 1.5) | (df["Belt Thickness"] > 1.7) |
        (df["Roller Condition"] < 65)
    )

# **Apply Anomaly Detection to All Datasets**
train_data["Anomaly"] = detect_anomalies(train_data)
sample_data["Anomaly"] = detect_anomalies(sample_data)
test_data["Anomaly"] = detect_anomalies(test_data)  # ✅ Added Test Data Anomaly Detection

# **Check the Number of Anomalies Detected**
print(f"✅ Total Anomalies in Training Data: {train_data['Anomaly'].sum()}")
print(f"✅ Total Anomalies in Sample Data: {sample_data['Anomaly'].sum()}")
print(f"✅ Total Anomalies in Test Data: {test_data['Anomaly'].sum()}")


✅ Total Anomalies in Training Data: 10592
✅ Total Anomalies in Sample Data: 53
✅ Total Anomalies in Test Data: 967


In [5]:
import numpy as np

# **Ensure 'Anomaly' Exists and Is Integer**
if "Anomaly" not in train_data.columns:
    raise ValueError("❌ ERROR: 'Anomaly' column is missing. Run anomaly detection first!")

train_data["Anomaly"] = train_data["Anomaly"].astype(int)

# **Step 1: Find First Anomaly Timestamp for Each Belt**
first_anomaly_times = train_data[train_data["Anomaly"] == 1].groupby("Name")["Timestamp"].min()

# **Step 2: Map Each Belt's First Anomaly Timestamp**
train_data["First_Anomaly_Timestamp"] = train_data["Name"].map(first_anomaly_times)

# **Step 3: Compute Time to First Anomaly**
train_data["Time_to_Anomaly"] = (train_data["First_Anomaly_Timestamp"] - train_data["Timestamp"]).dt.total_seconds()

# **Step 4: Drop Unnecessary Columns**
train_data.drop(columns=["First_Anomaly_Timestamp"], inplace=True)

# **Step 5: Remove Rows Without Valid Time_to_Anomaly**
train_data = train_data[train_data["Time_to_Anomaly"].notna()]

print("✅ Time_to_Anomaly Computed Successfully!")
print(train_data[["Name", "Timestamp", "Anomaly", "Time_to_Anomaly"]].head())


✅ Time_to_Anomaly Computed Successfully!
   Name           Timestamp  Anomaly  Time_to_Anomaly
0     4 2025-08-15 00:00:00        0         333900.0
1     4 2025-08-15 00:15:00        0         333000.0
2     4 2025-08-15 00:30:00        0         332100.0
3     4 2025-08-15 00:45:00        0         331200.0
4     4 2025-08-15 01:00:00        0         330300.0


In [6]:
# **Kernel 6: Train XGBoost Model for Predicting Time_to_Anomaly**
# Convert Timestamp to Numeric Feature
train_data["Timestamp_Num"] = train_data["Timestamp"].astype('int64') // 10**9

# Define Features & Target
X = train_data.drop(columns=['Time_to_Anomaly', 'Timestamp'])
y = train_data["Time_to_Anomaly"]

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train XGBoost Model
xgb_model = XGBRegressor(n_estimators=200, learning_rate=0.1, max_depth=6, random_state=42)
xgb_model.fit(X_train, y_train)

# Evaluate Model
y_pred_test = xgb_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred_test)

print(f"✅ XGBoost Model Trained! MAE on Test Data: {mae:.2f} seconds")


✅ XGBoost Model Trained! MAE on Test Data: 5814.98 seconds


In [7]:
# **Kernel 7: Predict Anomaly Timestamp for Sample Data**
# Convert Sample Timestamp to Numeric Feature
sample_data["Timestamp_Num"] = sample_data["Timestamp"].astype('int64') // 10**9
X_sample = sample_data.drop(columns=['Timestamp'])

# Predict Time Until First Anomaly
y_pred_sample = xgb_model.predict(X_sample)

# Convert Predicted Seconds into DateTime
predicted_failure_timestamps = sample_data["Timestamp"] + pd.to_timedelta(y_pred_sample, unit="s")

# Create DataFrame with Results
predicted_sample_failures_df = pd.DataFrame({
    "Belt_ID": sample_data["Name"],
    "Predicted Failure Date": predicted_failure_timestamps.dt.strftime("%m/%d/%Y %H:%M")
})

print("✅ Predicted Failure Dates for Sample Data:\n", predicted_sample_failures_df)


✅ Predicted Failure Dates for Sample Data:
      Belt_ID Predicted Failure Date
0          4       08/18/2025 18:46
1          4       08/18/2025 19:03
2          4       08/18/2025 19:11
3          4       08/18/2025 19:29
4          4       08/18/2025 19:39
..       ...                    ...
285        4       08/18/2025 20:39
286        4       08/18/2025 21:00
287        4       08/18/2025 21:17
288        4       08/18/2025 20:01
289        4       08/18/2025 20:02

[290 rows x 2 columns]


In [8]:
# **Kernel 7: Predict Anomaly Timestamp for Test Data**
# Convert Test Timestamp to Numeric Feature
test_data["Timestamp_Num"] = test_data["Timestamp"].astype('int64') // 10**9
X_test = test_data.drop(columns=['Timestamp'])

# **Predict Time Until First Anomaly for Test Data**
y_pred_test = xgb_model.predict(X_test)

# **Convert Predicted Seconds into DateTime**
predicted_failure_timestamps = test_data["Timestamp"] + pd.to_timedelta(y_pred_test, unit="s")

# **Create DataFrame with Results**
predicted_test_failures_df = pd.DataFrame({
    "Belt_ID": test_data["Name"],  # Belt Number
    "Predicted Failure Date": predicted_failure_timestamps.dt.strftime("%m/%d/%Y %H:%M")
})

# **Output Predictions**
print("✅ Predicted Failure Dates for Test Data:\n", predicted_test_failures_df)


✅ Predicted Failure Dates for Test Data:
       Belt_ID Predicted Failure Date
0           4       08/19/2025 08:55
1           4       08/19/2025 09:01
2           4       08/19/2025 09:51
3           4       08/19/2025 09:55
4           4       08/19/2025 10:12
...       ...                    ...
7913        4       09/12/2025 23:54
7914        4       09/13/2025 01:54
7915        4       09/13/2025 02:00
7916        4       09/13/2025 00:28
7917        4       09/13/2025 01:34

[7918 rows x 2 columns]
