In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, accuracy_score, precision_score, recall_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

def identify_threshold(df):
    numeric_cols = df.select_dtypes(include='number')
    threshold = numeric_cols.mean() + 2 * numeric_cols.std()
    return threshold

dataset_path = "/content/drive/MyDrive/Assignment_ANTT/Pallet Washing Machine (2) (1).csv"

sheets = {}
for sheet_name in ['Sheet-1', 'Sheet-2', 'Sheet-3', 'Sheet-4', 'Sheet-5', 'Sheet-6']:
    sheets[sheet_name] = pd.read_csv(dataset_path, skiprows=1)

for sheet_name, sheet_data in sheets.items():
    threshold = identify_threshold(sheet_data)

    sheet_data.drop(columns=['esp32_001', '2022-11-08T09:34:57.8310000Z'], inplace=True)

    scaler = MinMaxScaler()
    scaled_data = scaler.fit_transform(sheet_data)

    kmeans = KMeans(n_clusters=3, random_state=42)
    clusters = kmeans.fit_predict(scaled_data)

    silhouette_avg = silhouette_score(scaled_data, clusters)

    X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(scaled_data, clusters, test_size=0.2, random_state=42)

    rf_regressor = RandomForestRegressor(random_state=42)
    rf_regressor.fit(X_train_reg, y_train_reg)

    train_rmse_reg = mean_squared_error(y_train_reg, rf_regressor.predict(X_train_reg), squared=False)
    test_rmse_reg = mean_squared_error(y_test_reg, rf_regressor.predict(X_test_reg), squared=False)

    X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(scaled_data, clusters, test_size=0.2, random_state=42)

    rf_classifier = RandomForestClassifier(random_state=42)
    rf_classifier.fit(X_train_cls, y_train_cls)

    y_pred_cls = rf_classifier.predict(X_test_cls)

    accuracy = accuracy_score(y_test_cls, y_pred_cls)
    precision = precision_score(y_test_cls, y_pred_cls, average='weighted')
    recall = recall_score(y_test_cls, y_pred_cls, average='weighted')

    cluster_distribution = pd.Series(clusters).value_counts()
    major_cluster = cluster_distribution.idxmax()
    major_cluster_percentage = cluster_distribution[major_cluster] / len(clusters)

    health_status = ""
    if major_cluster_percentage >= 0.8:
        health_status = "Good Health"
    elif major_cluster_percentage >= 0.5 or accuracy >= 0.7:
        health_status = "Moderate Health"
    else:
        health_status = "Bad Health"

    print(f"Sheet: {sheet_name}")
    print(f"Silhouette Score: {silhouette_avg}")
    print(f"RMSE (Regression): {test_rmse_reg}")
    print(f"Accuracy (Classification): {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"Health Status: {health_status}\n")





Sheet: Sheet-1
Silhouette Score: 0.9514184271908873
RMSE (Regression): 0.0
Accuracy (Classification): 1.0
Precision: 1.0
Recall: 1.0
Health Status: Good Health





Sheet: Sheet-2
Silhouette Score: 0.9514184271908873
RMSE (Regression): 0.0
Accuracy (Classification): 1.0
Precision: 1.0
Recall: 1.0
Health Status: Good Health





Sheet: Sheet-3
Silhouette Score: 0.9514184271908873
RMSE (Regression): 0.0
Accuracy (Classification): 1.0
Precision: 1.0
Recall: 1.0
Health Status: Good Health





Sheet: Sheet-4
Silhouette Score: 0.9514184271908873
RMSE (Regression): 0.0
Accuracy (Classification): 1.0
Precision: 1.0
Recall: 1.0
Health Status: Good Health





Sheet: Sheet-5
Silhouette Score: 0.9514184271908873
RMSE (Regression): 0.0
Accuracy (Classification): 1.0
Precision: 1.0
Recall: 1.0
Health Status: Good Health





Sheet: Sheet-6
Silhouette Score: 0.9514184271908873
RMSE (Regression): 0.0
Accuracy (Classification): 1.0
Precision: 1.0
Recall: 1.0
Health Status: Good Health

