# Detailed Code

In [15]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
import shap

def preprocess_data(df):
    print("Initial data shape:", df.shape)
    print("Initial columns:", df.columns.tolist())

    # --- Name filtering ---
    if 'Name' in df.columns:
        df['Name'] = df['Name'].replace('AHU1.1', 'AHU.1')
        df = df[df['Name'].str.contains('HVAC', na=False)]
        print(f"Shape after filtering 'HVAC' names: {df.shape}")
    else:
        print("‚ö†Ô∏è 'Name' column not found, skipping name-based filtering.")

    # --- Target variable check ---
    if 'Active_Energy_Delivered' not in df.columns:
        raise ValueError("Target column 'Active_Energy_Delivered' not found.")
    df['Active_Energy_Delivered'] = pd.to_numeric(df['Active_Energy_Delivered'], errors='coerce')
    df.dropna(subset=['Active_Energy_Delivered'], inplace=True)
    df = df[(df['Active_Energy_Delivered'] >= 0) & (df['Active_Energy_Delivered'] < 120)]
    print(f"Shape after filtering target: {df.shape}")

    # --- Date Handling ---
    if 'Date' in df.columns:
        df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
        invalid = df['Date'].isna().sum()
        df.dropna(subset=['Date'], inplace=True)
        print(f"Dropped {invalid} rows due to invalid 'Date'.")
        df = df.sort_values(by='Date')
    else:
        raise ValueError("Column 'Date' not found in dataset.")

    # --- Jumbo Temperature Imputation ---
    temp_cols_exist = all(c in df.columns for c in ['Jumbo_Temp1', 'Jumbo_Temp2', 'Jumbo_Temp3'])
    if temp_cols_exist:
        for col in ['Jumbo_Temp1', 'Jumbo_Temp2', 'Jumbo_Temp3']:
            df[col] = pd.to_numeric(df[col], errors='coerce')
        df['Jumbo_Temp1'] = df['Jumbo_Temp1'].fillna(df['Jumbo_Temp2']).fillna(df['Jumbo_Temp3'])
        df.drop(columns=['Jumbo_Temp2', 'Jumbo_Temp3'], inplace=True)
        df['Jumbo_Temp1'] = df['Jumbo_Temp1'].ffill().bfill()
        print("‚úÖ Imputed Jumbo_Temp1")
    else:
        print("‚ö†Ô∏è Missing Jumbo_Temp columns, skipped imputation.")

    # --- Jumbo Humidity Imputation ---
    humidity_cols_exist = all(c in df.columns for c in ['Jumbo_Humidity', 'Jumbo_Humidity3'])
    if humidity_cols_exist:
        for col in ['Jumbo_Humidity', 'Jumbo_Humidity3']:
            df[col] = pd.to_numeric(df[col], errors='coerce')
        df['Jumbo_Humidity'] = df['Jumbo_Humidity'].fillna(df['Jumbo_Humidity3'])
        df.drop(columns=['Jumbo_Humidity3'], inplace=True)
        df['Jumbo_Humidity'] = df['Jumbo_Humidity'].ffill().bfill()
        print("‚úÖ Imputed Jumbo_Humidity")
    else:
        print("‚ö†Ô∏è Missing Jumbo_Humidity columns, skipped imputation.")

    # --- Feature Engineering ---
    if {'Avg_Return_Water_Temp', 'Avg_Supply_water_Temp'} <= set(df.columns):
        df['Compressor_delta'] = (
            pd.to_numeric(df['Avg_Return_Water_Temp'], errors='coerce') -
            pd.to_numeric(df['Avg_Supply_water_Temp'], errors='coerce')
        )
        print("‚úÖ Created 'Compressor_delta'")

    # --- Range Filters ---
    if 'T2M' in df.columns:
        df = df[(df['T2M'] > 0) & (df['T2M'] < 50)]
    if 'Operating_Hours' in df.columns:
        df = df[(df['Operating_Hours'] > 0) & (df['Operating_Hours'] < 1.1)]

    # --- Final Fill ---
    df = df.ffill().bfill()

    print("‚úÖ Preprocessing complete.")
    print(f"Final cleaned data shape: {df.shape}")
    return df


def load_and_prepare(path):
    # Auto-detect Excel or CSV
    if path.endswith(".xlsx") or path.endswith(".xls"):
        df = pd.read_excel(path)
        df=df.fillna(0)
    else:
        df = pd.read_csv(path, encoding="utf-8", encoding_errors="ignore", parse_dates=["Date"])
    df = preprocess_data(df)
    # Encode Machine_Name (RandomForest cannot take text)
    le = LabelEncoder()
    df["Machine_Encoded"] = le.fit_transform(df["Name"])

    # Independent variables
    features = [
        "Machine_Encoded", "T2M", "Operating_Hours", "Jumbo_Temp1", "Jumbo_Humidity",
        "Average_Voltage_Line_to_Line", "Average_Voltage_Line_to_Neutral",
        "Avg_Supply_water_Temp", "Avg_Return_Water_Temp", "Compressor_delta",
        "1st_Shift", "2nd_Shift", "common", "General", "hour", "Month", "Day"
    ]

    return df, features, le


# ---------------------------------------------
# 2Ô∏è‚É£ Split Train/Test by Date
# ---------------------------------------------
def split_by_date(df):
    latest_date = df["Date"].max()
    test_start = latest_date - pd.Timedelta(days=30)
    train = df[df["Date"] < test_start]
    test = df[df["Date"] >= test_start]
    return train, test


# ---------------------------------------------
# 3Ô∏è‚É£ Evaluate Model
# ---------------------------------------------
def evaluate_model(model, X_train, y_train, X_test, y_test):
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    mse_train = mean_squared_error(y_train, y_pred_train)
    mse_test = mean_squared_error(y_test, y_pred_test)

    rmse_train = np.sqrt(mse_train)
    rmse_test = np.sqrt(mse_test)

    metrics = {
        "Train": {
            "MSE": mse_train,
            "RMSE": rmse_train,
            "R2": r2_score(y_train, y_pred_train),
        },
        "Test": {
            "MSE": mse_test,
            "RMSE": rmse_test,
            "R2": r2_score(y_test, y_pred_test),
        },
        "OOB": {
            "R2": getattr(model, "oob_score_", None)
        }
    }
    return metrics, y_pred_train, y_pred_test


# ---------------------------------------------
# 4Ô∏è‚É£ SHAP: Top N Features per Prediction
# ---------------------------------------------
def shap_feature_importance(model, X, top_n=5):
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X)
    feature_names = X.columns

    # Extract top N impactful features for each row
    top_features = []
    for row in np.abs(shap_values):
        top_idx = np.argsort(row)[::-1][:top_n]
        top_feat_names = [feature_names[i] for i in top_idx]
        top_features.append(top_feat_names)

    top_features_df = pd.DataFrame(top_features, columns=[f"Top_Feature_{i+1}" for i in range(top_n)])
    return top_features_df


# ---------------------------------------------
# 5Ô∏è‚É£ Complete Pipeline
# ---------------------------------------------
def hvac_pipeline(data_path):
    # Load and prepare
    df, features, le = load_and_prepare(data_path)
    train, test = split_by_date(df)

    X_train, y_train = train[features], train["Active_Energy_Delivered"]
    X_test, y_test = test[features], test["Active_Energy_Delivered"]

    # Train model
    model = RandomForestRegressor(
        n_estimators=80,
        #max_depth=12,
       # min_samples_split=5,
        # min_samples_leaf=10,
       # max_features='sqrt',
        oob_score=True,
        # random_state=42,
        n_jobs=-1
    )
    model.fit(X_train, y_train)

    # Evaluate
    metrics, y_pred_train, y_pred_test = evaluate_model(model, X_train, y_train, X_test, y_test)

    # SHAP explainability
    top_features_df = shap_feature_importance(model, X_test, top_n=5)

    # Merge all data with predictions
    test_result = test.copy()
    test_result["Predicted_Power"] = y_pred_test
    test_result["Error_%"] = ((test_result["Predicted_Power"] - test_result["Active_Energy_Delivered"])
                              / test_result["Active_Energy_Delivered"]) * 100

    # Add anomaly tag (spike detection)
    test_result["Anomaly_Flag"] = np.where(
        np.abs(test_result["Error_%"]) > 20,
        np.where(test_result["Error_%"] > 0, "Overconsumption", "Underconsumption"),
        "Normal"
    )

    # Append top features (SHAP)
    test_result = pd.concat([test_result.reset_index(drop=True), top_features_df], axis=1)

    # Save final combined results
    output_path = "D:/Study/Projects/TATA_TOP_AND_HVAC_Projects/HVAC_Project/test_predictions_with_shap.xlsx"
    test_result.to_excel(output_path, index=False)

    print("\n‚úÖ Test predictions with SHAP and full columns saved to:")
    print(output_path)

    return metrics, test_result


# ---------------------------------------------
# 6Ô∏è‚É£ Run the Pipeline
# ---------------------------------------------
if __name__ == "__main__":
    data_path = "D:/Study/Projects/TATA_TOP_AND_HVAC_Projects/HVAC_Project/HVAC_data.xlsx"
    metrics, test_result = hvac_pipeline(data_path)

    print("\nüìä Evaluation Metrics:")
    print(pd.DataFrame(metrics).T)

Initial data shape: (317845, 32)
Initial columns: ['Time-Hourly', 'Name', 'Active_Energy_Delivered', 'Operating_Hours', 'Average_Voltage_Line_to_Neutral', 'Average_Voltage_Line_to_Line', 'Jumbo_Humidity3', 'Jumbo_Temp2', 'Jumbo_Temp1', 'Jumbo_Temp3', 'Jumbo_Humidity', 'T2M', 'Date_Time', 'Avg_Return_Water_Temp', 'Avg_Supply_water_Temp', 'TimeStamp', 'Year', 'Month', 'Day', 'hour', 'Date', 'Start Date', '1st_Shift', '2nd_Shift', '3rd_Shift', 'common', 'General', 'Left_Right_Name', 'Score', 'Right_Date_Time', 'Right_3rd_Shift', 'Result']
Shape after filtering 'HVAC' names: (109750, 32)
Shape after filtering target: (98663, 32)
Dropped 0 rows due to invalid 'Date'.
‚úÖ Imputed Jumbo_Temp1
‚úÖ Imputed Jumbo_Humidity
‚úÖ Created 'Compressor_delta'
‚úÖ Preprocessing complete.
Final cleaned data shape: (19011, 30)

‚úÖ Test predictions with SHAP and full columns saved to:
D:/Study/Projects/TATA_TOP_AND_HVAC_Projects/HVAC_Project/test_predictions_with_shap.xlsx

üìä Evaluation Metrics:
      

## All testing

## Old Code

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
import shap

# ---------------------------------------------
# 1Ô∏è‚É£ Load & Prepare Data
# ---------------------------------------------
def load_and_prepare(path):
    # Auto-detect Excel or CSV
    if path.endswith(".xlsx") or path.endswith(".xls"):
        df = pd.read_excel(path)
        df=df.fillna(0)
    else:
        df = pd.read_csv(path, encoding="utf-8", encoding_errors="ignore", parse_dates=["Date"])

    # Encode Machine_Name (RandomForest cannot take text)
    le = LabelEncoder()
    df["Machine_Encoded"] = le.fit_transform(df["Name"])

    # Independent variables
    features = [
        "Machine_Encoded", "T2M", "Operating_Hours", "Jumbo_Temp1", "Jumbo_Humidity",
        "Average_Voltage_Line_to_Line", "Average_Voltage_Line_to_Neutral",
        "Avg_Supply_water_Temp", "Avg_Return_Water_Temp", "Compressor_delta",
        "1st_Shift", "2nd_Shift", "common", "General", "hour", "Month", "Day"
    ]

    return df, features, le


# ---------------------------------------------
# 2Ô∏è‚É£ Split Train/Test by Date
# ---------------------------------------------
def split_by_date(df):
    latest_date = df["Date"].max()
    test_start = latest_date - pd.Timedelta(days=30)
    train = df[df["Date"] < test_start]
    test = df[df["Date"] >= test_start]
    return train, test


# ---------------------------------------------
# 3Ô∏è‚É£ Evaluate Model
# ---------------------------------------------
def evaluate_model(model, X_train, y_train, X_test, y_test):
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    mse_train = mean_squared_error(y_train, y_pred_train)
    mse_test = mean_squared_error(y_test, y_pred_test)

    rmse_train = np.sqrt(mse_train)
    rmse_test = np.sqrt(mse_test)

    metrics = {
        "Train": {
            "MSE": mse_train,
            "RMSE": rmse_train,
            "R2": r2_score(y_train, y_pred_train),
        },
        "Test": {
            "MSE": mse_test,
            "RMSE": rmse_test,
            "R2": r2_score(y_test, y_pred_test),
        },
        "OOB": {
            "R2": getattr(model, "oob_score_", None)
        }
    }
    return metrics, y_pred_train, y_pred_test


# ---------------------------------------------
# 4Ô∏è‚É£ SHAP: Top N Features per Prediction
# ---------------------------------------------
def shap_feature_importance(model, X, top_n=5):
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X)
    feature_names = X.columns

    # Extract top N impactful features for each row
    top_features = []
    for row in np.abs(shap_values):
        top_idx = np.argsort(row)[::-1][:top_n]
        top_feat_names = [feature_names[i] for i in top_idx]
        top_features.append(top_feat_names)

    top_features_df = pd.DataFrame(top_features, columns=[f"Top_Feature_{i+1}" for i in range(top_n)])
    return top_features_df


# ---------------------------------------------
# 5Ô∏è‚É£ Complete Pipeline
# ---------------------------------------------
def hvac_pipeline(data_path):
    # Load and prepare
    df, features, le = load_and_prepare(data_path)
    train, test = split_by_date(df)

    X_train, y_train = train[features], train["Active_Energy_Delivered"]
    X_test, y_test = test[features], test["Active_Energy_Delivered"]

    # Train model
    model = RandomForestRegressor(
        n_estimators=100,
        #max_depth=12,
       # min_samples_split=5,
        min_samples_leaf=10,
       # max_features='sqrt',
        oob_score=True,
        random_state=42,
        n_jobs=-1
    )
    model.fit(X_train, y_train)

    # Evaluate
    metrics, y_pred_train, y_pred_test = evaluate_model(model, X_train, y_train, X_test, y_test)

    # SHAP explainability
    top_features_df = shap_feature_importance(model, X_test, top_n=5)

    # Merge all data with predictions
    test_result = test.copy()
    test_result["Predicted_Power"] = y_pred_test
    test_result["Error_%"] = ((test_result["Predicted_Power"] - test_result["Active_Energy_Delivered"])
                              / test_result["Active_Energy_Delivered"]) * 100

    # Add anomaly tag (spike detection)
    test_result["Anomaly_Flag"] = np.where(
        np.abs(test_result["Error_%"]) > 20,
        np.where(test_result["Error_%"] > 0, "Overconsumption", "Underconsumption"),
        "Normal"
    )

    # Append top features (SHAP)
    test_result = pd.concat([test_result.reset_index(drop=True), top_features_df], axis=1)

    # Save final combined results
    output_path = "D:/Study/Projects/TATA_TOP_AND_HVAC_Projects/HVAC_Project/test_predictions_with_shap.xlsx"
    test_result.to_excel(output_path, index=False)

    print("\n‚úÖ Test predictions with SHAP and full columns saved to:")
    print(output_path)

    return metrics, test_result


# ---------------------------------------------
# 6Ô∏è‚É£ Run the Pipeline
# ---------------------------------------------
if __name__ == "__main__":
    data_path = "D:/Study/Projects/TATA_TOP_AND_HVAC_Projects/HVAC_Project/preprocessed_data.xlsx"
    metrics, test_result = hvac_pipeline(data_path)

    print("\nüìä Evaluation Metrics:")
    print(pd.DataFrame(metrics).T)


In [None]:
# print("Initial data shape:", df.shape)
# print("Initial columns:", df.columns.tolist())
# print("Initial data types:\n", df.dtypes)
# # --- Data Preprocessing ---
# print("\nStarting Data Preprocessing...")
# print("First, we will look on HVAC(Chiller)")
# # Rename inconsistent names
# if 'Name' in df.columns:
#     df['Name'] = df['Name'].replace('AHU1.1', 'AHU.1')
#     # Filter for relevant names (Still useful to filter rows, even if 'Name' isn't used as a feature)
#     data = df[df['Name'].str.contains('HVAC', na=False)]
#     print(f"Shape after filtering Name: {data.shape}")
# else:
#     print("Warning: 'Name' column not found for filtering.")
#
#
# # Filter target variable range
# if 'Active_Energy_Delivered' in df.columns:
#     # Ensure target is numeric before filtering
#     df['Active_Energy_Delivered'] = pd.to_numeric(
#         df['Active_Energy_Delivered'], errors='coerce')
#     # Drop rows where conversion failed
#     df.dropna(subset=['Active_Energy_Delivered'], inplace=True)
#     df = df[(df['Active_Energy_Delivered'] >= 0) &
#                 (df['Active_Energy_Delivered'] < 120)]  # Adjust upper limit if needed
#     print(f"Shape after filtering Active_Energy_Delivered: {df.shape}")
# else:
#     print("Error: Target column 'Active_Energy_Delivered' not found. Exiting.")
#     exit()
#
# # Convert Date and handle errors
# if 'Date' in df.columns:
#     df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
#     initial_rows = df.shape[0]
#     df = df.dropna(subset=['Date'])
#     print(f"Dropped {initial_rows - df.shape[0]} rows due to invalid Date format.")
#     print(f"Shape after handling Date: {df.shape}")
#     df = df.sort_values(by='Date')
# else:
#     print("Error: 'Date' column not found for train/test split. Exiting.")
#     exit()
#
# temp_cols_exist = all(col in df.columns for col in ['Jumbo_Temp1', 'Jumbo_Temp2', 'Jumbo_Temp3'])
# humidity_cols_exist = all(col in df.columns for col in ['Jumbo_Humidity', 'Jumbo_Humidity3'])
#
# if temp_cols_exist:
#     print("Imputing Jumbo Temperatures...")
#     # Convert to numeric first
#     for col in ['Jumbo_Temp1', 'Jumbo_Temp2', 'Jumbo_Temp3']:
#         df[col] = pd.to_numeric(df[col], errors='coerce')
#     df['Jumbo_Temp1'] = df['Jumbo_Temp1'].fillna(df['Jumbo_Temp2']).fillna(df['Jumbo_Temp3'])
# else:
#     print("Warning: One or more Jumbo Temperature columns missing, skipping imputation.")
#
# if humidity_cols_exist:
#     print("Imputing Jumbo Humidity...")
#     # Convert to numeric first
#     for col in ['Jumbo_Humidity', 'Jumbo_Humidity3']:
#         df[col] = pd.to_numeric(df[col], errors='coerce')
#     df['Jumbo_Humidity'] = df['Jumbo_Humidity'].fillna(df['Jumbo_Humidity3'])
# else:
#     print("Warning: One or more Jumbo Humidity columns missing, skipping imputation.")
#
# # Drop redundant columns if they exist AND imputation was attempted
# cols_to_drop = []
# if temp_cols_exist:
#     cols_to_drop.extend(['Jumbo_Temp2', 'Jumbo_Temp3'])
# if humidity_cols_exist:
#     cols_to_drop.append('Jumbo_Humidity3')
#
# df.drop(columns=[col for col in cols_to_drop if col in df.columns], inplace=True)
# print(f"Dropped redundant columns: {[col for col in cols_to_drop if col in df.columns]}")
#
#
# # Forward/Backward fill remaining NaNs for imputed columns
# if 'Jumbo_Temp1' in df.columns:
#     initial_nan = df['Jumbo_Temp1'].isna().sum()
#     data['Jumbo_Temp1'] = df['Jumbo_Temp1'].ffill().bfill()
#     print(f"Filled {initial_nan - data['Jumbo_Temp1'].isna().sum()} NaNs in Jumbo_Temp1.")
# if 'Jumbo_Humidity' in df.columns:
#     initial_nan = df['Jumbo_Humidity'].isna().sum()
#     df['Jumbo_Humidity'] = df['Jumbo_Humidity'].ffill().bfill()
#     print(f"Filled {initial_nan - data['Jumbo_Humidity'].isna().sum()} NaNs in Jumbo_Humidity.")
#
# # Filter based on T2M and Operating_Hours (Check column existence and convert to numeric)
# if 'T2M' in df.columns:
#    df = df[(df['T2M'] > 0) & (df['T2M'] < 50)]
#    print(f"Shape after filtering T2M: {df.shape}")
# if 'Operating_Hours' in df.columns:
#    df = df[(df['Operating_Hours'] > 0) &
#                (df['Operating_Hours'] < 1.1)]
#    print(f"Shape after filtering Operating_Hours: {df.shape}")
# else:
#     print("Warning: 'Operating_Hours' column not found for filtering.")
#
# if 'Avg_Return_Water_Temp' in df.columns and 'Avg_Supply_water_Temp' in df.columns:
#    # Ensure columns are numeric before subtraction
#    df['Avg_Return_Water_Temp'] = pd.to_numeric(df['Avg_Return_Water_Temp'], errors='coerce')
#    df['Avg_Supply_water_Temp'] = pd.to_numeric(df['Avg_Supply_water_Temp'], errors='coerce')
#    df['Compressor_delta'] = df['Avg_Return_Water_Temp'] - df['Avg_Supply_water_Temp']
#    print("Compressor_delta created.")
# df.isnull().sum()
# final_columns = [
#    'Name', 'T2M', 'Operating_Hours', 'Active_Energy_Delivered','Jumbo_Temp1','Jumbo_Humidity','Average_Voltage_Line_to_Line',
#    'Year', 'Month', 'Day', 'Date', 'hour', 'Avg_Supply_water_Temp','Current_Phase_Average', 'Average_Voltage_Line_to_Neutral',
#    'Avg_Return_Water_Temp', 'Compressor_delta', '1st_Shift', '2nd_Shift', 'common', 'General'
# ]
