In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
import joblib

In [2]:
# Function to preprocess datasets
def preprocess_data(file_path, target_columns):
    data = pd.read_excel(file_path)
    data.columns = data.iloc[0]
    data = data[1:]  # Remove the header row from data
    data = data.apply(lambda x: pd.to_numeric(x, errors='coerce'))
    data.fillna(data.mean(), inplace=True)
    X = data.drop(target_columns, axis=1)
    y = data[target_columns]
    return X, y


In [3]:
# Function to visualize distributions
def visualize_distributions(X_train, X_test, feature_names):
    import matplotlib.pyplot as plt
    for feature in feature_names:
        plt.figure(figsize=(8, 4))
        plt.hist(X_train[feature], bins=30, alpha=0.5, label="Train")
        plt.hist(X_test[feature], bins=30, alpha=0.5, label="Test")
        plt.title(f"Feature Distribution: {feature}")
        plt.legend()
        plt.show()

In [4]:
# File paths
training_file = '/kaggle/input/heterogenous-dataset/Training Dataset.xlsx'
testing_file = '/kaggle/input/heterogenous-dataset/Testing Dataset.xlsx'
target_columns = ['Cloud_Throughput', 'Total_Energy_Consumption', 'Total_Exec_Time']


In [5]:
# ========================= TRAINING PHASE =========================
X_train_unscaled, y_train = preprocess_data(training_file, target_columns)
scaler_X = StandardScaler()
scaler_y = StandardScaler()

X_train = scaler_X.fit_transform(X_train_unscaled)
y_train = scaler_y.fit_transform(y_train)

model_rf = RandomForestRegressor(
    n_estimators=50,
    max_depth=6,
    min_samples_split=10,
    min_samples_leaf=5,
    random_state=42
)
model_rf.fit(X_train, y_train)

# Feature Importance
feature_importances = pd.DataFrame(
    {'Feature': X_train_unscaled.columns, 'Importance': model_rf.feature_importances_}
).sort_values(by='Importance', ascending=False)
print("\nFeature Importances:\n", feature_importances)

# Training Metrics
y_train_pred_rf = model_rf.predict(X_train)
y_train = scaler_y.inverse_transform(y_train)
y_train_pred_rf = scaler_y.inverse_transform(y_train_pred_rf)

train_rmse_rf = np.sqrt(mean_squared_error(y_train, y_train_pred_rf))
train_mae_rf = mean_absolute_error(y_train, y_train_pred_rf)
print("\nRandom Forest Training Metrics:")
print(f"RMSE: {train_rmse_rf:.4f}, MAE: {train_mae_rf:.4f}")

# Cross-Validation
cv_mse_scores = cross_val_score(model_rf, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
print("\nCross-Validation Metrics:")
print(f"RMSE: {np.sqrt(-cv_mse_scores.mean()):.4f}")

joblib.dump(model_rf, 'random_forest_model_tuned.pkl')
print("Model saved successfully!")




Feature Importances:
                    Feature  Importance
32  Edge5_D2a_Task_Size_Kb    0.156413
33   Edge5_D3_Task_Size_Kb    0.129957
35  Edge5_D2b_Task_Size_Kb    0.118701
26   Edge3_D3_Task_Size_Kb    0.110273
29   Edge4_D3_Task_Size_Kb    0.109738
25   Edge3_D2_Task_Size_Kb    0.076960
28   Edge4_D2_Task_Size_Kb    0.073062
23   Edge2_D2_Task_Size_Kb    0.070814
36   Device_to_SBS_BW_Mbps    0.036222
12        D1a_Task_Size_Kb    0.013640
15        D1b_Task_Size_Kb    0.010400
10         D2_Task_Size_Kb    0.010015
18         D3_Task_Size_Kb    0.009377
8          D2_Task_Size_Kb    0.008116
6          D1_Task_Size_Kb    0.007785
14         D3_Task_Size_Kb    0.007470
37       SBS_to_ES_BW_Mbps    0.007093
9          D1_Task_Size_Kb    0.006824
19        D1b_Task_Size_Kb    0.006270
16        D1a_Task_Size_Kb    0.005950
20        D2a_Task_Size_Kb    0.005869
7          D1_Task_Size_Kb    0.005585
11         D3_Task_Size_Kb    0.004882
17        D2a_Task_Size_Kb    0.004709
13

In [6]:
# ========================= TESTING PHASE =========================
X_test_unscaled, y_test = preprocess_data(testing_file, target_columns)
X_test = scaler_X.transform(X_test_unscaled)
y_test = scaler_y.transform(y_test)

model_rf = joblib.load('random_forest_model_tuned.pkl')
y_test_pred_rf = model_rf.predict(X_test)
y_test = scaler_y.inverse_transform(y_test)
y_test_pred_rf = scaler_y.inverse_transform(y_test_pred_rf)

test_rmse_rf = np.sqrt(mean_squared_error(y_test, y_test_pred_rf))
test_mae_rf = mean_absolute_error(y_test, y_test_pred_rf)
print("\nRandom Forest Testing Metrics:")
print(f"RMSE: {test_rmse_rf:.4f}, MAE: {test_mae_rf:.4f}")

# Save predictions
y_test_combined = pd.DataFrame(y_test, columns=target_columns)
y_test_combined['Pred_Cloud_Throughput'] = y_test_pred_rf[:, 0]
y_test_combined['Pred_Total_Energy_Consumption'] = y_test_pred_rf[:, 1]
y_test_combined['Pred_Total_Exec_Time'] = y_test_pred_rf[:, 2]

y_test_combined.to_excel('testing_predictions_rf_refined.xlsx', index=False)
print("Testing predictions saved.")


Random Forest Testing Metrics:
RMSE: 21.7658, MAE: 7.0081
Testing predictions saved.
