In [8]:
# Step 1: Imports
import sys
import os
sys.path.append(os.path.abspath(".."))
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error
from src.preprocessing import preprocess

# Step 2: Load preprocessed data
X, y, scaler = preprocess("../data/manufacturing_data.csv")

# Step 3: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Define and train model
model = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42))
model.fit(X_train, y_train)

# Step 5: Evaluate
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred, multioutput='raw_values')

print("\n📊 Mean Squared Error for each of the 15 outputs:")
for i, err in enumerate(mse):
    print(f"  Output {i+1}: {err:.2f}")

# Step 6: Save model and scaler
os.makedirs("results", exist_ok=True)

joblib.dump(model, "results/model_primary.pkl")
joblib.dump(scaler, "results/scaler.pkl")
print("✅ Model and scaler saved.")

# Step 7: Predict on full data and save predictions
full_preds = model.predict(X)
predictions_df = pd.DataFrame(full_preds, columns=[f"Stage1_C_Actual_{i+1}" for i in range(full_preds.shape[1])])
predictions_df.to_csv("results/predictions.csv", index=False)
print("📁 Predictions saved to results/predictions.csv")


  full_df = pd.read_csv(file_path, header=None)



📊 Mean Squared Error for each of the 15 outputs:
  Output 1: 0.20
  Output 2: 0.55
  Output 3: 0.18
  Output 4: 0.52
  Output 5: 1.15
  Output 6: 0.01
  Output 7: 0.02
  Output 8: 0.01
  Output 9: 0.50
  Output 10: 0.40
  Output 11: 0.08
  Output 12: 0.04
  Output 13: 0.00
  Output 14: 0.01
  Output 15: 0.34
✅ Model and scaler saved.
📁 Predictions saved to results/predictions.csv
