In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load the data
data = pd.read_excel('model_prep.xlsx')

# Extract input features and process parameters
X_initial = data[['X1', 'X2', 'X3', 'X4', 'X5', 'X6']]
process_parameters = data.columns.difference(['Date', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', '> 35', '35 to 200', '< 200'])
X_process = data[process_parameters]
y_output = data[['> 35', '35 to 200', '< 200']]

# Split the data into training and testing sets for process parameter prediction
X_train_initial, X_test_initial, X_train_process, X_test_process = train_test_split(X_initial, X_process, test_size=0.2, random_state=42)

# Standardize the input features
scaler_initial = StandardScaler()
X_train_initial_scaled = scaler_initial.fit_transform(X_train_initial)
X_test_initial_scaled = scaler_initial.transform(X_test_initial)

# Train model to predict process parameters
model_process = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42))
model_process.fit(X_train_initial_scaled, X_train_process)

# Predict process parameters on the test set
X_test_process_pred = model_process.predict(X_test_initial_scaled)

# Combine initial inputs and predicted process parameters for final model training
X_train_final = pd.concat([X_train_initial.reset_index(drop=True), pd.DataFrame(model_process.predict(X_train_initial_scaled), columns=process_parameters)], axis=1)
X_test_final = pd.concat([X_test_initial.reset_index(drop=True), pd.DataFrame(X_test_process_pred, columns=process_parameters)], axis=1)

# Standardize the final input features
scaler_final = StandardScaler()
X_train_final_scaled = scaler_final.fit_transform(X_train_final)
X_test_final_scaled = scaler_final.transform(X_test_final)

# Split the data into training and testing sets for final output prediction
y_train, y_test = train_test_split(y_output, test_size=0.2, random_state=42)

# Initialize and train the final model
final_model = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42))
final_model.fit(X_train_final_scaled, y_train)

# Predict on the test set
y_pred = final_model.predict(X_test_final_scaled)

# Evaluate the final model
mse = mean_squared_error(y_test, y_pred, multioutput='raw_values')
r2 = r2_score(y_test, y_pred)


# Output the results
print("Mean Squared Error for each output variable:", mse)
print("R-squared Score:", r2)

def accuracy_within_threshold(y_true, y_pred, threshold=0.1):
    return (abs(y_true - y_pred) / y_true < threshold).mean()

accuracy = accuracy_within_threshold(y_test, y_pred)

print(accuracy)

Mean Squared Error for each output variable: [ 1.17751534 16.98007002 18.56301127]
R-squared Score: 0.18915362940601374
> 35         0.168724
35 to 200    0.847737
< 200        0.843621
dtype: float64


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Load the data
data = pd.read_excel('model_data_selected_columns.xlsx')

# Extract input features and output labels
X = data.drop(columns=['> 35', '35 to 200', '< 200', 'Date'])  # Drop the output columns and 'Date'
y = data[['> 35', '35 to 200', '< 200']]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the input features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train the model
model = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42))
model.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred = model.predict(X_test_scaled)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred, multioutput='raw_values')
r2 = r2_score(y_test, y_pred)

# Output the results
print("Mean Squared Error for each output variable:", mse)
print("R-squared Score:", r2)

Mean Squared Error for each output variable: [0.01530403 0.84704779 0.82254631]
R-squared Score: 0.9663669101772179
