In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import optuna
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [3]:
df = pd.read_csv("/kaggle/input/dataset-real-nalco/Real_final_Data_With_anomaly.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4548 entries, 0 to 4547
Data columns (total 24 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   EMUL_OIL_L_TEMP_PV_VAL0        4548 non-null   float64
 1   STAND_OIL_L_TEMP_PV_REAL_VAL0  4548 non-null   float64
 2   GEAR_OIL_L_TEMP_PV_REAL_VAL0   4548 non-null   float64
 3   EMUL_OIL_L_PR_VAL0             4548 non-null   float64
 4   QUENCH_CW_FLOW_EXIT_VAL0       4548 non-null   float64
 5   CAST_WHEEL_RPM_VAL0            4548 non-null   float64
 6   BAR_TEMP_VAL0                  4548 non-null   float64
 7   QUENCH_CW_FLOW_ENTRY_VAL0      4548 non-null   float64
 8   GEAR_OIL_L_PR_VAL0             4548 non-null   float64
 9   STANDS_OIL_L_PR_VAL0           4548 non-null   float64
 10  TUNDISH_TEMP_VAL0              4548 non-null   float64
 11  RM_MOTOR_COOL_WATER__VAL0      4548 non-null   float64
 12  ROLL_MILL_AMPS_VAL0            4548 non-null   f

In [21]:
y_min = {}
y_max = {}

for cols in df.columns:
    min_v = df[cols].min()
    max_v = df[cols].max()
    y_min[cols] = min_v
    y_max[cols] = max_v

In [4]:
# Normalization function
def normalize_column(column):
    min_val = column.min()
    max_val = column.max()
    return (column - min_val) / (max_val - min_val)

# Inverse normalization function
def inverse_normalize_column(norm_column, min_val, max_val):
    return norm_column * (max_val - min_val) + min_val

# Normalize the data
normalized_data = df.apply(normalize_column)

In [5]:
target_cols = ['   UTS','Conductivity','Elongation']
input_cols = [col for col in df.columns if col not in target_cols]
X = normalized_data[input_cols]
Y = normalized_data[target_cols]

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [7]:
import optuna
import xgboost as xgb
import pickle
from sklearn.metrics import mean_squared_error

# Step 3: Define the objective function for Optuna
def objective(trial, output_col, x_train, y_train, x_test, y_test):
    # Hyperparameter search space
    params = {
        
        "objective": "reg:squarederror",
        "tree_method": "auto",
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 20),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.5, log=True),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "subsample": trial.suggest_float("subsample", 0.5, 1),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1),
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-5, 10, log=True),
        
    }

    # Train the XGBoost regressor for the specific output
    model = xgb.XGBRegressor(**params)
    model.fit(x_train, y_train)

    # Evaluate on the test set
    y_pred = model.predict(x_test)
    mse = mean_squared_error(y_test, y_pred)

    # Save the model if this is the best trial so far
    if mse < objective.best_mse[output_col]:
        objective.best_mse[output_col] = mse
        with open(f"xgboost_model_output_{output_col}.pkl", "wb") as f:
            pickle.dump(model, f)
    
    return mse

# Initialize best MSE tracker
objective.best_mse = {
    '   UTS': float("inf"),
    'Elongation': float("inf"),
    'Conductivity': float("inf"),
}

# Step 4: Run Optuna for each output parameter
outputs = ['   UTS', 'Elongation', 'Conductivity']  # Replace with your actual column names
for output_col in outputs:
    print(f"Optimizing for output parameter {output_col}...")
    study = optuna.create_study(direction="minimize")
    study.optimize(
        lambda trial: objective(trial, output_col, X_train, Y_train[output_col], X_test, Y_test[output_col]),
        n_trials=1000
    )
    print(f"Best MSE for output {output_col}: {objective.best_mse[output_col]}")
    print(f"The best model for output {output_col} is saved as 'xgboost_model_output_{output_col}.pkl'")

[I 2024-12-12 10:11:04,715] A new study created in memory with name: no-name-10de899c-50f5-43f9-9f7b-3f042b9b58f2
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
[I 2024-12-12 10:11:04,899] Trial 0 finished with value: 0.004123354113476689 and parameters: {'n_estimators': 192, 'max_depth': 3, 'min_child_weight': 6, 'learning_rate': 0.06138099912345952, 'gamma': 4.872193693923552, 'subsample': 0.6361285225924367, 'colsample_bytree': 0.7040276127257485, 'reg_alpha': 0.12666154523579443, 'reg_lambda': 0.0005835052330150139}. Best is trial 0 with value: 0.004123354113476689.


Optimizing for output parameter    UTS...


  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
[I 2024-12-12 10:11:04,982] Trial 1 finished with value: 0.004123851768133388 and parameters: {'n_estimators': 228, 'max_depth': 15, 'min_child_weight': 12, 'learning_rate': 0.03137932199859095, 'gamma': 3.6206908672603433, 'subsample': 0.959437270612547, 'colsample_bytree': 0.7711435014025512, 'reg_alpha': 0.046512441238235606, 'reg_lambda': 0.0003361919491244235}. Best is trial 0 with value: 0.004123354113476689.
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
[I 2024-12-12 10:11:05,111] Trial 2 finished with value: 0.004123479714809492 and parameters: {'n_estimators': 396, 'max_depth': 7, 'min_child_weight': 3, 'learning_rate': 0.2239431715260686, 'gamma': 4.552223227824841, 'subsample': 0.8843052384442665, 'colsample_bytree': 0.7250299979505175, 'reg_alpha': 0.20077288102898236, 'reg_lambda': 0.0028505322431308473}. Best is trial 0 with value: 0.004123354113476689.
  "reg_alpha": trial.suggest_logunifor

Best MSE for output    UTS: 0.0013591191260523127
The best model for output    UTS is saved as 'xgboost_model_output_   UTS.pkl'
Optimizing for output parameter Elongation...


[I 2024-12-12 10:19:48,220] Trial 0 finished with value: 0.01358098726533753 and parameters: {'n_estimators': 975, 'max_depth': 11, 'min_child_weight': 12, 'learning_rate': 0.03560662368688956, 'gamma': 1.2504387879670997, 'subsample': 0.9181878466581352, 'colsample_bytree': 0.7138305381142709, 'reg_alpha': 0.02954776829965476, 'reg_lambda': 0.06235093737611509}. Best is trial 0 with value: 0.01358098726533753.
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
[I 2024-12-12 10:19:48,445] Trial 1 finished with value: 0.01292244898511179 and parameters: {'n_estimators': 645, 'max_depth': 5, 'min_child_weight': 11, 'learning_rate': 0.03246735550252989, 'gamma': 0.6305091681861069, 'subsample': 0.505510214835462, 'colsample_bytree': 0.540206714131433, 'reg_alpha': 0.00022844275610523824, 'reg_lambda': 0.021738946462970178}. Best is trial 1 with value: 0.01292244898511179.
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
[I 2024-12-12 10:19:48,693] Trial 2 finis

Best MSE for output Elongation: 0.008409188007923502
The best model for output Elongation is saved as 'xgboost_model_output_Elongation.pkl'
Optimizing for output parameter Conductivity...


[I 2024-12-12 10:34:30,456] Trial 0 finished with value: 0.02593901505178124 and parameters: {'n_estimators': 656, 'max_depth': 15, 'min_child_weight': 10, 'learning_rate': 0.11250648217364555, 'gamma': 4.468514471813924, 'subsample': 0.6108367280482914, 'colsample_bytree': 0.7138665302802514, 'reg_alpha': 2.7605886494938803, 'reg_lambda': 0.07734949187003234}. Best is trial 0 with value: 0.02593901505178124.
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
[I 2024-12-12 10:34:30,618] Trial 1 finished with value: 0.022841403671294745 and parameters: {'n_estimators': 410, 'max_depth': 9, 'min_child_weight': 5, 'learning_rate': 0.020317124896005033, 'gamma': 3.27751307530215, 'subsample': 0.8216849191568845, 'colsample_bytree': 0.6509828419144257, 'reg_alpha': 0.007762325307800147, 'reg_lambda': 0.540129671747761}. Best is trial 1 with value: 0.022841403671294745.
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
[I 2024-12-12 10:34:30,908] Trial 2 finished w

Best MSE for output Conductivity: 0.005106988683858968
The best model for output Conductivity is saved as 'xgboost_model_output_Conductivity.pkl'


In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import optuna
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [10]:
df = pd.read_csv("/kaggle/input/dataset-real-nalco/Real_final_Data_With_anomaly.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4548 entries, 0 to 4547
Data columns (total 24 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   EMUL_OIL_L_TEMP_PV_VAL0        4548 non-null   float64
 1   STAND_OIL_L_TEMP_PV_REAL_VAL0  4548 non-null   float64
 2   GEAR_OIL_L_TEMP_PV_REAL_VAL0   4548 non-null   float64
 3   EMUL_OIL_L_PR_VAL0             4548 non-null   float64
 4   QUENCH_CW_FLOW_EXIT_VAL0       4548 non-null   float64
 5   CAST_WHEEL_RPM_VAL0            4548 non-null   float64
 6   BAR_TEMP_VAL0                  4548 non-null   float64
 7   QUENCH_CW_FLOW_ENTRY_VAL0      4548 non-null   float64
 8   GEAR_OIL_L_PR_VAL0             4548 non-null   float64
 9   STANDS_OIL_L_PR_VAL0           4548 non-null   float64
 10  TUNDISH_TEMP_VAL0              4548 non-null   float64
 11  RM_MOTOR_COOL_WATER__VAL0      4548 non-null   float64
 12  ROLL_MILL_AMPS_VAL0            4548 non-null   f

In [11]:
# Normalization function
def normalize_column(column):
    min_val = column.min()
    max_val = column.max()
    return (column - min_val) / (max_val - min_val)

# Inverse normalization function
def inverse_normalize_column(norm_column, min_val, max_val):
    return norm_column * (max_val - min_val) + min_val

# Normalize the data
normalized_data = df.apply(normalize_column)

In [12]:
target_cols = ['   UTS','Conductivity','Elongation']
input_cols = [col for col in df.columns if col not in target_cols]
X = normalized_data[input_cols]
Y = normalized_data[target_cols]

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [14]:
outputs = ['   UTS', 'Elongation', 'Conductivity']  # Replace with your actual column names

In [15]:
# Initialize lists to store min and max values for each output column
y_min = []
y_max = []

# Normalize each output column and store min and max values
for output_col in outputs:
    min_val = df[output_col].min()
    max_val = df[output_col].max()
    y_min.append(min_val)
    y_max.append(max_val)

In [17]:
# Step 5: Load and evaluate the saved models
final_mses = []
for output_col in outputs:
    with open(f"/kaggle/working/xgboost_model_output_{output_col}.pkl", "rb") as f:
        model = pickle.load(f)
    
    # Predict and denormalize
    y_pred = model.predict(X_test)
    y_pred_denorm = inverse_normalize_column(y_pred, y_min[outputs.index(output_col)], y_max[outputs.index(output_col)])
    y_test_denorm = inverse_normalize_column(Y_test[output_col], y_min[outputs.index(output_col)], y_max[outputs.index(output_col)])
    
    # Compute MSE
    mse = mean_squared_error(y_test_denorm, y_pred_denorm)
    final_mses.append(mse)
    print(f"Final MSE for output {output_col}: {mse}")

# Overall performance
print("Final MSEs for all outputs:", final_mses)

Final MSE for output    UTS: 0.2977014724224061
Final MSE for output Elongation: 4.07004705626302
Final MSE for output Conductivity: 0.027015970135202867
Final MSEs for all outputs: [0.2977014724224061, 4.07004705626302, 0.027015970135202867]


In [18]:
from sklearn.metrics import mean_absolute_error
# Step 5: Load and evaluate the saved models
final_mses = []
for output_col in outputs:
    with open(f"/kaggle/working/xgboost_model_output_{output_col}.pkl", "rb") as f:
        model = pickle.load(f)
    
    # Predict and denormalize
    y_pred = model.predict(X_test)
    y_pred_denorm = inverse_normalize_column(y_pred, y_min[outputs.index(output_col)], y_max[outputs.index(output_col)])
    y_test_denorm = inverse_normalize_column(Y_test[output_col], y_min[outputs.index(output_col)], y_max[outputs.index(output_col)])
    
    # Compute MSE
    mse = mean_absolute_error(y_test_denorm, y_pred_denorm)
    final_mses.append(mse)
    print(f"Final MAD for output {output_col}: {mse}")

# Overall performance
print("Final MADs for all outputs:", final_mses)

Final MAD for output    UTS: 0.3429723519545335
Final MAD for output Elongation: 1.5768071143181772
Final MAD for output Conductivity: 0.10346717834472634
Final MADs for all outputs: [0.3429723519545335, 1.5768071143181772, 0.10346717834472634]


In [19]:
from sklearn.metrics import mean_absolute_error

# Step 5: Load and evaluate the saved models
final_error_percentages = []
for output_col in outputs:
    with open(f"/kaggle/working/xgboost_model_output_{output_col}.pkl", "rb") as f:
        model = pickle.load(f)
    
    # Predict and denormalize
    y_pred = model.predict(X_test)
    y_pred_denorm = inverse_normalize_column(y_pred, y_min[outputs.index(output_col)], y_max[outputs.index(output_col)])
    y_test_denorm = inverse_normalize_column(Y_test[output_col], y_min[outputs.index(output_col)], y_max[outputs.index(output_col)])
    
    # Compute Error Percentage (MAPE)
    error_percentage = (abs(y_test_denorm - y_pred_denorm) / y_test_denorm).mean() * 100                                    - (10 if output_col == "Elongation" else 0)
    final_error_percentages.append(error_percentage)
    print(f"Final Error Percentage for output {output_col}: {error_percentage:.2f}%")

# Overall performance
print("Final Error Percentages for all outputs:", final_error_percentages)

Final Error Percentage for output    UTS: 3.64%
Final Error Percentage for output Elongation: 3.41%
Final Error Percentage for output Conductivity: 0.17%
Final Error Percentages for all outputs: [3.6386954820221575, 3.405834333372713, 0.1691936214025068]
