#Baseline LGBM Regression ( ±2σ Rule)
Problem Statement 

The objective is to automate quality control in injection moulding by predicting the mould opening distance from process signals. A regression model is trained on synthetic time-series data, and a statistical ±2σ acceptance rule is applied to classify products as valid or faulty. The trained model is then evaluated on real production data without retraining to assess generalization capability.
Dataset	    Shape	        Time Steps	Channels
Synthetic	(3147, 200, 3)	200	        3
Real	(438, 181, 7)	    181	        7 → reduced to 3

Only 3 signals overlap between datasets.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, r2_score, accuracy_score, confusion_matrix, classification_report

from lightgbm import LGBMRegressor


# Load Synthetic Dataset

In [2]:
X_dict = np.load("../dataset/synthetic/X_dict.npy", allow_pickle=True).item()
y_dict = np.load("../dataset/synthetic/y_dict.npy", allow_pickle=True).item()

X_syn = np.stack([
    X_dict["InjectionPressure"],
    X_dict["RamPosition"],
    X_dict["CavityPressure"]
], axis=-1)

y_opening = y_dict["opening"]

print(X_syn.shape)   # (3147, 200, 3)


(3147, 200, 3)


# Feature Extraction
Problem:
LGBM cannot consume raw time-series.
Solution:
Extract physically meaningful statistics per signal.

In [3]:
def extract_features(X):
    feats = []
    for i in range(X.shape[0]):
        row = []
        for c in range(X.shape[2]):
            s = X[i, :, c]
            row.extend([
                np.mean(s),
                np.std(s),
                np.min(s),
                np.max(s),
                np.ptp(s)
            ])
        feats.append(row)
    return np.array(feats)
X_feat = extract_features(X_syn)
print(X_feat.shape)  # (3147, 15)


(3147, 15)


## Train / Test Split

In [4]:
X_tr, X_te, y_tr, y_te = train_test_split(
    X_feat, y_opening, test_size=0.2, random_state=42
)


## Scaling (Synthetic Only)


In [5]:
scaler = StandardScaler()
X_tr_s = scaler.fit_transform(X_tr)
X_te_s = scaler.transform(X_te)

joblib.dump(scaler, "../models/feature1_scaler.pkl")


['../models/feature1_scaler.pkl']

In [6]:
#8. Train LGBM Regressor
lgbm = LGBMRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

lgbm.fit(X_tr_s, y_tr)
lgbm.booster_.save_model("../models/lgbm_synthetic1.txt")


  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000145 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3315
[LightGBM] [Info] Number of data points in the train set: 2517, number of used features: 13
[LightGBM] [Info] Start training from score 52.254277


<lightgbm.basic.Booster at 0x2385c53a690>

In [7]:
# Regression Evaluation (Synthetic)
y_pred = lgbm.predict(X_te_s)

print("MAE:", mean_absolute_error(y_te, y_pred))
print("R² :", r2_score(y_te, y_pred))


MAE: 0.011310298766821303
R² : 0.7999168977501718




In [8]:
# ±2σ Quality Control Rule
mu = y_tr.mean()
sigma = y_tr.std()

lower = mu - 2 * sigma
upper = mu + 2 * sigma

qc_pred = ((y_pred >= lower) & (y_pred <= upper)).astype(int)
qc_true = ((y_te >= lower) & (y_te <= upper)).astype(int)

print("QC Accuracy:", accuracy_score(qc_true, qc_pred))
print(confusion_matrix(qc_true, qc_pred))


QC Accuracy: 0.9650793650793651
[[ 12  16]
 [  6 596]]


In [9]:
# Load Real Dataset (Signal Alignment)
X_real = np.load("../dataset/real/x_real.npy")
y_real = np.load("../dataset/real/y_real.npy")

# Select same 3 channels as synthetic
X_real = X_real[:, :, :3]   # mapping justified in methodology

print(X_real.shape)  # (438, 181, 3)


(438, 181, 3)


In [19]:
print("y_real shape:", y_real.shape)
print("y_real_pred shape:", y_real_pred.shape)
print("X_real_feat shape:", X_real_feat.shape)


y_real shape: (438, 2)
y_real_pred shape: (438,)
X_real_feat shape: (438, 15)


In [21]:
y_real_opening = y_real[:, 0]
print(y_real[:5])



[[ 0.    51.079]
 [ 0.    50.399]
 [ 0.    50.732]
 [ 1.    49.906]
 [ 1.    49.709]]


In [22]:
# Ensure 1D
y_real_opening = y_real_opening.reshape(-1)

qc_real_true = ((y_real_opening >= lower) & (y_real_opening <= upper)).astype(int)
qc_real_pred = ((y_real_pred >= lower) & (y_real_pred <= upper)).astype(int)


In [23]:
print("Shapes:")
print("qc_real_true:", qc_real_true.shape)
print("qc_real_pred:", qc_real_pred.shape)

print("QC true counts:", np.bincount(qc_real_true))
print("QC pred counts:", np.bincount(qc_real_pred))


Shapes:
qc_real_true: (438,)
qc_real_pred: (438,)
QC true counts: [438]
QC pred counts: [  0 438]


# The quality class label was determined based on the position of the opening distance with respect to the acceptance boundaries [x̄ − 2s, x̄ + 2s]. Both ground truth and prediction must be converted to QC labels using the SAME ±2σ rule. Regression outputs were converted into binary quality labels using the ±2σ acceptance boundaries derived from the training set. Classification metrics were computed only after both ground truth and predicted values were transformed into quality classes.

In [27]:
import numpy as np
from sklearn.model_selection import train_test_split

# Load synthetic targets
y_dict_syn = np.load("../dataset/synthetic/y_dict.npy", allow_pickle=True).item()

# Opening distance ONLY
y_opening_syn = y_dict_syn["opening"].reshape(-1)

print("Synthetic opening shape:", y_opening_syn.shape)


Synthetic opening shape: (3147,)


In [28]:
y_train, y_test = train_test_split(
    y_opening_syn,
    test_size=0.2,
    random_state=42
)

print("y_train shape:", y_train.shape)
print("y_test shape :", y_test.shape)


y_train shape: (2517,)
y_test shape : (630,)


In [29]:
mean_train = y_train.mean()
std_train  = y_train.std()

lower = mean_train - 2 * std_train
upper = mean_train + 2 * std_train

print("Synthetic training stats:")
print("mean :", mean_train)
print("std  :", std_train)
print("lower:", lower)
print("upper:", upper)


Synthetic training stats:
mean : 52.25427685299961
std  : 0.03512299189922415
lower: 52.18403086920116
upper: 52.32452283679806


In [30]:
print("\nReal opening prediction stats:")
print("min :", y_real_pred.min())
print("max :", y_real_pred.max())
print("mean:", y_real_pred.mean())



Real opening prediction stats:
min : 52.263397844975984
max : 52.263397844975984
mean: 52.263397844975984
