# Manufacturing Log Anomaly Detection (Industry Grade) with random data

This project detects abnormal patterns in manufacturing tool logs using
unsupervised learning techniques commonly used in semiconductor fabs

## Techniques Used
- Isolation Forest
- One-Class SVM
- Autoencoder (Deep Learning)

## Manufacturing failures are rare and unlabeled. Models learn normal behavior and flag deviations.

In [1]:
import sys
print(sys.executable)
print(sys.version)


E:\anamoly\venv311\Scripts\python.exe
3.11.9 (tags/v3.11.9:de54cf5, Apr  2 2024, 10:12:12) [MSC v.1938 64 bit (AMD64)]


In [2]:
import tensorflow as tf
print(tf.__version__)



2.15.0


In [21]:
import numpy as np
import pandas as pd
np.random.seed(42)
n_samples = 10000
anomaly_ratio=0.02
data = {
    "temperature": np.random.normal(70, 5, n_samples),
    "pressure": np.random.normal(30, 3, n_samples),
    "voltage": np.random.normal(220, 10, n_samples),
    "throughput": np.random.normal(100, 8, n_samples)
}

df = pd.DataFrame(data)


idx = np.random.choice(n_samples, 200, replace=False)
n_anomalies = int(n_samples * anomaly_ratio)
df.loc[idx, "temperature"] += np.random.normal(35, 8, n_anomalies)
df.loc[idx, "pressure"] -= np.random.normal(15, 5, n_anomalies)
df.loc[idx, "voltage"] += np.random.normal(40, 10, n_anomalies)


df.head()

Unnamed: 0,temperature,pressure,voltage,throughput
0,72.483571,27.964516,223.482862,84.155424
1,69.308678,29.083502,222.833236,91.560115
2,73.238443,28.207857,210.634802,95.303773
3,77.615149,30.331254,225.795842,101.197351
4,68.829233,33.591536,205.099173,108.193299


In [22]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(df)
scaler

0,1,2
,"copy  copy: bool, default=True If False, try to avoid a copy and do inplace scaling instead. This is not guaranteed to always work inplace; e.g. if the data is not a NumPy array or scipy.sparse CSR matrix, a copy may still be returned.",True
,"with_mean  with_mean: bool, default=True If True, center the data before scaling. This does not work (and will raise an exception) when attempted on sparse matrices, because centering them entails building a dense matrix which in common use cases is likely to be too large to fit in memory.",True
,"with_std  with_std: bool, default=True If True, scale the data to unit variance (or equivalently, unit standard deviation).",True


In [23]:
X_scaled

array([[ 0.24710198, -0.47553697,  0.24217218, -1.96428526],
       [-0.19505183, -0.17457591,  0.18598847, -1.04278748],
       [ 0.3522298 , -0.41008827, -0.86900742, -0.57689733],
       ...,
       [-0.58990679,  0.37720924, -0.49266066,  0.62131583],
       [ 0.24644145,  1.45453097, -1.69567423,  0.81856314],
       [ 0.34993185, -1.23515059, -0.25700045,  0.36193999]])

In [24]:
from sklearn.ensemble import IsolationForest

iso_forest = IsolationForest(
    n_estimators=200,
    contamination=0.02,
    random_state=42
)

iso_forest.fit(X_scaled)

df["iso_anomaly"] = iso_forest.predict(X_scaled)
df["iso_anomaly"] = df["iso_anomaly"].map({1: 0, -1: 1})

In [25]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam

input_dim = X_scaled.shape[1]

inputs = Input(shape=(input_dim,))
encoded = Dense(8, activation="relu")(inputs)
encoded = Dense(4, activation="relu")(encoded)

decoded = Dense(8, activation="relu")(encoded)
decoded = Dense(input_dim, activation="linear")(decoded)

autoencoder = Model(inputs, decoded)
autoencoder.compile(optimizer=Adam(0.001), loss="mse")

autoencoder.fit(
    X_scaled, X_scaled,
    epochs=50,
    batch_size=64,
    validation_split=0.1,
    verbose=1
)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x2529b37a690>

In [26]:
reconstructions = autoencoder.predict(X_scaled)
reconstruction_error = np.mean(np.square(X_scaled - reconstructions), axis=1)

threshold = np.percentile(reconstruction_error, 98)
df["ae_anomaly"] = (reconstruction_error > threshold).astype(int)




In [28]:
from sklearn.svm import OneClassSVM

ocsvm = OneClassSVM(
    kernel="rbf",
    gamma=0.05,
    nu=0.02
)

ocsvm.fit(X_scaled)

df["svm_anomaly"] = ocsvm.predict(X_scaled)
df["svm_anomaly"] = df["svm_anomaly"].map({1: 0, -1: 1})


In [29]:
print("Isolation Forest:", df["iso_anomaly"].sum())
print("One-Class SVM:", df["svm_anomaly"].sum())
print("Autoencoder:", df["ae_anomaly"].sum())


Isolation Forest: 200
One-Class SVM: 198
Autoencoder: 200


In [30]:
print(df.columns)


Index(['temperature', 'pressure', 'voltage', 'throughput', 'iso_anomaly',
       'ae_anomaly', 'svm_anomaly'],
      dtype='str')


In [31]:
df.to_csv("data/output.csv", index=False)


In [32]:
df

Unnamed: 0,temperature,pressure,voltage,throughput,iso_anomaly,ae_anomaly,svm_anomaly
0,72.483571,27.964516,223.482862,84.155424,0,0,0
1,69.308678,29.083502,222.833236,91.560115,0,0,0
2,73.238443,28.207857,210.634802,95.303773,0,0,0
3,77.615149,30.331254,225.795842,101.197351,0,0,0
4,68.829233,33.591536,205.099173,108.193299,0,0,0
...,...,...,...,...,...,...,...
9995,76.505510,31.163482,214.227121,108.957208,0,0,0
9996,60.008275,34.132100,234.446040,102.120272,0,0,0
9997,66.473416,31.135062,214.986327,104.932008,0,0,0
9998,72.478828,35.140589,201.076432,106.516988,0,0,0


In [33]:
print("Isolation Forest anomalies:")
print(df['iso_anomaly'].value_counts())

print("\nOne-Class SVM anomalies:")
print(df['svm_anomaly'].value_counts())

print("\nAutoencoder anomalies:")
print(df['ae_anomaly'].value_counts())

Isolation Forest anomalies:
iso_anomaly
0    9800
1     200
Name: count, dtype: int64

One-Class SVM anomalies:
svm_anomaly
0    9802
1     198
Name: count, dtype: int64

Autoencoder anomalies:
ae_anomaly
0    9800
1     200
Name: count, dtype: int64
