# 預測製造缺陷
#### 使用XGBboost模型
#### 資料來源:https://www.kaggle.com/datasets/rabieelkharoua/predicting-manufacturing-defects-dataset

In [88]:
import numpy as np 
import pandas as pd
df = pd.read_csv("Manufacturing Defects.csv")
df.head(10)

Unnamed: 0,ProductionVolume,ProductionCost,SupplierQuality,DeliveryDelay,DefectRate,QualityScore,MaintenanceHours,DowntimePercentage,InventoryTurnover,StockoutRate,WorkerProductivity,SafetyIncidents,EnergyConsumption,EnergyEfficiency,AdditiveProcessTime,AdditiveMaterialCost,DefectStatus
0,202,13175.403783,86.648534,1,3.121492,63.463494,9,0.052343,8.630515,0.081322,85.042379,0,2419.616785,0.468947,5.551639,236.439301,1
1,535,19770.046093,86.310664,4,0.819531,83.697818,20,4.908328,9.296598,0.038486,99.657443,7,3915.566713,0.119485,9.080754,353.957631,1
2,960,19060.820997,82.132472,0,4.514504,90.35055,1,2.464923,5.097486,0.002887,92.819264,2,3392.385362,0.496392,6.562827,396.189402,1
3,370,5647.606037,87.335966,5,0.638524,67.62869,8,4.692476,3.577616,0.055331,96.887013,8,4652.400275,0.183125,8.097496,164.13587,1
4,206,7472.222236,81.989893,3,3.867784,82.728334,9,2.746726,6.851709,0.068047,88.315554,7,1581.630332,0.263507,6.406154,365.708964,1
5,171,6975.931602,95.331919,1,3.914574,92.568436,19,3.027324,7.930009,0.074069,87.079118,7,1238.994421,0.118021,7.279442,171.711804,1
6,800,15889.69865,99.325486,3,4.789,90.729911,10,3.559561,3.046889,0.040192,91.063158,8,3138.43115,0.333913,4.891669,188.727737,1
7,120,17266.779948,99.401489,4,0.743605,92.119681,13,1.604879,8.380972,0.009702,88.705569,3,1004.108554,0.293422,9.333835,312.526896,1
8,714,8202.670495,97.301422,5,3.185856,95.172937,2,3.49492,3.668747,0.058433,94.298961,4,4150.875773,0.366683,5.517451,215.680921,1
9,221,12587.790394,92.015843,2,2.425283,97.507284,0,2.63396,5.933418,0.032955,85.316362,6,3023.891555,0.317071,5.965972,364.638176,0


In [89]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3240 entries, 0 to 3239
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   ProductionVolume      3240 non-null   int64  
 1   ProductionCost        3240 non-null   float64
 2   SupplierQuality       3240 non-null   float64
 3   DeliveryDelay         3240 non-null   int64  
 4   DefectRate            3240 non-null   float64
 5   QualityScore          3240 non-null   float64
 6   MaintenanceHours      3240 non-null   int64  
 7   DowntimePercentage    3240 non-null   float64
 8   InventoryTurnover     3240 non-null   float64
 9   StockoutRate          3240 non-null   float64
 10  WorkerProductivity    3240 non-null   float64
 11  SafetyIncidents       3240 non-null   int64  
 12  EnergyConsumption     3240 non-null   float64
 13  EnergyEfficiency      3240 non-null   float64
 14  AdditiveProcessTime   3240 non-null   float64
 15  AdditiveMaterialCost 

In [90]:
#檢查遺失值
missing_values = df.isnull().sum()
missing_values

ProductionVolume        0
ProductionCost          0
SupplierQuality         0
DeliveryDelay           0
DefectRate              0
QualityScore            0
MaintenanceHours        0
DowntimePercentage      0
InventoryTurnover       0
StockoutRate            0
WorkerProductivity      0
SafetyIncidents         0
EnergyConsumption       0
EnergyEfficiency        0
AdditiveProcessTime     0
AdditiveMaterialCost    0
DefectStatus            0
dtype: int64

In [91]:
import plotly.express as px
import numpy as np

# 計算相關矩陣
correlation_matrix = df.corr()

# 使用 Plotly 繪製熱圖，調整顏色方案和標籤
fig = px.imshow(correlation_matrix,
                labels=dict(x="Features", y="Features", color="Correlation"),
                x=correlation_matrix.columns,
                y=correlation_matrix.columns,
                color_continuous_scale=px.colors.diverging.RdBu,
                zmin=-1, zmax=1,
                text_auto=False)  # 不顯示數字

fig.update_layout(title='Correlation Matrix Heatmap', title_x=0.5, width=800, height=800)
fig.show()

In [92]:
from sklearn.preprocessing import StandardScaler

# 選擇所有數值特徵進行縮放
numerical_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# 排除標籤列
numerical_features.remove('DefectStatus')

# 初始化 StandardScaler
scaler = StandardScaler()

# 對選擇的數值特徵進行縮放
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# 查看縮放後的數據
print(df.head(10))

   ProductionVolume  ProductionCost  SupplierQuality  DeliveryDelay  \
0         -1.320785        0.174673        -0.553077      -0.914050   
1         -0.051544        1.705681        -0.611752       0.844922   
2          1.568358        1.541028        -1.337353      -1.500375   
3         -0.680447       -1.572975        -0.433695       1.431246   
4         -1.305539       -1.149373        -1.362114       0.258598   
5         -1.438942       -1.264592         0.954913      -0.914050   
6          0.958513        0.804822         1.648450       0.258598   
7         -1.633330        1.124524         1.661649       0.844922   
8          0.630721       -0.979793         1.296944       1.431246   
9         -1.248366        0.038253         0.379030      -0.327726   

   DefectRate  QualityScore  MaintenanceHours  DowntimePercentage  \
0    0.284267     -1.435903         -0.360401           -1.696637   
1   -1.473020      0.306939          1.240385            1.667488   
2    1.3476

In [93]:
df.head(10)

Unnamed: 0,ProductionVolume,ProductionCost,SupplierQuality,DeliveryDelay,DefectRate,QualityScore,MaintenanceHours,DowntimePercentage,InventoryTurnover,StockoutRate,WorkerProductivity,SafetyIncidents,EnergyConsumption,EnergyEfficiency,AdditiveProcessTime,AdditiveMaterialCost,DefectStatus
0,-1.320785,0.174673,-0.553077,-0.91405,0.284267,-1.435903,-0.360401,-1.696637,1.120811,1.057329,-0.873315,-1.585593,-0.493285,1.453584,0.030618,-0.542069,1
1,-0.051544,1.705681,-0.611752,0.844922,-1.47302,0.306939,1.240385,1.667488,1.406753,-0.430377,1.680553,0.831645,0.803883,-1.549139,1.389114,0.467869,1
2,1.568358,1.541028,-1.337353,-1.500375,1.347673,0.879958,-1.52461,-0.025252,-0.39588,-1.666767,0.485635,-0.894954,0.350222,1.689402,0.419865,0.830804,1
3,-0.680447,-1.572975,-0.433695,1.431246,-1.611198,-1.077143,-0.505928,1.51795,-1.048344,0.154648,1.196442,1.176965,1.442806,-1.002313,1.010619,-1.163435,1
4,-1.305539,-1.149373,-1.362114,0.258598,0.853976,0.223434,-0.360401,0.169975,0.357189,0.596292,-0.301354,0.831645,-1.21992,-0.311639,0.359555,0.568859,1
5,-1.438942,-1.264592,0.954913,-0.91405,0.889694,1.070991,1.094859,0.364368,0.820092,0.805454,-0.517411,0.831645,-1.517027,-1.561718,0.695718,-1.098329,1
6,0.958513,0.804822,1.64845,0.258598,1.55722,0.912634,-0.214875,0.733091,-1.27618,-0.371126,0.178769,1.176965,0.130013,0.293318,-0.22343,-0.952096,1
7,-1.63333,1.124524,1.661649,0.844922,-1.530981,1.032339,0.221703,-0.621073,1.013685,-1.43009,-0.233202,-0.549634,-1.720701,-0.054595,1.486535,0.111818,1
8,0.630721,-0.979793,1.296944,1.431246,0.333401,1.295325,-1.379084,0.688308,-1.009223,0.262372,0.7442,-0.204314,1.007924,0.574891,0.017458,-0.720464,1
9,-1.248366,0.038253,0.37903,-0.327726,-0.24721,1.496389,-1.670136,0.091853,-0.037023,-0.622498,-0.825439,0.486325,0.030694,0.148608,0.190112,0.559656,0


In [94]:
from sklearn.model_selection import train_test_split

# 分割數據為特徵和標籤
X = df.drop('DefectStatus', axis=1)
y = df['DefectStatus']

# 分割數據集為訓練集和測試集 (70%訓練, 30%測試)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [95]:
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

# 定義參數網格
param_grid = {
    'max_depth': [3, 5, 7, 9],
    'n_estimators': [50, 100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1, 0.2]
}

# 初始化 XGBoost 分類器
xgb_classifier = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss')

# 初始化 GridSearchCV
grid_search = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# 執行網格搜索
grid_search.fit(X_train, y_train)

# 輸出最佳參數
print("Best parameters found: ", grid_search.best_params_)

Best parameters found:  {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 50}


In [96]:
from sklearn.metrics import classification_report, roc_curve, auc, precision_recall_curve, f1_score, accuracy_score, confusion_matrix
import matplotlib.pyplot as plt

# 使用最佳參數初始化 XGBoost 分類器
best_xgb_classifier = grid_search.best_estimator_

# 使用最佳參數重新訓練模型
best_xgb_classifier.fit(X_train, y_train)

# 進行預測
y_pred = best_xgb_classifier.predict(X_test)
y_pred_proba = best_xgb_classifier.predict_proba(X_test)[:, 1]

# 計算評估指標
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print('Accuracy:', accuracy)
print('F1 Score:', f1)
print('Confusion Matrix:\n', conf_matrix)
print('Classification Report:\n', report)

Accuracy: 0.9567901234567902
F1 Score: 0.9749403341288783
Confusion Matrix:
 [[113  35]
 [  7 817]]
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.76      0.84       148
           1       0.96      0.99      0.97       824

    accuracy                           0.96       972
   macro avg       0.95      0.88      0.91       972
weighted avg       0.96      0.96      0.95       972



In [97]:
import plotly.express as px
import pandas as pd

# 提取特徵重要性
feature_importance = best_xgb_classifier.feature_importances_

# 創建特徵重要性數據框
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importance})

# 排序特徵重要性
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# 使用 Plotly 視覺化特徵重要性
fig = px.bar(feature_importance_df, x='Importance', y='Feature', orientation='h', 
             title='Feature Importance from XGBoost')
fig.update_layout(yaxis={'categoryorder':'total ascending'})

# 顯示圖表
fig.show()


In [98]:
from sklearn.metrics import roc_curve, roc_auc_score
import plotly.graph_objects as go

# 獲取真實標籤和預測概率
y_true = y_test
y_pred_proba = best_xgb_classifier.predict_proba(X_test)[:, 1]

# 計算 ROC 曲線
fpr, tpr, thresholds = roc_curve(y_true, y_pred_proba)
roc_auc = roc_auc_score(y_true, y_pred_proba)

# 繪製 ROC 曲線
fig = go.Figure()
fig.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', name=f'ROC curve (area = {roc_auc:.2f})'))
fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='Random Guess', line=dict(dash='dash')))
fig.update_layout(title='Receiver Operating Characteristic (ROC) Curve', xaxis_title='False Positive Rate', yaxis_title='True Positive Rate')
fig.show()