In [1]:
import os
import warnings
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
    make_scorer
)
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt

from pycaret.classification import *

warnings.filterwarnings('ignore') # Suppresas warnings
RANDOM_STATE = 110

## 1. 데이터불러오기

In [2]:
RANDOM_STATE = 110

# Load data
train_df = pd.read_csv("train_pre.csv")
test_df = pd.read_csv("test_pre.csv")

In [3]:
# "OK" 값을 1로 변경
train_df['GMES_ORIGIN_INSP_JUDGE_CODE Collect Result_AutoClave'].replace('OK', 1, inplace=True)
test_df['GMES_ORIGIN_INSP_JUDGE_CODE Collect Result_AutoClave'].replace('OK', 1, inplace=True)

# 결측값을 0으로 채우기
train_df['GMES_ORIGIN_INSP_JUDGE_CODE Collect Result_AutoClave'].fillna(0, inplace=True)
test_df['GMES_ORIGIN_INSP_JUDGE_CODE Collect Result_AutoClave'].fillna(0, inplace=True)

In [4]:

train_df['GMES_ORIGIN_INSP_JUDGE_CODE Collect Result_AutoClave'].replace('OK', 1, inplace=True)
test_df['GMES_ORIGIN_INSP_JUDGE_CODE Collect Result_AutoClave'].replace('OK', 1, inplace=True)

# 결측값을 0으로 채우기
train_df['GMES_ORIGIN_INSP_JUDGE_CODE Collect Result_AutoClave'].fillna(0, inplace=True)
test_df['GMES_ORIGIN_INSP_JUDGE_CODE Collect Result_AutoClave'].fillna(0, inplace=True)


## 2. 데이터 전처리

### 2-2. 의미있는 'OK'값 처리

In [5]:
def process_ok_columns(df, col):
    # 'OK' 값을 1로 변경(인코딩)
    df[col].replace('OK', 1, inplace=True)

    # 결측값을 0으로 채우기
    df[col].fillna(0, inplace=True)

    # 'NG' 값을 0으로 변경(인코딩)
    df[col].replace('NG', 0, inplace=True)

cols = ['Chamber Temp. Judge Value_AutoClave', 'GMES_ORIGIN_INSP_JUDGE_CODE Collect Result_AutoClave']
process_ok_columns(train_df, cols[0])
process_ok_columns(test_df,  cols[0])
process_ok_columns(train_df, cols[1])
process_ok_columns(test_df,  cols[1])

### 2-3. 여러 공정에 있는 동일한 컬럼을 1개로 통일

> - Model.Suffix (모든 공정)
- PalletID (모든 공정)
- Workorder (모든 공정)
- Production Qty (모든 공정)
- Receip No (모든 공정)

In [6]:
##############################################
### 1. Equipment 인코딩
# Equipment_Dam 컬럼에서 #1을 포함하는 값은 0으로, #2를 포함하는 값은 1로 변환
def convert_equipment_values(row):
    if '#1' in str(row):
        return 0
    elif '#2' in str(row):
        return 1
    return row  # #1 또는 #2를 포함하지 않는 경우 원래 값 유지

train_df['Equipment_Dam'] = train_df['Equipment_Dam'].apply(convert_equipment_values)
train_df['Equipment_Fill1'] = train_df['Equipment_Fill1'].apply(convert_equipment_values)
train_df['Equipment_Fill2'] = train_df['Equipment_Fill2'].apply(convert_equipment_values)

test_df['Equipment_Dam'] = test_df['Equipment_Dam'].apply(convert_equipment_values)
test_df['Equipment_Fill1'] = test_df['Equipment_Fill1'].apply(convert_equipment_values)
test_df['Equipment_Fill2'] = test_df['Equipment_Fill2'].apply(convert_equipment_values)

# 컬럼 그룹 정의
equipment_cols = ['Equipment_Dam', 'Equipment_Fill1', 'Equipment_Fill2']
model_suffix_cols = [col for col in train_df.columns if "Model.Suffix" in col]
workorder_cols = [col for col in train_df.columns if "Workorder" in col]
receip_no_cols = [col for col in train_df.columns if "Receip No Collect Result" in col]
pallet_id_cols = [col for col in train_df.columns if "PalletID" in col]
production_qty_cols = [col for col in train_df.columns if "Production" in col]

# 컬럼 그룹을 딕셔너리로 관리
columns_groups = {
    'Model Suffix': model_suffix_cols,
    'Workorder': workorder_cols,
    'Receip No': receip_no_cols,
    'Pallet ID': pallet_id_cols,
    'Production Qty': production_qty_cols,
    'Equipment': equipment_cols  # 추가된 그룹
}


##############################################
### 2. 불일치 ROWS 제거
# 주어진 컬럼 그룹에서 서로 다른 값을 찾는 함수
def find_differences(data, columns):
    if not columns:
        return None  # 주어진 컬럼 그룹이 비어있는 경우 None 반환
    comparison_df = data[columns]
    diff_rows = comparison_df[comparison_df.nunique(axis=1) > 1]  # 행별로 유일한 값의 수를 계산, 1보다 크면 다름
    #print(f'✅ 불일치한 경우의 df shape: (len(rows), len(cols)) = {diff_rows.shape}}')
    print(f'✅ <불일치한 경우> 총 {len(diff_rows)}개 rows가 불일치함. 비교한 {len(diff_rows.columns)}개 columns 정보: {diff_rows.columns.to_list()}')
    return

# 행 제거 함수
def remove_inconsistent_rows(data, columns_groups):
    for key, cols in columns_groups.items():
        diff_rows = find_differences(data, cols)
        if diff_rows is not None:
            data.drop(diff_rows.index, inplace=True)
    return data

train_df = remove_inconsistent_rows(train_df, columns_groups)


##############################################
### 3. 중복 COLUMNS 제거
# => 마지막에 feature group 어차피 선택할거라 걍 skip
'''
remove_cols = ['Model.Suffix_AutoClave', 'Model.Suffix_Fill1', 'Model.Suffix_Fill2',
               'Equipment_Fill1', 'Equipment_Fill2',
               'Workorder_AutoClave', 'Workorder_Fill1', 'Workorder_Fill2',
               'Receip No Collect Result_Fill1', 'Receip No Collect Result_Fill2',
               'Production Qty Collect Result_Fill1', 'Production Qty Collect Result_Fill2',
               'PalletID Collect Result_Fill1','PalletID Collect Result_Fill2']

# 결과 출력
print("삭제 전:")
print(train_df.shape)
print(test_df.shape)

# 삭제
train_df = train_df.drop(columns=remove_cols)
test_df  = test_df.drop(columns=remove_cols)

# 결과 출력
print("삭제 후:")
print(train_df.shape)
print(test_df.shape)
'''

✅ <불일치한 경우> 총 0개 rows가 불일치함. 비교한 4개 columns 정보: ['Model.Suffix_Dam', 'Model.Suffix_AutoClave', 'Model.Suffix_Fill1', 'Model.Suffix_Fill2']
✅ <불일치한 경우> 총 0개 rows가 불일치함. 비교한 4개 columns 정보: ['Workorder_Dam', 'Workorder_AutoClave', 'Workorder_Fill1', 'Workorder_Fill2']
✅ <불일치한 경우> 총 4개 rows가 불일치함. 비교한 3개 columns 정보: ['Receip No Collect Result_Dam', 'Receip No Collect Result_Fill1', 'Receip No Collect Result_Fill2']
✅ <불일치한 경우> 총 83개 rows가 불일치함. 비교한 9개 columns 정보: ['PalletID Collect Result_Dam', 'PalletID Unit Time_Dam', 'PalletID Judge Value_Dam', 'PalletID Collect Result_Fill1', 'PalletID Unit Time_Fill1', 'PalletID Judge Value_Fill1', 'PalletID Collect Result_Fill2', 'PalletID Unit Time_Fill2', 'PalletID Judge Value_Fill2']
✅ <불일치한 경우> 총 93개 rows가 불일치함. 비교한 9개 columns 정보: ['Production Qty Collect Result_Dam', 'Production Qty Unit Time_Dam', 'Production Qty Judge Value_Dam', 'Production Qty Collect Result_Fill1', 'Production Qty Unit Time_Fill1', 'Production Qty Judge Value_Fill1', 'Produ

'\nremove_cols = [\'Model.Suffix_AutoClave\', \'Model.Suffix_Fill1\', \'Model.Suffix_Fill2\',\n               \'Equipment_Fill1\', \'Equipment_Fill2\',\n               \'Workorder_AutoClave\', \'Workorder_Fill1\', \'Workorder_Fill2\',\n               \'Receip No Collect Result_Fill1\', \'Receip No Collect Result_Fill2\',\n               \'Production Qty Collect Result_Fill1\', \'Production Qty Collect Result_Fill2\',\n               \'PalletID Collect Result_Fill1\',\'PalletID Collect Result_Fill2\']\n\n# 결과 출력\nprint("삭제 전:")\nprint(train_df.shape)\nprint(test_df.shape)\n\n# 삭제\ntrain_df = train_df.drop(columns=remove_cols)\ntest_df  = test_df.drop(columns=remove_cols)\n\n# 결과 출력\nprint("삭제 후:")\nprint(train_df.shape)\nprint(test_df.shape)\n'

### 2-4. 고윳값이 1개인 컬럼 삭제
- 모든 rows에 동일한 값만 들어있는 column은 분류에 아무 영향을 안 줌

In [7]:
print("삭제 전:")
print(train_df.shape)
print(test_df.shape)


### 고유치가 1개인 컬럼 제거
# unique data가 1개 이하만 들어있는 컬럼
unique_counts = train_df.nunique()
tr_cols_to_drop = unique_counts[unique_counts <= 1].index

# unique data가 1개 이하만 들어있는 컬럼
unique_counts = test_df.nunique()
te_cols_to_drop = unique_counts[unique_counts <= 1].index

# 1개 이하만 들어있는 컬럼 삭제
train_df = train_df.drop(columns=tr_cols_to_drop)
test_df  = test_df.drop(columns=te_cols_to_drop)



### 장치#1, 장치#2에 따라 각각 고유값이 1개인 컬럼 제거
def remove_single_value_columns(df):
    # Equipment_Dam 값에 따른 데이터 분할
    group_0 = df[df['Equipment_Dam'] == 0]
    group_1 = df[df['Equipment_Dam'] == 1]

    # 유니크한 값이 1인 컬럼을 찾고 제거하는 함수
    def find_single_value_columns(data):
        columns_to_remove = []
        for col in data.columns:
            # Equipment_Dam 컬럼은 제외
            if col != 'Equipment_Dam' and len(data[col].unique()) == 1:
                columns_to_remove.append(col)
        return columns_to_remove

    # 각 그룹에 대해 유니크한 값이 1인 컬럼 찾기
    columns_to_remove_0 = find_single_value_columns(group_0)
    columns_to_remove_1 = find_single_value_columns(group_1)

    # 두 그룹에서 공통으로 제거할 컬럼 찾기
    common_columns_to_remove = list(set(columns_to_remove_0) & set(columns_to_remove_1))

    # 공통 컬럼 제거
    return df.drop(columns=common_columns_to_remove, inplace=False), common_columns_to_remove


train_df, train_removed_cols = remove_single_value_columns(train_df)
test_df = test_df.drop(columns=train_removed_cols, errors='ignore')


# 결과 출력
print("삭제 후:")
print(train_df.shape)
print(test_df.shape)

삭제 전:
(40506, 465)
(17361, 466)
삭제 후:
(40506, 116)
(17361, 116)


### 2-7. OK값 결측치 처리

In [8]:
# 결과 출력
print("삭제 전:")
print(train_df.shape)
print(test_df.shape)


### 1. 'OK' 값을 NaN으로 변환
train_df.replace('OK', np.nan, inplace=True)
test_df.replace('OK', np.nan, inplace=True)


### 2. 모든 값이 결측치인 컬럼 삭제
# 모든 값이 결측값인 컬럼을 제거하는 함수
def remove_all_missing_columns(data):
    # 결측값이 모든 행에 있는 컬럼만 True로 표시됩니다.
    is_all_missing = data.isnull().all()
    # True가 아닌 컬럼만 선택하여 데이터프레임을 새로 생성합니다.
    cleaned_data = data.loc[:, ~is_all_missing]
    return cleaned_data

train_df = remove_all_missing_columns(train_df)
test_df = remove_all_missing_columns(test_df)


# 결과 출력
print("삭제 후:")
print(train_df.shape)
print(test_df.shape)

삭제 전:
(40506, 116)
(17361, 116)
삭제 후:
(40506, 116)
(17361, 116)


In [9]:

train_df = train_df[['Production Qty Collect Result_Dam','Workorder_AutoClave', 'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1',
                     'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill1',
                     'DISCHARGED TIME OF RESIN(Stage2) Collect Result_Fill1','DISCHARGED SPEED OF RESIN Collect Result_Fill1'
                     ,'target']]
test_df = test_df[['Production Qty Collect Result_Dam','Workorder_AutoClave', 'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1',
                     'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill1',
                     'DISCHARGED TIME OF RESIN(Stage2) Collect Result_Fill1','DISCHARGED SPEED OF RESIN Collect Result_Fill1'
                     ]]

In [10]:
#Qty + Work
'''
train_df = pd.read_csv("train_pre.csv")
test_df = pd.read_csv("test_pre.csv")


train_df = train_df[['Production Qty Collect Result_Dam','Workorder_AutoClave','target']]
test_df = test_df[['Production Qty Collect Result_Dam','Workorder_AutoClave']]
'''

'\ntrain_df = pd.read_csv("train_pre.csv")\ntest_df = pd.read_csv("test_pre.csv")\n\n\ntrain_df = train_df[[\'Production Qty Collect Result_Dam\',\'Workorder_AutoClave\',\'target\']]\ntest_df = test_df[[\'Production Qty Collect Result_Dam\',\'Workorder_AutoClave\']]\n'

In [11]:
# Work + Qty
'''
train_df = pd.read_csv("train_pre.csv")
test_df = pd.read_csv("test_pre.csv")


train_df = train_df[['Workorder_AutoClave','Production Qty Collect Result_Dam','target']]
test_df = test_df[['Workorder_AutoClave','Production Qty Collect Result_Dam']]
'''

'\ntrain_df = pd.read_csv("train_pre.csv")\ntest_df = pd.read_csv("test_pre.csv")\n\n\ntrain_df = train_df[[\'Workorder_AutoClave\',\'Production Qty Collect Result_Dam\',\'target\']]\ntest_df = test_df[[\'Workorder_AutoClave\',\'Production Qty Collect Result_Dam\']]\n'

In [None]:
#train_df = train_df.drop(columns ="Unnamed: 0")
#test_df = test_df.drop(columns = "Unnamed: 0")
#test_df = test_df.drop(['Set ID', 'target'], axis=1)

## 3. 모델

In [60]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from pycaret.classification import setup, create_model, tune_model, blend_models, compare_models, predict_model
import numpy as np

# 데이터 준비 및 초기 설정
RANDOM_STATE = 110
train_split_df, valid_df = train_test_split(
    train_df,
    test_size=0.2,
    stratify=train_df["target"],
    random_state=RANDOM_STATE,
)

y_train = train_split_df['target']
X_train = train_split_df.drop('target', axis=1)
y_valid = valid_df['target']
X_valid = valid_df.drop('target', axis=1)

# 결과 저장을 위한 리스트
results = []
from pycaret.classification import *

# PyCaret setup
clf = setup(data=X_train.join(y_train), target='target', session_id=RANDOM_STATE, n_jobs=4)

# 랜덤 포레스트 모델 생성 및 하이퍼파라미터 튜닝
model_list = []
for model_name in ['rf', 'lightgbm', 'catboost', 'xgboost', 'gbc']:
    
    if model_name == 'rf':
        # 랜덤 포레스트 모델 생성
        model = create_model(model_name, fold=10)
        
        # 중요하다고 여겨지는 하이퍼파라미터만 튜닝
        custom_grid = {
            'n_estimators':[130, 140,150, 160, 170],  # 트리의 개수 범위 확장 
            # [100, 200, 300, 400, 500]중 200이 최적 
            #[180, 200, 220, 250]중 180이 최적
            #지금은 170이 최적 
            'max_depth': [30, 35, 40],          # 트리의 최대 깊이 범위 확장 # [10, 15, 20, 25, 30]중 30이 최적 [30, 35, 40]중 30이최적
            'min_samples_split': [3],        # 현재 3이 최적인듯 보임
            'min_samples_leaf': [3],        # 마찬가지
            'max_features': ['sqrt', 'auto', None] 
        }
        
        tuned_model = tune_model(
            model, 
            fold=10, 
            n_iter=15,  # 여기를 작게하면 조합이 많아도 다 안찾고 조기종료 
            optimize='F1',
            custom_grid=custom_grid,
            search_library='scikit-learn',   
            search_algorithm='grid'       # Randomized Search 초기에 범위 많이 두고 탐색할때는 랜덤서치하시고 경우의 수 적어지면 grid 
        )
        
        # 최적의 하이퍼파라미터 출력
        best_params = tuned_model.get_params()
        print(f"최적의 랜덤 포레스트 하이퍼파라미터: {best_params}")
        
        model_list.append(tuned_model)
    else:
        # 나머지 모델은 자동으로 하이퍼파라미터 튜닝
        model = create_model(model_name, fold=10)
        tuned_model = tune_model(model, fold=10, n_iter=10, optimize='F1')
        model_list.append(tuned_model)

# 모델 블렌딩
blended_model = blend_models(estimator_list=model_list, fold=10, method='soft', optimize='F1')



# 모델 예측 및 F1 점수 계산
y_pred = predict_model(blended_model, data=X_valid)
best_f1 = 0
best_threshold = 0
for threshold in np.arange(0.50, 1.00, 0.001):
    y_valid_pred = ['Normal' if score >= threshold else 'AbNormal' for score in y_pred['prediction_score']]
    f1 = f1_score(y_valid, y_valid_pred, pos_label='AbNormal')
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = threshold

# 결과 저장
results.append({
    'feature': X_valid.columns,
    'best_threshold': best_threshold,
    'best_f1': best_f1
})

# 결과 출력
for result in results:
    print(f"Feature: {result['feature']}, Best Threshold: {result['best_threshold']:.3f}, Best F1 Score: {result['best_f1']:.4f}")

Unnamed: 0,Description,Value
0,Session id,110
1,Target,target
2,Target type,Binary
3,Target mapping,"AbNormal: 0, Normal: 1"
4,Original data shape,"(32404, 7)"
5,Transformed data shape,"(32404, 7)"
6,Transformed train set shape,"(22682, 7)"
7,Transformed test set shape,"(9722, 7)"
8,Numeric features,5
9,Categorical features,1


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9211,0.6505,0.9211,0.9029,0.9112,0.1052,0.1094
1,0.9246,0.6391,0.9246,0.9035,0.9129,0.1057,0.1124
2,0.9321,0.6437,0.9321,0.9131,0.9207,0.1752,0.1882
3,0.9228,0.6116,0.9228,0.9066,0.914,0.1326,0.1371
4,0.9286,0.6176,0.9286,0.9109,0.9185,0.1641,0.1726
5,0.9255,0.6256,0.9255,0.9078,0.9156,0.1397,0.146
6,0.9246,0.7068,0.9246,0.9119,0.9176,0.1882,0.1922
7,0.9242,0.6243,0.9242,0.9089,0.9157,0.1587,0.1637
8,0.9224,0.5989,0.9224,0.9058,0.9133,0.1317,0.1362
9,0.9198,0.5939,0.9198,0.8988,0.9085,0.0696,0.0731


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9418,0.67,0.9418,0.9183,0.9184,0.0744,0.1376
1,0.9436,0.6398,0.9436,0.933,0.9194,0.0796,0.1758
2,0.9449,0.6647,0.9449,0.9303,0.9253,0.1593,0.2395
3,0.9409,0.6144,0.9409,0.9102,0.9168,0.0478,0.0915
4,0.9422,0.6427,0.9422,0.9208,0.9224,0.1293,0.1876
5,0.9431,0.6489,0.9431,0.9237,0.9216,0.1113,0.1824
6,0.9436,0.7147,0.9436,0.9264,0.9235,0.1448,0.2183
7,0.9431,0.6573,0.9431,0.9267,0.9198,0.0898,0.1724
8,0.9409,0.6375,0.9409,0.9165,0.9199,0.1047,0.1578
9,0.944,0.6558,0.944,0.9288,0.9231,0.1362,0.2186


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 45 candidates, totalling 450 fits
최적의 랜덤 포레스트 하이퍼파라미터: {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 30, 'max_features': None, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 3, 'min_samples_split': 3, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 170, 'n_jobs': 4, 'oob_score': False, 'random_state': 110, 'verbose': 0, 'warm_start': False}


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9423,0.6749,0.9423,0.9231,0.9163,0.0397,0.1088
1,0.9436,0.632,0.9436,0.933,0.9194,0.0796,0.1758
2,0.9436,0.6554,0.9436,0.9272,0.9205,0.0905,0.1731
3,0.9427,0.5964,0.9427,0.9236,0.9169,0.04,0.1093
4,0.9431,0.6527,0.9431,0.9258,0.9188,0.0657,0.1447
5,0.9436,0.6475,0.9436,0.9286,0.9198,0.0789,0.1647
6,0.9453,0.704,0.9453,0.9483,0.9219,0.1084,0.2393
7,0.9431,0.6669,0.9431,0.9305,0.9184,0.0665,0.1559
8,0.9405,0.6479,0.9405,0.9096,0.9161,0.0474,0.0911
9,0.944,0.649,0.944,0.9471,0.9189,0.069,0.1891


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9423,0.6736,0.9423,0.9231,0.9163,0.0397,0.1088
1,0.9427,0.6396,0.9427,0.9273,0.9174,0.0532,0.1339
2,0.9431,0.658,0.9431,0.9278,0.918,0.0536,0.1344
3,0.9427,0.6019,0.9427,0.9271,0.9161,0.0273,0.095
4,0.9431,0.6642,0.9431,0.9278,0.918,0.0536,0.1344
5,0.9427,0.6508,0.9427,0.9218,0.9185,0.0645,0.1347
6,0.9431,0.7083,0.9431,0.9351,0.9176,0.0544,0.1489
7,0.9427,0.6596,0.9427,0.9273,0.9173,0.0532,0.1339
8,0.94,0.6566,0.94,0.9095,0.9166,0.058,0.1006
9,0.9431,0.6567,0.9431,0.9464,0.9168,0.042,0.1464


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9423,0.6814,0.9423,0.921,0.9186,0.0756,0.1453
1,0.9423,0.6342,0.9423,0.9212,0.9179,0.064,0.1341
2,0.9458,0.6598,0.9458,0.9375,0.9246,0.1422,0.2459
3,0.9422,0.5949,0.9422,0.9185,0.9183,0.0632,0.1262
4,0.9414,0.6482,0.9414,0.9137,0.9178,0.0608,0.1122
5,0.9427,0.6394,0.9427,0.9218,0.9185,0.0645,0.1347
6,0.9453,0.7004,0.9453,0.9429,0.9226,0.1195,0.2393
7,0.9427,0.6592,0.9427,0.9253,0.9181,0.0652,0.144
8,0.94,0.6488,0.94,0.9095,0.9166,0.058,0.1006
9,0.9436,0.6578,0.9436,0.9288,0.9208,0.1024,0.1895


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9409,0.6669,0.9409,0.915,0.9186,0.0831,0.1366
1,0.9431,0.6356,0.9431,0.9253,0.9213,0.1119,0.1888
2,0.9449,0.644,0.9449,0.9317,0.924,0.1388,0.2275
3,0.9405,0.5958,0.9405,0.9116,0.918,0.0699,0.1146
4,0.9422,0.6353,0.9422,0.9212,0.923,0.1394,0.196
5,0.9409,0.6319,0.9409,0.915,0.9196,0.0934,0.1431
6,0.9431,0.7026,0.9431,0.9248,0.9232,0.1432,0.2119
7,0.9427,0.6639,0.9427,0.9231,0.921,0.1104,0.1816
8,0.9409,0.6508,0.9409,0.9165,0.9199,0.1047,0.1578
9,0.9436,0.6452,0.9436,0.9278,0.9215,0.1134,0.1968


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9414,0.6686,0.9414,0.9153,0.9174,0.0615,0.1182
1,0.9427,0.632,0.9427,0.9253,0.9181,0.0652,0.144
2,0.9418,0.6514,0.9418,0.9185,0.9208,0.1069,0.1639
3,0.9409,0.5981,0.9409,0.9118,0.9175,0.0596,0.1064
4,0.9436,0.6338,0.9436,0.9259,0.9219,0.1128,0.1897
5,0.9427,0.6327,0.9427,0.9216,0.9192,0.0762,0.1459
6,0.944,0.6926,0.944,0.9288,0.9231,0.1362,0.2186
7,0.944,0.6634,0.944,0.9324,0.9211,0.1039,0.1995
8,0.9392,0.6552,0.9392,0.9085,0.9168,0.067,0.1051
9,0.944,0.6572,0.944,0.9324,0.9211,0.1039,0.1995


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9427,0.658,0.9427,0.946,0.9158,0.0282,0.1195
1,0.9431,0.634,0.9431,0.9351,0.9176,0.0544,0.1489
2,0.9444,0.6474,0.9444,0.9397,0.9203,0.0816,0.1907
3,0.9431,0.5904,0.9431,0.9323,0.9172,0.0412,0.1247
4,0.9436,0.6151,0.9436,0.931,0.919,0.067,0.1566
5,0.9444,0.6377,0.9444,0.9397,0.9203,0.0816,0.1907
6,0.9444,0.689,0.9444,0.9406,0.9206,0.094,0.2076
7,0.9436,0.6618,0.9436,0.9374,0.9186,0.0677,0.1705
8,0.9409,0.6373,0.9409,0.913,0.9171,0.0603,0.1117
9,0.9427,0.6326,0.9427,0.9273,0.9173,0.0532,0.1339


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9427,0.677,0.9427,0.9318,0.9166,0.0408,0.1242
1,0.9427,0.6386,0.9427,0.9318,0.9166,0.0408,0.1242
2,0.9444,0.6597,0.9444,0.9329,0.9217,0.1047,0.2004
3,0.9422,0.6031,0.9422,0.9178,0.9167,0.0389,0.0976
4,0.9444,0.668,0.9444,0.9397,0.9203,0.0816,0.1907
5,0.9427,0.6431,0.9427,0.9223,0.9177,0.0524,0.1225
6,0.944,0.708,0.944,0.935,0.9204,0.0925,0.194
7,0.9418,0.6585,0.9418,0.9168,0.9153,0.0261,0.0793
8,0.9436,0.6521,0.9436,0.9329,0.9194,0.0796,0.1758
9,0.9436,0.6545,0.9436,0.9468,0.9178,0.0556,0.1691


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9431,0.6826,0.9431,0.9464,0.9168,0.042,0.1464
1,0.9418,0.6398,0.9418,0.9172,0.9161,0.0386,0.0972
2,0.9436,0.6607,0.9436,0.931,0.919,0.067,0.1566
3,0.9427,0.6063,0.9427,0.9236,0.9169,0.04,0.1093
4,0.9431,0.6702,0.9431,0.9464,0.9164,0.0284,0.12
5,0.9431,0.6425,0.9431,0.9258,0.9188,0.0657,0.1447
6,0.9444,0.7043,0.9444,0.9475,0.9199,0.0823,0.2072
7,0.9418,0.6571,0.9418,0.9168,0.9153,0.0261,0.0793
8,0.9422,0.6515,0.9422,0.9217,0.9171,0.052,0.122
9,0.9427,0.6581,0.9427,0.946,0.9157,0.0282,0.1195


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9436,0.681,0.9436,0.9374,0.9186,0.0677,0.1705
1,0.9436,0.6428,0.9436,0.933,0.9194,0.0796,0.1758
2,0.9444,0.6625,0.9444,0.9301,0.9231,0.1266,0.213
3,0.9427,0.6079,0.9427,0.9223,0.9177,0.0524,0.1225
4,0.944,0.6571,0.944,0.9294,0.9215,0.1032,0.1904
5,0.9427,0.6472,0.9427,0.9216,0.9192,0.0762,0.1459
6,0.9449,0.7139,0.9449,0.9418,0.9216,0.1068,0.224
7,0.9436,0.6687,0.9436,0.9304,0.9201,0.0911,0.1824
8,0.9409,0.6547,0.9409,0.9141,0.9178,0.0718,0.1247
9,0.944,0.6608,0.944,0.935,0.9204,0.0925,0.194


Processing:   0%|          | 0/6 [00:00<?, ?it/s]

Feature: Index(['Production Qty Collect Result_Dam', 'Workorder_AutoClave',
       'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1',
       'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill1',
       'DISCHARGED TIME OF RESIN(Stage2) Collect Result_Fill1',
       'DISCHARGED SPEED OF RESIN Collect Result_Fill1'],
      dtype='object'), Best Threshold: 0.874, Best F1 Score: 0.1909


##  valid 예측

In [18]:
best_f1   ## 이게 0.22 이상이 나와야 합니다!

0.2209695603156708

In [19]:
def analyze_class_distribution(y_pred):
    # 예측된 클래스의 유니크 값과 해당 값들의 개수 계산
    unique_classes, counts = np.unique(y_pred, return_counts=True)

    # 클래스별 개수 및 비율 계산
    total_count = len(y_pred)
    class_counts = dict(zip(unique_classes, counts))
    class_ratios = {cls: count / total_count for cls, count in class_counts.items()}

    # 결과를 딕셔너리로 반환
    results = {
        'class_counts': class_counts,
        'class_ratios': class_ratios
    }

    return results

In [22]:
analysis = analyze_class_distribution(y_valid_pred)

print("클래스별 개수:", analysis['class_counts'])
print("클래스별 비율:", analysis['class_ratios'])

클래스별 개수: {'AbNormal': 8102}
클래스별 비율: {'AbNormal': 1.0}


## 테스트 예측

In [26]:
# 모델이 예측
y_test_pred_original = predict_model(blended_model, data=test_df)

# Best Threshold 적용하여 최종 예측
y_test_pred = ['Normal' if score >= 0.88 else 'AbNormal' for score in y_test_pred_original['prediction_score']]

# 일관성 검사 적용해줘야함
Abnormal_rows = [64, 562, 1460, 1530, 1892, 2505, 2710, 3457, 3682, 3732, 4928, 4932, 6092,
 7001, 7287, 7666, 7836, 8253, 8898, 10989, 12439, 12585, 12844, 14756, 15180, 15406, 15811, 15964]


# 주어진 인덱스에 해당하는 값이 이미 "AbNormal"인지 확인하고, 아니라면 "AbNormal"로 변경
for idx in Abnormal_rows:
    if y_test_pred[idx] != "AbNormal":
        y_test_pred[idx] = "AbNormal"
    else:
        print(f'{idx} is passed')

64 is passed
562 is passed
1530 is passed
1892 is passed
2505 is passed
3457 is passed
3732 is passed
4928 is passed
7287 is passed
7666 is passed
10989 is passed
12439 is passed
12585 is passed
15180 is passed
15964 is passed


In [27]:
analysis = analyze_class_distribution(y_test_pred)

print("클래스별 개수:", analysis['class_counts'])
print("클래스별 비율:", analysis['class_ratios'])

클래스별 개수: {'AbNormal': 2076, 'Normal': 15285}
클래스별 비율: {'AbNormal': 0.11957836530153793, 'Normal': 0.880421634698462}


In [28]:
y_pred = y_test_pred

## 제출

In [29]:
sub_data = pd.read_csv('submission.csv')

# 수정된 결과를 CSV로 저장
sub_data['target'] = y_pred
sub_data.to_csv('submission.csv', index=False)

## 추론 결과 확인
- 생성한 모델에서 어떤 변수가 가장 영향을 주는지 확인할 필요가 있다.

In [None]:
import shap
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

# 시스템에 있는 기본 폰트로 폰트 설정
font_list = fm.findSystemFonts(fontpaths=None, fontext='ttf')
if font_list:
    plt.rcParams['font.family'] = fm.FontProperties(fname=font_list[0]).get_name()

# 분석 결과 feature importance 확인
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_valid)

# feature importance plot
shap.summary_plot(shap_values, X_train, plot_type="bar",
                class_names= y_train.unique(), feature_names = X_train.columns)


print("-" * 80)
# visualize 
shap.summary_plot(shap_values, X_valid)