# Model #2

In [None]:
!pip install catboost imblearn

In [None]:
import os
from pprint import pprint
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from tqdm import tqdm

from catboost import CatBoostClassifier
from sklearn.metrics import f1_score
import itertools

from imblearn.over_sampling import SMOTENC
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from collections import Counter
from sklearn.model_selection import StratifiedKFold, GridSearchCV

In [None]:
ROOT_DIR = "."
DATA_DIR = os.path.join(ROOT_DIR, ".")

train_data_v1 =  pd.read_csv(os.path.join(DATA_DIR, "data/train.csv"))
test_data_v1 = pd.read_csv(os.path.join(DATA_DIR, "data/test.csv"))

In [None]:

lst2 = ['Equipment_Dam', 'Model.Suffix_Dam', 'Workorder_Dam',
       'CURE END POSITION X Collect Result_Dam',
       'CURE SPEED Collect Result_Dam',
       'DISCHARGED SPEED OF RESIN Collect Result_Dam',
       'DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam',
       'DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam',
       'Dispense Volume(Stage1) Collect Result_Dam',
       'Dispense Volume(Stage2) Collect Result_Dam',
       'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam',
       'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam',
       'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam',
       'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam',
       'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam',
       'HEAD Standby Position X Collect Result_Dam',
       'Production Qty Collect Result_Dam', 'Receip No Collect Result_Dam',
       'Stage2 Circle1 Distance Speed Collect Result_Dam',
       'THICKNESS 2 Collect Result_Dam', 'THICKNESS 3 Collect Result_Dam',
       '1st Pressure Collect Result_AutoClave',
       '1st Pressure 1st Pressure Unit Time_AutoClave',
       '2nd Pressure Collect Result_AutoClave',
       '3rd Pressure Collect Result_AutoClave',
       '3rd Pressure Unit Time_AutoClave',
       'Chamber Temp. Collect Result_AutoClave',
       'Chamber Temp. Judge Value_AutoClave', 'Equipment_Fill1',
       'DISCHARGED SPEED OF RESIN Collect Result_Fill1',
       'DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill1',
       'DISCHARGED TIME OF RESIN(Stage2) Collect Result_Fill1',
       'Dispense Volume(Stage1) Collect Result_Fill1',
       'Dispense Volume(Stage3) Collect Result_Fill1',
       'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1',
       'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1',
       'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill1',
       'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill1',
       'Head Purge Position Z Collect Result_Fill1', 'Equipment_Fill2',
       'CURE END POSITION X Collect Result_Fill2',
       'CURE END POSITION Z Collect Result_Fill2',
       'CURE SPEED Collect Result_Fill2',
       'CURE STANDBY POSITION Z Collect Result_Fill2',
       'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2',
       'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill2',
       'WorkMode Collect Result_Fill2',
        'target']

In [None]:
# Equipment_xxx 값 #1, #2로 통일
train_data_v1['Equipment_Dam'] = train_data_v1['Equipment_Dam'].replace({
    'Dam dispenser #1': '#1',
    'Dam dispenser #2': '#2'
})

train_data_v1['Equipment_Fill1'] = train_data_v1['Equipment_Fill1'].replace({
    'Fill1 dispenser #1': '#1',
    'Fill1 dispenser #2': '#2'
})

train_data_v1['Equipment_Fill2'] = train_data_v1['Equipment_Fill2'].replace({
    'Fill2 dispenser #1': '#1',
    'Fill2 dispenser #2': '#2'
})

# Equipment_xxx 값 #1, #2로 통일
test_data_v1 ['Equipment_Dam'] = test_data_v1 ['Equipment_Dam'].replace({
    'Dam dispenser #1': '#1',
    'Dam dispenser #2': '#2'
})

test_data_v1 ['Equipment_Fill1'] = test_data_v1 ['Equipment_Fill1'].replace({
    'Fill1 dispenser #1': '#1',
    'Fill1 dispenser #2': '#2'
})

test_data_v1 ['Equipment_Fill2'] = test_data_v1 ['Equipment_Fill2'].replace({
    'Fill2 dispenser #1': '#1',
    'Fill2 dispenser #2': '#2'
})

In [None]:
outlier_ok = ['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam',
              'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1', 'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2']
train_data_v1[outlier_ok] = train_data_v1[outlier_ok].apply(pd.to_numeric, errors='coerce').fillna(0)
test_data_v1[outlier_ok] = test_data_v1[outlier_ok].apply(pd.to_numeric, errors='coerce').fillna(0)


train_data_v1['target'] = train_data_v1['target'].apply(lambda x: 0 if x == 'Normal' else 1)

train_data = train_data_v1[lst2]
test_data = test_data_v1[lst2]

In [None]:
category=['Equipment_Dam', 'Model.Suffix_Dam', 'Workorder_Dam', 'Chamber Temp. Judge Value_AutoClave', 'Equipment_Fill1', 'Equipment_Fill2']

In [None]:

X = train_data.drop('target', axis=1)
y = train_data['target']

# 범주형 변수의 인덱스 목록 생성
categorical_features = [X.columns.get_loc(col) for col in category]

# SMOTENC로 소수 클래스의 데이터를 다수 클래스의 절반 정도로 증강
smote_nc = SMOTENC(categorical_features=categorical_features, sampling_strategy=0.5, random_state=42)

# Undersampling을 통해 다수 클래스의 데이터를 소수 클래스와 동일하게 줄이기
undersample = RandomUnderSampler(sampling_strategy=1.0, random_state=42)

# 파이프라인을 통해 SMOTENC와 Undersampling 결합
pipeline = Pipeline(steps=[('smote', smote_nc), ('undersample', undersample)])

# 데이터에 파이프라인 적용하여 Resampling 수행
X_res, y_res = pipeline.fit_resample(X, y)

# Resampling 후 클래스 분포 확인
print(f'Resampled dataset shape {Counter(y_res)}')

# 결과 확인
df_resampled = pd.concat([pd.DataFrame(X_res, columns=X.columns), pd.DataFrame(y_res, columns=['target'])], axis=1)
# print(df_resampled)
print(df_resampled['target'].value_counts())

Resampled dataset shape Counter({0: 19078, 1: 19078})
target
0    19078
1    19078
Name: count, dtype: int64


In [None]:
X = df_resampled.drop('target', axis=1)
y = df_resampled['target']
X_test = test_data.drop('target', axis=1)

# Stratified K-Fold Cross-Validation 설정
n_splits = 5  # K 값 설정
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

train_fold_predict_c = np.zeros((X.shape[0], 1))
val_predict_c = np.zeros((X_test.shape[0], 5))

f1_scores = []

# K-Fold Cross-Validation을 통한 모델 학습 및 평가
for cnt,(train_index, val_index) in enumerate(skf.split(X, y)):
    x_train, x_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    model = CatBoostClassifier(
        iterations=1000,
        learning_rate=0.1,
        depth=6,
        eval_metric='F1',
        random_seed=42,
        logging_level='Verbose',
        use_best_model=True
    )

    model.fit(
        x_train, y_train,
        cat_features=categorical_features,
        eval_set=(x_val, y_val),
        verbose=False
    )

    # 예측 및 평가
    y_pred = model.predict(x_val)
    train_fold_predict_c[val_index, : ] = y_pred.reshape(-1, 1)
    val_predict_c[:, cnt] = model.predict(X_test)
val_predict_mean_c = np.mean(val_predict_c, axis=1).reshape(-1,1)


bestTest = 0.9357418643
bestIteration = 897

Shrink model to first 898 iterations.

bestTest = 0.9342159702
bestIteration = 794

Shrink model to first 795 iterations.

bestTest = 0.9374827871
bestIteration = 849

Shrink model to first 850 iterations.

bestTest = 0.9379632171
bestIteration = 949

Shrink model to first 950 iterations.

bestTest = 0.9388145315
bestIteration = 989

Shrink model to first 990 iterations.


In [None]:
from lightgbm import LGBMClassifier

X = df_resampled.drop('target', axis=1)
y = df_resampled['target']
X_test = test_data.drop('target', axis=1)
for col in category:
  X[col] = X[col].astype('category').cat.codes
  X_test[col] = X_test[col].astype('category').cat.codes
# Stratified K-Fold Cross-Validation 설정
n_splits = 5  # K 값 설정
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

train_fold_predict_l = np.zeros((X.shape[0], 1))
val_predict_l = np.zeros((X_test.shape[0], 5))

f1_scores = []

# K-Fold Cross-Validation을 통한 모델 학습 및 평가
for cnt,(train_index, val_index) in enumerate(skf.split(X, y)):
    x_train, x_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    model = LGBMClassifier(n_estimators=1000, num_leaves=64,learning_rate = 0.1, n_jobs=-1,verbose=-1, boost_from_average=False)

    model.fit(
        x_train, y_train,
        eval_set=(x_val, y_val),
        eval_metric = 'f1',
    )

    # 예측 및 평가
    y_pred = model.predict(x_val)
    train_fold_predict_l[val_index, : ] = y_pred.reshape(-1, 1)
    val_predict_l[:, cnt] = model.predict(X_test)
val_predict_mean_l = np.mean(val_predict_l, axis=1).reshape(-1,1)

In [None]:
new_train = np.concatenate([train_fold_predict_l, train_fold_predict_c], axis=1)
new_test = np.concatenate([val_predict_mean_l, val_predict_mean_c], axis=1)
print(new_train.shape,new_test.shape)

(38156, 2) (17361, 2)


In [None]:
new_model =   LGBMClassifier(random_state = 42,n_estimators=1000, num_leaves=64,learning_rate = 0.1, n_jobs=-1,verbose=-1, boost_from_average=False)
new_model.fit(new_train, y)
pred = new_model.predict(new_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
new_model = RandomForestClassifier(random_state=42, n_estimators=1000)
new_model.fit(new_train, y)
pred = new_model.predict(new_test)

In [None]:
def apply_rules(df):
    receip_columns = [col for col in df.columns if 'Receip No Collect Result' in col]
    equipment_columns = [col for col in df.columns if 'Equipment' in col and col != 'Equipment_AutoClave']
    qty_columns = [col for col in df.columns if 'Production Qty Collect Result' in col]

    condition = (
        (df[receip_columns].nunique(axis=1) > 1) |
        (df[equipment_columns].nunique(axis=1) > 1) |
        (df[qty_columns].nunique(axis=1) > 1) |
        (df['Workorder_Dam'].isin(["3KPXX094-0001", "4CPXX084-0001"]))
    )

    return condition
condition = apply_rules(test_data_v1)
pred[condition]=1

In [None]:
def get_clf_eval(y_test, y_pred=None):
    #y_pred = np.array([0 if i=="Normal" else 1 for i in y_pred])
    #y_test = np.array([0 if i=="Normal" else 1 for i in y_test])
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [None]:
pred = np.where(pred == 1, 'AbNormal', 'Normal')

df_sub = pd.read_csv(os.path.join(ROOT_DIR, "submission.csv"))
df_sub["target"] = pred

# 제출 파일 저장
df_sub.to_csv(os.path.join('./csv', "model2_submission.csv"), index=False)
print('done')

done


# Model #5


In [None]:
from typing import Any, Tuple, List
import os
from imblearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from collections import Counter, defaultdict
from imblearn.over_sampling import SMOTE, SMOTENC, RandomOverSampler, BorderlineSMOTE, ADASYN, KMeansSMOTE, SVMSMOTE
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.under_sampling import RandomUnderSampler

In [None]:
def apply_encoding(X: pd.DataFrame, category: List[str], encoding_type: str) -> pd.DataFrame:
    if encoding_type == "onehot":
        # One-Hot Encoding 적용
        encoder = OneHotEncoder(sparse_output=False, drop='first')
        X_encoded = encoder.fit_transform(X[category])
        X_encoded_df = pd.DataFrame(X_encoded, columns=encoder.get_feature_names_out(category))
        X = pd.concat([X.drop(columns=category).reset_index(drop=True), X_encoded_df.reset_index(drop=True)], axis=1)
    elif encoding_type == "label":
        # Label Encoding 적용
        X = X.copy()
        for col in category:
            le = LabelEncoder()
            X[col] = le.fit_transform(X[col])
    return X

In [None]:
# 상수 설정
DATA_DIR = './data'

# 데이터 로드 및 주요 특성 선택
train_data = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))
test_data = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))

In [None]:
selected_features = [
    'Equipment_Dam', 'Model.Suffix_Dam', 'Workorder_Dam',
    'CURE END POSITION X Collect Result_Dam', 'CURE SPEED Collect Result_Dam',
    'DISCHARGED SPEED OF RESIN Collect Result_Dam',
    'DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam',
    'DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam',
    'Dispense Volume(Stage1) Collect Result_Dam',
    'Dispense Volume(Stage2) Collect Result_Dam',
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam',
    'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam',
    'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam',
    'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam',
    'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam',
    'HEAD Standby Position X Collect Result_Dam',
    'Production Qty Collect Result_Dam', 'Receip No Collect Result_Dam',
    'Stage2 Circle1 Distance Speed Collect Result_Dam',
    'THICKNESS 2 Collect Result_Dam', 'THICKNESS 3 Collect Result_Dam',
    '1st Pressure Collect Result_AutoClave',
    '1st Pressure 1st Pressure Unit Time_AutoClave',
    '2nd Pressure Collect Result_AutoClave', '3rd Pressure Collect Result_AutoClave',
    '3rd Pressure Unit Time_AutoClave', 'Chamber Temp. Collect Result_AutoClave',
    'Chamber Temp. Judge Value_AutoClave', 'Equipment_Fill1',
    'DISCHARGED SPEED OF RESIN Collect Result_Fill1',
    'DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill1',
    'DISCHARGED TIME OF RESIN(Stage2) Collect Result_Fill1',
    'Dispense Volume(Stage1) Collect Result_Fill1',
    'Dispense Volume(Stage3) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill1',
    'Head Purge Position Z Collect Result_Fill1', 'Equipment_Fill2',
    'CURE END POSITION X Collect Result_Fill2',
    'CURE END POSITION Z Collect Result_Fill2', 'CURE SPEED Collect Result_Fill2',
    'CURE STANDBY POSITION Z Collect Result_Fill2',
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2',
    'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill2',
    'WorkMode Collect Result_Fill2',
    'target'
]

In [None]:
train_data_v1 = train_data[selected_features]
test_data_v1 = test_data[selected_features]

# 타겟 변수 이진화 (Normal: 0, AbNormal: 1)
train_data_v1.loc[:, 'target'] = (train_data_v1['target'] != 'Normal').astype(int)

In [None]:
# 'OK' 값을 가진 열의 처리를 위한 컬럼 선택 및 값 변환
columns_to_replace = [col for col in train_data_v1 if 'OK' in train_data_v1[col].unique() and col != 'Chamber Temp. Judge Value_AutoClave']
train_data_v1.loc[:, columns_to_replace] = train_data_v1[columns_to_replace].apply(pd.to_numeric, errors='coerce').fillna(0)
test_data_v1.loc[:, columns_to_replace] = test_data_v1[columns_to_replace].apply(pd.to_numeric, errors='coerce').fillna(0)

  columns_to_replace = [col for col in train_data_v1 if 'OK' in train_data_v1[col].unique() and col != 'Chamber Temp. Judge Value_AutoClave']


In [None]:
# Equipment_xxx 값 #1, #2로 통일
train_data_v1.loc[:, 'Equipment_Dam'] = train_data_v1['Equipment_Dam'].replace({
    'Dam dispenser #1': '#1',
    'Dam dispenser #2': '#2'
})

train_data_v1.loc[:, 'Equipment_Fill1'] = train_data_v1['Equipment_Fill1'].replace({
    'Fill1 dispenser #1': '#1',
    'Fill1 dispenser #2': '#2'
})

train_data_v1.loc[:, 'Equipment_Fill2'] = train_data_v1['Equipment_Fill2'].replace({
    'Fill2 dispenser #1': '#1',
    'Fill2 dispenser #2': '#2'
})

# Equipment_xxx 값 #1, #2로 통일
test_data_v1.loc[:, 'Equipment_Dam'] = test_data_v1.loc[:, 'Equipment_Dam'].replace({
    'Dam dispenser #1': '#1',
    'Dam dispenser #2': '#2'
})

test_data_v1.loc[:, 'Equipment_Fill1'] = test_data_v1.loc[:, 'Equipment_Fill1'].replace({
    'Fill1 dispenser #1': '#1',
    'Fill1 dispenser #2': '#2'
})

test_data_v1.loc[:, 'Equipment_Fill2'] = test_data_v1.loc[:, 'Equipment_Fill2'].replace({
    'Fill2 dispenser #1': '#1',
    'Fill2 dispenser #2': '#2'
})

In [None]:
# 모든 열에서 값을 숫자로 변환 시도
train_data_v1 = train_data_v1.apply(pd.to_numeric, errors='ignore')
test_data_v1 = test_data_v1.apply(pd.to_numeric, errors='ignore')

In [None]:
# 모든 열에서 숫자로 변환이 불가능한 열(column) 찾기
non_numeric_columns = train_data_v1.columns[train_data_v1.apply(lambda col: pd.to_numeric(col, errors='coerce').notna().all()) == False].tolist()
print(non_numeric_columns)

['Equipment_Dam', 'Model.Suffix_Dam', 'Workorder_Dam', 'Chamber Temp. Judge Value_AutoClave', 'Equipment_Fill1', 'Equipment_Fill2']


In [None]:
from typing import Any, Tuple, List
from imblearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from collections import Counter, defaultdict
from imblearn.over_sampling import SMOTE, SMOTENC, RandomOverSampler, BorderlineSMOTE, ADASYN, KMeansSMOTE, SVMSMOTE
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.under_sampling import RandomUnderSampler

In [None]:
def apply_encoding(X: pd.DataFrame, category: List[str], encoding_type: str) -> pd.DataFrame:
    if encoding_type == "onehot":
        # One-Hot Encoding 적용
        encoder = OneHotEncoder(sparse_output=False, drop='first')
        X_encoded = encoder.fit_transform(X[category])
        X_encoded_df = pd.DataFrame(X_encoded, columns=encoder.get_feature_names_out(category))
        X = pd.concat([X.drop(columns=category).reset_index(drop=True), X_encoded_df.reset_index(drop=True)], axis=1)
    elif encoding_type == "label":
        # Label Encoding 적용
        X = X.copy()
        for col in category:
            le = LabelEncoder()
            X[col] = le.fit_transform(X[col])
    return X

In [None]:
# 범주형 변수 선택
category = train_data_v1.select_dtypes(include='object').columns.tolist()

# SMOTEENN과 RandomUnderSampler 설정
oversampler = SMOTEENN(random_state=42, sampling_strategy=0.5)
undersampler = RandomUnderSampler(random_state=42, sampling_strategy=1.0)

# 데이터와 타겟 분리
X = train_data_v1.drop('target', axis=1)
y = train_data_v1['target']

# One-Hot Encoding 적용
X_encoded = apply_encoding(X, category, "onehot")

# 파이프라인 설정 및 Resampling
pipeline = Pipeline(steps=[('oversampler', oversampler), ('undersampler', undersampler)])
X_res, y_res = pipeline.fit_resample(X_encoded, y)

# Resampling 후 데이터프레임 생성
df_resampled = pd.concat([pd.DataFrame(X_res, columns=X_encoded.columns), pd.DataFrame(y_res, columns=['target'])], axis=1)
print(f"Resampled dataset shape: {Counter(y_res)}")
print(df_resampled['target'].value_counts())

# Resampling된 데이터를 다시 분리
X_resampled = df_resampled.drop('target', axis=1)
y_resampled = df_resampled['target']

# Stratified K-Fold Cross-Validation 설정
n_splits = 4
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

f1_scores = []

# K-Fold Cross-Validation을 통한 모델 학습 및 평가
for train_index, val_index in skf.split(X_resampled, y_resampled):
    x_train, x_val = X_resampled.iloc[train_index], X_resampled.iloc[val_index]
    y_train, y_val = y_resampled.iloc[train_index], y_resampled.iloc[val_index]

    # CatBoostClassifier 설정 및 학습
    model = CatBoostClassifier(
        iterations=1000,
        learning_rate=0.1,
        depth=6,
        eval_metric='F1',
        random_seed=42,
        logging_level='Verbose',
        use_best_model=True
    )

    model.fit(
        x_train, y_train,
        cat_features=None,  # 원핫인코딩 적용으로 cat_features 설정 불필요
        eval_set=(x_val, y_val),
        verbose=False
    )

    # 예측 및 평가
    y_pred = model.predict(x_val)
    f1 = f1_score(y_val, y_pred)
    f1_scores.append(f1)
    print(f"Fold F1 Score: {f1}")

# 전체 F1 Score 평균
mean_f1 = np.mean(f1_scores)
print(f"Mean F1 Score: {mean_f1}")

Resampled dataset shape: Counter({0: 14330, 1: 14330})
target
0    14330
1    14330
Name: count, dtype: int64

bestTest = 0.955198162
bestIteration = 974

Shrink model to first 975 iterations.
Fold F1 Score: 0.9551981619758759

bestTest = 0.943839791
bestIteration = 946

Shrink model to first 947 iterations.
Fold F1 Score: 0.9438397910317806

bestTest = 0.9447425671
bestIteration = 840

Shrink model to first 841 iterations.
Fold F1 Score: 0.9447425670775924

bestTest = 0.9392924734
bestIteration = 798

Shrink model to first 799 iterations.
Fold F1 Score: 0.9392924734313582
Mean F1 Score: 0.9457682483791519


In [None]:
test_data = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))

In [None]:
# 테스트 데이터 전처리
X_test = test_data_v1.drop('target', axis=1, errors='ignore')

# train_data_v1에서 사용한 것과 동일한 방식으로 One-Hot Encoding 적용
X_test_encoded = apply_encoding(X_test, category, "onehot")

# 학습된 모델을 사용하여 예측 수행
y_test_pred = model.predict(X_test_encoded)

In [None]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["target"] = y_test_pred

df_sub.loc[:, "target"] = np.where(df_sub['target'] == 1, "AbNormal", "Normal")

In [None]:
# 제출 파일 저장
df_sub.to_csv("./csv/model5_submission.csv", index=False)

# Model #1

In [None]:
import os
from pprint import pprint
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from tqdm import tqdm

from catboost import CatBoostClassifier
from sklearn.metrics import f1_score
import itertools

from imblearn.over_sampling import SMOTENC
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from collections import Counter
from sklearn.model_selection import StratifiedKFold, GridSearchCV



### 엑셀 파일들 읽어오기

In [None]:
ROOT_DIR = "."
DATA_DIR = os.path.join(ROOT_DIR, "data")
SAVE_DIR = os.path.join(ROOT_DIR, "csv")
RANDOM_STATE = 110

train_data = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))
print(train_data.shape)

(40506, 464)


### data 처리 v1

In [None]:
lst3 = ['Equipment_Dam', 'Model.Suffix_Dam', 'Workorder_Dam',
       'CURE END POSITION X Collect Result_Dam',
       'CURE SPEED Collect Result_Dam',

       'DISCHARGED SPEED OF RESIN Collect Result_Dam',
       'DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam',
       'DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam',
       'DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam',
       'Dispense Volume(Stage1) Collect Result_Dam',
       'Dispense Volume(Stage2) Collect Result_Dam',
       'Dispense Volume(Stage3) Collect Result_Dam',

       'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam',
       'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam',
       'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam',
       'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam',
       'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam',
       'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam',
       'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Dam',
       'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam',
       'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Dam',

       'HEAD Standby Position X Collect Result_Dam',
       'Production Qty Collect Result_Dam', 'Receip No Collect Result_Dam',
       'Stage2 Circle1 Distance Speed Collect Result_Dam',
       'THICKNESS 2 Collect Result_Dam', 'THICKNESS 3 Collect Result_Dam',
       '1st Pressure Collect Result_AutoClave',
       '1st Pressure 1st Pressure Unit Time_AutoClave',
       '2nd Pressure Collect Result_AutoClave',
       '3rd Pressure Collect Result_AutoClave',
       '3rd Pressure Unit Time_AutoClave',
       'Chamber Temp. Collect Result_AutoClave',
       'Chamber Temp. Judge Value_AutoClave', 'Equipment_Fill1',

       'DISCHARGED SPEED OF RESIN Collect Result_Fill1',
       'DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill1',
       'DISCHARGED TIME OF RESIN(Stage2) Collect Result_Fill1',
       'DISCHARGED TIME OF RESIN(Stage3) Collect Result_Fill1',
       'Dispense Volume(Stage1) Collect Result_Fill1',
       'Dispense Volume(Stage2) Collect Result_Fill1',
       'Dispense Volume(Stage3) Collect Result_Fill1',
       'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1',
       'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill1',
       'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill1',
       'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1',
       'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1',
       'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1',
       'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill1',
       'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill1',
       'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill1',
       'Head Purge Position Z Collect Result_Fill1', 'Equipment_Fill2',
       'CURE END POSITION X Collect Result_Fill2',
       'CURE END POSITION Z Collect Result_Fill2',
       'CURE SPEED Collect Result_Fill2',
       'CURE STANDBY POSITION Z Collect Result_Fill2',
       'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2',
       'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill2',
       'WorkMode Collect Result_Fill2',
        'target']

In [None]:
train_data_v1 = train_data[lst3]
print(train_data_v1.shape)

(40506, 61)


In [None]:
# Normal: 0, AbNormal: 1
train_data_v1['target'] = train_data_v1['target'].apply(lambda x: 0 if x == 'Normal' else 1)
# train_data_v1.to_csv(os.path.join(SAVE_DIR, "data_prep_1.csv"), index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [None]:
# OK랑 숫자 섞인 column 에서 ok -> null로 변경
outlier_ok = ['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam',
              'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1', 'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2']
train_data_v1[outlier_ok] = train_data_v1[outlier_ok].apply(pd.to_numeric, errors='coerce').fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [None]:
# Equipment_xxx 값 #1, #2로 통일
train_data_v1['Equipment_Dam'] = train_data_v1['Equipment_Dam'].replace({
    'Dam dispenser #1': '#1',
    'Dam dispenser #2': '#2'
})

train_data_v1['Equipment_Fill1'] = train_data_v1['Equipment_Fill1'].replace({
    'Fill1 dispenser #1': '#1',
    'Fill1 dispenser #2': '#2'
})

train_data_v1['Equipment_Fill2'] = train_data_v1['Equipment_Fill2'].replace({
    'Fill2 dispenser #1': '#1',
    'Fill2 dispenser #2': '#2'
})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [None]:
# 모든 열에서 값을 숫자로 변환 시도
train_data_v1 = train_data_v1.apply(pd.to_numeric, errors='ignore')

In [None]:
columns_of_interest = ['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam',
       'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam',
       'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam',
       'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam',
       'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam',
       'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam',
       'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Dam',
       'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam',
       'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Dam',
       'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1',
       'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill1',
       'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill1',
       'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1',
       'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1',
       'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1',
       'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill1',
       'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill1',
       'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill1'
      ]

modified_intervals_list = [
[(152.4, 172.4), (538.5, 560.3)],
[(151.2, 174.5), (452.2, 475.5), (538.9, 562.1)],
[(149.5, 170.8), (452.2, 477.5), (541.1, 562.0)],
[(149.8, 172.9), (367.3, 387.5), (540.5, 564.0), (1259.0, 1281.8)],
[(367.0, 394.0), (1258.2, 1291.8)],
[(367.1, 393.1), (1259.0, 1283.2), (1373.9, 1404.2)],
[(272.15, 294.8), (367.3, 394.0), (1261.8, 1292.0)],
[(271.8, 276.78), (278.894, 286.8)],
[(271.8, 276.78), (278.894, 286.8)],
[(827.5, 848.4)],
[(146.1, 167.0), (448.2, 470.5), (827.4, 848.4)],
[(147.0, 168.0), (447.6, 468.8), (828.1, 848.4)],
[(146.1, 167.0), (420.2, 440.5), (828.1, 848.4), (1313.2, 1333.2)],
[(419.8, 441.1), (1312.5, 1342.8)],
[(419.5, 440.8), (1312.5, 1335.8)],
[(234.2, 254.6), (420.0, 441.1), (1312.8, 1342.7)],
[(234.2, 254.728)],
[(234.2, 254.728)]
]


def getmean(df, col, intervals):
  mean_values = []
  for s, e in intervals:
    filtered_values = df.loc[(df[col] >= s) & (df[col] <= e), col]
    mean_value = filtered_values.mean()
    mean_values.append(mean_value)
  return mean_values

def change_to_dis(df, col, intervals, mean_values):
  for (s, e), mean_value in zip(intervals, mean_values):
     df.loc[(df[col] >= s) & (df[col] <= e), col] -= mean_value

def scale_col(df, columns_of_interest, scale_factor):
  df[columns_of_interest] = df[columns_of_interest] * scale_factor

def euclidean_dis(df):
  for stage in ['Stage1', 'Stage2', 'Stage3']:
    # 컬럼 이름 생성
    x_col_dam = f'HEAD NORMAL COORDINATE X AXIS({stage}) Collect Result_Dam'
    y_col_dam = f'HEAD NORMAL COORDINATE Y AXIS({stage}) Collect Result_Dam'
    z_col_dam = f'HEAD NORMAL COORDINATE Z AXIS({stage}) Collect Result_Dam'

    x_col_fill = f'HEAD NORMAL COORDINATE X AXIS({stage}) Collect Result_Fill1'
    y_col_fill = f'HEAD NORMAL COORDINATE Y AXIS({stage}) Collect Result_Fill1'
    z_col_fill = f'HEAD NORMAL COORDINATE Z AXIS({stage}) Collect Result_Fill1'

    # 유클리드 거리 계산
    df[f'Euclidean Distance Dam ({stage})'] = np.sqrt(df[x_col_dam]**2 + df[y_col_dam]**2 + df[z_col_dam]**2)
    df[f'Euclidean Distance Fill1 ({stage})'] = np.sqrt(df[x_col_fill]**2 + df[y_col_fill]**2 + df[z_col_fill]**2)

    df.drop([x_col_dam, y_col_dam, z_col_dam, x_col_fill, y_col_fill, z_col_fill], axis=1, inplace=True)

In [None]:
# outlier 제거
train_data_v1 = train_data_v1[train_data_v1['HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill1'] != 225.85]

# 좌표 distance로 바꾸기
for column, intervals in zip(columns_of_interest, modified_intervals_list):
  mean_values = getmean(train_data_v1, column, intervals)
  change_to_dis(train_data_v1, column, intervals, mean_values)
#scaling
scale_col(train_data_v1, columns_of_interest, 10)
#euclidean dis f로 변경
euclidean_dis(train_data_v1)

In [None]:
'''
       'DISCHARGED SPEED OF RESIN Collect Result_Dam',
       'DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam',
       'DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam',
       'DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam',
       'Dispense Volume(Stage1) Collect Result_Dam',
       'Dispense Volume(Stage2) Collect Result_Dam',
       'Dispense Volume(Stage3) Collect Result_Dam',

              'DISCHARGED SPEED OF RESIN Collect Result_Fill1',
       'DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill1',
       'DISCHARGED TIME OF RESIN(Stage2) Collect Result_Fill1',
       'DISCHARGED TIME OF RESIN(Stage3) Collect Result_Fill1',
       'Dispense Volume(Stage1) Collect Result_Fill1',
       'Dispense Volume(Stage2) Collect Result_Fill1',
       'Dispense Volume(Stage3) Collect Result_Fill1',
'''

def make_resin_feature(df):
  for i in range(1, 4):
    time_dam = f'DISCHARGED TIME OF RESIN(Stage{i}) Collect Result_Dam'
    volumn_dam = f'Dispense Volume(Stage{i}) Collect Result_Dam'
    time_fill1 = f'DISCHARGED TIME OF RESIN(Stage{i}) Collect Result_Fill1'
    volumn_fill1 = f'Dispense Volume(Stage{i}) Collect Result_Fill1'
    resin_dam = df['DISCHARGED SPEED OF RESIN Collect Result_Dam'] * df[time_dam] / df[volumn_dam]
    resin_fill1 = df['DISCHARGED SPEED OF RESIN Collect Result_Fill1'] * df[time_fill1] * 100 / df[volumn_fill1] - 1100
    df[f'new_resin_dam_stage{i}'] = resin_dam
    df[f'new_resin_fill1_stage{i}'] = resin_fill1
def drop_origin_regin(df):
  columns_to_drop = [
    'DISCHARGED SPEED OF RESIN Collect Result_Dam',
    'DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam',
    'DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam',
    'DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam',
    'Dispense Volume(Stage1) Collect Result_Dam',
    'Dispense Volume(Stage2) Collect Result_Dam',
    'Dispense Volume(Stage3) Collect Result_Dam',
    'DISCHARGED SPEED OF RESIN Collect Result_Fill1',
    'DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill1',
    'DISCHARGED TIME OF RESIN(Stage2) Collect Result_Fill1',
    'DISCHARGED TIME OF RESIN(Stage3) Collect Result_Fill1',
    'Dispense Volume(Stage1) Collect Result_Fill1',
    'Dispense Volume(Stage2) Collect Result_Fill1',
    'Dispense Volume(Stage3) Collect Result_Fill1']
  df.drop(columns=columns_to_drop, axis=1, inplace=True)

In [None]:
make_resin_feature(train_data_v1)
drop_origin_regin(train_data_v1)
train_data_v1.to_csv(os.path.join(SAVE_DIR, "data_prep_resin.csv"), index=False)
'''
new_resin_dam_stage1
new_resin_dam_stage2
new_resin_dam_stage3
new_resin_fill1_stage1
new_resin_fill1_stage2
new_resin_fill1_stage3
'''

'\nnew_resin_dam_stage1\nnew_resin_dam_stage2\nnew_resin_dam_stage3\nnew_resin_fill1_stage1\nnew_resin_fill1_stage2\nnew_resin_fill1_stage3\n'

## train v1

In [None]:
# 모든 열에서 숫자로 변환이 불가능한 열(column) 찾기
non_numeric_columns = train_data_v1.columns[train_data_v1.apply(lambda col: pd.to_numeric(col, errors='coerce').notna().all()) == False].tolist()
print(non_numeric_columns)

['Equipment_Dam', 'Model.Suffix_Dam', 'Workorder_Dam', 'Chamber Temp. Judge Value_AutoClave', 'Equipment_Fill1', 'Equipment_Fill2']


balanced sampling

In [None]:
category = non_numeric_columns
X = train_data_v1.drop('target', axis=1)
y = train_data_v1['target']

# 범주형 변수의 인덱스 목록 생성
categorical_features = [X.columns.get_loc(col) for col in category]

# SMOTENC로 소수 클래스의 데이터를 다수 클래스의 절반 정도로 증강
smote_nc = SMOTENC(categorical_features=categorical_features, sampling_strategy=0.5, random_state=42)

# Undersampling을 통해 다수 클래스의 데이터를 소수 클래스와 동일하게 줄이기
undersample = RandomUnderSampler(sampling_strategy=1.0, random_state=42)

# 파이프라인을 통해 SMOTENC와 Undersampling 결합
pipeline = Pipeline(steps=[('smote', smote_nc), ('undersample', undersample)])

# 데이터에 파이프라인 적용하여 Resampling 수행
X_res, y_res = pipeline.fit_resample(X, y)

# Resampling 후 클래스 분포 확인
print(f'Resampled dataset shape {Counter(y_res)}')

# 결과 확인
df_resampled = pd.concat([pd.DataFrame(X_res, columns=X.columns), pd.DataFrame(y_res, columns=['target'])], axis=1)
# print(df_resampled)
print(df_resampled['target'].value_counts())

Resampled dataset shape Counter({0: 19078, 1: 19078})
target
0    19078
1    19078
Name: count, dtype: int64


In [None]:
X = df_resampled.drop('target', axis=1)
y = df_resampled['target']

# 범주형 변수 인덱스 찾기
# categorical_features

# Stratified K-Fold Cross-Validation 설정
n_splits = 4  # K 값 설정
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

f1_scores = []

# K-Fold Cross-Validation을 통한 모델 학습 및 평가
for train_index, val_index in skf.split(X, y):
    x_train, x_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    model = CatBoostClassifier(
        iterations=1000,
        learning_rate=0.1,
        depth=6,
        eval_metric='F1',
        random_seed=42,
        logging_level='Verbose',
        use_best_model=True
    )

    model.fit(
        x_train, y_train,
        cat_features=categorical_features,
        eval_set=(x_val, y_val),
        verbose=False
    )

    # 예측 및 평가
    y_pred = model.predict(x_val)
    f1 = f1_score(y_val, y_pred)
    f1_scores.append(f1)
    print(f"Fold F1 Score: {f1}")

# 전체 F1 Score 평균
mean_f1 = np.mean(f1_scores)
print(f"Mean F1 Score: {mean_f1}")


bestTest = 0.9346965699
bestIteration = 701

Shrink model to first 702 iterations.
Fold F1 Score: 0.9346965699208444

bestTest = 0.9336858339
bestIteration = 996

Shrink model to first 997 iterations.
Fold F1 Score: 0.9336858338841154

bestTest = 0.9369646387
bestIteration = 906

Shrink model to first 907 iterations.
Fold F1 Score: 0.9369646386997584

bestTest = 0.938579235
bestIteration = 796

Shrink model to first 797 iterations.
Fold F1 Score: 0.9385792349726776
Mean F1 Score: 0.935981569369349


## test data 전처리

In [None]:
test_data = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))

In [None]:
test_data_v1 = test_data[lst3]
print(test_data_v1.shape)

(17361, 61)


In [None]:
# Normal: 0, AbNormal: 1
test_data_v1['target'] = test_data_v1['target'].apply(lambda x: 0 if x == 'Normal' else 1)
# train_data_v1.to_csv(os.path.join(SAVE_DIR, "data_prep_1.csv"), index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [None]:
# OK랑 숫자 섞인 column 에서 ok -> null로 변경
outlier_ok = ['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam',
              'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1', 'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2']
test_data_v1[outlier_ok] = test_data_v1[outlier_ok].apply(pd.to_numeric, errors='coerce').fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [None]:
# Equipment_xxx 값 #1, #2로 통일
test_data_v1['Equipment_Dam'] = test_data_v1['Equipment_Dam'].replace({
    'Dam dispenser #1': '#1',
    'Dam dispenser #2': '#2'
})

test_data_v1['Equipment_Fill1'] = test_data_v1['Equipment_Fill1'].replace({
    'Fill1 dispenser #1': '#1',
    'Fill1 dispenser #2': '#2'
})

test_data_v1['Equipment_Fill2'] = test_data_v1['Equipment_Fill2'].replace({
    'Fill2 dispenser #1': '#1',
    'Fill2 dispenser #2': '#2'
})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [None]:
# 모든 열에서 값을 숫자로 변환 시도
test_data_v1 = test_data_v1.apply(pd.to_numeric, errors='ignore')

In [None]:
# 좌표 distance로 바꾸기
for column, intervals in zip(columns_of_interest, modified_intervals_list):
  mean_values = getmean(test_data_v1, column, intervals)
  change_to_dis(test_data_v1, column, intervals, mean_values)

#scaling
scale_col(test_data_v1, columns_of_interest, 10)
#euclidean dis 로 변경
euclidean_dis(test_data_v1)

In [None]:
make_resin_feature(test_data_v1)
drop_origin_regin(test_data_v1)

## train v1 predict

In [None]:
test_pred = model.predict(test_data_v1)
test_pred = np.where(test_pred == 1, 'AbNormal', 'Normal')
test_data_v1['target'] = test_pred
condition = (
    (test_data_v1['Equipment_Dam'] != test_data_v1['Equipment_Fill1']) |
    (test_data_v1['Equipment_Dam'] != test_data_v1['Equipment_Fill2']) |
    (test_data_v1['Equipment_Fill1'] != test_data_v1['Equipment_Fill2'])
)
test_data_v1.loc[condition, 'target'] = 'AbNormal'
test_pred = test_data_v1['target']

CatBoostError: Bad value for num_feature[non_default_doc_idx=0,feature_idx=28]="Normal": Cannot convert 'b'Normal'' to float

In [None]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv(os.path.join(ROOT_DIR, "submission.csv"))
df_sub["target"] = test_pred

# 제출 파일 저장
df_sub.to_csv(os.path.join(SAVE_DIR, "model1_submission.csv"), index=False)
print('done')

done


# Model #6

In [None]:
import os
from pprint import pprint

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import matplotlib.pyplot as plt
import pickle

In [None]:
ROOT_DIR = "."
DATA_DIR = os.path.join(ROOT_DIR, "data")
SAVE_DIR = os.path.join(ROOT_DIR, "csv")

# Load data
train_data_origin = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))
test_data_origin = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))
df_sub = pd.read_csv(os.path.join(ROOT_DIR, "submission.csv"))

In [None]:
# Drop columns with more than half of the values missing
drop_cols = []
for column in train_data_origin.columns:
    if (train_data_origin[column].isnull().sum()) == len(train_data_origin):
        drop_cols.append(column)

print(f'null 행: {len(drop_cols)}/{len(train_data_origin.columns)}')
train_data = train_data_origin.drop(drop_cols, axis=1).copy(deep=True)
train_data["target"] = (train_data["target"] == 'AbNormal').astype(int)

test_data = test_data_origin[train_data.columns].copy(deep=True)

In [None]:
has_null = train_data[train_data.columns[train_data.isnull().any()]]

for has_null_col in has_null.select_dtypes(include=['object']).columns:
    train_data[train_data[[has_null_col]] == 'OK'] = np.nan
    train_data[has_null_col] = train_data[has_null_col].astype('float')
    test_data[test_data[[has_null_col]] == 'OK'] = np.nan
    test_data[has_null_col] = test_data[has_null_col].astype('float')


for col in train_data[has_null.columns].columns:
    print('train:', col, train_data[col].unique())
    print('test:', col, test_data[col].unique())

In [None]:
nunique_counts = train_data.nunique()
drop_cols = nunique_counts[nunique_counts <= 1].index
df_train = train_data.drop(drop_cols, axis=1)

df_test = test_data.drop(drop_cols, axis=1)

In [None]:
has_null = df_train[df_train.columns[df_train.isnull().any()]]

for col in df_train[has_null.columns].columns:
    df_train[df_train[[col]].isnull()] = 0
    df_test[df_test[[col]].isnull()] = 0
    print('train:', col, df_train[col].unique())
    print('test:', col, df_test[col].unique())

In [None]:
df_all = pd.concat([df_train, df_test], axis=0, ignore_index=True)

corr_matrix = train_data.select_dtypes(['float', 'int', 'boolean']).corr().abs()


upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

to_drop = [column for column in upper.columns if any(upper[column] > 0.94)]
df_train = df_train.drop(columns=to_drop)
df_test = df_test.drop(columns=to_drop)
del df_all

In [None]:

golden_features_1 = {
    "new_features": [
        {
            "feature1": "WorkMode Collect Result_Fill2",
            "feature2": "CURE SPEED Collect Result_Fill2",
            "operation": "sum",
            "score": 0.2121855674
        },
        {
            "feature1": "3rd Pressure Unit Time_AutoClave",
            "feature2": "WorkMode Collect Result_Fill2",
            "operation": "ratio",
            "score": 0.2126641102
        },
        {
            "feature1": "1st Pressure 1st Pressure Unit Time_AutoClave",
            "feature2": "WorkMode Collect Result_Fill2",
            "operation": "ratio",
            "score": 0.2127302559
        },
        {
            "feature1": "WorkMode Collect Result_Fill2",
            "feature2": "3rd Pressure Unit Time_AutoClave",
            "operation": "ratio",
            "score": 0.2129125475
        },
        {
            "feature1": "WorkMode Collect Result_Fill2",
            "feature2": "1st Pressure 1st Pressure Unit Time_AutoClave",
            "operation": "multiply",
            "score": 0.2129371268
        },
        {
            "feature1": "WorkMode Collect Result_Fill2",
            "feature2": "CURE SPEED Collect Result_Fill2",
            "operation": "multiply",
            "score": 0.2129570359
        },
        {
            "feature1": "WorkMode Collect Result_Fill2",
            "feature2": "1st Pressure 1st Pressure Unit Time_AutoClave",
            "operation": "ratio",
            "score": 0.2129659371
        },
        {
            "feature1": "WorkMode Collect Result_Fill2",
            "feature2": "Dispense Volume(Stage2) Collect Result_Dam",
            "operation": "sum",
            "score": 0.2129880768
        },
        {
            "feature1": "WorkMode Collect Result_Fill2",
            "feature2": "3rd Pressure Unit Time_AutoClave",
            "operation": "multiply",
            "score": 0.2130466599
        },
        {
            "feature1": "Stage2 Circle1 Distance Speed Collect Result_Dam",
            "feature2": "WorkMode Collect Result_Fill2",
            "operation": "diff",
            "score": 0.2131566118
        }
    ],
    "new_columns": [
        "WorkMode Collect Result_Fill2_sum_CURE SPEED Collect Result_Fill2",
        "3rd Pressure Unit Time_AutoClave_ratio_WorkMode Collect Result_Fill2",
        "1st Pressure 1st Pressure Unit Time_AutoClave_ratio_WorkMode Collect Result_Fill2",
        "WorkMode Collect Result_Fill2_ratio_3rd Pressure Unit Time_AutoClave",
        "WorkMode Collect Result_Fill2_multiply_1st Pressure 1st Pressure Unit Time_AutoClave",
        "WorkMode Collect Result_Fill2_multiply_CURE SPEED Collect Result_Fill2",
        "WorkMode Collect Result_Fill2_ratio_1st Pressure 1st Pressure Unit Time_AutoClave",
        "WorkMode Collect Result_Fill2_sum_Dispense Volume(Stage2) Collect Result_Dam",
        "WorkMode Collect Result_Fill2_multiply_3rd Pressure Unit Time_AutoClave",
        "Stage2 Circle1 Distance Speed Collect Result_Dam_diff_WorkMode Collect Result_Fill2"
    ],
    "ml_task": "binary_classification"
}
golden_features_2 = {
    "new_features": [
        {
            "feature1": "WorkMode Collect Result_Fill2",
            "feature2": "3rd Pressure Unit Time_AutoClave",
            "operation": "sum",
            "score": 0.2104471591
        },
        {
            "feature1": "WorkMode Collect Result_Fill2",
            "feature2": "1st Pressure 1st Pressure Unit Time_AutoClave",
            "operation": "sum",
            "score": 0.210717613
        },
        {
            "feature1": "WorkMode Collect Result_Fill2",
            "feature2": "Dispense Volume(Stage1) Collect Result_Dam",
            "operation": "sum",
            "score": 0.2111184784
        },
        {
            "feature1": "WorkMode Collect Result_Fill2",
            "feature2": "CURE SPEED Collect Result_Fill2",
            "operation": "sum",
            "score": 0.2111915385
        },
        {
            "feature1": "WorkMode Collect Result_Fill2",
            "feature2": "CURE END POSITION Z Collect Result_Fill2",
            "operation": "sum",
            "score": 0.2112053145
        },
        {
            "feature1": "WorkMode Collect Result_Fill2",
            "feature2": "Dispense Volume(Stage1) Collect Result_Dam",
            "operation": "multiply",
            "score": 0.21127006
        },
        {
            "feature1": "HEAD Standby Position X Collect Result_Dam",
            "feature2": "WorkMode Collect Result_Fill2",
            "operation": "ratio",
            "score": 0.211271404
        },
        {
            "feature1": "HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam",
            "feature2": "WorkMode Collect Result_Fill2",
            "operation": "ratio",
            "score": 0.211271404
        },
        {
            "feature1": "Stage2 Circle1 Distance Speed Collect Result_Dam",
            "feature2": "HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam",
            "operation": "multiply",
            "score": 0.2112727579
        },
        {
            "feature1": "WorkMode Collect Result_Fill2",
            "feature2": "HEAD Standby Position X Collect Result_Dam",
            "operation": "multiply",
            "score": 0.2112793565
        }
    ],
    "new_columns": [
        "WorkMode Collect Result_Fill2_sum_3rd Pressure Unit Time_AutoClave",
        "WorkMode Collect Result_Fill2_sum_1st Pressure 1st Pressure Unit Time_AutoClave",
        "WorkMode Collect Result_Fill2_sum_Dispense Volume(Stage1) Collect Result_Dam",
        "WorkMode Collect Result_Fill2_sum_CURE SPEED Collect Result_Fill2",
        "WorkMode Collect Result_Fill2_sum_CURE END POSITION Z Collect Result_Fill2",
        "WorkMode Collect Result_Fill2_multiply_Dispense Volume(Stage1) Collect Result_Dam",
        "HEAD Standby Position X Collect Result_Dam_ratio_WorkMode Collect Result_Fill2",
        "HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam_ratio_WorkMode Collect Result_Fill2",
        "Stage2 Circle1 Distance Speed Collect Result_Dam_multiply_HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam",
        "WorkMode Collect Result_Fill2_multiply_HEAD Standby Position X Collect Result_Dam"
    ],
    "ml_task": "binary_classification"
}

In [None]:
import json

with open('AutoML_3/golden_features.json', 'rt') as f:
    golden_features_1 = json.loads(f.read())

with open('AutoML_6/golden_features.json', 'rt') as f:
    golden_features_2 = json.loads(f.read())

common_golden_features = []
for new_feature_1 in golden_features_1['new_features']:
    for new_feature_2 in golden_features_2['new_features']:
        if new_feature_1['operation'] == new_feature_2['operation']:
            if (new_feature_1['feature1'] == new_feature_2['feature1'] and new_feature_1['feature2'] == new_feature_2['feature2']) or (new_feature_1['feature1'] == new_feature_2['feature2'] and new_feature_1['feature2'] == new_feature_2['feature1']):
                common_golden_features.append(new_feature_1)
union_golden_features = golden_features_1['new_features'] + golden_features_2['new_features']


In [None]:
def make_new_features(df: pd.DataFrame, new_features: dict):
    for new_feature in new_features:
        new_col = "_".join(
            [
                new_feature["feature1"],
                new_feature["operation"],
                new_feature["feature2"],
            ]
        )
        if new_feature["operation"] == "diff":
            df[new_col] = df[new_feature["feature1"]] - df[new_feature["feature2"]]
        elif new_feature["operation"] == "ratio":
            a, b = (
                np.array(df[new_feature["feature1"]], dtype=float),
                np.array(df[new_feature["feature2"]], dtype=float),
            )
            df[new_col] = np.divide(
                a, b, out=np.zeros_like(a), where=b != 0
            ).reshape(-1, 1)
        elif new_feature["operation"] == "sum":
            df[new_col] = df[new_feature["feature1"]] + df[new_feature["feature2"]]
        elif new_feature["operation"] == "multiply":
            df[new_col] = df[new_feature["feature1"]] * df[new_feature["feature2"]]

In [None]:
make_new_features(df_train, common_golden_features)
make_new_features(df_test, common_golden_features)

In [None]:
drop_features_1 = [
    "Workorder_AutoClave",
    "Dispense Volume(Stage1) Collect Result_Fill1",
    "Chamber Temp. Collect Result_AutoClave",
    "Dispense Volume(Stage2) Collect Result_Dam",
    "WorkMode Collect Result_Fill2_sum_CURE SPEED Collect Result_Fill2",
    "CURE SPEED Collect Result_Fill2",
    "HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill1",
    "Workorder_Fill2",
    "WorkMode Collect Result_Fill2_multiply_1st Pressure 1st Pressure Unit Time_AutoClave",
    "Dispense Volume(Stage1) Collect Result_Dam",
    "THICKNESS 3 Collect Result_Dam",
    "1st Pressure 1st Pressure Unit Time_AutoClave_ratio_WorkMode Collect Result_Fill2",
    "HEAD Standby Position X Collect Result_Dam",
    "3rd Pressure Unit Time_AutoClave",
    "WorkMode Collect Result_Fill2_ratio_1st Pressure 1st Pressure Unit Time_AutoClave",
    "THICKNESS 2 Collect Result_Dam",
    "DISCHARGED SPEED OF RESIN Collect Result_Dam",
    "DISCHARGED SPEED OF RESIN Collect Result_Fill1",
    "WorkMode Collect Result_Fill2_multiply_3rd Pressure Unit Time_AutoClave",
    "WorkMode Collect Result_Fill2_multiply_CURE SPEED Collect Result_Fill2",
    "CURE END POSITION X Collect Result_Fill2",
    "CURE END POSITION Z Collect Result_Fill2",
    "WorkMode Collect Result_Fill2",
    "3rd Pressure Unit Time_AutoClave_ratio_WorkMode Collect Result_Fill2",
    "Model.Suffix_Fill1",
    "Model.Suffix_AutoClave",
    "Model.Suffix_Fill2",
    "WorkMode Collect Result_Fill2_ratio_3rd Pressure Unit Time_AutoClave",
    "HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1",
    "Stage2 Circle1 Distance Speed Collect Result_Dam",
    "DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill1",
    "CURE SPEED Collect Result_Dam",
    "Head Purge Position Z Collect Result_Fill1",
]

drop_features_2 = [
    "1st Pressure 1st Pressure Unit Time_AutoClave",
    "Chamber Temp. Collect Result_AutoClave",
    "Stage2 Circle1 Distance Speed Collect Result_Dam",
    "CURE SPEED Collect Result_Dam",
    "CURE END POSITION X Collect Result_Fill2",
    "DISCHARGED SPEED OF RESIN Collect Result_Dam",
    "THICKNESS 3 Collect Result_Dam",
    "HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill1",
]

In [None]:
import json


comm_drop_features = [feat for feat in drop_features_1 if feat in drop_features_2]

for feat in comm_drop_features.copy():
    if feat not in df_train.columns:
        comm_drop_features.remove(feat)

union_drop_features = list(set(drop_features_1 + drop_features_2))
for feat in union_drop_features.copy():
    if feat not in df_train.columns:
        union_drop_features.remove(feat)

In [None]:
df_train = df_train.drop(columns=union_drop_features)
df_test = df_test.drop(columns=union_drop_features)

In [None]:
class VotingModel():
    def __init__(self, estimators):
        self.estimators = estimators

    def predict(self, X):
        y_preds = [estimator.predict(X) for estimator in self.estimators]
        return (np.mean(y_preds, axis=0) >= 0.5).astype(int)

    def predict_proba(self, X):
        y_preds = [estimator.predict_proba(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler, RobustScaler, QuantileTransformer, PowerTransformer, MaxAbsScaler

y = df_train['target']
X = df_train.drop('target', axis=1)
object_cols = X.select_dtypes(include='object').columns
num_cols = X.select_dtypes(include=['int', 'float']).columns

numeric_transformer = MaxAbsScaler()
categorical_transformer = OneHotEncoder(sparse_output=True, handle_unknown='ignore')
ct = ColumnTransformer([
    ('num', numeric_transformer, num_cols),
    ('cat', categorical_transformer, object_cols)])
ct.fit(X)

In [None]:
class VotingModel():
    def __init__(self, estimators):
        self.estimators = estimators

    def predict(self, X):
        y_preds = [estimator.predict(X) for estimator in self.estimators]
        return (np.mean(y_preds, axis=0) >= 0.5).astype(int)

    def predict_proba(self, X):
        y_preds = [estimator.predict_proba(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)

with open(os.path.join(MODEL_DIR, "model6.pkl"), "rb") as f:
    model = pickle.load(f)

In [None]:
X_test = df_test.drop(columns=['target'])
X_test = ct.transform(X_test)
pred_y = model.predict(X_test)

In [None]:
df_sub['target'] = pred_y
df_sub[df_sub==1] = 'AbNormal'
df_sub[df_sub==0] = 'Normal'
df_sub.to_csv(os.path.join(SAVE_DIR, 'model6_submission.csv'), index=False)

# Model Ensemble

In [None]:
df_model1 = pd.read_csv(os.path.join(SAVE_DIR, "model1_submission.csv"))
df_model2 = pd.read_csv(os.path.join(SAVE_DIR, "model2_submission.csv"))
df_model5 = pd.read_csv(os.path.join(SAVE_DIR, "model5_submission.csv"))
df_model6 = pd.read_csv(os.path.join(SAVE_DIR, "model6_submission.csv"))

df_model1["target"] = (df_model1["target"] == 'AbNormal').astype(int)
df_model2["target"] = (df_model2["target"] == 'AbNormal').astype(int)
df_model5["target"] = (df_model5["target"] == 'AbNormal').astype(int)
df_model6["target"] = (df_model6["target"] == 'AbNormal').astype(int)

In [None]:
df_list = [df_model1, df_model2, df_model5, df_model6]

In [None]:
def apply_rules(df):
    receip_columns = [col for col in df.columns if 'Receip No Collect Result' in col]
    equipment_columns = [col for col in df.columns if 'Equipment' in col and col != 'Equipment_AutoClave']
    qty_columns = [col for col in df.columns if 'Production Qty Collect Result' in col]

    condition = (
        (df[receip_columns].nunique(axis=1) > 1) |
        (df[equipment_columns].nunique(axis=1) > 1) |
        (df[qty_columns].nunique(axis=1) > 1) |
        (df['Workorder_Dam'].isin(["3KPXX094-0001", "4CPXX084-0001"]))
    )

    return condition

test_data ['Equipment_Dam'] = test_data ['Equipment_Dam'].replace({
    'Dam dispenser #1': '#1',
    'Dam dispenser #2': '#2'
})

test_data ['Equipment_Fill1'] = test_data ['Equipment_Fill1'].replace({
    'Fill1 dispenser #1': '#1',
    'Fill1 dispenser #2': '#2'
})

test_data ['Equipment_Fill2'] = test_data ['Equipment_Fill2'].replace({
    'Fill2 dispenser #1': '#1',
    'Fill2 dispenser #2': '#2'
})
condition = apply_rules(test_data)

In [None]:

df_sub['target'] = 0
for i, df in enumerate(df_list):
    df_sub['target'] += df['target']
df_sub['target'] = (df_sub['target'] / len(df_list)) >= 0.5
df_sub['target'] = df_sub['target'].astype('int')
df_sub.loc[condition, 'target']=1

In [None]:
df_sub[df_sub==1] = 'AbNormal'
df_sub[df_sub==0] = 'Normal'
df_sub.to_csv('submission.csv', index=False)