In [None]:
import os
import pandas as pd
import numpy as np
import random
import warnings

from pprint import pprint

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split

import catboost
from catboost import CatBoostClassifier, Pool
from tqdm import tqdm
warnings.filterwarnings(action='ignore')

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

In [None]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [None]:
train = train.dropna(axis=1, how='all')
test = test.dropna(axis=1, how='all')

In [None]:
missing_train = train.columns[train.isna().any()]
print(missing_train)
missing_test = test.columns[test.isna().any()]
print(missing_test)


Index(['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam',
       'HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Dam',
       'GMES_ORIGIN_INSP_JUDGE_CODE Collect Result_AutoClave',
       'GMES_ORIGIN_INSP_JUDGE_CODE Judge Value_AutoClave',
       'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1',
       'HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill1',
       'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2',
       'HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill2'],
      dtype='object')
Index(['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam',
       'HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Dam',
       'GMES_ORIGIN_INSP_JUDGE_CODE Collect Result_AutoClave',
       'GMES_ORIGIN_INSP_JUDGE_CODE Judge Value_AutoClave',
       'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1',
       'HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill1',
       'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fil

In [None]:
# 데이터 프레임 내 이상치 결측치로 변경
train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'] = train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'].replace('OK', np.nan)
train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'] = train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'].replace('OK', np.nan)
train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'] = train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'].replace('OK', np.nan)

test['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'] = test['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'].replace('OK', np.nan)
test['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'] = test['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'].replace('OK', np.nan)
test['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'] = test['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'].replace('OK', np.nan)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

def process_column(train, test, target_column, feature_prefix):
    """
    주어진 target_column에 대해 결측치 예측 모델을 학습하고 예측 값을 데이터프레임에 채우는 함수
    :param train: 학습 데이터프레임
    :param test: 테스트 데이터프레임
    :param target_column: 예측할 대상 열
    :param feature_prefix: 피처 열의 접두사
    """
    # 피처 열 정의
    feature_columns = [
        f'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect {feature_prefix}', 
        f'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect {feature_prefix}',
        f'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect {feature_prefix}',
        f'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect {feature_prefix}',
        f'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect {feature_prefix}',
        f'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect {feature_prefix}',
        f'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect {feature_prefix}',
        f'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect {feature_prefix}'
    ]

    # 데이터 타입 확인 및 변환
    for col in feature_columns + [target_column]:
        train[col] = pd.to_numeric(train[col], errors='coerce')
        test[col] = pd.to_numeric(test[col], errors='coerce')

    # 결측치가 있는 행을 필터링
    train_missing = train[train[target_column].isna()]

    # 결측치가 없는 데이터 필터링
    train_notna = train.dropna(subset=[target_column])

    # 라벨 인코딩을 위한 데이터 준비
    le = LabelEncoder()
    train_notna['target_encoded'] = le.fit_transform(train_notna[target_column])

    # Feature와 Target 설정
    X = train_notna[feature_columns]
    y = train_notna['target_encoded']

    # 학습 데이터와 검증 데이터로 분리 (80% 학습, 20% 검증)
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

    # 모델 초기화
    model = RandomForestClassifier(random_state=42)

    # 모델 학습
    model.fit(X_train, y_train)

    # 검증 데이터에 대한 예측
    y_pred = model.predict(X_valid)

    # 성능 평가
    accuracy = accuracy_score(y_valid, y_pred)
    print(f"Validation Accuracy for {target_column}: {accuracy * 100:.2f}%")

    # 라벨 이름 가져오기
    target_names = [str(label) for label in le.classes_]

    # 추가적인 성능 평가 리포트
    report = classification_report(y_valid, y_pred, target_names=target_names)
    print(f"Classification Report for {target_column}:\n", report)

    # 결측치가 있는 데이터 필터링
    train_na = train[train[target_column].isna()]

    # 결측치 데이터에 대한 예측
    X_test = train_na[feature_columns]
    predicted_values = model.predict(X_test)

    # 예측된 값을 target_column의 NaN 값에 대체
    train.loc[train[target_column].isna(), target_column] = le.inverse_transform(predicted_values)

    # 테스트 데이터에 대한 예측
    X_test_final = test[feature_columns]
    test_predictions = model.predict(X_test_final)

    # 예측된 값을 테스트 데이터프레임에 추가
    test[target_column] = le.inverse_transform(test_predictions)

# 학습 데이터와 테스트 데이터 로드
# train = pd.read_csv('train.csv')  # 예시
# test = pd.read_csv('test.csv')    # 예시

# 각 열에 대해 처리
process_column(train, test, 'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam', 'Result_Dam')
process_column(train, test, 'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1', 'Result_Fill1')
process_column(train, test, 'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2', 'Result_Fill2')

# 결과 확인
print(train.head())
print(test.head())


Validation Accuracy for HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam: 96.99%
Classification Report for HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam:
               precision    recall  f1-score   support

       162.4       1.00      1.00      1.00       667
       548.5       0.00      0.00      0.00         9
       549.0       0.94      1.00      0.97      1487
       549.5       1.00      0.64      0.78       244
       550.0       1.00      1.00      1.00       493
       550.3       1.00      1.00      1.00       390

    accuracy                           0.97      3290
   macro avg       0.82      0.77      0.79      3290
weighted avg       0.97      0.97      0.97      3290

Validation Accuracy for HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1: 99.91%
Classification Report for HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1:
               precision    recall  f1-score   support

       837.5       0.00      0.00      0.00       

In [None]:
#분산이 0인 피쳐 제거
a=[]
for c in train.columns:
  if  train[c].nunique()==1:
    a.append(c)
print(len(train.columns))
train = train.drop(columns=a)
print(len(train.columns))
# test_df.drop(columns=a, inplace=True)

186
146


In [None]:
#분산이 0인 피쳐 제거
a=[]
for c in test.columns:
  if  test[c].nunique()==1:
    a.append(c)
print(len(test.columns))
test = test.drop(columns=a)
print(len(test.columns))
# test_df.drop(columns=a, inplace=True)

186
146


In [None]:
# NaN 값이 존재하는 컬럼 리스트 반환
nan_columns = [col for col in train.columns if train[col].isna().any()]

print("NaN 값이 포함된 컬럼 리스트:")
print(nan_columns)
train[nan_columns]

NaN 값이 포함된 컬럼 리스트:
[]


0
1
2
3
4
...
40501
40502
40503
40504
40505


### optuna

In [None]:
%pip install optuna

Note: you may need to restart the kernel to use updated packages.


In [None]:
import optuna
import numpy as np
from sklearn.metrics import f1_score
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split

# 기본 설정
RANDOM_STATE = 42
n_splits = 10

# F1 스코어를 저장할 리스트 초기화
f1_scores_N = []
f1_scores_A = []
models = []
param_list = []
# AbNormal과 Normal 데이터 분리
df_abnormal = train[train["target"] == "AbNormal"]
df_normal = train[train["target"] == "Normal"]

# Normal 데이터를 n_splits만큼 나누기
normal_splits = np.array_split(df_normal.sample(frac=1, random_state=RANDOM_STATE), n_splits)

categorical_feats = df_abnormal.drop(columns=["target"]).dtypes[df_abnormal.drop(columns=["target"]).dtypes == "object"].index.tolist()


def objective(trial, X_train, y_train):
    # 하이퍼파라미터 범위 설정
    params = {
        "iterations": trial.suggest_int("iterations", 100, 1000),
        "depth": trial.suggest_int("depth", 4, 10),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-2, 10.0),
        "border_count": trial.suggest_int("border_count", 32, 255),
        "random_strength": trial.suggest_float("random_strength", 1e-2, 10.0),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0.0, 1.0)
    }
    
    # 훈련/검증 데이터 분리
    X_train_part, X_val, y_train_part, y_val = train_test_split(X_train, y_train, test_size=0.2,random_state=RANDOM_STATE)
    
    # 모델 생성 및 학습
    model = CatBoostClassifier(**params, eval_metric='F1', silent=True, random_state=RANDOM_STATE, cat_features=categorical_feats)
    model.fit(X_train_part, y_train_part)
    
    # 검증 F1 스코어 계산
    y_pred = model.predict(X_val)
    f1 = f1_score(y_val, y_pred, pos_label="AbNormal")
    
    return f1

# Optuna Study 생성
study = optuna.create_study(direction="maximize")

for i, normal_split in enumerate(normal_splits):
    # AbNormal 데이터를 Normal의 일부분과 결합
    df_split = pd.concat([df_abnormal, normal_split], axis=0).reset_index(drop=True)

    # 학습 데이터와 라벨 분리
    X_train = df_split.drop(columns=["target"])
    y_train = df_split["target"]
    
    # 최적화 수행
    study.optimize(lambda trial: objective(trial, X_train, y_train), timeout=360)  # 각 모델을 6분 (360초) 동안 최적화
    
    # 최적의 하이퍼파라미터로 최종 모델 학습
    best_params = study.best_params
    param_list.append(best_params)
    model = CatBoostClassifier(**best_params, eval_metric='F1', silent=True, random_state=RANDOM_STATE, cat_features=categorical_feats)
    model.fit(X_train, y_train)
    
    # 학습된 모델을 리스트에 저장
    models.append(model)
    
    # 최종 F1 스코어 계산 및 저장
    y_pred = model.predict(X_train)
    f1_N = f1_score(y_train, y_pred, pos_label="Normal")
    f1_A = f1_score(y_train, y_pred, pos_label="AbNormal")
    f1_scores_N.append(f1_N)
    f1_scores_A.append(f1_A)

# F1 스코어 출력
for i, score in enumerate(f1_scores_N):
    print(f"Model {i+1}: F1 Score = {score:.4f}")


[I 2024-08-26 22:08:50,957] A new study created in memory with name: no-name-bad31bde-cc61-4f70-bbea-98b06756d355
[I 2024-08-26 22:08:54,849] Trial 0 finished with value: 0.5 and parameters: {'iterations': 626, 'depth': 5, 'learning_rate': 0.05087810598446251, 'l2_leaf_reg': 7.6277413546671475, 'border_count': 156, 'random_strength': 1.1171293790263346, 'bagging_temperature': 0.003383175272832273}. Best is trial 0 with value: 0.5.
[I 2024-08-26 22:09:14,258] Trial 1 finished with value: 0.4792626728110599 and parameters: {'iterations': 728, 'depth': 10, 'learning_rate': 0.25735856844862903, 'l2_leaf_reg': 4.930130852410249, 'border_count': 176, 'random_strength': 9.730938293108405, 'bagging_temperature': 0.9080442172391525}. Best is trial 0 with value: 0.5.
[I 2024-08-26 22:09:16,987] Trial 2 finished with value: 0.4810744810744811 and parameters: {'iterations': 429, 'depth': 5, 'learning_rate': 0.10168499964458036, 'l2_leaf_reg': 4.071498983619261, 'border_count': 67, 'random_strength

In [None]:
test['target']= ''

In [None]:
import numpy as np

def ensemble_predict(models, X):
    predictions = np.zeros((len(X), len(models)))

    for i, model in enumerate(models):
        proba = model.predict_proba(X)
        # 각 모델의 클래스 순서를 확인하고 "AbNormal" 클래스의 확률 추출
        abnormal_class_index = list(model.classes_).index('AbNormal')
        predictions[:, i] = proba[:, abnormal_class_index]

    final_prediction = (predictions.mean(axis=1) > 0.5).astype(int)
    final_labels = np.where(final_prediction == 1, 'AbNormal', 'Normal')
    return final_labels


# 테스트 데이터에 대한 예측
test_pred = ensemble_predict(models, test[train.columns])

# 결과 확인
print(test_pred)


['Normal' 'Normal' 'Normal' ... 'Normal' 'Normal' 'Normal']


In [None]:
# 전체 Train 데이터에 대해 검증
def ensemble_predict(models, X):
    predictions = np.zeros((len(X), len(models)))

    for i, model in enumerate(models):
        proba = model.predict_proba(X)
        # 각 모델의 클래스 순서를 확인하고 "AbNormal" 클래스의 확률 추출
        abnormal_class_index = list(model.classes_).index('AbNormal')
        predictions[:, i] = proba[:, abnormal_class_index]

    final_prediction = (predictions.mean(axis=1) > 0.5).astype(int)
    final_labels = np.where(final_prediction == 1, 'AbNormal', 'Normal')
    return final_prediction, final_labels

val_x = train.drop(columns=["target"])
val_y = train["target"]

# 최적화 수행
pred_, labels_ =ensemble_predict(models, val_x)
f1_N = f1_score(val_y, labels_, pos_label="Normal")
f1_A = f1_score(val_y, labels_, pos_label="AbNormal")



In [None]:
# 모델이 예측하는 클래스 순서 확인
print(model.classes_)


['AbNormal' 'Normal']


In [None]:
from collections import Counter 
counter = Counter(test_pred)
counter

Counter({'Normal': 14249, 'AbNormal': 3112})

In [None]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub['Set ID'] = test['Set ID']
df_sub["target"] = test_pred

# 제출 파일 저장
df_sub.to_csv("submission_cat_10fold_optuna.csv", index=False)

In [None]:
# 학습된 모델들의 하이퍼파라미터 출력
for i, model in enumerate(models):
    # 모델의 하이퍼파라미터 가져오기
    params = model.get_params()
    # 모델 번호와 함께 하이퍼파라미터 출력
    print(f"Model {i+1} parameters:")
    for param, value in params.items():
        print(f"  {param}: {value}")
    print()  # 빈 줄 추가


Model 1 parameters:
  iterations: 569
  learning_rate: 0.15856214298086496
  depth: 7
  l2_leaf_reg: 6.131096540886849
  border_count: 230
  silent: True
  random_strength: 8.998024173178548
  eval_metric: F1
  bagging_temperature: 0.5269922925770286
  random_state: 42
  cat_features: ['Equipment_Dam', 'Model.Suffix_Dam', 'Workorder_Dam', 'Model.Suffix_AutoClave', 'Workorder_AutoClave', 'Chamber Temp. Judge Value_AutoClave', 'Equipment_Fill1', 'Model.Suffix_Fill1', 'Workorder_Fill1', 'Equipment_Fill2', 'Model.Suffix_Fill2', 'Workorder_Fill2']

Model 2 parameters:
  iterations: 569
  learning_rate: 0.15856214298086496
  depth: 7
  l2_leaf_reg: 6.131096540886849
  border_count: 230
  silent: True
  random_strength: 8.998024173178548
  eval_metric: F1
  bagging_temperature: 0.5269922925770286
  random_state: 42
  cat_features: ['Equipment_Dam', 'Model.Suffix_Dam', 'Workorder_Dam', 'Model.Suffix_AutoClave', 'Workorder_AutoClave', 'Chamber Temp. Judge Value_AutoClave', 'Equipment_Fill1', 'M