In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

FDC 데이터 불러오기

In [7]:
df_final = pd.read_csv('./data/fdc_data.csv')

정규화

In [8]:
scaler = StandardScaler()

sensor_data = df_final.iloc[:,5:]
sensor_data_scaled = scaler.fit_transform(sensor_data)
df_sensor_scaled = pd.DataFrame(sensor_data_scaled, columns=sensor_data.columns)
df_fdc_scaled = pd.concat([df_final.iloc[:,:5].reset_index(drop=True), df_sensor_scaled.reset_index(drop=True)], axis=1)

라벨 추출

In [9]:
labels = df_fdc_scaled['PASS(1)/FAIL(0)'].values
labels = np.array(labels)

In [10]:
SEED = 42

In [11]:
fdc_train, fdc_temp, labels_train, labels_temp = train_test_split(
    df_sensor_scaled, labels, test_size=0.2, random_state=SEED
)

fdc_val, fdc_test, labels_val, labels_test = train_test_split(
    fdc_temp, labels_temp, test_size=0.25, random_state=SEED  # 0.25 of the remaining 20% = 5% of total
)

In [12]:
fdc_train.shape, fdc_val.shape, fdc_test.shape

((848, 216), (159, 216), (53, 216))

In [25]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV

# 평가 함수 정의
def print_result_classifier(y, pred):
    accuracy = accuracy_score(y, pred)
    precision = precision_score(y, pred)
    recall = recall_score(y, pred)
    f1 = f1_score(y, pred)
    
    print(f'Accuracy: {accuracy:.6f}')
    print(f'Precision: {precision:.6f}')
    print(f'Recall: {recall:.6f}')
    print(f'F1 Score: {f1:.6f}')

# 하이퍼파라미터 범위 설정
param_grid = {
    'n_estimators': [100, 500, 1000],          # 트리 개수
    'max_depth': [None, 64, 128],              # 최대 깊이
    'min_samples_split': [2, 5, 10],           # 노드 분할을 위한 최소 샘플 수
    'min_samples_leaf': [1, 2, 4],             # 리프 노드의 최소 샘플 수
    'max_features': ['sqrt', 'log2'],          # 각 분할에 사용할 특성 수
    'class_weight': ['balanced', None]         # 클래스 가중치 (불균형 데이터 처리)
}

# RandomForestClassifier와 GridSearchCV 설정
rf_classifier = RandomForestClassifier(random_state=0)
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, 
                           scoring='f1', cv=5, verbose=2, n_jobs=-1)

# 훈련 데이터에 대해 하이퍼파라미터 최적화
grid_search.fit(fdc_train, labels_train)

# 최적의 하이퍼파라미터와 성능 출력
print("Best Hyperparameters:", grid_search.best_params_)
print("Best F1 Score:", grid_search.best_score_)

# 최적화된 모델로 테스트 데이터 예측 및 결과 출력
best_rf_classifier = grid_search.best_estimator_
y_pred_rf = best_rf_classifier.predict(fdc_test)
print_result_classifier(labels_test, y_pred_rf)


Fitting 5 folds for each of 324 candidates, totalling 1620 fits
Best Hyperparameters: {'class_weight': None, 'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 500}
Best F1 Score: 0.8360671975848177
Accuracy: 0.679245
Precision: 0.666667
Recall: 0.866667
F1 Score: 0.753623


In [26]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def print_result_classifier(y, pred):
    accuracy = accuracy_score(y, pred)
    precision = precision_score(y, pred)
    recall = recall_score(y, pred)
    f1 = f1_score(y, pred)
    
    print(f'Accuracy: {accuracy:.6f}')
    print(f'Precision: {precision:.6f}')
    print(f'Recall: {recall:.6f}')
    print(f'F1 Score: {f1:.6f}')

# RandomForestClassifier로 변경
rf_classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
rf_classifier.fit(fdc_train, labels_train)
y_pred_rf = rf_classifier.predict(fdc_test)

print_result_classifier(labels_test, y_pred_rf)

Accuracy: 0.698113
Precision: 0.666667
Recall: 0.933333
F1 Score: 0.777778


In [28]:
# 0과 1의 개수를 계산
unique, counts = np.unique(labels, return_counts=True)
label_counts = dict(zip(unique, counts))

print(f"0의 개수: {label_counts.get(0, 0)}")
print(f"1의 개수: {label_counts.get(1, 0)}")

0의 개수: 363
1의 개수: 697


In [36]:
# 0과 1의 개수를 계산
unique2, counts2 = np.unique(labels_test, return_counts=True)
label_counts2 = dict(zip(unique2, counts2))

print(f"0의 개수: {label_counts2.get(0, 0)}")
print(f"1의 개수: {label_counts2.get(1, 0)}")

0의 개수: 23
1의 개수: 30
