In [13]:
# 기존 라이브러리 불러오기
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
import xgboost as xgb
from sklearn.neighbors import KNeighborsClassifier
import os

# 데이터 불러오기
if os.path.exists('semi.csv'):
    df = pd.read_csv('semi.csv')
else:
    raise FileNotFoundError("CSV 파일이 없습니다. 데이터를 업로드해주세요.")

# 데이터 확인
print(f"Shape of the dataset: {df.shape}")
print(df.info())

# Step 1: 결측값 처리
# 50% 이상 결측값이 있는 열은 삭제
threshold = len(df) * 0.5
df_cleaned = df.dropna(thresh=threshold, axis=1)

# 숫자형 열에 대해서만 결측값을 평균값으로 대체
numeric_columns = df_cleaned.select_dtypes(include=[np.number]).columns
df_cleaned[numeric_columns] = df_cleaned[numeric_columns].fillna(df_cleaned[numeric_columns].mean())

# Step 2: '

# 'Time' 열은 더 이상 필요 없으므로 삭제
df_cleaned = df_cleaned.drop(columns=['Time'])

# Step 3: 데이터 스케일링 (Pass/Fail을 제외한 나머지 열을 스케일링)
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df_cleaned.drop(columns=['Pass/Fail']))

# 스케일링된 데이터프레임 생성
df_scaled = pd.DataFrame(scaled_features, columns=df_cleaned.columns[:-1])
df_scaled['Pass/Fail'] = df_cleaned['Pass/Fail'].values

# 스케일링된 데이터 확인
print(df_scaled.head())

# Step 4: 학습 데이터와 테스트 데이터로 분할
X = df_scaled.drop(columns=['Pass/Fail'])
y = df_scaled['Pass/Fail']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
y_train = np.where(y_train == -1, 0, 1)
y_test = np.where(y_test == -1, 0, 1)



from sklearn.feature_selection import RFE
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(random_state=1234)
rfe = RFE(lr, n_features_to_select=300)
X_train_rfe = rfe.fit_transform(X_train, y_train)
X_test_rfe = rfe.transform(X_test)

pca = PCA(n_components=150)
X_train = pca.fit_transform(X_train_rfe)
X_test = pca.transform(X_test_rfe)







# Step 5: 모델 훈련 및 성능 평가 (XGBoost 모델을 예시로 사용)
model = KNeighborsClassifier(n_neighbors=3,metric='cosine',algorithm='brute',weights='uniform')
model.fit(X_train, y_train)
y_pred_proba = model.predict_proba(X_test)[:, 1]

threshold = 0.45
y_pred = (y_pred_proba >= threshold).astype(int)


# 성능 평가
accuracy = balanced_accuracy_score(y_test, y_pred)
print(f"Balanced Accuracy Score: {accuracy}")


Shape of the dataset: (1567, 592)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1567 entries, 0 to 1566
Columns: 592 entries, Time to Pass/Fail
dtypes: float64(590), int64(1), object(1)
memory usage: 7.1+ MB
None
          0         1         2         3         4    5         6         7  \
0  0.224309  0.849725 -0.436273  0.033555 -0.050580  0.0 -0.563790  0.266269   
1  1.107136 -0.382910  1.017137  0.153067 -0.060045  0.0  0.198217  0.322244   
2 -1.114158  0.799102 -0.481289  0.686213 -0.047906  0.0 -0.906210  0.255074   
3 -0.350312 -0.198875 -0.051547 -1.106948 -0.051290  0.0  0.503246 -0.013602   
4  0.242143  0.087526  1.117387 -0.158919 -0.047492  0.0 -0.115382  0.187905   

          8         9  ...       577       582        583        584  \
0  0.509826  1.128417  ... -0.135520  0.118699  -0.204890  -0.093207   
1  0.456999  0.022582  ... -0.460054  0.530203   0.406679   0.444706   
2 -0.260907  0.327183  ... -0.590505 -1.262780   0.022264   0.014375   
3  0.343218 -0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned[numeric_columns] = df_cleaned[numeric_columns].fillna(df_cleaned[numeric_columns].mean())


Balanced Accuracy Score: 0.5382183908045977


In [58]:
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import balanced_accuracy_score
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

# SVC를 위한 베이지안 최적화 설정
search_space = {
    'C': Real(1e-6, 1000.0, prior='log-uniform'),  # 정규화 매개변수
    'gamma': Real(1e-6, 1.0, prior='log-uniform'),  # 커널 계수
    'kernel': Categorical(['linear', 'poly', 'rbf', 'sigmoid']),  # 커널 종류
    'degree': Integer(1, 5),  # 다항 커널을 사용할 경우 차수
    'class_weight': Categorical([None, 'balanced'])  # 클래스 가중치
}

# SVC 모델 초기화
model = SVC()

# BayesSearchCV 설정

opt = BayesSearchCV(
    model,
    search_space,
    n_iter=100,  # 최대 100번의 파라미터 조합을 테스트
    cv=2,  # 5-Fold Cross Validation
    scoring='f1',  # balanced accuracy score를 기준으로 최적화
    n_jobs=-1,  # 모든 CPU 코어 사용
    random_state=123  # 결과 재현성을 위한 random_state
)

# 모델 최적화 (여기서 X_train, y_train 사용)
opt.fit(X_train, y_train)

# 최적의 파라미터 출력
print(f"Best Parameters: {opt.best_params_}")

# 최적 파라미터를 사용하여 다시 모델을 학습 (전체 훈련 데이터를 사용)
best_model = opt.best_estimator_

# 최적 모델을 이용한 테스트셋 예측
y_pred = best_model.predict(X_test)

# 테스트셋에서의 balanced accuracy 계산
balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
print(f"Balanced Accuracy on Test Set: {balanced_accuracy}")


Best Parameters: OrderedDict([('C', 222.08332588348782), ('class_weight', 'balanced'), ('degree', 5), ('gamma', 0.0010624665872805631), ('kernel', 'sigmoid')])
Balanced Accuracy on Test Set: 0.6551724137931034


In [33]:
model = KNeighborsClassifier(n_neighbors=3,metric='cosine',algorithm='brute')
model.fit(X_train, y_train)
y_pred_proba = model.predict_proba(X_test)[:, 1]

threshold = 0.25
y_pred = (y_pred_proba >= threshold).astype(int)


# 성능 평가
accuracy = balanced_accuracy_score(y_test, y_pred)
print(f"Balanced Accuracy Score: {accuracy}")

Balanced Accuracy Score: 0.6619252873563218


In [57]:
from sklearn.svm import SVC
from sklearn.metrics import balanced_accuracy_score

# SVC 모델 초기화 (확률 추정을 사용하기 위해 probability=True로 설정)
model = SVC(C=1.3, class_weight='balanced', degree=3, probability=True,kernel='linear')
model.fit(X_train, y_train)

# 확률 예측
y_pred_proba = model.predict_proba(X_test)

# 특정 임계값(threshold)을 적용한 예측
threshold = 0.3
y_pred = (y_pred_proba[:, 1] >= threshold).astype(int)  # 이진 예측을 위한 변환

# 성능 평가
balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
print(f"Balanced Accuracy on Test Set: {balanced_accuracy}")


Balanced Accuracy on Test Set: 0.4982758620689655
