In [192]:

# Libraries for reading and manipulating data
import numpy as np
import pandas as pd

# Libraries for data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Libraries for model management
import pickle
from sklearn.pipeline import Pipeline

# Libraries for data-preprocessing, model building and evaluation
from sklearn import metrics
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import (KFold,
                                     LeaveOneOut,
                                     StratifiedKFold,
                                     cross_val_score,
                                     train_test_split,
                                     GridSearchCV,
                                     RandomizedSearchCV)

# Classifier models
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

# Library to over-sample a given data
from imblearn.over_sampling import SMOTE
import os

# 데이터 불러오기
if os.path.exists('semi.csv'):
    df = pd.read_csv('semi.csv')
else:
    raise FileNotFoundError("CSV 파일이 없습니다. 데이터를 업로드해주세요.")

# 데이터 확인
print(f"Shape of the dataset: {df.shape}")
print(df.info())


def display_null_count(df):
    for attr in df.columns:
        print('-' * 50)
        print(f'{attr}')
        print('-' * 50)
        print('Null entry count:', df[attr].isnull().sum())
        print('Null entry proportion:', round(100 * df[attr].isnull().sum() / df.shape[0], 2), '%\n')


display_null_count(df[['Time', 'Pass/Fail']])





Shape of the dataset: (1567, 592)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1567 entries, 0 to 1566
Columns: 592 entries, Time to Pass/Fail
dtypes: float64(590), int64(1), object(1)
memory usage: 7.1+ MB
None
--------------------------------------------------
Time
--------------------------------------------------
Null entry count: 0
Null entry proportion: 0.0 %

--------------------------------------------------
Pass/Fail
--------------------------------------------------
Null entry count: 0
Null entry proportion: 0.0 %



In [193]:
for attr in df.select_dtypes(include=['float64']).columns:
    null_proportion = (df[attr].isnull().sum() / df.shape[0])  # str(attr) 대신 attr 사용
    if null_proportion > 0.2:
        df = df.drop(attr, axis=1)
    elif null_proportion:
        df[attr] = df[attr].fillna(df[attr].mean())  # str(attr) 대신 attr 사용

df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1567 entries, 0 to 1566
Columns: 560 entries, Time to Pass/Fail
dtypes: float64(558), int64(1), object(1)
memory usage: 6.7+ MB


In [194]:
std_0_cols = [col for col in df.select_dtypes(include=['float64']).columns if df[col].std() == 0]
df = df.drop(std_0_cols, axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1567 entries, 0 to 1566
Columns: 444 entries, Time to Pass/Fail
dtypes: float64(442), int64(1), object(1)
memory usage: 5.3+ MB


In [195]:
df['Time'] = pd.to_datetime(df['Time'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1567 entries, 0 to 1566
Columns: 444 entries, Time to Pass/Fail
dtypes: datetime64[ns](1), float64(442), int64(1)
memory usage: 5.3 MB


In [196]:
# Attributes with very weak(<0.05) correlation with the target variable 'Pass/Fail'
target_corr = abs(df.corr()['Pass/Fail'])
weak_corr_features = target_corr[target_corr < 0.05]
df.drop(list(weak_corr_features.index), axis=1)

Unnamed: 0,14,21,22,26,28,32,33,38,40,56,...,510,511,542,543,551,554,557,573,575,Pass/Fail
0,7.955800,-5419.00,2916.50,1.7730,64.2333,83.3971,9.5126,86.9555,61.29,0.9317,...,64.6707,0.0000,0.1096,0.0078,0.78,0.3363,1.0297,0.3157,0.1026,-1
1,10.154800,-5441.50,2604.25,2.0143,68.4222,84.9052,9.7997,87.5241,78.25,0.9324,...,141.4365,0.0000,0.1096,0.0078,1.33,0.4989,1.7819,0.2653,0.0772,-1
2,9.515700,-5447.75,2701.75,2.0295,67.1333,84.7569,8.6590,84.7327,14.37,0.9139,...,240.7767,244.2748,0.1096,0.0078,0.85,0.3752,1.1386,0.1882,0.0640,1
3,9.605200,-5468.25,2648.25,2.0038,62.9333,84.9105,8.6789,86.6867,76.90,0.9139,...,113.5593,0.0000,0.1096,0.0078,39.33,17.4781,54.2917,0.1738,0.0525,-1
4,10.566100,-5476.25,2635.25,1.9912,62.8333,86.3269,8.7677,86.1468,76.39,0.9298,...,148.0663,0.0000,0.1096,0.0078,1.98,0.8311,2.5014,0.2224,0.0706,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1562,11.769200,-5418.75,2608.00,1.9540,71.1444,83.8405,8.7164,86.3672,81.21,0.9204,...,53.1915,235.7895,0.1180,0.0104,0.80,0.3141,1.0367,0.2363,0.0785,-1
1563,9.162000,-6408.75,2277.50,1.8023,72.8444,84.0623,8.9607,86.4051,79.43,0.9255,...,29.4372,700.0000,0.1120,0.0102,1.33,0.5058,1.8156,0.3891,0.1213,-1
1564,9.005371,-5153.25,2707.00,1.9435,71.2667,85.8638,8.1728,86.3506,82.03,0.9353,...,54.8330,0.0000,0.1119,0.0089,1.50,0.5605,2.0228,0.4154,0.1352,-1
1565,9.735400,-5271.75,2676.50,1.9880,70.5111,84.5602,9.1930,86.3130,81.13,0.9207,...,78.4993,456.4103,0.1180,0.0104,1.33,0.5058,1.8156,0.3669,0.1040,-1


In [197]:
# Attributes with high correlation(>0.95)
corr_attrs = list()
signal_df_corr = df.select_dtypes(include=['float64']).corr()
for i in range(len(signal_df_corr.columns)):
    for j in range(i + 1, len(signal_df_corr.columns)):
        if abs(signal_df_corr.iloc[i, j]) > 0.95:
            corr_attrs.append(signal_df_corr.columns[j])
corr_attrs = list(set(corr_attrs))
df = df.drop(corr_attrs, axis=1)

X = df.drop('Pass/Fail', axis=1)
Y = df['Pass/Fail']

In [198]:
# datetime64 타입의 열들을 제외한 피처들만 선택
X = df.select_dtypes(exclude=['datetime64']).iloc[:, :-1]  # 마지막 열 제외
Y = df.iloc[:, -1]  # 마지막 열(타겟 변수)

'''smote = SMOTE(random_state=1, k_neighbors=20)
X_bal, Y_bal = smote.fit_resample(X, Y)
Y_bal = pd.Series(Y_bal)'''
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import BorderlineSMOTE

X_bal = X
Y_bal = pd.Series(Y)
x_train, x_test, y_train, y_test = train_test_split(X_bal, Y_bal, test_size=0.30, random_state=1, stratify=Y_bal)
adasyn = ADASYN(sampling_strategy='minority', random_state=42, n_neighbors=5)
x_train, y_train = adasyn.fit_resample(x_train, y_train)

from sklearn.decomposition import PCA



In [199]:
std_scalar = StandardScaler().fit(x_train)
X_train = pd.DataFrame(std_scalar.transform(x_train), columns=x_train.columns)
X_test = pd.DataFrame(std_scalar.transform(x_test), columns=x_train.columns)

pca = PCA(n_components=150)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

In [200]:
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import balanced_accuracy_score
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

# SVC를 위한 베이지안 최적화 설정
search_space = {
    'C': Real(1e-6, 1000.0, prior='log-uniform'),  # 정규화 매개변수
    'gamma': Real(1e-6, 1.0, prior='log-uniform'),  # 커널 계수
    'kernel': Categorical(['linear', 'poly', 'rbf', 'sigmoid']),  # 커널 종류
    'degree': Integer(1, 5),  # 다항 커널을 사용할 경우 차수
    'class_weight': Categorical([None, 'balanced'])  # 클래스 가중치
}

# SVC 모델 초기화
model = SVC()

# BayesSearchCV 설정

opt = BayesSearchCV(
    model,
    search_space,
    n_iter=30,  # 최대 100번의 파라미터 조합을 테스트
    cv=2,
    scoring='balanced_accuracy',  # balanced accuracy score를 기준으로 최적화
    n_jobs=-1,
    random_state=123  # 결과 재현성을 위한 random_state
)

# 모델 최적화 (여기서 X_train, y_train 사용)
opt.fit(X_train, y_train)

# 최적의 파라미터 출력
print(f"Best Parameters: {opt.best_params_}")

# 최적 파라미터를 사용하여 다시 모델을 학습 (전체 훈련 데이터를 사용)
best_model = opt.best_estimator_

y_pred = best_model.predict(X_test)

# 테스트셋에서의 balanced accuracy 계산
balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
print(f"Balanced Accuracy on Test Set: {balanced_accuracy}")


Best Parameters: OrderedDict([('C', 11.653724968169188), ('class_weight', None), ('degree', 2), ('gamma', 0.004711038388676721), ('kernel', 'rbf')])
Balanced Accuracy on Test Set: 0.49772727272727274


In [201]:
model = KNeighborsClassifier(n_neighbors=3, metric='cosine', algorithm='brute')
model.fit(X_train, y_train)
y_pred_proba = model.predict_proba(X_test)[:, 1]

threshold = 0.3
y_pred = (y_pred_proba >= threshold).astype(int)

# 성능 평가
accuracy = balanced_accuracy_score(y_test, y_pred)
print(f"Balanced Accuracy Score: {accuracy}")

Balanced Accuracy Score: 0.16129032258064516




In [202]:
from sklearn.svm import SVC
from sklearn.metrics import balanced_accuracy_score

# SVC 모델 초기화 (확률 추정을 사용하기 위해 probability=True로 설정)


# 확률 예측
y_pred_proba = opt.best_estimator_.predict(X_test)

# 성능 평가
balanced_accuracy = balanced_accuracy_score(y_test, y_pred_proba)
print(f"Balanced Accuracy on Test Set: {balanced_accuracy}")


Balanced Accuracy on Test Set: 0.49772727272727274
