# Lab2

In [912]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import urllib.request
import urllib.request
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tqdm import tqdm
import os
from imblearn.over_sampling import SMOTE

print('-------------------')
print('|     lab2        |')
print('-------------------')

if os.path.exists('/semi.csv'):
    '''
    혹시 파일이 생기지 않는다면, 아래 두 줄의 스크립스틑 파이썬 .py파일로 만들어서 실행하면 됩니다. 
    '''
    url = "https://drive.google.com/uc?export=download&id=1XCU0eo2xZ03xhxJhdrCnVjduCoaBQ7kJ"
    urllib.request.urlretrieve(url, "semi.csv")  # save in a file
else:
    print('data already exist')


-------------------
|     lab2        |
-------------------
data already exist


In [913]:
df = pd.read_csv('semi.csv')

## 데이터 null값을 전체 컬럼에서 구합니다. 41951개의 Null data가 존재합니다.

In [914]:
df.isnull().sum().sum()


np.int64(41951)

## 데이터 info를 확인해보니 Object로 문자열로 인코딩된 데이터가 한 개 존재합니다.

In [915]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1567 entries, 0 to 1566
Columns: 592 entries, Time to Pass/Fail
dtypes: float64(590), int64(1), object(1)
memory usage: 7.1+ MB


In [916]:
def remove_collinear_features(x, threshold = 0.5):
    corr_matrix = x.corr()
    iters = range(len(corr_matrix.columns) - 1)
    drop_cols = []
    
    for i in iters:
        for j in range(i+1):
            item = corr_matrix.iloc[j:(j+1), (i+1):(i+2)]
            col = item.columns
            row = item.index
            # 절대값을 씌우는 이유는
            # corr 절대값이 높은거를 제거하면 되기 때문에
            val = abs(item.values)
            
            if val >= threshold:
                print(col.values[0], '|', row.values[0], '|', round(val[0][0], 2))
                drop_cols.append(col.values[0])
                
    drops = set(drop_cols)
    x = x.drop(columns=drops)
            
    return x


### 데이터 decribe를 확인해보니 std가 상당히 큰 걸 알 수 있습니다. 
### 또한 0 or 100으로만 이루어진 std =0 인 데이터가 존재합니다.

In [917]:
df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,581,582,583,584,585,586,587,588,589,Pass/Fail
count,1561.0,1560.0,1553.0,1553.0,1553.0,1553.0,1553.0,1558.0,1565.0,1565.0,...,618.0,1566.0,1566.0,1566.0,1566.0,1566.0,1566.0,1566.0,1566.0,1567.0
mean,3014.452896,2495.850231,2200.547318,1396.376627,4.197013,100.0,101.112908,0.121822,1.462862,-0.000841,...,97.934373,0.500096,0.015318,0.003847,3.067826,0.021458,0.016475,0.005283,99.670066,-0.867262
std,73.621787,80.407705,29.513152,441.69164,56.35554,0.0,6.237214,0.008961,0.073897,0.015116,...,87.520966,0.003404,0.01718,0.00372,3.578033,0.012358,0.008808,0.002867,93.891919,0.49801
min,2743.24,2158.75,2060.66,0.0,0.6815,100.0,82.1311,0.0,1.191,-0.0534,...,0.0,0.4778,0.006,0.0017,1.1975,-0.0169,0.0032,0.001,0.0,-1.0
25%,2966.26,2452.2475,2181.0444,1081.8758,1.0177,100.0,97.92,0.1211,1.4112,-0.0108,...,46.1849,0.4979,0.0116,0.0031,2.3065,0.013425,0.0106,0.0033,44.3686,-1.0
50%,3011.49,2499.405,2201.0667,1285.2144,1.3168,100.0,101.5122,0.1224,1.4616,-0.0013,...,72.2889,0.5002,0.0138,0.0036,2.75765,0.0205,0.0148,0.0046,71.9005,-1.0
75%,3056.65,2538.8225,2218.0555,1591.2235,1.5257,100.0,104.5867,0.1238,1.5169,0.0084,...,116.53915,0.502375,0.0165,0.0041,3.295175,0.0276,0.0203,0.0064,114.7497,-1.0
max,3356.35,2846.44,2315.2667,3715.0417,1114.5366,100.0,129.2522,0.1286,1.6564,0.0749,...,737.3048,0.5098,0.4766,0.1045,99.3032,0.1028,0.0799,0.0286,737.3048,1.0


### NaN 즉, 발산해버린 센서 데이터도 존재하는 것을 볼 수 있습니다.

In [918]:
df.isna().sum().sum()

np.int64(41951)

## DuplicatedHandler

### 1. 데이터의 중복이 존재하는 것을 볼 수 있습니다.
### 2. 모든 컬럼을 돌면서 st, me, mx, mn : tuple 값을 stack에 넣고 iter합니다.
### 3. stack에 동일한 분포를 갖는 컬럼이 있으면 idx를 저장하고 drop at axis = 1을 통해 drop합니다.

In [919]:
def DuplicatedHandler(df: pd.DataFrame):
    features = range(590)
    stack = []
    idx = []
    for i, fe in enumerate(features):
        st = df[str(fe)].std()
        me = df[str(fe)].mean()
        mx = df[str(fe)].max()
        mn = df[str(fe)].min()
        if (st, me, mx, mn) in stack:
            idx.append(fe)
            print('duplicated!', i)
        else:
            stack.append((st, me, mx, mn))

    for i in idx:
        df = df.drop(str(i), axis=1)
    df = df.drop(['Time'], axis=1)
    return df


df = DuplicatedHandler(df)
df = remove_collinear_features(df)
feature = df.drop('Pass/Fail', axis=1)
target = df['Pass/Fail']
X_train, X_test, y_train, y_test = train_test_split(feature, target, \
                                                    test_size=0.2, random_state=11, stratify=target)

X_train, X_test, y_train, y_test = pd.DataFrame(X_train), pd.DataFrame(X_test), y_train, y_test

duplicated! 52
duplicated! 69
duplicated! 97
duplicated! 141
duplicated! 149
duplicated! 178
duplicated! 179
duplicated! 186
duplicated! 189
duplicated! 190
duplicated! 191
duplicated! 192
duplicated! 193
duplicated! 194
duplicated! 226
duplicated! 229
duplicated! 230
duplicated! 231
duplicated! 232
duplicated! 233
duplicated! 234
duplicated! 235
duplicated! 236
duplicated! 237
duplicated! 240
duplicated! 241
duplicated! 242
duplicated! 243
duplicated! 256
duplicated! 257
duplicated! 258
duplicated! 259
duplicated! 260
duplicated! 261
duplicated! 262
duplicated! 263
duplicated! 264
duplicated! 265
duplicated! 266
duplicated! 276
duplicated! 284
duplicated! 313
duplicated! 314
duplicated! 315
duplicated! 322
duplicated! 325
duplicated! 326
duplicated! 327
duplicated! 328
duplicated! 329
duplicated! 330
duplicated! 364
duplicated! 369
duplicated! 370
duplicated! 371
duplicated! 372
duplicated! 373
duplicated! 374
duplicated! 375
duplicated! 378
duplicated! 379
duplicated! 380
duplicated!

In [920]:
X_train

Unnamed: 0,0,1,2,3,4,5,8,9,10,11,...,563,564,565,571,572,578,579,581,582,583
425,3079.17,2405.56,2217.3777,1425.1041,1.7585,100.0,1.4794,-0.0198,-0.0004,0.9535,...,,,,2.2846,9.3600,0.0234,0.0073,31.3771,0.5080,0.0139
412,2989.85,2501.88,2197.2333,1435.1460,0.9740,100.0,1.5330,-0.0059,0.0228,0.9490,...,0.5802,6.11,0.1208,2.2688,11.1800,,,,0.4976,0.0184
115,3002.85,2502.05,2232.5889,1717.2750,1.6700,100.0,1.4518,0.0066,0.0151,0.9659,...,,,,2.2788,11.6000,0.0364,0.0166,45.6835,0.4995,0.0093
887,3007.75,2535.14,2216.5000,1111.5436,0.8373,100.0,1.4503,0.0149,0.0010,0.9580,...,0.5671,4.98,0.0877,2.3473,10.9300,,,,0.4948,0.0099
328,2894.04,2490.06,2207.0444,1330.6718,1.3076,100.0,1.5546,-0.0107,0.0072,0.9600,...,,,,1.8070,8.9200,,,,0.4963,0.0208
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1332,3045.48,2408.85,2223.0444,1194.5986,1.2016,100.0,1.3838,-0.0125,0.0073,0.9726,...,0.7429,7.92,0.2796,2.0396,8.0900,,,,0.4990,0.0137
1447,2949.12,2553.24,2176.8000,1461.4374,0.8864,100.0,1.5576,-0.0205,0.0095,0.9777,...,0.5671,4.98,0.0877,2.2909,9.1299,,,,0.5031,0.0111
1412,3025.46,2516.06,2195.9778,1388.2869,1.5605,100.0,1.4298,0.0122,0.0176,0.9611,...,0.5671,4.98,0.0877,2.2909,9.1299,0.0378,0.0651,171.9936,0.4985,0.0169
270,2988.52,2291.92,2183.5777,1764.5386,1.7050,100.0,1.4305,0.0001,-0.0054,0.9615,...,,,,1.6889,9.8800,0.0274,0.0142,51.9067,0.4999,0.0095


### Create feature Handler
### 1. Null 값을 핸들링하는데, 데이터의 대부분이 비어있으면 (대략 1/3) 해당 컬럼을 drop합니다.
### 2. NaN value는 이전값 또는 이후 값으로 채웁니다. (이런 방식은 추천되지 않지만, 해당 데이터에서는 운이 좋게도 이런 방식으로 진행했을 때 데이터 무결성이 보장되는 것을 확인했습니다.)
### 3. zero 값으로 이루어진 column은 drop합니다.
### 4. smote를 통해 데이터 oversampling을 하는데 해당 알고리즘은 knn그룹 상에서 비슷한 데이터 분포를 가상으로 만들어내는 것으로 알고 있습니다.
### 5. log scaler을 처음에 사용하려 했지만 NaN으로 값이 발산하는 문제가 있기에 std_scaler를 사용하겠습니다.

In [921]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler


def NullDropHandler(train, test):
    tmp_stack = []
    for column in train.columns:
        if train[column].isnull().sum() > 500:
            train = train.drop([column], axis=1)
            test = test.drop([column], axis=1)
            tmp_stack.append(column)
    return train, test


def NaNValueHandler(df: pd.DataFrame):
    df = df.ffill()
    df = df.bfill()
    return df


def drop_zero_columns(train, test):
    zero_columns = train.columns[(train.mean() == 0) & (train.max() == 0) & (train.min() == 0)]
    train = train.drop(zero_columns, axis=1)
    test = test.drop(zero_columns, axis=1)
    return train, test


def augment_data_with_smote(X, y):
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)

    return X_resampled, y_resampled


'''def LogTransform(train, test):
    numeric_features = train.select_dtypes(include=['int64', 'float64']).columns
    # Use np.where to handle negative values before applying log
    train[numeric_features] = train[numeric_features].apply(lambda x: np.log1p(np.where(x < 0, 0, x)))
    test[numeric_features] = test[numeric_features].apply(lambda x: np.log1p(np.where(x < 0, 0, x)))
    return train, test'''


def DataHandler(X_train, X_test, y_train, y_test):
    X_train, X_test = NullDropHandler(X_train.copy(), X_test.copy())
    X_train = NaNValueHandler(X_train)
    X_test = NaNValueHandler(X_test)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    X_train, X_test = drop_zero_columns(pd.DataFrame(X_train), pd.DataFrame(X_test))
    X_train, y_train = augment_data_with_smote(X_train, y_train)

    return X_train, X_test, np.array(y_train).ravel(), np.array(y_test).ravel()


# Usage
X_train, X_test, y_train, y_test = DataHandler(X_train, X_test, y_train, y_test)

X_train, X_test, y_train, y_test = pd.DataFrame(X_train), pd.DataFrame(X_test), pd.DataFrame(y_train), pd.DataFrame(
    y_test)

In [922]:
print('is NaN data check : ', X_train.isna().sum().sum())
print('is NaN data check : ', X_test.isna().sum().sum())

is NaN data check :  0
is NaN data check :  0


In [923]:
X_train

Unnamed: 0,0,1,2,3,4,6,7,8,9,10,...,146,147,148,149,150,151,152,153,154,155
0,0.881309,-1.100675,0.568447,0.040984,-0.049872,0.209377,-1.238595,-0.079477,-1.136553,-0.937733,...,-0.720581,-0.884802,1.053761,-0.823185,-0.110345,-0.304470,0.657738,-0.226864,2.315336,-0.078402
1,-0.325747,0.086066,-0.113297,0.062878,-0.062384,0.925505,-0.326870,2.429340,-1.608314,-1.002535,...,0.146397,0.893408,1.053761,-0.823185,-0.110345,-0.304470,0.601695,-0.206288,-0.714307,0.157986
2,-0.150067,0.088160,1.083238,0.677975,-0.051283,-0.159375,0.493027,1.596672,0.163412,-0.415426,...,-0.538443,-0.777107,1.053761,-0.823185,-0.110345,-0.304470,0.637165,-0.201539,-0.160814,-0.320042
3,-0.083850,0.495856,0.538743,-0.642640,-0.064564,-0.179416,1.037438,0.071917,-0.664792,-0.773402,...,-0.376947,-0.885298,0.215580,-0.930100,-0.545908,-0.722336,0.880136,-0.209114,-1.529980,-0.288524
4,-1.620508,-0.059566,0.218739,-0.164897,-0.057063,1.214093,-0.641710,0.742377,-0.455120,0.264198,...,-0.466802,-0.621768,0.215580,-0.930100,-0.545908,-0.722336,-1.036322,-0.231838,-1.093012,0.284059
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2335,-1.098304,-0.006497,0.148130,-0.198164,-0.054654,0.081778,1.030151,-0.746428,-0.258622,0.844106,...,-0.464426,-0.425466,-1.058568,1.073903,1.237163,-0.128304,1.048743,-0.252008,0.846804,0.022656
2336,0.428660,-0.873879,-0.437387,-0.555109,-0.063702,-0.179439,-0.355186,0.270431,0.758173,-0.301445,...,-0.431059,0.956964,-1.458157,-0.469792,-0.300080,0.488629,0.912379,-0.218612,0.835660,-0.057084
2337,0.847122,-0.088783,-0.270408,-0.229245,-0.057170,-0.652952,-0.454887,-0.270698,0.119768,-2.008651,...,0.049044,0.511741,0.040958,-0.448858,0.011993,-0.870104,0.152377,-0.265283,-0.340871,-0.050825
2338,-1.118166,-0.328587,-0.403791,0.117403,-0.063614,0.557276,-0.938778,0.873380,0.506746,-0.590578,...,-0.498256,-0.601556,-2.002622,-0.159644,-0.939930,0.357896,-0.342407,-0.232328,1.220270,-0.164780


### target 값을 확인해서 category와 smote를 통한 데이터 증강을 확인합니다.

In [924]:
tmp = pd.DataFrame(y_train)
tmp.value_counts()

0 
-1    1170
 1    1170
Name: count, dtype: int64

### 0으로 이루어진 column index를 확인합니다. 
### index가 없으니 zero data 무결성이 보장되었습니다.

In [925]:
print(X_train.columns[(X_train.mean() == 0) & (X_train.max() == 0) & (X_train.min() == 0)])

Index([], dtype='int64')


### data scaler를 통한 노말라이즈 또한 잘 이루어진 것을 볼 수 있습니다.

In [926]:
X_train

Unnamed: 0,0,1,2,3,4,6,7,8,9,10,...,146,147,148,149,150,151,152,153,154,155
0,0.881309,-1.100675,0.568447,0.040984,-0.049872,0.209377,-1.238595,-0.079477,-1.136553,-0.937733,...,-0.720581,-0.884802,1.053761,-0.823185,-0.110345,-0.304470,0.657738,-0.226864,2.315336,-0.078402
1,-0.325747,0.086066,-0.113297,0.062878,-0.062384,0.925505,-0.326870,2.429340,-1.608314,-1.002535,...,0.146397,0.893408,1.053761,-0.823185,-0.110345,-0.304470,0.601695,-0.206288,-0.714307,0.157986
2,-0.150067,0.088160,1.083238,0.677975,-0.051283,-0.159375,0.493027,1.596672,0.163412,-0.415426,...,-0.538443,-0.777107,1.053761,-0.823185,-0.110345,-0.304470,0.637165,-0.201539,-0.160814,-0.320042
3,-0.083850,0.495856,0.538743,-0.642640,-0.064564,-0.179416,1.037438,0.071917,-0.664792,-0.773402,...,-0.376947,-0.885298,0.215580,-0.930100,-0.545908,-0.722336,0.880136,-0.209114,-1.529980,-0.288524
4,-1.620508,-0.059566,0.218739,-0.164897,-0.057063,1.214093,-0.641710,0.742377,-0.455120,0.264198,...,-0.466802,-0.621768,0.215580,-0.930100,-0.545908,-0.722336,-1.036322,-0.231838,-1.093012,0.284059
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2335,-1.098304,-0.006497,0.148130,-0.198164,-0.054654,0.081778,1.030151,-0.746428,-0.258622,0.844106,...,-0.464426,-0.425466,-1.058568,1.073903,1.237163,-0.128304,1.048743,-0.252008,0.846804,0.022656
2336,0.428660,-0.873879,-0.437387,-0.555109,-0.063702,-0.179439,-0.355186,0.270431,0.758173,-0.301445,...,-0.431059,0.956964,-1.458157,-0.469792,-0.300080,0.488629,0.912379,-0.218612,0.835660,-0.057084
2337,0.847122,-0.088783,-0.270408,-0.229245,-0.057170,-0.652952,-0.454887,-0.270698,0.119768,-2.008651,...,0.049044,0.511741,0.040958,-0.448858,0.011993,-0.870104,0.152377,-0.265283,-0.340871,-0.050825
2338,-1.118166,-0.328587,-0.403791,0.117403,-0.063614,0.557276,-0.938778,0.873380,0.506746,-0.590578,...,-0.498256,-0.601556,-2.002622,-0.159644,-0.939930,0.357896,-0.342407,-0.232328,1.220270,-0.164780


### train데이터의 shape을 확인해 smote의 적용이 잘 되었는지 확인합니다. 천 개의 데이터에서 2340개의 데이터로 oversampling 되었다는 것을 볼 수 있습니다.

In [927]:
X_train.shape

(2340, 152)

In [928]:
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

## 차원의 저주, high-d feature handling
### 1. 처음에는 xgboost를 통해 xgboost가 loss를 떨어뜨리기 위해 어떤 feature를 중요하게 봤는지 확인한 후 해당 feature에서 200개의 feature만 sampling 하기로 했지만, 이는 잘 작동하지 않았습니다.
### 2. 또한 feature selection에서 시간 지연을 방지하기 위해서 간단한 모델을 RFE를 통해 feature selection을 진행했습니다.
### 3. 이후에 PCA 차원축소를 같이 진행했습니다. 축소 후 분산을 확인해서 해당 데이터가 원본 데이터를 얼마나 잘 대변하는가? 를 확인하여 0.95를 가이드라인으로 잡았습니다. 이는 150입니다.

In [929]:
from sklearn.feature_selection import RFE
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

lr = LogisticRegression(random_state=1234)
rfe = RFE(lr, n_features_to_select=300)
X_train_rfe = rfe.fit_transform(X_train, y_train)
X_test_rfe = rfe.transform(X_test)

pca = PCA(n_components=150)
X_train = pca.fit_transform(X_train_rfe)
X_test = pca.transform(X_test_rfe)



In [930]:
y_train = np.where(y_train == -1, 0, 1)
y_test = np.where(y_test == -1, 0, 1)

### -1, 1의 target에서 0, 1의 target으로 re-handling합니다.

In [931]:
tmp = pd.DataFrame(y_train)
tmp.value_counts()

0
0    1170
1    1170
Name: count, dtype: int64

In [932]:
import numpy as np

# Convert the classes parameter to a numpy array

y_train = pd.DataFrame(y_train)

model = xgb.XGBClassifier(
    max_depth=5,
    learning_rate=0.05,
    min_child_weight=1,
    n_estimators=100,
    subsample=0.8,
    colsample_bytree=0.8,

)

model.fit(X_train, y_train)


### cross validation을 통해서 train data가 모델에 적합한지 f1 score를 통해 확인합니다.

In [933]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model, X_train, y_train, cv=5, scoring='f1')
print(f'Cross-validated F1 scores: {scores}')

Cross-validated F1 scores: [0.98920086 1.         0.99785867 1.         1.        ]


### overfitting 방지를 위해 파라미터 수정을 조금 진행합니다.

In [934]:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import balanced_accuracy_score
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

# KNeighborsClassifier를 위한 베이지안 최적화 설정
search_space = {
    'n_neighbors': Integer(1, 50),  # 이웃 수 (1 ~ 50 사이)
    'weights': Categorical(['uniform', 'distance']),  # 가중치 옵션
    'algorithm': Categorical(['auto', 'brute']),  # 알고리즘 선택
    'leaf_size': Integer(10, 100),  # 리프 사이즈 (10 ~ 100 사이)
    'p': Integer(1, 5), # Minkowski 거리에서 p 값 (1은 Manhattan 거리, 2는 Euclidean 거리)
    'metric': Categorical(['minkowski', 'hamming', 'dice', 'russellrao', 'rogerstanimoto', 
                           'l2', 'yule', 'cosine', 'euclidean', 'sokalmichener'])
}

# KNeighborsClassifier 초기화
model = KNeighborsClassifier()

# BayesSearchCV 설정
opt = BayesSearchCV(
    model,
    search_space,
    n_iter=100,  # 최대 30번의 파라미터 조합을 테스트
    cv=5,  # 5-Fold Cross Validation
    scoring='balanced_accuracy',  # balanced accuracy score를 기준으로 최적화
    n_jobs=-1,  # 모든 CPU 코어 사용
    random_state=123  # 결과 재현성을 위한 random_state
)

# 모델 최적화 (여기서 X_train, y_train 사용)
opt.fit(X_train, y_train.values.squeeze(axis=1))

# 최적의 파라미터 출력
print(f"Best Parameters: {opt.best_params_}")

# 최적 파라미터를 사용하여 다시 모델을 학습 (전체 훈련 데이터를 사용)
best_model = opt.best_estimator_

# 최적 모델을 이용한 테스트셋 예측
y_pred = best_model.predict(X_test)

# 테스트셋에서의 balanced accuracy 계산
balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
print(f"Balanced Accuracy on Test Set: {balanced_accuracy}")


KeyboardInterrupt: 

In [None]:
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import balanced_accuracy_score
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

# SVC를 위한 베이지안 최적화 설정
search_space = {
    'C': Real(1e-6, 1000.0, prior='log-uniform'),  # 정규화 매개변수
    'gamma': Real(1e-6, 1.0, prior='log-uniform'),  # 커널 계수
    'kernel': Categorical(['linear', 'poly', 'rbf', 'sigmoid']),  # 커널 종류
    'degree': Integer(1, 5),  # 다항 커널을 사용할 경우 차수
    'class_weight': Categorical([None, 'balanced'])  # 클래스 가중치
}

# SVC 모델 초기화
model = SVC()

# BayesSearchCV 설정

opt = BayesSearchCV(
    model,
    search_space,
    n_iter=100,  # 최대 100번의 파라미터 조합을 테스트
    cv=2,  # 5-Fold Cross Validation
    scoring='balanced_accuracy',  # balanced accuracy score를 기준으로 최적화
    n_jobs=-1,  # 모든 CPU 코어 사용
    random_state=123  # 결과 재현성을 위한 random_state
)

# 모델 최적화 (여기서 X_train, y_train 사용)
opt.fit(X_train, y_train)

# 최적의 파라미터 출력
print(f"Best Parameters: {opt.best_params_}")

# 최적 파라미터를 사용하여 다시 모델을 학습 (전체 훈련 데이터를 사용)
best_model = opt.best_estimator_

# 최적 모델을 이용한 테스트셋 예측
y_pred = best_model.predict(X_test)

# 테스트셋에서의 balanced accuracy 계산
balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
print(f"Balanced Accuracy on Test Set: {balanced_accuracy}")


In [935]:
model = KNeighborsClassifier(n_neighbors=2,metric='cosine',algorithm='brute',weights='uniform')
model.fit(X_train, y_train)
y_pred_proba = model.predict_proba(X_test)[:, 1]

threshold = 0.4
y_pred = (y_pred_proba >= threshold).astype(int)

  return self._fit(X, y)


In [936]:
from sklearn.metrics import classification_report, f1_score

print(balanced_accuracy_score(y_test, y_pred))


0.7038030229156509


In [None]:
import joblib

joblib.dump(model, 'best_xgboost_model.pkl')

## conclusion

### 1. 데이터의 분산이 크다보니 pass / fail 이 각각 특징적으로 갖는 feature 분포가 없습니다.
### 2. 해당 데이터 column이 어떤 특성을 갖는지 분석을 진행할 수 없어 feature selection에 한계가 있습니다.
### 3. 1번과 2번의 영향으로 f1스코어가 낮게 나옵니다. 또한 test데이터는 smote를 진행하지않아 이런 현상이 가중화됩니다.

### 이상입니다.

In [None]:
#Rmx