In [None]:
import pandas as pd
import random
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(37) # Seed 고정

In [None]:
train_df = pd.read_csv('./drive/MyDrive/dacon_train.csv') #파일 로드드
test_df = pd.read_csv('./drive/MyDrive/dacon_test.csv')

In [None]:
train_x = train_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP', 'Y_Class', 'Y_Quality'])
train_y = train_df['Y_Quality']
train_y1 = train_df['Y_Class']

test_x = test_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP'])


train_x = train_x.fillna(0)
test_x = test_x.fillna(0)
train_x

In [None]:
from sklearn.preprocessing import StandardScaler #표준화

std = StandardScaler()
std.fit(train_x.filter(regex='X'))
train_x_std = std.transform(train_x.filter(regex='X'))
test_x_std = std.transform(test_x.filter(regex='X'))
train_x.iloc[:, 2:] = train_x_std
test_x.iloc[:, 2:] = test_x_std

In [None]:
qual_col = ['LINE', 'PRODUCT_CODE'] # 라벨 인코딩딩

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train_x[i])
    train_x[i] = le.transform(train_x[i])

    for label in np.unique(test_x[i]):
        if label not in le.classes_:
            le.classes_ = np.append(le.classes_, label)
    test_x[i] = le.transform(test_x[i])

train_x

In [None]:
x_train, x_test, y_train, y_test = train_test_split(train_x, train_y, test_size = 0.2, random_state=37) # 모델 정확도 측정을 위한 데이터 스플릿릿

y_train1, y_test1 = train_test_split(train_y1, test_size = 0.2, random_state=37)

In [None]:
pip install bayesian-optimization
pip install catboost

In [None]:
from sklearn.metrics import r2_score, mean_squared_error
from catboost import CatBoostClassifier, Pool
from catboost import CatBoostRegressor, Pool

# 파라미터 튜닝닝
def mean_absolute_percentage_error(y_test, y_pred):
    y_test, y_pred = np.array(y_test), np.array(y_pred)
    return np.mean(np.abs((y_test - y_pred) / y_test)) * 100

# 탐색 대상 함수 (XGBRegressor)
def XGB_cv(max_depth,learning_rate, n_estimators, min_child_samples, bagging_temperature, random_strength, subsample, l2_leaf_reg
            ,colsample_bylevel, silent=True):

    # 모델 정의
    model = CatBoostRegressor(max_depth=int(max_depth),
                              learning_rate=learning_rate,
                              n_estimators=int(n_estimators),
                              bagging_temperature = bagging_temperature,
                              random_strength = random_strength,
                              min_child_samples=int(min_child_samples),
                              subsample=subsample,
                              colsample_bylevel=colsample_bylevel,
                              l2_leaf_reg = l2_leaf_reg,
                              loss_function='RMSE'
                              )
    # 모델 훈련
    model.fit(x_train, y_train)

    # 예측값 출력
    y_pred= model.predict(x_test)

    # 각종 metric 계산
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    # 오차 최적화로 사용할 metric 반환
    return r2 + rmse

#  bayesian-optimization 라이브러리의 BayesianOptimization 클래스 import
from bayes_opt import BayesianOptimization
import numpy as np

# 실험해보고자하는 hyperparameter 집합
pbounds = {'max_depth': (1, 7),
              'learning_rate': (0.01, 0.2),
              'n_estimators': (5000, 20000),
              'min_child_samples': (1, 50),
              'random_strength': (0, 200),
              'bagging_temperature': (0.01, 255),
              'subsample': (0.2, 1),
              'colsample_bylevel' :(0.2, 1),
              'l2_leaf_reg' : (1, 200)
              }


# verbose = 2 항상 출력, verbose = 1 최댓값일 때 출력, verbose = 0 출력 안함
bo=BayesianOptimization(f=XGB_cv, pbounds=pbounds, verbose=2, random_state=37 )    

# init_points :  초기 Random Search 갯수
# n_iter : 반복 횟수 (몇개의 입력값-함숫값 점들을 확인할지! 많을 수록 정확한 값을 얻을 수 있다.)
# acq : Acquisition Function들 중 Expected Improvement(EI) 를 사용
# xi : exploration 강도 (기본값은 0.0)
bo.maximize(init_points=2, n_iter=20, acq='ei', xi=0.01)

# ‘iter’는 반복 회차, ‘target’은 목적 함수의 값, 나머지는 입력값을 나타냅니다. 
# 현재 회차 이전까지 조사된 함숫값들과 비교하여, 현재 회차에 최댓값이 얻어진 경우, 

# 찾은 파라미터 값 확인
print(bo.max)

In [None]:

# test데이터 넣기 전 나눈 train 데이터를 토대로 모델 성능 살펴보기기
model = CatBoostRegressor(l2_leaf_reg = 30,bagging_temperature= 8.338089576931216, colsample_bylevel= 0.3705090418296811, learning_rate= 0.05824132807180273, max_depth= 5, min_child_samples= 17.32446440935138, n_estimators= 9847, random_strength= 70.01987348280203, subsample= 0.4185277051886087, loss_function='MAE')
# train the model
model.fit(x_train, y_train)

y_pred2 = model.predict(x_test)
print(y_pred2)
print(r2_score(y_test, y_pred2))

x = np.array(y_train)
test = np.array(y_pred2)

xgboost1 = XGBClassifier(random_state = 37, colsample_bytree= 0.8202199218715005, learning_rate= 0.011507229876304158, max_depth= 4, min_child_weight= 0.3320122820098772, n_estimators= 6577, subsample= 0.5896753878923751)
xgboost1.fit(x.reshape(-1, 1), y_train1)
y_pred3 = xgboost1.predict(test.reshape(-1, 1))

from sklearn.metrics import precision_score

print(y_pred3)
print(precision_score(y_test1, y_pred3, average='micro'))
print(precision_score(y_test1, y_pred3, average='macro'))
print(precision_score(y_test1, y_pred3, average='weighted'))

In [None]:

# 실제 test 데이터를 집어 넣기기

model = CatBoostRegressor(l2_leaf_reg = 30,bagging_temperature= 8.338089576931216, colsample_bylevel= 0.3705090418296811, learning_rate= 0.05824132807180273, max_depth= 5, min_child_samples= 17.32446440935138, n_estimators= 9847, random_strength= 70.01987348280203, subsample= 0.4185277051886087, loss_function='RMSE')
# train the model
model.fit(train_x, train_y)

y_pred = model.predict(test_x)
print(y_pred)

from sklearn.metrics import precision_score

x = np.array(train_y)
test = np.array(y_pred)

xgboost1 = XGBClassifier(random_state = 37, colsample_bytree= 0.8202199218715005, learning_rate= 0.011507229876304158, max_depth= 4, min_child_weight= 0.3320122820098772, n_estimators= 6577, subsample= 0.5896753878923751)
xgboost1.fit(x.reshape(-1, 1), train_y1)
y_pred1 = xgboost1.predict(test.reshape(-1, 1))

In [None]:
submit = pd.read_csv('./drive/MyDrive/sample_submission.csv')

submit['Y_Class'] = y_pred1

submit.to_csv('./baseline_submission26.csv', index=False)