In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.1.1-cp38-none-manylinux1_x86_64.whl (76.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.6/76.6 MB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.1.1


In [3]:
!pip install optuna

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting optuna
  Downloading optuna-3.1.0-py3-none-any.whl (365 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m365.3/365.3 KB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0
  Downloading alembic-1.9.3-py3-none-any.whl (210 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m210.6/210.6 KB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m
Collecting cmaes>=0.9.1
  Downloading cmaes-0.9.1-py3-none-any.whl (21 kB)
Collecting colorlog
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting Mako
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 KB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, cmaes, alembic, optuna
Successfully installed Mako-1.2.4 alembic-1.9.3 cmaes-0.9.1 colorlog-6.7.0 optuna-3.1.0


In [4]:
import pandas as pd 
import numpy as np 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split, KFold
from sklearn.metrics import mean_absolute_error, roc_auc_score, f1_score, confusion_matrix
import optuna
from optuna import Trial
from optuna.samplers import TPESampler
from catboost import CatBoostClassifier, CatBoostRegressor

In [5]:
dpath = 'drive/MyDrive/Seculayer/OSC/data/'

train = pd.read_csv(dpath + 'train.csv')
test = pd.read_csv(dpath + 'test.csv')

In [6]:
guide_list = ['AL',"CA","Y_LABEL"]

for item in test.columns:
    guide_list.append(item)

In [7]:
teacher_X = train[guide_list]


In [8]:
teacher_X = teacher_X.drop(['ID'],axis=1)

# teacher_y = real['Y_LABEL']

In [9]:
le1 = LabelEncoder()
le2 = LabelEncoder()

teacher_X['COMPONENT_ARBITRARY_cat'] = le1.fit_transform(teacher_X['COMPONENT_ARBITRARY'])
teacher_X['YEAR_cat'] = le2.fit_transform(teacher_X['YEAR'])
teacher_X.drop(['YEAR','COMPONENT_ARBITRARY'],axis=1,inplace=True)



In [10]:
categorical_features = ['COMPONENT_ARBITRARY_cat','YEAR_cat']

In [11]:
teacher_X.dropna(axis=1,inplace=True)

In [12]:
teacher_y = teacher_X['Y_LABEL']
train_3 = teacher_X.copy()
teacher_X.drop('Y_LABEL',axis=1,inplace=True)

In [13]:
X_train, X_val, y_train, y_val = train_test_split(teacher_X,teacher_y,test_size=0.3,random_state=42,stratify = teacher_y)

In [None]:
def objective(trial : Trial) -> float :

    params_cat = {
        "random_state" : 39,
        'learning_rate' : trial.suggest_loguniform('learning_rate', 0.001, 1),
        "n_estimators" : trial.suggest_int("n_estimators", 100, 1000),
        "max_depth" : trial.suggest_int("max_depth", 3, 16)
  }
    
    model = CatBoostClassifier(**params_cat,eval_metric="F1")
    model.fit(X_train, y_train, eval_set = [(X_val, y_val)],
              early_stopping_rounds = 100, cat_features = categorical_features, verbose = False)

    cat_pred = model.predict(X_val)
    AUC = roc_auc_score(y_val, cat_pred)
    
    return AUC

In [None]:
# Optuna 초모수 작업 시작

sampler = TPESampler(seed = 2023)
study = optuna.create_study(
    study_name = "cat_parameter_opt",
    direction = "maximize",
    sampler = sampler)
study.optimize(objective, n_trials = 100)

In [None]:
# 가장 좋은 초모수와 성능 확인

print("Best Score :", study.best_value)
print("Best trial :", study.best_trial.params)

In [None]:
'learning_rate': 0.89449713934227, 'n_estimators': 992, 'max_depth': 5}

In [14]:
# 위의 초모수 적용하고, StratifiedKFold을 이용해 모델 적합 
import pickle

n_fold = 5
cv = StratifiedKFold(n_splits = n_fold, shuffle = True, random_state = 39)

cat_val = np.zeros((teacher_X.shape[0], 2))
cat_train = np.zeros((X_train.shape[0], 2))

for i, (i_trn, i_val) in enumerate(cv.split(teacher_X, teacher_y), 1):
    print(f'training model for CV #{i}')
    optuna_cat = CatBoostClassifier(
        random_state = 39,
        learning_rate = 0.89449713934227, 
        n_estimators = 992, 
        max_depth = 5,
        eval_metric="F1")

    optuna_cat.fit(teacher_X.loc[i_trn, :], teacher_y[i_trn], verbose = False, cat_features = categorical_features)

    cat_val[i_val, :] = optuna_cat.predict_proba(teacher_X.loc[i_val, :])
    cat_train += optuna_cat.predict_proba(X_train) / n_fold

    # 학습이 완료된 Teacher 모델 5개를 저장 
    with open('Teacher_model' + str(i) + '.pickle', 'wb') as fw:
        pickle.dump(optuna_cat, fw)

training model for CV #1
training model for CV #2
training model for CV #3
training model for CV #4
training model for CV #5


In [15]:
# Teacher model에서 구한 예측불량률을 train 데이터에 넣기

train_3['model1_prob'] = cat_val[:, 1]
print(train_3.shape)

(14095, 22)


In [16]:
test.columns

Index(['ID', 'COMPONENT_ARBITRARY', 'ANONYMOUS_1', 'YEAR', 'ANONYMOUS_2', 'AG',
       'CO', 'CR', 'CU', 'FE', 'H2O', 'MN', 'MO', 'NI', 'PQINDEX', 'TI', 'V',
       'V40', 'ZN'],
      dtype='object')

In [17]:
guide_list = ['COMPONENT_ARBITRARY_cat','YEAR_cat']

In [18]:
student_list = []
for item in train_3.columns:
    if item in test.columns:
        student_list.append(item)
    if item in guide_list:
        student_list.append(item)

student_X  = train_3[student_list]
student_y = train_3['model1_prob']

In [19]:
X_train, X_val, y_train, y_val = train_test_split(student_X, student_y, test_size = 0.3, random_state = 39)


In [None]:
def objective(trial : Trial) -> float :

    params_cat = {
        "random_state" : 39,
        'learning_rate' : trial.suggest_loguniform('learning_rate', 0.001, 1),
        "n_estimators" : trial.suggest_int("n_estimators", 100, 1000),
        "max_depth" : trial.suggest_int("max_depth", 3, 16)
  }
    
    model = CatBoostRegressor(**params_cat)
    model.fit(X_train, y_train, eval_set = [(X_val, y_val)],
              early_stopping_rounds = 100, cat_features = categorical_features, verbose = False)

    cat_pred = model.predict(X_val)
    MAE = mean_absolute_error(y_val, cat_pred)
    
    return MAE

In [None]:
# Optuna 초모수 작업 시작

sampler = TPESampler(seed = 2023)
study = optuna.create_study(
    study_name = "cat_parameter_opt_reg",
    direction = "minimize",
    sampler = sampler)
study.optimize(objective, n_trials = 100)

In [None]:
'learning_rate': 0.08540413612790043, 'n_estimators': 678, 'max_depth': 13

In [20]:
test2 = test.drop(['ID'],axis=1)
test2['COMPONENT_ARBITRARY_cat'] = le1.transform(test2['COMPONENT_ARBITRARY'])
test2['YEAR_cat'] = le2.transform(test2['YEAR'])
test3 = test2.drop(['COMPONENT_ARBITRARY', 'YEAR'], axis = 1)
X_test = test3.copy()

In [21]:
import pickle

n_fold = 5
cv = KFold(n_splits = n_fold, shuffle = True, random_state = 39)

cat_val = np.zeros((student_X.shape[0]))
cat_test = np.zeros((X_test.shape[0]))

for i, (i_trn, i_val) in enumerate(cv.split(student_X, student_y), 1):
    print(f'training model for CV #{i}')
    optuna_cat = CatBoostRegressor(
        random_state = 39,
        learning_rate = 0.03431411924930179, 
        n_estimators = 333, 
        max_depth = 11)

    optuna_cat.fit(student_X.loc[i_trn, :], student_y[i_trn], verbose = False, cat_features = categorical_features)

    cat_val[i_val] = optuna_cat.predict(student_X.loc[i_val, :])
    cat_test += optuna_cat.predict(X_test) / n_fold

    # 학습이 완료된 Teacher 모델 5개를 저장 
    with open('Teacher_model' + str(i) + '.pickle', 'wb') as fw:
        pickle.dump(optuna_cat, fw)

training model for CV #1
training model for CV #2
training model for CV #3
training model for CV #4
training model for CV #5


In [22]:
scores = []
TP = []
FP = []
FN = []
TN = []
for threshold in range(50) :
    threshold = threshold / 50
    pred = cat_val
    pred = np.where(pred >= threshold, 1, 0)
    score = f1_score(teacher_y, pred)
    scores.append(score)
    TP.append(confusion_matrix(teacher_y, pred)[0][0])
    FN.append(confusion_matrix(teacher_y, pred)[0][1])
    FP.append(confusion_matrix(teacher_y, pred)[1][0])
    TN.append(confusion_matrix(teacher_y, pred)[1][1])
    

temp1 = pd.DataFrame(np.linspace(0, 0.98, 50), columns = ['threshold'])
temp2 = pd.DataFrame(scores, columns = ['score'])
temp3 = pd.DataFrame(TP, columns = ['TP'])
temp4 = pd.DataFrame(FP, columns = ['FP'])
temp5 = pd.DataFrame(FN, columns = ['FN'])
temp6 = pd.DataFrame(TN, columns = ['TN'])
scores = pd.concat([temp1, temp2, temp3, temp4, temp5, temp6], axis = 1)
scores

Unnamed: 0,threshold,score,TP,FP,FN,TN
0,0.0,0.159791,356,10,12536,1193
1,0.02,0.176947,2391,67,10501,1136
2,0.04,0.202165,5460,232,7432,971
3,0.06,0.228753,8249,448,4643,755
4,0.08,0.252157,10144,633,2748,570
5,0.1,0.263514,11268,774,1624,429
6,0.12,0.244576,11870,893,1022,310
7,0.14,0.216659,12243,978,649,225
8,0.16,0.191111,12467,1031,425,172
9,0.18,0.16289,12595,1070,297,133


In [23]:
scores = []
for threshold in range(100) :
    threshold = threshold / 100
    pred = cat_val
    pred = np.where(pred >= threshold, 1, 0)
    score = f1_score(teacher_y, pred)
    scores.append(score)

temp1 = pd.DataFrame(np.linspace(0, 0.99, 100), columns = ['threshold'])
temp2 = pd.DataFrame(scores, columns = ['score'])
scores = pd.concat([temp1, temp2], axis = 1)
scores.loc[: 50, :]

Unnamed: 0,threshold,score
0,0.0,0.159791
1,0.01,0.165997
2,0.02,0.176947
3,0.03,0.189615
4,0.04,0.202165
5,0.05,0.214661
6,0.06,0.228753
7,0.07,0.242614
8,0.08,0.252157
9,0.09,0.261514


In [24]:
scores.loc[scores['score'] == scores['score'].max(), :]

Unnamed: 0,threshold,score
10,0.1,0.263514


In [None]:
from collections import Counter
answer = np.zeros(cat_test.shape[0])

for i in range(cat_test.shape[0]) :
  if cat_test[i] >= 0.10 :
    answer[i] = 1
    
answer = answer.astype('int64')
print(Counter(answer))

Counter({0: 5191, 1: 850})


In [None]:
submission_preds = answer
submission = pd.read_csv(dpath+'sample_submission.csv')
submission['Y_LABEL'] = submission_preds
submission.to_csv(dpath + 'submission/KD_with_core_opt_submission.csv', index = False)

In [25]:
submission = pd.read_csv(dpath+'sample_submission.csv')
submission['Y_LABEL'] = cat_test
submission.to_csv(dpath + 'for_ensemble/KD_with_core_opt_submission.csv', index = False)