In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
pip install interpret

In [None]:
pip install optuna

In [30]:
import pandas as pd 
import numpy as np 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import optuna
from optuna import Trial
from optuna.samplers import TPESampler
import interpret 
from interpret import show
from interpret.glassbox import ExplainableBoostingClassifier
from interpret.data import Marginal


In [5]:
dpath = 'drive/MyDrive/Seculayer/OSC/data/'

train = pd.read_csv(dpath+'train.csv')
test = pd.read_csv(dpath+'test.csv')

In [11]:
X = train.dropna(axis=1)
X = X.drop(['ID'],axis=1)
le1 = LabelEncoder()
le2 = LabelEncoder()
X['COMPONENT_ARBITRARY_cat'] = le1.fit_transform(X['COMPONENT_ARBITRARY'])
X['YEAR_cat'] = le2.fit_transform(X['YEAR'])
X.drop(['YEAR','COMPONENT_ARBITRARY'],axis=1,inplace=True)

y = X['Y_LABEL']
X.drop('Y_LABEL',axis=1,inplace=True)

X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.3,stratify=y)

In [None]:
from interpret import show

marginal = Marginal().explain_data(X_train, y_train, name = 'Train Data')
show(marginal)

In [37]:
def objective(trial : Trial) -> float :

    params_cat = {
        "random_state" : 39,
        'learning_rate' : trial.suggest_loguniform('learning_rate', 0.001, 1),
        "max_leaves" : trial.suggest_int("max_depth", 2, 16),
        "binning": trial.suggest_categorical("binning",["uniform","quantile","rounded_quantile"]),
        "early_stopping_rounds": trial.suggest_int("early_stopping_rounds",50,500)
  }
    
    model = ExplainableBoostingClassifier(**params_cat)
    model.fit(X_train, y_train)

    cat_pred = model.predict(X_val)
    F1 = f1_score(y_val, cat_pred)
    
    return F1

In [38]:
# Optuna

sampler = TPESampler(seed = 2023)
study = optuna.create_study(
    study_name = "ebm_parameter_opt",
    direction = "maximize",
    sampler = sampler)
study.optimize(objective, n_trials = 100)

[32m[I 2023-02-07 05:45:06,890][0m A new study created in memory with name: ebm_parameter_opt[0m

suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use :func:`~optuna.trial.Trial.suggest_float` instead.

[32m[I 2023-02-07 05:45:26,599][0m Trial 0 finished with value: 0.5736738703339883 and parameters: {'learning_rate': 0.009246234676674875, 'max_depth': 15, 'binning': 'uniform', 'early_stopping_rounds': 261}. Best is trial 0 with value: 0.5736738703339883.[0m

suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use :func:`~optuna.trial.Trial.suggest_float` instead.

[32m[I 2023-02-07 05:46:48,378][0m Trial 1 finished with value: 0.6484517304189434 and parameters: {'learning_rate': 0.0011648472664387375, 'max_depth': 12, 'binning': 'quantile', 'early_stopping_rounds': 276}. Best is trial 

KeyboardInterrupt: ignored

In [40]:
ebm_param = {'learning_rate': 0.5537927528727201, 'max_leaves': 5, 'binning': 'rounded_quantile', 'early_stopping_rounds': 103}

ebm = ExplainableBoostingClassifier(**ebm_param,random_state=2023)
ebm.fit(X_train, y_train)


# Global Explanations: What the model learned overall
ebm_global = ebm.explain_global(name='EBM')
show(ebm_global)

In [41]:
f1_score(y_val,ebm.predict(X_val))

0.667870036101083

In [69]:
test.columns

Index(['ID', 'COMPONENT_ARBITRARY', 'ANONYMOUS_1', 'YEAR', 'ANONYMOUS_2', 'AG',
       'CO', 'CR', 'CU', 'FE', 'H2O', 'MN', 'MO', 'NI', 'PQINDEX', 'TI', 'V',
       'V40', 'ZN'],
      dtype='object')

AL, CA가 중요한 Feature임을 확인

In [68]:
importances = pd.DataFrame()
importances['feature'] = ebm_global.data()['names']
importances['score'] = ebm_global.data()['scores']
importances.sort_values('score',ascending=False)

Unnamed: 0,feature,score
4,AL,0.509173
39,AL & CA,0.328453
35,ANONYMOUS_1 & YEAR_cat,0.304052
8,CA,0.253538
42,CA & YEAR_cat,0.217952
41,B & YEAR_cat,0.213317
0,ANONYMOUS_1,0.196672
33,ANONYMOUS_1 & AL,0.183733
6,BA,0.172815
38,AL & BA,0.150174
