### autogluon, scikit-learn 설치

In [None]:
$ pip install autogluon

In [10]:
$ pip install scikit-learn



### AutoGluon (auto-ml) 코드 작성

In [122]:
import pandas as pd
from sklearn.model_selection import train_test_split
from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.metrics import confusion_matrix
from typing import Any

def learn(data: pd.DataFrame, y_label:str, hyperparameters: dict=None, eval_metric: str=None) -> tuple[Any]:
  # 학습, 테스트 데이터 셋
  y = data[y_label]
  X = data.drop(y_label, axis=1)
  X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, stratify=y, random_state=0)
  train_set = pd.concat([X_train, y_train], axis="columns")
  test_set = pd.concat([X_test, y_test], axis="columns")
  train = TabularDataset(train_set)
  test = TabularDataset(test_set)

  # 학습
  predictor = TabularPredictor(label=y_label, eval_metric=eval_metric).fit(train, verbosity=0) # , presets="best_quality"
  #   'accuracy', 'balanced_accuracy', 'f1', 'f1_macro', 'f1_micro', 'f1_weighted',
  #   'roc_auc', 'roc_auc_ovo_macro', 'average_precision', 'precision', 'precision_macro',
  #   'precision_micro', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro',
  #   'recall_weighted', 'log_loss', 'pac_score'

  # 모델 훈련 성능 평가
  ld_board = predictor.leaderboard(train, silent=True)
  print(f"\n\n<<<<<< 모델 훈련 성능 평가 >>>>>>:\n{ld_board}")

  # 유효한 모델
  print(f"\n\n<<<<<< 유효한 모델 >>>>>>:\n{predictor.get_model_names()}")

  # 결과 평가
  results = predictor.evaluate(test)
  print(f"\n\n<<<<<< 결과 평가 >>>>>>:\n{results}")

  # 클래스 예측 확률
  y_proba = predictor.predict_proba(test)
  print(f"\n\n<<<<<< 클래스 예측 확률(predict_proba) >>>>>>:\n{y_proba}")

  # 변수 중요도
  print(f"\n\n<<<<<< Features importance >>>>>>:\n")
  feature_importance_train = predictor.feature_importance(train)
  print(f"\ntrain data:\n{feature_importance_train['importance']}")
  feature_importance_test = predictor.feature_importance(test)
  print(f"\ntest data:\n{feature_importance_test['importance']}")

  return predictor, test


### 1. 기본 Data set

In [123]:
import pandas as pd

DATA_DIR = "../../data/"
data = pd.read_pickle(DATA_DIR + "base_dataset_standardized.pkl")
predictor, test = learn(data=data, y_label="Attrition_Flag")

No path specified. Models will be saved in: "AutogluonModels/ag-20231031_100906/"




<<<<<< 모델 훈련 성능 평가 >>>>>>:
                  model  score_test  score_val  pred_time_test  pred_time_val  \
0         LightGBMLarge    0.997037   0.970407        1.272073       0.205945   
1            LightGBMXT    0.996667   0.980271        0.305037       0.028628   
2      RandomForestGini    0.996667   0.966708        0.414063       0.108564   
3      RandomForestEntr    0.996667   0.966708        0.569792       0.145321   
4               XGBoost    0.996420   0.972873        0.136970       0.020514   
5   WeightedEnsemble_L2    0.996050   0.982737        0.398439       0.049806   
6              CatBoost    0.993828   0.979038        0.022869       0.028585   
7        ExtraTreesEntr    0.993581   0.935882        0.606350       0.211877   
8        NeuralNetTorch    0.993087   0.959309        0.084328       0.029915   
9        ExtraTreesGini    0.993087   0.930949        0.771097       0.182830   
10      NeuralNetFastAI    0.992717   0.959309        0.271406       0.035336   

### 2. income 처리 데이터 셋

In [116]:
import pandas as pd

DATA_DIR = "../../data/"
data = pd.read_pickle(DATA_DIR + "v2_dataset_of_income_standardized.pkl")
predictor, test = learn(data=data, y_label="Attrition_Flag")

No path specified. Models will be saved in: "AutogluonModels/ag-20231031_094411/"




<<<<<< 모델 훈련 성능 평가 >>>>>>:
                  model  score_test  score_val  pred_time_test  pred_time_val  \
0              LightGBM    0.997778   0.977805        0.566051       0.092062   
1            LightGBMXT    0.997161   0.977805        0.285928       0.083639   
2         LightGBMLarge    0.997037   0.970407        0.130469       0.018707   
3      RandomForestEntr    0.996791   0.967941        0.284635       0.099253   
4      RandomForestGini    0.996544   0.965475        0.284967       0.108710   
5              CatBoost    0.996420   0.980271        0.011359       0.004599   
6   WeightedEnsemble_L2    0.996173   0.981504        0.049985       0.014940   
7        ExtraTreesEntr    0.993458   0.934649        0.361577       0.146768   
8        ExtraTreesGini    0.993087   0.930949        0.370378       0.153493   
9        KNeighborsDist    0.991483   0.914920        0.244165       0.370733   
10       NeuralNetTorch    0.989137   0.954377        0.049387       0.027827   

### 3. 하이퍼파라미터 조정

In [124]:
import pandas as pd

DATA_DIR = "../../data/"
data = pd.read_pickle(DATA_DIR + "base_dataset_standardized.pkl")

hyperparameters = {
  "NN_TORCH": {},
  "GBM": [{"extra_trees": True, "ag_args": {"name_suffix": "XT"}}, {}, "GBMLarge"],
  "CAT": {},
  "XGB": {},
  "FASTAI": {},
  "RF": [
      {
        'criterion': 'gini',
        'ag_args': {
          'name_suffix': 'Gini',
          'problem_types': ['binary', 'multiclass']
        }
      },
      {
        'criterion': 'entropy',
        'ag_args': {
          'name_suffix': 'Entr',
          'problem_types': ['binary', 'multiclass']
        }
      },
      {
        'criterion': 'squared_error',
        'ag_args': {
          'name_suffix': 'MSE',
          'problem_types': ['regression']
        }
      }
    ],
  "XT": [
    {
      'criterion': 'gini',
      'ag_args': {
        'name_suffix': 'Gini',
        'problem_types': ['binary', 'multiclass']
      }
    },
    {
      'criterion': 'entropy',
      'ag_args': {
        'name_suffix': 'Entr',
        'problem_types': ['binary', 'multiclass']
      }
    },
    {
      'criterion': 'squared_error',
      'ag_args': {
        'name_suffix': 'MSE',
        'problem_types': ['regression']
      }
    }
  ],
  "KNN": [
    {
      'weights': 'uniform',
      'ag_args': {
        'name_suffix': 'Unif'
      }
    },
    {
      'weights': 'distance',
      'ag_args': {
        'name_suffix': 'Dist'
      }
    }
  ]
}

predictor, test = learn(data=data, y_label="Attrition_Flag", hyperparameters=hyperparameters)

No path specified. Models will be saved in: "AutogluonModels/ag-20231031_102305/"




<<<<<< 모델 훈련 성능 평가 >>>>>>:
                  model  score_test  score_val  pred_time_test  pred_time_val  \
0         LightGBMLarge    0.997037   0.970407        0.859966       0.122742   
1            LightGBMXT    0.996667   0.980271        0.274945       0.031891   
2      RandomForestGini    0.996667   0.966708        0.309961       0.125122   
3      RandomForestEntr    0.996667   0.966708        0.331422       0.120650   
4               XGBoost    0.996420   0.972873        0.096519       0.025472   
5   WeightedEnsemble_L2    0.996050   0.982737        0.344860       0.045206   
6              CatBoost    0.993828   0.979038        0.010763       0.010561   
7        ExtraTreesEntr    0.993581   0.935882        0.427711       0.202142   
8        NeuralNetTorch    0.993087   0.959309        0.055780       0.030151   
9        ExtraTreesGini    0.993087   0.930949        0.438573       0.208906   
10      NeuralNetFastAI    0.992717   0.959309        0.154189       0.029301   