In [7]:
import pandas as pd
import seaborn as sns
import missingno as msno
import math
from sklearn.preprocessing import StandardScaler, RobustScaler, MaxAbsScaler, QuantileTransformer
from scipy.stats import gaussian_kde
import scipy.stats as stats
from sklearn.decomposition import PCA
from statsmodels.stats.outliers_influence import variance_inflation_factor
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

from sklearn.model_selection import ParameterGrid, StratifiedKFold
from sklearn.metrics import f1_score

import torch
from pytorch_tabnet.tab_model import TabNetClassifier

In [20]:
import warnings

# "Device used : cpu" 메시지를 무시
warnings.filterwarnings("ignore", message="Device used : cpu")
# "Best weights from best epoch are automatically used!" 메시지를 무시
warnings.filterwarnings("ignore", message="Best weights from best epoch are automatically used!")


dataset_link = [
    "../dataset/v2_seperate/v2_original.csv"
]

for link in dataset_link:
    df = pd.read_csv(link)
    
    y = df['diagnosis_type']
    df.drop(columns=['diagnosis_type'], inplace=True)
    
    x = df.to_numpy()
    
    seed_li = [1557, 2356, 88488, 616, 821]
    result = 0
    
    ####Tabnet########################################
    ##################################################
    ##################################################
    
    
    for seed in seed_li:

        X_train, X_test, y_train, y_test = train_test_split(
            df, y, test_size=0.3, random_state=seed, stratify=y
        )

        X_train = X_train.fillna(0)
        X_test = X_test.fillna(0)
        
        X_train_np = X_train.values  # 또는 X_train.to_numpy()
        y_train_np = y_train.values
        X_test_np  = X_test.values
        y_test_np  = y_test.values

        
        # 하이퍼파라미터 그리드 정의 (필요에 따라 값을 조정하세요)
        param_grid = {
            "n_d": [8, 16],
            "n_a": [8, 16],
            "n_steps": [3, 5],
            "gamma": [1.3, 1.5],
            "lambda_sparse": [1e-3, 1e-4],
            "optimizer_params": [ {"lr": 2e-2}, {"lr": 1e-2} ]
        }

        best_score = 0
        best_params = None

        # Stratified K-Fold 교차검증 설정 (여기서는 3-fold 사용)
        skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=seed)

        # 그리드의 각 하이퍼파라미터 조합에 대해 평가
        for params in ParameterGrid(param_grid):
            scores = []
            
            for train_index, val_index in skf.split(X_train_np, y_train_np):
                X_train_cv, X_val_cv = X_train_np[train_index], X_train_np[val_index]
                y_train_cv, y_val_cv = y_train_np[train_index], y_train_np[val_index]
                
                clf = TabNetClassifier(
                    n_d=params["n_d"],
                    n_a=params["n_a"],
                    n_steps=params["n_steps"],
                    gamma=params["gamma"],
                    lambda_sparse=params["lambda_sparse"],
                    optimizer_fn=torch.optim.Adam,
                    optimizer_params=params["optimizer_params"],
                    mask_type='sparsemax'
                )
                
                # 모델 학습 (early stopping 포함)
                clf.fit(
                    X_train_cv, y_train_cv,
                    eval_set=[(X_val_cv, y_val_cv)],
                    eval_name=['valid'],
                    eval_metric=['accuracy'],
                    max_epochs=100,
                    patience=10,
                    batch_size=256, 
                    virtual_batch_size=128,
                    num_workers=0,
                    drop_last=False
                )
                
                preds = clf.predict(X_val_cv)
                score = f1_score(y_val_cv, preds, average='weighted')
                scores.append(score)
            
            avg_score = np.mean(scores)
            
            if avg_score > best_score:
                best_score = avg_score
                best_params = params
                
        best_clf = TabNetClassifier(
            n_d=best_params["n_d"],
            n_a=best_params["n_a"],
            n_steps=best_params["n_steps"],
            gamma=best_params["gamma"],
            lambda_sparse=best_params["lambda_sparse"],
            optimizer_fn=torch.optim.Adam,
            optimizer_params=best_params["optimizer_params"],
            mask_type='sparsemax'
        )

        best_clf.fit(
            X_train_np, y_train_np,
            eval_set=[(X_test_np, y_test_np)],
            eval_name=['valid'],
            eval_metric=['accuracy'],
            max_epochs=100,
            patience=10,
            batch_size=256, 
            virtual_batch_size=128,
            num_workers=0,
            drop_last=False
        )

        preds = best_clf.predict(X_test_np)
        test_f1 = f1_score(y_test_np, preds, average='weighted')

        result += test_f1

    result = result / 5
    print("****************************************************************")
    print(link, result)
    print("****************************************************************")

epoch 0  | loss: 1.0505  | valid_accuracy: 0.375   |  0:00:00s
epoch 1  | loss: 1.34179 | valid_accuracy: 0.41667 |  0:00:00s
epoch 2  | loss: 1.17926 | valid_accuracy: 0.54167 |  0:00:00s
epoch 3  | loss: 1.08057 | valid_accuracy: 0.54167 |  0:00:00s
epoch 4  | loss: 1.14925 | valid_accuracy: 0.29167 |  0:00:00s
epoch 5  | loss: 0.97042 | valid_accuracy: 0.41667 |  0:00:00s
epoch 6  | loss: 1.05943 | valid_accuracy: 0.375   |  0:00:00s
epoch 7  | loss: 0.97437 | valid_accuracy: 0.41667 |  0:00:00s
epoch 8  | loss: 0.91637 | valid_accuracy: 0.58333 |  0:00:00s
epoch 9  | loss: 0.87101 | valid_accuracy: 0.5     |  0:00:00s
epoch 10 | loss: 0.8319  | valid_accuracy: 0.5     |  0:00:00s
epoch 11 | loss: 0.82116 | valid_accuracy: 0.41667 |  0:00:00s
epoch 12 | loss: 0.79271 | valid_accuracy: 0.45833 |  0:00:00s
epoch 13 | loss: 0.77135 | valid_accuracy: 0.45833 |  0:00:00s
epoch 14 | loss: 0.66238 | valid_accuracy: 0.5     |  0:00:00s
epoch 15 | loss: 0.58586 | valid_accuracy: 0.5     |  0