<a href="https://colab.research.google.com/github/shedbarshemoth/ML-4-AI/blob/main/ver1_0_490_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### 패키지 임포트

In [1]:
#!pip -q install /kaggle/input/pytorchtabnet/pytorch_tabnet-4.1.0-py3-none-any.whl
!pip install pytorch-tabnet==4.1.0 #위의 경로에 pytorch 파일이 없음, !pip install로 다운로드

Collecting pytorch-tabnet==4.1.0
  Downloading pytorch_tabnet-4.1.0-py3-none-any.whl.metadata (15 kB)
Downloading pytorch_tabnet-4.1.0-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.5/44.5 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pytorch-tabnet
Successfully installed pytorch-tabnet-4.1.0


In [2]:
#colab의 경우 해당 코드 돌려야 함
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
#colab의 경우 해당 코드 돌려야 함
!pip install pytorch_tabnet
!pip install colorama
!pip install catboost

Collecting colorama
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: colorama
Successfully installed colorama-0.4.6
Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [4]:
#패키지 임포트
import numpy as np
import pandas as pd
import os
import re
from sklearn.base import clone, BaseEstimator, RegressorMixin
from sklearn.metrics import cohen_kappa_score, accuracy_score, mean_squared_error
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.decomposition import PCA
from sklearn.datasets import make_classification
from scipy.optimize import minimize
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from keras.models import Model
from keras.layers import Input, Dense
from keras.optimizers import Adam
import torch
import torch.nn as nn
import torch.optim as optim
from pytorch_tabnet.tab_model import TabNetRegressor
from pytorch_tabnet.callbacks import Callback

from colorama import Fore, Style
from IPython.display import clear_output
import warnings
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import VotingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None

SEED = 42 #random seed 설정
n_splits = 5 #k fold cross valdation에서 데이터를 5개의 fold로 나눔

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



#### 데이터 임포트 및 코드 def

In [5]:
import random
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(2024) #seed 2024로 고정

In [6]:
def process_file(filename, dirname):
    #df = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet')) #'part-0.parquet 파일명 존재하지 않음
    df = pd.read_parquet(os.path.join(dirname, filename)) #파일명 제거하여 수정
    #df.drop('step', axis=1, inplace=True) #sKeyError: "['step'] not found in axis
    if 'step' in df.columns: #조건부 삭제 코드 추가하여 수정
      df.drop('step', axis = 1, inplace = True)
     #step drop -> dataframe 비어 있는지 확인
    if df.empty:
        print(f"Warning: DataFrame for {filename} is empty after dropping 'step'. Returning NaN values.")
        return np.full(18, np.nan), filename.split('=')[1]  #empty dataframe에 NaN 삽입
    return df.describe().values.reshape(-1), filename.split('=')[1]

In [7]:
#디렉토리에 있는 파일 병렬 처리 -> 데이터 추출 -> 데이터프레임 반환
def load_time_series(dirname) -> pd.DataFrame:
    ids = os.listdir(dirname)

    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))

    stats, indexes = zip(*results)

    df = pd.DataFrame(stats, columns=[f"stat_{i}" for i in range(len(stats[0]))])
    df['id'] = indexes
    return df

In [8]:
#오토인코더 모델 구현 : 인코딩으로 압축 후 디코딩으로 복원하는 비지도 학습 모델
class AutoEncoder(nn.Module):
    def __init__(self, input_dim, encoding_dim):
        super(AutoEncoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, encoding_dim*3),
            nn.ReLU(),
            nn.Linear(encoding_dim*3, encoding_dim*2),
            nn.ReLU(),
            nn.Linear(encoding_dim*2, encoding_dim),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, input_dim*2),
            nn.ReLU(),
            nn.Linear(input_dim*2, input_dim*3),
            nn.ReLU(),
            nn.Linear(input_dim*3, input_dim),
            nn.Sigmoid()
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

In [9]:
#오토인코더 통해 데이터 차원 축소
def perform_autoencoder(df, encoding_dim=50, epochs=50, batch_size=32):
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df) #정규화 후 저장

    data_tensor = torch.FloatTensor(df_scaled) #텐서 변환

    input_dim = data_tensor.shape[1] #데이터 열 개수 입력 차원으로 사용
    autoencoder = AutoEncoder(input_dim, encoding_dim)

    criterion = nn.MSELoss() #손실함수
    optimizer = optim.Adam(autoencoder.parameters()) #옵티마이저

    for epoch in range(epochs):
        for i in range(0, len(data_tensor), batch_size):
            batch = data_tensor[i : i + batch_size]
            optimizer.zero_grad()
            reconstructed = autoencoder(batch)
            loss = criterion(reconstructed, batch)
            loss.backward()
            optimizer.step()

        if (epoch + 1) % 10 == 0:
            print(f'Epoch [{epoch + 1}/{epochs}], Loss: {loss.item():.4f}]')

    with torch.no_grad():
        encoded_data = autoencoder.encoder(data_tensor).numpy() #오토인코더의 인코더 부분 -> 데이터 잠재 공간으로 변환

    df_encoded = pd.DataFrame(encoded_data, columns=[f'Enc_{i + 1}' for i in range(encoded_data.shape[1])])

    return df_encoded

In [10]:
#새로운 column 추가
def feature_engineering(df):
    season_cols = [col for col in df.columns if 'Season' in col]
    df = df.drop(season_cols, axis=1)
    df['BMI_Age'] = df['Physical-BMI'] * df['Basic_Demos-Age']
    df['Internet_Hours_Age'] = df['PreInt_EduHx-computerinternet_hoursday'] * df['Basic_Demos-Age']
    df['BMI_Internet_Hours'] = df['Physical-BMI'] * df['PreInt_EduHx-computerinternet_hoursday']
    df['BFP_BMI'] = df['BIA-BIA_Fat'] / df['BIA-BIA_BMI']
    df['FFMI_BFP'] = df['BIA-BIA_FFMI'] / df['BIA-BIA_Fat']
    df['FMI_BFP'] = df['BIA-BIA_FMI'] / df['BIA-BIA_Fat']
    df['LST_TBW'] = df['BIA-BIA_LST'] / df['BIA-BIA_TBW']
    df['BFP_BMR'] = df['BIA-BIA_Fat'] * df['BIA-BIA_BMR']
    df['BFP_DEE'] = df['BIA-BIA_Fat'] * df['BIA-BIA_DEE']
    df['BMR_Weight'] = df['BIA-BIA_BMR'] / df['Physical-Weight']
    df['DEE_Weight'] = df['BIA-BIA_DEE'] / df['Physical-Weight']
    df['SMM_Height'] = df['BIA-BIA_SMM'] / df['Physical-Height']
    df['Muscle_to_Fat'] = df['BIA-BIA_SMM'] / df['BIA-BIA_FMI']
    df['Hydration_Status'] = df['BIA-BIA_TBW'] / df['Physical-Weight']
    df['ICW_TBW'] = df['BIA-BIA_ICW'] / df['BIA-BIA_TBW']

    return df

In [18]:
#파일 경로 각자 설정해야 함
#train = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')
#test = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')
#sample = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/sample_submission.csv')

train = pd.read_csv('/content/drive/MyDrive/child-mind-institute-problematic-internet-use/train.csv')
test = pd.read_csv('/content/drive/MyDrive/child-mind-institute-problematic-internet-use/test.csv')
sample = pd.read_csv('/content/drive/MyDrive/child-mind-institute-problematic-internet-use/sample_submission.csv')

#파일 경로 각자 설정해야 함
#train_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet")
#test_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet")

train_ts = load_time_series('/content/drive/MyDrive/child-mind-institute-problematic-internet-use/series_train.parquet')
test_ts = load_time_series('/content/drive/MyDrive/child-mind-institute-problematic-internet-use/series_test.parquet')

100%|██████████| 996/996 [02:56<00:00,  5.63it/s]
100%|██████████| 2/2 [00:00<00:00,  4.39it/s]


In [19]:
train.head()

Unnamed: 0,id,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Season,Fitness_Endurance-Max_Stage,Fitness_Endurance-Time_Mins,Fitness_Endurance-Time_Sec,FGC-Season,FGC-FGC_CU,FGC-FGC_CU_Zone,FGC-FGC_GSND,FGC-FGC_GSND_Zone,FGC-FGC_GSD,FGC-FGC_GSD_Zone,FGC-FGC_PU,FGC-FGC_PU_Zone,FGC-FGC_SRL,FGC-FGC_SRL_Zone,FGC-FGC_SRR,FGC-FGC_SRR_Zone,FGC-FGC_TL,FGC-FGC_TL_Zone,BIA-Season,BIA-BIA_Activity_Level_num,BIA-BIA_BMC,BIA-BIA_BMI,BIA-BIA_BMR,BIA-BIA_DEE,BIA-BIA_ECW,BIA-BIA_FFM,BIA-BIA_FFMI,BIA-BIA_FMI,BIA-BIA_Fat,BIA-BIA_Frame_num,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,PAQ_A-Season,PAQ_A-PAQ_A_Total,PAQ_C-Season,PAQ_C-PAQ_C_Total,PCIAT-Season,PCIAT-PCIAT_01,PCIAT-PCIAT_02,PCIAT-PCIAT_03,PCIAT-PCIAT_04,PCIAT-PCIAT_05,PCIAT-PCIAT_06,PCIAT-PCIAT_07,PCIAT-PCIAT_08,PCIAT-PCIAT_09,PCIAT-PCIAT_10,PCIAT-PCIAT_11,PCIAT-PCIAT_12,PCIAT-PCIAT_13,PCIAT-PCIAT_14,PCIAT-PCIAT_15,PCIAT-PCIAT_16,PCIAT-PCIAT_17,PCIAT-PCIAT_18,PCIAT-PCIAT_19,PCIAT-PCIAT_20,PCIAT-PCIAT_Total,SDS-Season,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,sii
0,00008ff9,Fall,5,0,Winter,51.0,Fall,16.877316,46.0,50.8,,,,,,,,,Fall,0.0,0.0,,,,,0.0,0.0,7.0,0.0,6.0,0.0,6.0,1.0,Fall,2.0,2.66855,16.8792,932.498,1492.0,8.25598,41.5862,13.8177,3.06143,9.21377,1.0,24.4349,8.89536,38.9177,19.5413,32.6909,,,,,Fall,5.0,4.0,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,4.0,0.0,4.0,4.0,4.0,4.0,4.0,4.0,2.0,4.0,55.0,,,,Fall,3.0,2.0
1,000fd460,Summer,9,0,,,Fall,14.03559,48.0,46.0,22.0,75.0,70.0,122.0,,,,,Fall,3.0,0.0,,,,,5.0,0.0,11.0,1.0,11.0,1.0,3.0,0.0,Winter,2.0,2.57949,14.0371,936.656,1498.65,6.01993,42.0291,12.8254,1.21172,3.97085,1.0,21.0352,14.974,39.4497,15.4107,27.0552,,,Fall,2.34,Fall,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Fall,46.0,64.0,Summer,0.0,0.0
2,00105258,Summer,10,1,Fall,71.0,Fall,16.648696,56.5,75.6,,65.0,94.0,117.0,Fall,5.0,7.0,33.0,Fall,20.0,1.0,10.2,1.0,14.7,2.0,7.0,1.0,10.0,1.0,10.0,1.0,5.0,0.0,,,,,,,,,,,,,,,,,,,,Summer,2.17,Fall,5.0,2.0,2.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,2.0,2.0,1.0,1.0,28.0,Fall,38.0,54.0,Summer,2.0,0.0
3,00115b9f,Winter,9,0,Fall,71.0,Summer,18.292347,56.0,81.6,,60.0,97.0,117.0,Summer,6.0,9.0,37.0,Summer,18.0,1.0,,,,,5.0,0.0,7.0,0.0,7.0,0.0,7.0,1.0,Summer,3.0,3.84191,18.2943,1131.43,1923.44,15.5925,62.7757,14.074,4.22033,18.8243,2.0,30.4041,16.779,58.9338,26.4798,45.9966,,,Winter,2.451,Summer,4.0,2.0,4.0,0.0,5.0,1.0,0.0,3.0,2.0,2.0,3.0,0.0,3.0,0.0,0.0,3.0,4.0,3.0,4.0,1.0,44.0,Summer,31.0,45.0,Winter,0.0,1.0
4,0016bb22,Spring,18,1,Summer,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Summer,1.04,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [20]:
train_ts.head()

Unnamed: 0,stat_0,stat_1,stat_2,stat_3,stat_4,stat_5,stat_6,stat_7,stat_8,stat_9,stat_10,stat_11,stat_12,stat_13,stat_14,stat_15,stat_16,stat_17,stat_18,stat_19,stat_20,stat_21,stat_22,stat_23,stat_24,stat_25,stat_26,stat_27,stat_28,stat_29,stat_30,stat_31,stat_32,stat_33,stat_34,stat_35,stat_36,stat_37,stat_38,stat_39,stat_40,stat_41,stat_42,stat_43,stat_44,stat_45,stat_46,stat_47,stat_48,stat_49,stat_50,stat_51,stat_52,stat_53,stat_54,stat_55,stat_56,stat_57,stat_58,stat_59,stat_60,stat_61,stat_62,stat_63,stat_64,stat_65,stat_66,stat_67,stat_68,stat_69,stat_70,stat_71,stat_72,stat_73,stat_74,stat_75,stat_76,stat_77,stat_78,stat_79,stat_80,stat_81,stat_82,stat_83,stat_84,stat_85,stat_86,stat_87,stat_88,stat_89,stat_90,stat_91,stat_92,stat_93,stat_94,stat_95,id
0,269335.0,269335.0,269335.0,269335.0,269335.0,269335.0,269335.0,269335.0,269335.0,269335.0,269335.0,269335.0,-0.478973,-0.037643,-0.215956,0.061542,-14.676989,0.0,41.468441,3876.515869,47582890000000.0,4.01578,1.0,46.020077,0.429458,0.518862,0.42268,0.129371,28.84729,0.0,180.024918,121.012596,17271380000000.0,1.998955,0.0,9.290177,-3.29879,-3.262288,-1.134395,0.0,-89.820724,0.0,0.0,3706.0,0.0,1.0,1.0,28.0,-0.81684,-0.353651,-0.545926,0.007793,-34.635059,0.0,1.5,3779.25,34062500000000.0,2.0,1.0,39.0,-0.578881,-0.006695,-0.211011,0.027122,-12.874933,0.0,5.495555,3841.0,47725000000000.0,4.0,1.0,45.0,-0.250647,0.300744,0.050556,0.068639,2.749033,0.0,21.666666,3970.0,61460000000000.0,6.0,1.0,55.0,1.159667,2.525316,1.802745,4.568309,89.673332,0.0,2659.666748,4179.0,86395000000000.0,7.0,1.0,63.0,0d01bbf2
1,412332.0,412332.0,412332.0,412332.0,412332.0,412332.0,412332.0,412332.0,412332.0,412332.0,412332.0,412332.0,0.047866,0.003234,-0.249981,0.023465,-18.722479,0.216525,68.818016,3841.463379,43239370000000.0,3.809581,2.475539,24.915834,0.523361,0.440953,0.646356,0.052377,49.601421,0.407909,278.388855,165.153732,25006220000000.0,1.971711,0.499402,6.90102,-1.777734,-2.433394,-1.005808,0.0,-89.819664,0.0,0.0,3098.166748,0.0,1.0,2.0,13.0,-0.266012,-0.277724,-0.829161,1.1e-05,-56.70623,0.0,2.738918,3741.0,21475000000000.0,2.0,2.0,19.0,0.009822,0.008072,-0.383322,0.006272,-23.246984,0.0,7.405453,3807.0,43445000000000.0,4.0,2.0,25.0,0.445334,0.26108,0.160221,0.020526,9.357183,0.0,18.088059,3963.333252,64920000000000.0,6.0,3.0,31.0,1.859814,1.518311,1.510279,3.006919,89.322289,1.0,2648.0,4181.0,86395000000000.0,7.0,3.0,37.0,cefdb7fe
2,311959.0,311959.0,311959.0,311959.0,311959.0,311959.0,311959.0,311959.0,311959.0,311959.0,311959.0,311959.0,-0.080044,0.058017,-0.269036,0.045412,-17.78392,0.0,132.968567,3874.709473,48575770000000.0,3.85157,3.0,48.147717,0.601244,0.59521,0.375176,0.095587,25.640087,0.0,434.704041,133.961487,17918520000000.0,1.98538,0.0,9.506028,-1.962057,-2.844661,-1.02151,0.0,-89.55394,0.0,0.0,3683.0,0.0,1.0,3.0,31.0,-0.605807,-0.445863,-0.536875,0.005877,-33.453754,0.0,2.168796,3771.0,34505000000000.0,2.0,3.0,41.0,-0.19,0.093646,-0.278828,0.02203,-16.665411,0.0,7.046413,3829.0,48365000000000.0,4.0,3.0,48.0,0.472721,0.594089,-0.049271,0.041659,-3.008855,0.0,29.298994,3971.25,63380000000000.0,6.0,3.0,56.0,1.148359,3.186745,2.724948,4.054967,89.521629,0.0,2648.5,4181.0,86395000000000.0,7.0,3.0,67.0,2ca2206f
3,384228.0,384228.0,384228.0,384228.0,384228.0,384228.0,384228.0,384228.0,384228.0,384228.0,384228.0,384228.0,-0.088861,0.045154,-0.21227,0.004798,-21.601578,0.611183,9.674905,3838.082031,43472240000000.0,4.096849,1.0,36.297169,0.300376,0.371397,0.840793,0.02792,67.862091,0.480341,47.099369,145.080429,24958260000000.0,1.98691,0.0,6.425583,-2.163437,-3.142938,-1.001401,0.0,-89.842148,0.0,0.0,3098.166748,0.0,1.0,1.0,25.0,-0.345387,-0.088445,-0.992394,0.0,-88.35062,0.0,1.677741,3759.0,21830000000000.0,2.0,1.0,31.0,0.005507,0.002966,-0.636735,7.9e-05,-40.234131,1.0,6.103776,3818.0,43660000000000.0,4.0,1.0,36.0,0.027835,0.06728,0.786521,0.003858,52.15822,1.0,10.39494,3941.0,65210000000000.0,6.0,1.0,42.0,1.017271,1.381445,1.041023,4.491224,88.801147,1.0,1157.25,4152.0,86395000000000.0,7.0,1.0,47.0,58391429
4,278676.0,278676.0,278676.0,278676.0,278676.0,278676.0,278676.0,278676.0,278676.0,278676.0,278676.0,278676.0,0.207518,-0.019419,0.053818,0.060862,1.239281,0.132412,121.166656,3840.667969,43202460000000.0,3.737114,3.0,8.007235,0.562727,0.493573,0.593929,0.177835,44.14262,0.335735,379.683319,167.202866,24844770000000.0,1.999294,0.0,4.674907,-1.204403,-3.650786,-1.004756,0.0,-89.868881,0.0,0.0,3098.166748,0.0,1.0,3.0,0.0,-0.120236,-0.298297,-0.36747,0.000333,-22.438588,0.0,3.777616,3724.0,21770000000000.0,2.0,3.0,4.0,0.282462,-0.00177,0.035146,0.01246,1.622942,0.0,9.893938,3824.0,43235000000000.0,3.0,3.0,8.0,0.669191,0.29312,0.526787,0.046701,31.213132,0.0,25.905516,3976.833252,64625000000000.0,5.0,3.0,12.0,1.796815,2.623967,3.006071,4.794266,89.624619,1.0,2636.75,4158.0,86395000000000.0,7.0,3.0,16.0,9d6b1410


In [21]:
df_train = train_ts.drop('id', axis=1)
df_test = test_ts.drop('id', axis=1)

#시계열 데이터 차원 축소
train_ts_encoded = perform_autoencoder(df_train, encoding_dim=60, epochs=100, batch_size=32)
test_ts_encoded = perform_autoencoder(df_test, encoding_dim=60, epochs=100, batch_size=32)

Epoch [10/100], Loss: 0.6314]
Epoch [20/100], Loss: 0.6314]
Epoch [30/100], Loss: 0.6313]
Epoch [40/100], Loss: 0.4909]
Epoch [50/100], Loss: 0.4570]
Epoch [60/100], Loss: 0.4457]
Epoch [70/100], Loss: 0.4400]
Epoch [80/100], Loss: 0.4399]
Epoch [90/100], Loss: 0.4393]
Epoch [100/100], Loss: 0.4345]
Epoch [10/100], Loss: 0.9973]
Epoch [20/100], Loss: 0.5802]
Epoch [30/100], Loss: 0.4271]
Epoch [40/100], Loss: 0.4271]
Epoch [50/100], Loss: 0.4271]
Epoch [60/100], Loss: 0.4271]
Epoch [70/100], Loss: 0.4271]
Epoch [80/100], Loss: 0.4271]
Epoch [90/100], Loss: 0.4271]
Epoch [100/100], Loss: 0.4271]


In [22]:
train_ts_encoded.head()

Unnamed: 0,Enc_1,Enc_2,Enc_3,Enc_4,Enc_5,Enc_6,Enc_7,Enc_8,Enc_9,Enc_10,Enc_11,Enc_12,Enc_13,Enc_14,Enc_15,Enc_16,Enc_17,Enc_18,Enc_19,Enc_20,Enc_21,Enc_22,Enc_23,Enc_24,Enc_25,Enc_26,Enc_27,Enc_28,Enc_29,Enc_30,Enc_31,Enc_32,Enc_33,Enc_34,Enc_35,Enc_36,Enc_37,Enc_38,Enc_39,Enc_40,Enc_41,Enc_42,Enc_43,Enc_44,Enc_45,Enc_46,Enc_47,Enc_48,Enc_49,Enc_50,Enc_51,Enc_52,Enc_53,Enc_54,Enc_55,Enc_56,Enc_57,Enc_58,Enc_59,Enc_60
0,10.46349,0.0,4.400503,0.0,0.0,4.186581,0.0,0.0,0.0,0.0,0.0,4.471686,0.0,8.130355,0.0,1.809909,9.713943,0.0,15.744424,0.0,4.973973,1.497915,1.097611,0.0,8.180731,0.0,0.0,0.0,4.981907,0.0,0.0,1.381495,0.0,0.0,6.392432,0.0,0.0,0.0,4.126699,0.410915,0.0,0.0,0.380895,11.869916,0.0,1.894547,0.0,0.119831,7.520442,0.0,0.0,1.514828,0.0,3.649864,0.0,0.0,0.0,0.0,0.568618,10.520723
1,1.468686,0.0,1.109338,0.0,0.0,1.221228,1.291767,0.0,0.0,0.0,0.0,0.954009,1.180826,1.362354,0.0,0.0,2.734173,0.0,2.298082,0.0,0.602144,1.114163,0.96333,0.0,0.0,0.0,0.869869,0.0,3.226078,0.0,0.0,2.931674,2.247607,2.403712,2.48364,0.0,0.0,0.0,0.729417,0.959202,2.355773,0.0,3.038229,2.265924,0.0,0.0,0.568037,2.862,1.085435,0.0,0.0,2.963107,0.0,2.269294,0.768473,3.057898,0.0,0.0,0.0,0.65819
2,6.997195,0.0,1.033229,0.078371,1.171508,0.0,0.0,0.0,0.0,0.0,0.0,4.992122,3.416955,2.341736,0.0,0.022807,7.578375,0.0,3.107586,0.103344,3.431124,1.467633,1.518127,0.0,2.646826,0.0,2.869344,0.0,1.233231,0.0,0.0,0.0,0.283556,2.493599,5.87021,0.0,0.0,0.0,1.409631,2.652506,0.611295,0.0,6.745518,9.036569,0.0,1.052874,0.0,4.697539,3.199555,0.0,0.0,2.414349,0.276239,2.6212,7.356723,0.26562,0.0,0.0,0.0,5.121244
3,0.839009,5.535186,3.108105,0.0,0.0,9.279172,7.066889,0.0,0.0,0.0,0.0,0.0,0.0,8.472496,0.0,0.0,3.210153,0.625574,4.911438,0.0,3.748278,1.770332,0.0,1.872949,0.0,0.0,0.0,1.71089,8.687901,0.0,0.0,8.519537,6.289674,4.226508,0.616771,0.0,0.0,0.0,1.436654,0.0,5.876308,0.0,0.0,1.916134,0.0,0.0,7.373839,1.184644,1.590376,0.0,0.0,2.357945,0.0,3.225517,0.0,7.431235,0.0,0.0,0.0,0.0
4,2.169195,0.0,0.114022,1.370219,0.0,2.159742,0.0,0.0,0.0,0.0,1.705548,2.291415,1.644797,0.0,0.0,1.056784,2.21768,0.066694,1.149231,0.612412,0.0,0.207822,1.295403,0.0,1.230114,0.0,2.018633,1.158477,0.491336,0.0,0.0,0.0,0.229501,0.0,1.176376,0.0,0.0,0.0,2.302747,1.263108,1.121357,0.0,4.180768,3.568252,0.0,0.871848,0.0,1.651164,0.908166,0.0,1.946582,0.0,0.682209,0.0,3.894411,0.0,0.0,0.0,0.315038,0.844683


In [23]:
#열 이름 리스트로 변경
time_series_cols = train_ts_encoded.columns.tolist()
train_ts_encoded["id"]=train_ts["id"]
test_ts_encoded['id']=test_ts["id"]

#데이터 결합
train = pd.merge(train, train_ts_encoded, how="left", on='id')
test = pd.merge(test, test_ts_encoded, how="left", on='id')

오토인코더의 성능을 올리는 쪽으로?

입력 데이터를 복원하는 성능

In [24]:
train

Unnamed: 0,id,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Season,Fitness_Endurance-Max_Stage,Fitness_Endurance-Time_Mins,Fitness_Endurance-Time_Sec,FGC-Season,FGC-FGC_CU,FGC-FGC_CU_Zone,FGC-FGC_GSND,FGC-FGC_GSND_Zone,FGC-FGC_GSD,FGC-FGC_GSD_Zone,FGC-FGC_PU,FGC-FGC_PU_Zone,FGC-FGC_SRL,FGC-FGC_SRL_Zone,FGC-FGC_SRR,FGC-FGC_SRR_Zone,FGC-FGC_TL,FGC-FGC_TL_Zone,BIA-Season,BIA-BIA_Activity_Level_num,BIA-BIA_BMC,BIA-BIA_BMI,BIA-BIA_BMR,BIA-BIA_DEE,BIA-BIA_ECW,BIA-BIA_FFM,BIA-BIA_FFMI,BIA-BIA_FMI,BIA-BIA_Fat,BIA-BIA_Frame_num,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,PAQ_A-Season,PAQ_A-PAQ_A_Total,PAQ_C-Season,PAQ_C-PAQ_C_Total,PCIAT-Season,PCIAT-PCIAT_01,PCIAT-PCIAT_02,PCIAT-PCIAT_03,PCIAT-PCIAT_04,PCIAT-PCIAT_05,PCIAT-PCIAT_06,PCIAT-PCIAT_07,PCIAT-PCIAT_08,PCIAT-PCIAT_09,PCIAT-PCIAT_10,PCIAT-PCIAT_11,PCIAT-PCIAT_12,PCIAT-PCIAT_13,PCIAT-PCIAT_14,PCIAT-PCIAT_15,PCIAT-PCIAT_16,PCIAT-PCIAT_17,PCIAT-PCIAT_18,PCIAT-PCIAT_19,PCIAT-PCIAT_20,PCIAT-PCIAT_Total,SDS-Season,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,sii,Enc_1,Enc_2,Enc_3,Enc_4,Enc_5,Enc_6,Enc_7,Enc_8,Enc_9,Enc_10,Enc_11,Enc_12,Enc_13,Enc_14,Enc_15,Enc_16,Enc_17,Enc_18,Enc_19,Enc_20,Enc_21,Enc_22,Enc_23,Enc_24,Enc_25,Enc_26,Enc_27,Enc_28,Enc_29,Enc_30,Enc_31,Enc_32,Enc_33,Enc_34,Enc_35,Enc_36,Enc_37,Enc_38,Enc_39,Enc_40,Enc_41,Enc_42,Enc_43,Enc_44,Enc_45,Enc_46,Enc_47,Enc_48,Enc_49,Enc_50,Enc_51,Enc_52,Enc_53,Enc_54,Enc_55,Enc_56,Enc_57,Enc_58,Enc_59,Enc_60
0,00008ff9,Fall,5,0,Winter,51.0,Fall,16.877316,46.0,50.8,,,,,,,,,Fall,0.0,0.0,,,,,0.0,0.0,7.0,0.0,6.0,0.0,6.0,1.0,Fall,2.0,2.66855,16.8792,932.498,1492.00,8.25598,41.5862,13.8177,3.06143,9.21377,1.0,24.4349,8.89536,38.9177,19.5413,32.6909,,,,,Fall,5.0,4.0,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,4.0,0.0,4.0,4.0,4.0,4.0,4.0,4.0,2.0,4.0,55.0,,,,Fall,3.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,000fd460,Summer,9,0,,,Fall,14.035590,48.0,46.0,22.0,75.0,70.0,122.0,,,,,Fall,3.0,0.0,,,,,5.0,0.0,11.0,1.0,11.0,1.0,3.0,0.0,Winter,2.0,2.57949,14.0371,936.656,1498.65,6.01993,42.0291,12.8254,1.21172,3.97085,1.0,21.0352,14.97400,39.4497,15.4107,27.0552,,,Fall,2.340,Fall,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Fall,46.0,64.0,Summer,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,00105258,Summer,10,1,Fall,71.0,Fall,16.648696,56.5,75.6,,65.0,94.0,117.0,Fall,5.0,7.0,33.0,Fall,20.0,1.0,10.2,1.0,14.7,2.0,7.0,1.0,10.0,1.0,10.0,1.0,5.0,0.0,,,,,,,,,,,,,,,,,,,,Summer,2.170,Fall,5.0,2.0,2.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,2.0,2.0,1.0,1.0,28.0,Fall,38.0,54.0,Summer,2.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,00115b9f,Winter,9,0,Fall,71.0,Summer,18.292347,56.0,81.6,,60.0,97.0,117.0,Summer,6.0,9.0,37.0,Summer,18.0,1.0,,,,,5.0,0.0,7.0,0.0,7.0,0.0,7.0,1.0,Summer,3.0,3.84191,18.2943,1131.430,1923.44,15.59250,62.7757,14.0740,4.22033,18.82430,2.0,30.4041,16.77900,58.9338,26.4798,45.9966,,,Winter,2.451,Summer,4.0,2.0,4.0,0.0,5.0,1.0,0.0,3.0,2.0,2.0,3.0,0.0,3.0,0.0,0.0,3.0,4.0,3.0,4.0,1.0,44.0,Summer,31.0,45.0,Winter,0.0,1.0,6.092597,0.000000,2.272971,1.109518,0.0,2.264499,0.000000,0.0,0.0,0.000000,0.000000,2.626785,0.000000,5.511967,0.0,0.0,12.365653,0.000000,6.763050,0.0,6.988305,5.028478,0.00000,0.000000,2.998503,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.794934,3.439999,3.073107,0.0,0.0,0.0,1.089302,4.201143,0.900546,0.0,0.463877,8.943865,0.0,0.275575,0.000000,2.776990,3.997134,0.0,0.000000,2.297152,0.0,1.051462,5.661705,0.000000,0.0,0.0,0.000000,4.753823
4,0016bb22,Spring,18,1,Summer,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Summer,1.04,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3955,ff8a2de4,Fall,13,0,Spring,60.0,Fall,16.362460,59.5,82.4,,71.0,70.0,104.0,,,,,Fall,16.0,0.0,18.0,1.0,19.9,2.0,10.0,1.0,8.0,1.0,9.0,1.0,12.0,1.0,Fall,3.0,4.52277,16.3642,1206.880,2051.70,19.46110,70.8117,14.0629,2.30138,11.58830,1.0,33.3709,17.97970,66.2889,29.7790,52.8320,,,Winter,3.260,Winter,3.0,3.0,3.0,2.0,3.0,2.0,2.0,2.0,2.0,1.0,2.0,0.0,2.0,0.0,1.0,0.0,2.0,1.0,1.0,0.0,32.0,Winter,35.0,50.0,Fall,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3956,ffa9794a,Winter,10,0,,,Spring,18.764678,53.5,76.4,27.0,60.0,78.0,118.0,,,,,Spring,0.0,0.0,,,,,4.0,0.0,0.0,0.0,0.0,0.0,12.0,1.0,Spring,,,,,,,,,,,,,,,,,,,Winter,2.340,,,,,,,,,,,,,,,,,,,,,,,,,,Winter,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3957,ffcd4dbd,Fall,11,0,Spring,68.0,Winter,21.441500,60.0,109.8,,79.0,99.0,116.0,,,,,Winter,15.0,1.0,18.5,2.0,15.8,2.0,0.0,0.0,10.0,1.0,10.0,1.0,14.0,1.0,Winter,2.0,4.41305,21.4438,1253.740,2005.99,20.48250,75.8033,14.8043,6.63952,33.99670,2.0,33.9805,21.34030,71.3903,28.7792,54.4630,,,Winter,2.729,Winter,5.0,5.0,3.0,0.0,5.0,1.0,0.0,2.0,0.0,2.0,1.0,0.0,1.0,3.0,0.0,0.0,1.0,1.0,0.0,1.0,31.0,Winter,56.0,77.0,Fall,0.0,1.0,0.000000,0.000000,5.276361,0.000000,0.0,1.407226,1.742838,0.0,0.0,0.000000,1.192059,1.519034,2.801047,0.444616,0.0,0.0,1.905474,4.253047,7.909111,0.0,1.850417,0.000000,1.67693,0.000000,1.230853,0.0,2.631329,0.657377,3.319725,0.0,0.0,8.219189,3.281114,0.000000,3.470974,0.0,0.0,0.0,0.000000,0.000000,1.144167,0.0,9.881141,0.000000,0.0,0.000000,0.000000,5.066833,0.023382,0.0,2.422389,2.917506,0.0,2.626473,3.949580,7.145576,0.0,0.0,0.000000,1.579676
3958,ffed1dd5,Spring,13,0,Spring,70.0,Winter,12.235895,70.7,87.0,,59.0,61.0,113.0,,,,,Spring,,,,,,,,,,,,,,,Summer,4.0,6.66168,12.2372,1414.340,2970.12,26.53230,92.9092,13.0684,-0.83117,-5.90917,2.0,41.3715,25.00540,86.2475,45.4340,67.9038,,,Spring,3.300,Spring,2.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,2.0,0.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,19.0,Spring,33.0,47.0,Spring,1.0,0.0,0.000000,10.574584,0.000000,2.257841,0.0,7.146640,2.655945,0.0,0.0,0.387349,0.000000,0.000000,0.000000,6.584021,0.0,0.0,0.523901,6.135649,3.894987,0.0,0.000000,2.379207,0.00000,3.507591,0.000000,0.0,0.000000,0.000000,4.336707,0.0,0.0,1.335013,8.694819,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,1.773384,0.0,3.421620,0.000000,0.0,0.000000,6.374635,0.000000,2.222562,0.0,2.251566,0.000000,0.0,2.610091,1.550163,2.309440,0.0,0.0,0.402866,0.000000


In [25]:
imputer = KNNImputer(n_neighbors=5) #결측값 채우기
numeric_cols = train.select_dtypes(include=['int32', 'int64', 'float64', 'int64']).columns #숫자형 열만 선택
imputed_data = imputer.fit_transform(train[numeric_cols]) #위에서 선택된 열로 KNN
train_imputed = pd.DataFrame(imputed_data, columns=numeric_cols)
train_imputed['sii'] = train_imputed['sii'].round().astype(int)
for col in train.columns:
    if col not in numeric_cols:
        train_imputed[col] = train[col] #숫자 열 아닌 거 복원

train = train_imputed

train = feature_engineering(train)
train = train.dropna(thresh=10, axis=0) #한 행에 결측치 10개 이상이면 삭제
test = feature_engineering(test)

In [26]:
train.drop('id', axis=1)
train

Unnamed: 0,Basic_Demos-Age,Basic_Demos-Sex,CGAS-CGAS_Score,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Max_Stage,Fitness_Endurance-Time_Mins,Fitness_Endurance-Time_Sec,FGC-FGC_CU,FGC-FGC_CU_Zone,FGC-FGC_GSND,FGC-FGC_GSND_Zone,FGC-FGC_GSD,FGC-FGC_GSD_Zone,FGC-FGC_PU,FGC-FGC_PU_Zone,FGC-FGC_SRL,FGC-FGC_SRL_Zone,FGC-FGC_SRR,FGC-FGC_SRR_Zone,FGC-FGC_TL,FGC-FGC_TL_Zone,BIA-BIA_Activity_Level_num,BIA-BIA_BMC,BIA-BIA_BMI,BIA-BIA_BMR,BIA-BIA_DEE,BIA-BIA_ECW,BIA-BIA_FFM,BIA-BIA_FFMI,BIA-BIA_FMI,BIA-BIA_Fat,BIA-BIA_Frame_num,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,PAQ_A-PAQ_A_Total,PAQ_C-PAQ_C_Total,PCIAT-PCIAT_01,PCIAT-PCIAT_02,PCIAT-PCIAT_03,PCIAT-PCIAT_04,PCIAT-PCIAT_05,PCIAT-PCIAT_06,PCIAT-PCIAT_07,PCIAT-PCIAT_08,PCIAT-PCIAT_09,PCIAT-PCIAT_10,PCIAT-PCIAT_11,PCIAT-PCIAT_12,PCIAT-PCIAT_13,PCIAT-PCIAT_14,PCIAT-PCIAT_15,PCIAT-PCIAT_16,PCIAT-PCIAT_17,PCIAT-PCIAT_18,PCIAT-PCIAT_19,PCIAT-PCIAT_20,PCIAT-PCIAT_Total,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-computerinternet_hoursday,sii,id,Enc_1,Enc_2,Enc_3,Enc_4,Enc_5,Enc_6,Enc_7,Enc_8,Enc_9,Enc_10,Enc_11,Enc_12,Enc_13,Enc_14,Enc_15,Enc_16,Enc_17,Enc_18,Enc_19,Enc_20,Enc_21,Enc_22,Enc_23,Enc_24,Enc_25,Enc_26,Enc_27,Enc_28,Enc_29,Enc_30,Enc_31,Enc_32,Enc_33,Enc_34,Enc_35,Enc_36,Enc_37,Enc_38,Enc_39,Enc_40,Enc_41,Enc_42,Enc_43,Enc_44,Enc_45,Enc_46,Enc_47,Enc_48,Enc_49,Enc_50,Enc_51,Enc_52,Enc_53,Enc_54,Enc_55,Enc_56,Enc_57,Enc_58,Enc_59,Enc_60,BMI_Age,Internet_Hours_Age,BMI_Internet_Hours,BFP_BMI,FFMI_BFP,FMI_BFP,LST_TBW,BFP_BMR,BFP_DEE,BMR_Weight,DEE_Weight,SMM_Height,Muscle_to_Fat,Hydration_Status,ICW_TBW
0,5.0,0.0,51.0,16.877316,46.00,50.8,23.0,61.2,86.4,110.6,4.0,5.8,27.0,0.0,0.0,17.56,1.8,16.18,1.4,0.0,0.0,7.0,0.0,6.0,0.0,6.0,1.0,2.0,2.668550,16.87920,932.4980,1492.000,8.255980,41.58620,13.81770,3.061430,9.213770,1.0,24.43490,8.895360,38.91770,19.54130,32.69090,1.9120,2.2220,5.0,4.0,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,4.0,0.0,4.0,4.0,4.0,4.0,4.0,4.0,2.0,4.0,55.0,48.4,62.2,3.0,2,00008ff9,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,84.386578,15.0,50.631947,0.545865,1.499679,0.332267,1.190475,8591.822097,13746.944840,18.356260,29.370079,0.424811,6.383063,0.643522,0.747453
1,9.0,0.0,70.0,14.035590,48.00,46.0,22.0,75.0,70.0,122.0,4.6,6.6,24.2,3.0,0.0,16.04,1.6,15.50,1.6,5.0,0.0,11.0,1.0,11.0,1.0,3.0,0.0,2.0,2.579490,14.03710,936.6560,1498.650,6.019930,42.02910,12.82540,1.211720,3.970850,1.0,21.03520,14.974000,39.44970,15.41070,27.05520,2.6260,2.3400,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,46.0,64.0,0.0,0,000fd460,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,126.320313,0.0,0.000000,0.282883,3.229888,0.305154,1.458119,3719.320478,5950.914352,20.362087,32.579348,0.321056,12.718037,0.588157,0.777492
2,10.0,1.0,71.0,16.648696,56.50,75.6,24.8,65.0,94.0,117.0,5.0,7.0,33.0,20.0,1.0,10.20,1.0,14.70,2.0,7.0,1.0,10.0,1.0,10.0,1.0,5.0,0.0,2.6,3.431454,19.10500,1106.4030,1889.264,17.199762,60.10940,14.83936,4.265620,17.650582,2.6,28.81348,14.096188,56.67794,27.61536,46.01322,2.0938,2.1700,5.0,2.0,2.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,2.0,2.0,1.0,1.0,28.0,38.0,54.0,2.0,0,00105258,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,166.486961,20.0,33.297392,0.923872,0.840729,0.241670,1.231775,19528.656877,33346.609152,14.634960,24.990265,0.488767,6.473938,0.608640,0.626200
3,9.0,0.0,71.0,18.292347,56.00,81.6,25.4,60.0,97.0,117.0,6.0,9.0,37.0,18.0,1.0,14.50,1.6,16.92,2.2,5.0,0.0,7.0,0.0,7.0,0.0,7.0,1.0,3.0,3.841910,18.29430,1131.4300,1923.440,15.592500,62.77570,14.07400,4.220330,18.824300,2.0,30.40410,16.779000,58.93380,26.47980,45.99660,1.7980,2.4510,4.0,2.0,4.0,0.0,5.0,1.0,0.0,3.0,2.0,2.0,3.0,0.0,3.0,0.0,0.0,3.0,4.0,3.0,4.0,1.0,44.0,31.0,45.0,0.0,1,00115b9f,6.092597,0.000000,2.272971,1.109518,0.0,2.264499,0.000000,0.0,0.0,0.000000,0.000000,2.626785,0.000000,5.511967,0.0,0.0,12.365653,0.000000,6.763050,0.0,6.988305,5.028478,0.00000,0.000000,2.998503,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.794934,3.439999,3.073107,0.0,0.0,0.0,1.089302,4.201143,0.900546,0.0,0.463877,8.943865,0.0,0.275575,0.000000,2.776990,3.997134,0.0,0.000000,2.297152,0.0,1.051462,5.661705,0.000000,0.0,0.0,0.000000,4.753823,164.631122,0.0,0.000000,1.028971,0.747651,0.224196,1.281264,21298.377749,36207.411592,13.865564,23.571569,0.472854,6.274343,0.563684,0.661008
4,18.0,1.0,69.4,26.713639,64.14,125.0,33.6,70.6,74.2,125.8,4.4,8.4,17.8,12.8,0.2,28.48,2.0,28.80,2.0,1.4,0.0,10.1,0.6,9.5,0.6,10.7,0.8,2.4,4.382366,26.06698,1394.9880,2144.724,29.722340,90.84782,16.01834,10.048682,56.672180,2.4,35.37708,25.748480,86.46560,47.54038,65.09940,1.0400,2.0724,2.0,2.2,2.2,2.0,2.8,1.8,1.6,2.0,2.4,1.0,3.4,0.8,2.2,2.6,2.0,2.6,2.0,1.6,1.6,1.4,40.0,42.0,58.8,2.6,1,0016bb22,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,480.845494,46.8,69.455460,2.174098,0.282649,0.177312,1.328209,79057.011034,121546.184578,11.159904,17.157792,0.741197,4.731007,0.520795,0.543432
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3955,13.0,0.0,60.0,16.362460,59.50,82.4,25.0,71.0,70.0,104.0,4.8,7.2,23.8,16.0,0.0,18.00,1.0,19.90,2.0,10.0,1.0,8.0,1.0,9.0,1.0,12.0,1.0,3.0,4.522770,16.36420,1206.8800,2051.700,19.461100,70.81170,14.06290,2.301380,11.588300,1.0,33.37090,17.979700,66.28890,29.77900,52.83200,2.7338,3.2600,3.0,3.0,3.0,2.0,3.0,2.0,2.0,2.0,2.0,1.0,2.0,0.0,2.0,0.0,1.0,0.0,2.0,1.0,1.0,0.0,32.0,35.0,50.0,1.0,1,ff8a2de4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,212.711984,13.0,16.362460,0.708149,1.213543,0.198595,1.254711,13985.687504,23775.715110,14.646602,24.899272,0.500487,12.939628,0.641165,0.631642
3956,10.0,0.0,58.6,18.764678,53.50,76.4,27.0,60.0,78.0,118.0,4.4,6.2,23.6,0.0,0.0,17.20,1.8,16.52,1.4,4.0,0.0,0.0,0.0,0.0,0.0,12.0,1.0,3.0,2.940418,18.27962,1010.4646,1785.136,12.685996,49.89076,14.06412,4.215510,15.749254,2.4,25.20942,11.995308,46.95032,21.39602,37.89540,2.2040,2.3400,1.8,2.2,3.0,0.6,2.2,1.0,0.2,1.0,0.4,1.6,1.2,0.2,0.4,0.4,1.4,0.8,2.0,0.8,0.8,0.4,22.4,38.6,54.8,0.0,0,ffa9794a,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,187.646781,0.0,0.000000,0.861574,0.893002,0.267664,1.238945,15914.063643,28114.560289,13.225976,23.365654,0.399926,5.075547,0.496013,0.665237
3957,11.0,0.0,68.0,21.441500,60.00,109.8,28.6,79.0,99.0,116.0,4.6,6.2,32.4,15.0,1.0,18.50,2.0,15.80,2.0,0.0,0.0,10.0,1.0,10.0,1.0,14.0,1.0,2.0,4.413050,21.44380,1253.7400,2005.990,20.482500,75.80330,14.80430,6.639520,33.996700,2.0,33.98050,21.340300,71.39030,28.77920,54.46300,2.7338,2.7290,5.0,5.0,3.0,0.0,5.0,1.0,0.0,2.0,0.0,2.0,1.0,0.0,1.0,3.0,0.0,0.0,1.0,1.0,0.0,1.0,31.0,56.0,77.0,0.0,1,ffcd4dbd,0.000000,0.000000,5.276361,0.000000,0.0,1.407226,1.742838,0.0,0.0,0.000000,1.192059,1.519034,2.801047,0.444616,0.0,0.0,1.905474,4.253047,7.909111,0.0,1.850417,0.000000,1.67693,0.000000,1.230853,0.0,2.631329,0.657377,3.319725,0.0,0.0,8.219189,3.281114,0.000000,3.470974,0.0,0.0,0.0,0.000000,0.000000,1.144167,0.0,9.881141,0.000000,0.0,0.000000,0.000000,5.066833,0.023382,0.0,2.422389,2.917506,0.0,2.626473,3.949580,7.145576,0.0,0.0,0.000000,1.579676,235.856500,0.0,0.000000,1.585386,0.435463,0.195299,1.310804,42623.022658,68197.040233,11.418397,18.269490,0.479653,4.334530,0.496020,0.623919
3958,13.0,0.0,70.0,12.235895,70.70,87.0,27.6,59.0,61.0,113.0,3.8,4.6,25.0,19.0,0.6,23.18,2.0,24.90,2.2,3.8,0.4,9.8,0.4,10.3,0.8,11.6,1.0,4.0,6.661680,12.23720,1414.3400,2970.120,26.532300,92.90920,13.06840,-0.831170,-5.909170,2.0,41.37150,25.005400,86.24750,45.43400,67.90380,2.7338,3.3000,2.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,2.0,0.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,19.0,33.0,47.0,1.0,0,ffed1dd5,0.000000,10.574584,0.000000,2.257841,0.0,7.146640,2.655945,0.0,0.0,0.387349,0.000000,0.000000,0.000000,6.584021,0.0,0.0,0.523901,6.135649,3.894987,0.0,0.000000,2.379207,0.00000,3.507591,0.000000,0.0,0.000000,0.000000,4.336707,0.0,0.0,1.335013,8.694819,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,1.773384,0.0,3.421620,0.000000,0.0,0.000000,6.374635,0.000000,2.222562,0.0,2.251566,0.000000,0.0,2.610091,1.550163,2.309440,0.0,0.0,0.402866,0.000000,159.066638,13.0,12.235895,-0.482886,-2.211546,0.140658,1.270142,-8357.575498,-17550.944000,16.256782,34.139310,0.642631,-54.662704,0.780503,0.609266


In [35]:
test.drop('id', axis=1)
test

Unnamed: 0,id,Basic_Demos-Age,Basic_Demos-Sex,CGAS-CGAS_Score,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Max_Stage,Fitness_Endurance-Time_Mins,Fitness_Endurance-Time_Sec,FGC-FGC_CU,FGC-FGC_CU_Zone,FGC-FGC_GSND,FGC-FGC_GSND_Zone,FGC-FGC_GSD,FGC-FGC_GSD_Zone,FGC-FGC_PU,FGC-FGC_PU_Zone,FGC-FGC_SRL,FGC-FGC_SRL_Zone,FGC-FGC_SRR,FGC-FGC_SRR_Zone,FGC-FGC_TL,FGC-FGC_TL_Zone,BIA-BIA_Activity_Level_num,BIA-BIA_BMC,BIA-BIA_BMI,BIA-BIA_BMR,BIA-BIA_DEE,BIA-BIA_ECW,BIA-BIA_FFM,BIA-BIA_FFMI,BIA-BIA_FMI,BIA-BIA_Fat,BIA-BIA_Frame_num,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,PAQ_A-PAQ_A_Total,PAQ_C-PAQ_C_Total,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-computerinternet_hoursday,Enc_1,Enc_2,Enc_3,Enc_4,Enc_5,Enc_6,Enc_7,Enc_8,Enc_9,Enc_10,Enc_11,Enc_12,Enc_13,Enc_14,Enc_15,Enc_16,Enc_17,Enc_18,Enc_19,Enc_20,Enc_21,Enc_22,Enc_23,Enc_24,Enc_25,Enc_26,Enc_27,Enc_28,Enc_29,Enc_30,Enc_31,Enc_32,Enc_33,Enc_34,Enc_35,Enc_36,Enc_37,Enc_38,Enc_39,Enc_40,Enc_41,Enc_42,Enc_43,Enc_44,Enc_45,Enc_46,Enc_47,Enc_48,Enc_49,Enc_50,Enc_51,Enc_52,Enc_53,Enc_54,Enc_55,Enc_56,Enc_57,Enc_58,Enc_59,Enc_60,BMI_Age,Internet_Hours_Age,BMI_Internet_Hours,BFP_BMI,FFMI_BFP,FMI_BFP,LST_TBW,BFP_BMR,BFP_DEE,BMR_Weight,DEE_Weight,SMM_Height,Muscle_to_Fat,Hydration_Status,ICW_TBW
0,00008ff9,5,0,51.0,16.877316,46.0,50.8,,,,,,,,0.0,0.0,,,,,0.0,0.0,7.0,0.0,6.0,0.0,6.0,1.0,2.0,2.66855,16.8792,932.498,1492.0,8.25598,41.5862,13.8177,3.06143,9.21377,1.0,24.4349,8.89536,38.9177,19.5413,32.6909,,,,,3.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,84.386578,15.0,50.631947,0.545865,1.499679,0.332267,1.190475,8591.822097,13746.94484,18.35626,29.370079,0.424811,6.383063,0.643522,0.747453
1,000fd460,9,0,,14.03559,48.0,46.0,22.0,75.0,70.0,122.0,,,,3.0,0.0,,,,,5.0,0.0,11.0,1.0,11.0,1.0,3.0,0.0,2.0,2.57949,14.0371,936.656,1498.65,6.01993,42.0291,12.8254,1.21172,3.97085,1.0,21.0352,14.974,39.4497,15.4107,27.0552,,2.34,46.0,64.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,126.320313,0.0,0.0,0.282883,3.229888,0.305154,1.458119,3719.320478,5950.914352,20.362087,32.579348,0.321056,12.718037,0.588157,0.777492
2,00105258,10,1,71.0,16.648696,56.5,75.6,,65.0,94.0,117.0,5.0,7.0,33.0,20.0,1.0,10.2,1.0,14.7,2.0,7.0,1.0,10.0,1.0,10.0,1.0,5.0,0.0,,,,,,,,,,,,,,,,,,2.17,38.0,54.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,166.486961,20.0,33.297392,,,,,,,,,,,,
3,00115b9f,9,0,71.0,18.292347,56.0,81.6,,60.0,97.0,117.0,6.0,9.0,37.0,18.0,1.0,,,,,5.0,0.0,7.0,0.0,7.0,0.0,7.0,1.0,3.0,3.84191,18.2943,1131.43,1923.44,15.5925,62.7757,14.074,4.22033,18.8243,2.0,30.4041,16.779,58.9338,26.4798,45.9966,,2.451,31.0,45.0,0.0,2.160089,0.0,7.706036,3.293011,0.0,0.0,0.0,9.24861,0.0,0.0,8.699675,0.0,0.0,0.0,11.390141,0.0,0.0,0.0,3.792442,0.0,6.055555,0.0,0.0,7.310084,0.02966,0.0,2.325594,0.0,10.364971,0.0,0.0,0.0,9.426815,13.8441,0.0,0.0,2.213969,11.190788,0.0,5.166083,0.0,1.452391,8.020641,8.339259,11.755312,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.680251,0.0,0.0,0.0,164.631122,0.0,0.0,1.028971,0.747651,0.224196,1.281264,21298.377749,36207.411592,13.865564,23.571569,0.472854,6.274343,0.563684,0.661008
4,0016bb22,18,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.04,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5,001f3379,13,1,50.0,22.279952,59.5,112.2,,60.0,73.0,102.0,,,,12.0,0.0,16.5,2.0,17.9,2.0,6.0,0.0,10.0,1.0,11.0,1.0,8.0,0.0,2.0,4.33036,30.1865,1330.97,1996.45,30.2124,84.0285,16.6877,13.4988,67.9715,2.0,32.9141,20.902,79.6982,35.3804,63.1265,,4.11,40.0,56.0,0.0,2.543467,7.007909,0.677505,0.0,0.0,6.224926,3.997221,0.0,6.664527,0.0,0.0,0.0,9.906938,0.0,0.0,10.784865,0.0,4.87749,0.0,0.0,0.0,0.0,8.736187,0.0,8.766219,0.0,7.982283,0.0,0.0,0.0,8.15222,0.0,0.0,0.0,6.082332,0.0,2.942147,0.0,0.0,0.0,0.0,6.656182,0.0,0.852644,0.154119,0.0,0.0,3.341939,7.363288,0.0,0.0,0.0,0.0,0.0,8.228198,6.006187,0.211595,6.557204,0.0,0.321733,289.639376,0.0,0.0,2.251718,0.24551,0.198595,1.262516,90468.027355,135701.701175,11.862478,17.793672,0.594629,2.621003,0.562625,0.521399
6,0038ba98,10,0,,19.66076,55.0,84.6,,123.0,83.0,163.0,,,,9.0,1.0,,,,,2.0,0.0,11.0,1.0,11.0,1.0,11.0,1.0,2.0,3.78271,19.6629,1135.86,1817.38,16.3275,63.247,14.7,4.96291,21.353,2.0,30.8936,16.0259,59.4643,26.1957,47.2211,,3.67,27.0,40.0,3.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,196.607603,30.0,58.982281,1.085954,0.688428,0.232422,1.259274,24254.01858,38806.51514,13.426241,21.482033,0.476285,5.278294,0.558169,0.654233
7,0068a485,10,1,,16.861286,59.25,84.2,27.0,71.0,90.0,116.0,,,,0.0,0.0,12.6,2.0,11.1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,3.0,4.05726,16.8631,1180.04,1888.06,21.94,67.9527,13.6092,3.25395,16.2474,2.0,28.5367,17.476,63.8954,28.768,50.4767,,1.27,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,168.612865,20.0,33.722573,0.963488,0.837623,0.200275,1.265839,19172.581896,30676.066044,14.014727,22.423515,0.485536,8.840947,0.599486,0.565344
8,0069fbed,15,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,30.0,,,,,,,,,,,,,
9,0083e397,19,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [36]:
len(test.columns) #PCIAT 문항만큼 차이남

124

In [37]:
#test - train data columns 개수 차이 -> 특정 열 선택하여 feature로 사용
featuresCols = ['Basic_Demos-Age', 'Basic_Demos-Sex',
                'CGAS-CGAS_Score', 'Physical-BMI',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'Fitness_Endurance-Max_Stage',
                'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
                'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
                'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
                'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
                'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone',
                'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
                'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
                'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
                'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
                'BIA-BIA_TBW', 'PAQ_A-PAQ_A_Total',
                'PAQ_C-PAQ_C_Total', 'SDS-SDS_Total_Raw',
                'SDS-SDS_Total_T',
                'PreInt_EduHx-computerinternet_hoursday', 'sii', 'BMI_Age','Internet_Hours_Age','BMI_Internet_Hours',
                'BFP_BMI', 'FFMI_BFP', 'FMI_BFP', 'LST_TBW', 'BFP_BMR', 'BFP_DEE', 'BMR_Weight', 'DEE_Weight',
                'SMM_Height', 'Muscle_to_Fat', 'Hydration_Status', 'ICW_TBW']

featuresCols += time_series_cols #시계열 데이터 추가

train = train[featuresCols] #특정 열만 선택
train = train.dropna(subset='sii') #sii에 결측치 있으면 행 제거

featuresCols = ['Basic_Demos-Age', 'Basic_Demos-Sex',
                'CGAS-CGAS_Score', 'Physical-BMI',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'Fitness_Endurance-Max_Stage',
                'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
                'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
                'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
                'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
                'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone',
                'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
                'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
                'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
                'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
                'BIA-BIA_TBW', 'PAQ_A-PAQ_A_Total',
                'PAQ_C-PAQ_C_Total', 'SDS-SDS_Total_Raw',
                'SDS-SDS_Total_T',
                'PreInt_EduHx-computerinternet_hoursday', 'BMI_Age','Internet_Hours_Age','BMI_Internet_Hours',
                'BFP_BMI', 'FFMI_BFP', 'FMI_BFP', 'LST_TBW', 'BFP_BMR', 'BFP_DEE', 'BMR_Weight', 'DEE_Weight',
                'SMM_Height', 'Muscle_to_Fat', 'Hydration_Status', 'ICW_TBW']

featuresCols += time_series_cols
test = test[featuresCols] #테스트 데이터에도 똑같이 적용

In [38]:
train #124 columns

Unnamed: 0,Basic_Demos-Age,Basic_Demos-Sex,CGAS-CGAS_Score,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Max_Stage,Fitness_Endurance-Time_Mins,Fitness_Endurance-Time_Sec,FGC-FGC_CU,FGC-FGC_CU_Zone,FGC-FGC_GSND,FGC-FGC_GSND_Zone,FGC-FGC_GSD,FGC-FGC_GSD_Zone,FGC-FGC_PU,FGC-FGC_PU_Zone,FGC-FGC_SRL,FGC-FGC_SRL_Zone,FGC-FGC_SRR,FGC-FGC_SRR_Zone,FGC-FGC_TL,FGC-FGC_TL_Zone,BIA-BIA_Activity_Level_num,BIA-BIA_BMC,BIA-BIA_BMI,BIA-BIA_BMR,BIA-BIA_DEE,BIA-BIA_ECW,BIA-BIA_FFM,BIA-BIA_FFMI,BIA-BIA_FMI,BIA-BIA_Fat,BIA-BIA_Frame_num,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,PAQ_A-PAQ_A_Total,PAQ_C-PAQ_C_Total,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-computerinternet_hoursday,sii,BMI_Age,Internet_Hours_Age,BMI_Internet_Hours,BFP_BMI,FFMI_BFP,FMI_BFP,LST_TBW,BFP_BMR,BFP_DEE,BMR_Weight,DEE_Weight,SMM_Height,Muscle_to_Fat,Hydration_Status,ICW_TBW,Enc_1,Enc_2,Enc_3,Enc_4,Enc_5,Enc_6,Enc_7,Enc_8,Enc_9,Enc_10,Enc_11,Enc_12,Enc_13,Enc_14,Enc_15,Enc_16,Enc_17,Enc_18,Enc_19,Enc_20,Enc_21,Enc_22,Enc_23,Enc_24,Enc_25,Enc_26,Enc_27,Enc_28,Enc_29,Enc_30,Enc_31,Enc_32,Enc_33,Enc_34,Enc_35,Enc_36,Enc_37,Enc_38,Enc_39,Enc_40,Enc_41,Enc_42,Enc_43,Enc_44,Enc_45,Enc_46,Enc_47,Enc_48,Enc_49,Enc_50,Enc_51,Enc_52,Enc_53,Enc_54,Enc_55,Enc_56,Enc_57,Enc_58,Enc_59,Enc_60
0,5.0,0.0,51.0,16.877316,46.00,50.8,23.0,61.2,86.4,110.6,4.0,5.8,27.0,0.0,0.0,17.56,1.8,16.18,1.4,0.0,0.0,7.0,0.0,6.0,0.0,6.0,1.0,2.0,2.668550,16.87920,932.4980,1492.000,8.255980,41.58620,13.81770,3.061430,9.213770,1.0,24.43490,8.895360,38.91770,19.54130,32.69090,1.9120,2.2220,48.4,62.2,3.0,2,84.386578,15.0,50.631947,0.545865,1.499679,0.332267,1.190475,8591.822097,13746.944840,18.356260,29.370079,0.424811,6.383063,0.643522,0.747453,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,9.0,0.0,70.0,14.035590,48.00,46.0,22.0,75.0,70.0,122.0,4.6,6.6,24.2,3.0,0.0,16.04,1.6,15.50,1.6,5.0,0.0,11.0,1.0,11.0,1.0,3.0,0.0,2.0,2.579490,14.03710,936.6560,1498.650,6.019930,42.02910,12.82540,1.211720,3.970850,1.0,21.03520,14.974000,39.44970,15.41070,27.05520,2.6260,2.3400,46.0,64.0,0.0,0,126.320313,0.0,0.000000,0.282883,3.229888,0.305154,1.458119,3719.320478,5950.914352,20.362087,32.579348,0.321056,12.718037,0.588157,0.777492,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,10.0,1.0,71.0,16.648696,56.50,75.6,24.8,65.0,94.0,117.0,5.0,7.0,33.0,20.0,1.0,10.20,1.0,14.70,2.0,7.0,1.0,10.0,1.0,10.0,1.0,5.0,0.0,2.6,3.431454,19.10500,1106.4030,1889.264,17.199762,60.10940,14.83936,4.265620,17.650582,2.6,28.81348,14.096188,56.67794,27.61536,46.01322,2.0938,2.1700,38.0,54.0,2.0,0,166.486961,20.0,33.297392,0.923872,0.840729,0.241670,1.231775,19528.656877,33346.609152,14.634960,24.990265,0.488767,6.473938,0.608640,0.626200,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,9.0,0.0,71.0,18.292347,56.00,81.6,25.4,60.0,97.0,117.0,6.0,9.0,37.0,18.0,1.0,14.50,1.6,16.92,2.2,5.0,0.0,7.0,0.0,7.0,0.0,7.0,1.0,3.0,3.841910,18.29430,1131.4300,1923.440,15.592500,62.77570,14.07400,4.220330,18.824300,2.0,30.40410,16.779000,58.93380,26.47980,45.99660,1.7980,2.4510,31.0,45.0,0.0,1,164.631122,0.0,0.000000,1.028971,0.747651,0.224196,1.281264,21298.377749,36207.411592,13.865564,23.571569,0.472854,6.274343,0.563684,0.661008,6.092597,0.000000,2.272971,1.109518,0.0,2.264499,0.000000,0.0,0.0,0.000000,0.000000,2.626785,0.000000,5.511967,0.0,0.0,12.365653,0.000000,6.763050,0.0,6.988305,5.028478,0.00000,0.000000,2.998503,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.794934,3.439999,3.073107,0.0,0.0,0.0,1.089302,4.201143,0.900546,0.0,0.463877,8.943865,0.0,0.275575,0.000000,2.776990,3.997134,0.0,0.000000,2.297152,0.0,1.051462,5.661705,0.000000,0.0,0.0,0.000000,4.753823
4,18.0,1.0,69.4,26.713639,64.14,125.0,33.6,70.6,74.2,125.8,4.4,8.4,17.8,12.8,0.2,28.48,2.0,28.80,2.0,1.4,0.0,10.1,0.6,9.5,0.6,10.7,0.8,2.4,4.382366,26.06698,1394.9880,2144.724,29.722340,90.84782,16.01834,10.048682,56.672180,2.4,35.37708,25.748480,86.46560,47.54038,65.09940,1.0400,2.0724,42.0,58.8,2.6,1,480.845494,46.8,69.455460,2.174098,0.282649,0.177312,1.328209,79057.011034,121546.184578,11.159904,17.157792,0.741197,4.731007,0.520795,0.543432,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3955,13.0,0.0,60.0,16.362460,59.50,82.4,25.0,71.0,70.0,104.0,4.8,7.2,23.8,16.0,0.0,18.00,1.0,19.90,2.0,10.0,1.0,8.0,1.0,9.0,1.0,12.0,1.0,3.0,4.522770,16.36420,1206.8800,2051.700,19.461100,70.81170,14.06290,2.301380,11.588300,1.0,33.37090,17.979700,66.28890,29.77900,52.83200,2.7338,3.2600,35.0,50.0,1.0,1,212.711984,13.0,16.362460,0.708149,1.213543,0.198595,1.254711,13985.687504,23775.715110,14.646602,24.899272,0.500487,12.939628,0.641165,0.631642,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3956,10.0,0.0,58.6,18.764678,53.50,76.4,27.0,60.0,78.0,118.0,4.4,6.2,23.6,0.0,0.0,17.20,1.8,16.52,1.4,4.0,0.0,0.0,0.0,0.0,0.0,12.0,1.0,3.0,2.940418,18.27962,1010.4646,1785.136,12.685996,49.89076,14.06412,4.215510,15.749254,2.4,25.20942,11.995308,46.95032,21.39602,37.89540,2.2040,2.3400,38.6,54.8,0.0,0,187.646781,0.0,0.000000,0.861574,0.893002,0.267664,1.238945,15914.063643,28114.560289,13.225976,23.365654,0.399926,5.075547,0.496013,0.665237,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3957,11.0,0.0,68.0,21.441500,60.00,109.8,28.6,79.0,99.0,116.0,4.6,6.2,32.4,15.0,1.0,18.50,2.0,15.80,2.0,0.0,0.0,10.0,1.0,10.0,1.0,14.0,1.0,2.0,4.413050,21.44380,1253.7400,2005.990,20.482500,75.80330,14.80430,6.639520,33.996700,2.0,33.98050,21.340300,71.39030,28.77920,54.46300,2.7338,2.7290,56.0,77.0,0.0,1,235.856500,0.0,0.000000,1.585386,0.435463,0.195299,1.310804,42623.022658,68197.040233,11.418397,18.269490,0.479653,4.334530,0.496020,0.623919,0.000000,0.000000,5.276361,0.000000,0.0,1.407226,1.742838,0.0,0.0,0.000000,1.192059,1.519034,2.801047,0.444616,0.0,0.0,1.905474,4.253047,7.909111,0.0,1.850417,0.000000,1.67693,0.000000,1.230853,0.0,2.631329,0.657377,3.319725,0.0,0.0,8.219189,3.281114,0.000000,3.470974,0.0,0.0,0.0,0.000000,0.000000,1.144167,0.0,9.881141,0.000000,0.0,0.000000,0.000000,5.066833,0.023382,0.0,2.422389,2.917506,0.0,2.626473,3.949580,7.145576,0.0,0.0,0.000000,1.579676
3958,13.0,0.0,70.0,12.235895,70.70,87.0,27.6,59.0,61.0,113.0,3.8,4.6,25.0,19.0,0.6,23.18,2.0,24.90,2.2,3.8,0.4,9.8,0.4,10.3,0.8,11.6,1.0,4.0,6.661680,12.23720,1414.3400,2970.120,26.532300,92.90920,13.06840,-0.831170,-5.909170,2.0,41.37150,25.005400,86.24750,45.43400,67.90380,2.7338,3.3000,33.0,47.0,1.0,0,159.066638,13.0,12.235895,-0.482886,-2.211546,0.140658,1.270142,-8357.575498,-17550.944000,16.256782,34.139310,0.642631,-54.662704,0.780503,0.609266,0.000000,10.574584,0.000000,2.257841,0.0,7.146640,2.655945,0.0,0.0,0.387349,0.000000,0.000000,0.000000,6.584021,0.0,0.0,0.523901,6.135649,3.894987,0.0,0.000000,2.379207,0.00000,3.507591,0.000000,0.0,0.000000,0.000000,4.336707,0.0,0.0,1.335013,8.694819,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,1.773384,0.0,3.421620,0.000000,0.0,0.000000,6.374635,0.000000,2.222562,0.0,2.251566,0.000000,0.0,2.610091,1.550163,2.309440,0.0,0.0,0.402866,0.000000


In [39]:
test

Unnamed: 0,Basic_Demos-Age,Basic_Demos-Sex,CGAS-CGAS_Score,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Max_Stage,Fitness_Endurance-Time_Mins,Fitness_Endurance-Time_Sec,FGC-FGC_CU,FGC-FGC_CU_Zone,FGC-FGC_GSND,FGC-FGC_GSND_Zone,FGC-FGC_GSD,FGC-FGC_GSD_Zone,FGC-FGC_PU,FGC-FGC_PU_Zone,FGC-FGC_SRL,FGC-FGC_SRL_Zone,FGC-FGC_SRR,FGC-FGC_SRR_Zone,FGC-FGC_TL,FGC-FGC_TL_Zone,BIA-BIA_Activity_Level_num,BIA-BIA_BMC,BIA-BIA_BMI,BIA-BIA_BMR,BIA-BIA_DEE,BIA-BIA_ECW,BIA-BIA_FFM,BIA-BIA_FFMI,BIA-BIA_FMI,BIA-BIA_Fat,BIA-BIA_Frame_num,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,PAQ_A-PAQ_A_Total,PAQ_C-PAQ_C_Total,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-computerinternet_hoursday,BMI_Age,Internet_Hours_Age,BMI_Internet_Hours,BFP_BMI,FFMI_BFP,FMI_BFP,LST_TBW,BFP_BMR,BFP_DEE,BMR_Weight,DEE_Weight,SMM_Height,Muscle_to_Fat,Hydration_Status,ICW_TBW,Enc_1,Enc_2,Enc_3,Enc_4,Enc_5,Enc_6,Enc_7,Enc_8,Enc_9,Enc_10,Enc_11,Enc_12,Enc_13,Enc_14,Enc_15,Enc_16,Enc_17,Enc_18,Enc_19,Enc_20,Enc_21,Enc_22,Enc_23,Enc_24,Enc_25,Enc_26,Enc_27,Enc_28,Enc_29,Enc_30,Enc_31,Enc_32,Enc_33,Enc_34,Enc_35,Enc_36,Enc_37,Enc_38,Enc_39,Enc_40,Enc_41,Enc_42,Enc_43,Enc_44,Enc_45,Enc_46,Enc_47,Enc_48,Enc_49,Enc_50,Enc_51,Enc_52,Enc_53,Enc_54,Enc_55,Enc_56,Enc_57,Enc_58,Enc_59,Enc_60
0,5,0,51.0,16.877316,46.0,50.8,,,,,,,,0.0,0.0,,,,,0.0,0.0,7.0,0.0,6.0,0.0,6.0,1.0,2.0,2.66855,16.8792,932.498,1492.0,8.25598,41.5862,13.8177,3.06143,9.21377,1.0,24.4349,8.89536,38.9177,19.5413,32.6909,,,,,3.0,84.386578,15.0,50.631947,0.545865,1.499679,0.332267,1.190475,8591.822097,13746.94484,18.35626,29.370079,0.424811,6.383063,0.643522,0.747453,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,9,0,,14.03559,48.0,46.0,22.0,75.0,70.0,122.0,,,,3.0,0.0,,,,,5.0,0.0,11.0,1.0,11.0,1.0,3.0,0.0,2.0,2.57949,14.0371,936.656,1498.65,6.01993,42.0291,12.8254,1.21172,3.97085,1.0,21.0352,14.974,39.4497,15.4107,27.0552,,2.34,46.0,64.0,0.0,126.320313,0.0,0.0,0.282883,3.229888,0.305154,1.458119,3719.320478,5950.914352,20.362087,32.579348,0.321056,12.718037,0.588157,0.777492,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,10,1,71.0,16.648696,56.5,75.6,,65.0,94.0,117.0,5.0,7.0,33.0,20.0,1.0,10.2,1.0,14.7,2.0,7.0,1.0,10.0,1.0,10.0,1.0,5.0,0.0,,,,,,,,,,,,,,,,,,2.17,38.0,54.0,2.0,166.486961,20.0,33.297392,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,9,0,71.0,18.292347,56.0,81.6,,60.0,97.0,117.0,6.0,9.0,37.0,18.0,1.0,,,,,5.0,0.0,7.0,0.0,7.0,0.0,7.0,1.0,3.0,3.84191,18.2943,1131.43,1923.44,15.5925,62.7757,14.074,4.22033,18.8243,2.0,30.4041,16.779,58.9338,26.4798,45.9966,,2.451,31.0,45.0,0.0,164.631122,0.0,0.0,1.028971,0.747651,0.224196,1.281264,21298.377749,36207.411592,13.865564,23.571569,0.472854,6.274343,0.563684,0.661008,2.160089,0.0,7.706036,3.293011,0.0,0.0,0.0,9.24861,0.0,0.0,8.699675,0.0,0.0,0.0,11.390141,0.0,0.0,0.0,3.792442,0.0,6.055555,0.0,0.0,7.310084,0.02966,0.0,2.325594,0.0,10.364971,0.0,0.0,0.0,9.426815,13.8441,0.0,0.0,2.213969,11.190788,0.0,5.166083,0.0,1.452391,8.020641,8.339259,11.755312,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.680251,0.0,0.0,0.0
4,18,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.04,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5,13,1,50.0,22.279952,59.5,112.2,,60.0,73.0,102.0,,,,12.0,0.0,16.5,2.0,17.9,2.0,6.0,0.0,10.0,1.0,11.0,1.0,8.0,0.0,2.0,4.33036,30.1865,1330.97,1996.45,30.2124,84.0285,16.6877,13.4988,67.9715,2.0,32.9141,20.902,79.6982,35.3804,63.1265,,4.11,40.0,56.0,0.0,289.639376,0.0,0.0,2.251718,0.24551,0.198595,1.262516,90468.027355,135701.701175,11.862478,17.793672,0.594629,2.621003,0.562625,0.521399,2.543467,7.007909,0.677505,0.0,0.0,6.224926,3.997221,0.0,6.664527,0.0,0.0,0.0,9.906938,0.0,0.0,10.784865,0.0,4.87749,0.0,0.0,0.0,0.0,8.736187,0.0,8.766219,0.0,7.982283,0.0,0.0,0.0,8.15222,0.0,0.0,0.0,6.082332,0.0,2.942147,0.0,0.0,0.0,0.0,6.656182,0.0,0.852644,0.154119,0.0,0.0,3.341939,7.363288,0.0,0.0,0.0,0.0,0.0,8.228198,6.006187,0.211595,6.557204,0.0,0.321733
6,10,0,,19.66076,55.0,84.6,,123.0,83.0,163.0,,,,9.0,1.0,,,,,2.0,0.0,11.0,1.0,11.0,1.0,11.0,1.0,2.0,3.78271,19.6629,1135.86,1817.38,16.3275,63.247,14.7,4.96291,21.353,2.0,30.8936,16.0259,59.4643,26.1957,47.2211,,3.67,27.0,40.0,3.0,196.607603,30.0,58.982281,1.085954,0.688428,0.232422,1.259274,24254.01858,38806.51514,13.426241,21.482033,0.476285,5.278294,0.558169,0.654233,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
7,10,1,,16.861286,59.25,84.2,27.0,71.0,90.0,116.0,,,,0.0,0.0,12.6,2.0,11.1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,3.0,4.05726,16.8631,1180.04,1888.06,21.94,67.9527,13.6092,3.25395,16.2474,2.0,28.5367,17.476,63.8954,28.768,50.4767,,1.27,,,2.0,168.612865,20.0,33.722573,0.963488,0.837623,0.200275,1.265839,19172.581896,30676.066044,14.014727,22.423515,0.485536,8.840947,0.599486,0.565344,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
8,15,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,,30.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
9,19,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [61]:
len(test.columns) #sii가 없음

123

In [62]:
#결측치 제거
if np.any(np.isinf(train)):
    train = train.replace([np.inf, -np.inf], np.nan)

In [63]:
#평가지표
def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

In [64]:
#연속형 데이터를 임계값 기준으로 정수형 label로 변환하는 함수
#회귀 모델 예측값을 범주형으로 변환하는 데에 사용, 여기서는 sii 분류
def threshold_Rounder(oof_non_rounded, thresholds):
    return np.where(oof_non_rounded < thresholds[0], 0,
                    np.where(oof_non_rounded < thresholds[1], 1,
                             np.where(oof_non_rounded < thresholds[2], 2, 3)))

#값이 첫 번쨰 임계값보다 작으면 0 ~ 그 외 3

In [65]:
#범주형으로 변환 후 결과 확인
def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    rounded_p = threshold_Rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_p)

In [66]:
def TrainML(model_class, test_data):
    X = train.drop(['sii'], axis=1)
    y = train['sii'] #target variable

    SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED) #stratified k fold

    train_S = []
    test_S = [] #각 fold qwk 저장

    oof_non_rounded = np.zeros(len(y), dtype=float) #각 fold 검증 데이터 예측한 연속형 값
    oof_rounded = np.zeros(len(y), dtype=int) #연속형 반올림 -> 정수형으로
    test_preds = np.zeros((len(test_data), n_splits)) #fold 별로 저장

    #교차 검증, train / val로 split -> clone으로 모델 새로 생성 및 학습
    for fold, (train_idx, test_idx) in enumerate(tqdm(SKF.split(X, y), desc="Training Folds", total=n_splits)):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]

        model = clone(model_class)
        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

        #위에랑 마찬가지, 예측값 정수로 변환
        oof_non_rounded[test_idx] = y_val_pred
        y_val_pred_rounded = y_val_pred.round(0).astype(int)
        oof_rounded[test_idx] = y_val_pred_rounded

        #qwk 계산
        train_kappa = quadratic_weighted_kappa(y_train, y_train_pred.round(0).astype(int))
        val_kappa = quadratic_weighted_kappa(y_val, y_val_pred_rounded)

        train_S.append(train_kappa)
        test_S.append(val_kappa)

        #테스트 데이터 예측
        test_preds[:, fold] = model.predict(test_data)

        print(f"Fold {fold+1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}")
        clear_output(wait=True)

    print(f"Mean Train QWK --> {np.mean(train_S):.4f}")
    print(f"Mean Validation QWK ---> {np.mean(test_S):.4f}")

    #임계값 최적화, qwk 점수 기준으로 oof 정수로 변환하기 위한 최적의 임계값 찾기 위함
    KappaOPtimizer = minimize(evaluate_predictions,
                              x0=[0.5, 1.5, 2.5], args=(y, oof_non_rounded),
                              method='Nelder-Mead')
    assert KappaOPtimizer.success, "Optimization did not converge."

    #최적화된 oof
    oof_tuned = threshold_Rounder(oof_non_rounded, KappaOPtimizer.x)
    tKappa = quadratic_weighted_kappa(y, oof_tuned)

    print(f"----> || Optimized QWK SCORE :: {Fore.CYAN}{Style.BRIGHT} {tKappa:.3f}{Style.RESET_ALL}")

    #테스트 데이터의 각 fold 예측값 평균으로 변환 -> 정수로 변환
    tpm = test_preds.mean(axis=1)
    tpTuned = threshold_Rounder(tpm, KappaOPtimizer.x)

    submission = pd.DataFrame({
        'id': sample['id'],
        'sii': tpTuned
    })

    return submission

In [67]:
#tabnet을 scikit-learn 호환 모델로 만드는 클래스
class TabNetWrapper(BaseEstimator, RegressorMixin):
    def __init__(self, **kwargs):
        self.model = TabNetRegressor(**kwargs) #회귀 모델
        self.kwargs = kwargs
        self.imputer = SimpleImputer(strategy='median') #결측치 중앙값으로 처리
        self.best_model_path = 'best_tabnet_model.pt'

    def fit(self, X, y):
        X_imputed = self.imputer.fit_transform(X)

        if hasattr(y, 'values'):
            y = y.values

        X_train, X_valid, y_train, y_valid = train_test_split(X_imputed, y, test_size=0.2, random_state=SEED)

        history = self.model.fit(
            X_train = X_train,
            y_train = y_train.reshape(-1, 1),
            eval_set = [(X_valid, y_valid.reshape(-1, 1))],
            eval_name = ['valid'],
            eval_metric = ['mse'],
            max_epochs = 200,
            patience = 20,
            batch_size = 1024,
            virtual_batch_size = 128,
            num_workers = 0,
            drop_last = False,
            callbacks = [
                TabNetPretrainedModelCheckpoint(
                    filepath = self.best_model_path,
                    monitor = 'valid_mse', #검증 성능
                    mode = 'min',
                    save_best_only = True,
                    verbose = True
                )
            ]
        )

        if os.path.exists(self.best_model_path):
            self.model.load_model(self.best_model_path)
            os.remove(self.best_model_path)

        return self

    #tabnet 모델로 예측값 반환
    def predict(self, X):
        X_imputed = self.imputer.transform(X)
        return self.model.predict(X_imputed).flatten()

    def __deepcopy__(self, memo):
        cls = self.__class__
        result = self.__new__(cls)
        memo[id(self)] = result
        for k, v in self.__dict__.items():
            setattr(result, k, deepcopy(v, memo))

        return result

TabNet_Params = {
    'n_d': 64,              # Width of the decision prediction layer
    'n_a': 64,              # Width of the attention embedding for each step
    'n_steps': 5,           # Number of steps in the architecture
    'gamma': 1.5,           # Coefficient for feature selection regularization
    'n_independent': 2,     # Number of independent GLU layer in each GLU block
    'n_shared': 2,          # Number of shared GLU layer in each GLU block
    'lambda_sparse': 1e-4,  # Sparsity regularization
    'optimizer_fn': torch.optim.Adam,
    'optimizer_params': dict(lr=2e-2, weight_decay=1e-5),
    'mask_type': 'entmax',
    'scheduler_params': dict(mode="min", patience=10, min_lr=1e-5, factor=0.5),
    'scheduler_fn': torch.optim.lr_scheduler.ReduceLROnPlateau,
    'verbose': 1,
    'device_name': 'cuda' if torch.cuda.is_available() else 'cpu'
}

In [68]:
#tabnet 모델 중 평가 지표 모니터링하며 가장 좋은 성능 보이는 모델을 저장하는 클래스
class TabNetPretrainedModelCheckpoint(Callback):
    def __init__(self, filepath, monitor='val_loss', mode='min', save_best_only=True, verbose=1):
        super().__init__()
        self.filepath = filepath
        self.monitor = monitor
        self.mode = mode
        self.save_best_only = save_best_only
        self.verbose = verbose
        self.best = float('inf') if mode == 'min' else -float('inf')

    def on_train_begin(self, logs=None):
        self.model = self.trainer

    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        current = logs.get(self.monitor)
        if current is None:
            return

        if (self.mode == 'min' and current < self.best) or (self.mode == 'max' and current > self.best):
            if self.verbose:
                print(f'\nEpoch {epoch}: {self.monitor} improved from {self.best:.4f} to {current:.4f}')
            self.best = current
            if self.save_best_only:
                self.model.save_model(self.filepath)

일반 의사결정 나무와 다르게 축에 평행한 분할 대신 데이터를 다양한 각도로 분할하는 경사 분할을 사용

-> 선형 결합을 통해 데이터를 나누므로, 복잡한 데이터 분포를 잘 학습시킬 수 있음

In [69]:
#oblique decision tree의 노드 정의, 각 노드는 데이터를 선형 분할
class ObliqueDecisionTreeNode:
    def __init__(self, depth=0, max_depth=5, alpha=0.01, min_samples_split=2, min_samples_leaf=1,
                 min_impurity_decrease=0.0, ccp_alpha=0.0, max_leaf_nodes=None, leaf_count=[0]):
        self.left = None
        self.right = None
        self.depth = depth
        self.max_depth = max_depth
        self.alpha = alpha
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.min_impurity_decrease = min_impurity_decrease
        self.ccp_alpha = ccp_alpha
        self.max_leaf_nodes = max_leaf_nodes
        self.leaf_count = leaf_count
        self.is_leaf = False
        self.coefficients = None
        self.bias = None
        self.prediction = None

    def fit(self, X, y):
        if (self.depth >= self.max_depth or len(y) < self.min_samples_split or
            (self.max_leaf_nodes and self.leaf_count[0] >= self.max_leaf_nodes)):
            self.is_leaf = True
            self.prediction = np.mean(y)
            self.leaf_count[0] += 1
            return

        def objective(params):
            coefficients, bias = params[:-1], params[-1]
            predictions = X @ coefficients + bias
            left_mask = predictions < 0
            right_mask = ~left_mask

            if np.sum(left_mask) == 0 or np.sum(right_mask) == 0:
                return float('inf')

            left_variance = np.var(y[left_mask]) if np.sum(left_mask) > 1 else 0
            right_variance = np.var(y[right_mask]) if np.sum(right_mask) > 1 else 0
            weighted_variance = (np.sum(left_mask) * left_variance + np.sum(right_mask) * right_variance) / len(y)
            regularization = self.alpha * np.sum(coefficients ** 2)
            return weighted_variance + regularization

        initial_params = np.append(np.random.randn(X.shape[1]), 0)
        result = minimize(objective, initial_params, method="BFGS")
        self.coefficients, self.bias = result.x[:-1], result.x[-1]

        split = (X @ self.coefficients + self.bias) < 0
        left_X, right_X = X[split], X[~split]
        left_y, right_y = y[split], y[~split]

        if self.min_impurity_decrease > 0:
            current_impurity = np.var(y)
            left_impurity = np.var(left_y) if len(left_y) > 0 else 0
            right_impurity = np.var(right_y) if len(right_y) > 0 else 0
            weighted_impurity = (len(left_y) * left_impurity + len(right_y) * right_impurity) / len(y)
            impurity_decrease = current_impurity - weighted_impurity

            if impurity_decrease < self.min_impurity_decrease:
                self.is_leaf = True
                self.prediction = np.mean(y)
                self.leaf_count[0] += 1
                return

        if len(left_y) == 0 or len(right_y) == 0:
            self.is_leaf = True
            self.prediction = np.mean(y)
            self.leaf_count[0] += 1
            return

        self.left = ObliqueDecisionTreeNode(depth=self.depth + 1, max_depth=self.max_depth, alpha=self.alpha,
                                            min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf,
                                            min_impurity_decrease=self.min_impurity_decrease, ccp_alpha=self.ccp_alpha,
                                            max_leaf_nodes=self.max_leaf_nodes, leaf_count=self.leaf_count)
        self.left.fit(left_X, left_y)

        self.right = ObliqueDecisionTreeNode(depth=self.depth + 1, max_depth=self.max_depth, alpha=self.alpha,
                                             min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf,
                                             min_impurity_decrease=self.min_impurity_decrease, ccp_alpha=self.ccp_alpha,
                                             max_leaf_nodes=self.max_leaf_nodes, leaf_count=self.leaf_count)
        self.right.fit(right_X, right_y)

    def predict(self, X):
        if self.is_leaf:
            return self.prediction
        decision = (X @ self.coefficients + self.bias) < 0
        if decision:
            return self.left.predict(X)
        else:
            return self.right.predict(X)

In [70]:
#위의 노드로 만들어진 선형 결합을 기반으로 분할하는 회귀 모델 정의
class ObliqueDecisionTreeRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, max_depth=5, alpha=0.01, n_components=20, min_samples_split=2, min_samples_leaf=1, verbose=0,
                 max_leaf_nodes=None, min_impurity_decrease=0.0, explained_variance_ratio=None, ccp_alpha=0.0,
                 early_stopping_rounds=None, random_state=None):
        self.max_depth = max_depth
        self.alpha = alpha
        self.n_components = n_components
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.verbose = verbose
        self.max_leaf_nodes = max_leaf_nodes
        self.min_impurity_decrease = min_impurity_decrease
        self.explained_variance_ratio = explained_variance_ratio
        self.ccp_alpha = ccp_alpha
        self.early_stopping_rounds = early_stopping_rounds
        self.random_state = random_state

        self.root = ObliqueDecisionTreeNode(max_depth=max_depth, alpha=alpha,
                                             min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf,
                                             min_impurity_decrease=min_impurity_decrease, ccp_alpha=ccp_alpha,
                                             max_leaf_nodes=max_leaf_nodes)
        self.scaler = StandardScaler()
        self.imputer = SimpleImputer(strategy='median')

    def fit(self, X, y):
        X_imputed = self.imputer.fit_transform(X)

        #결측치 처리
        if self.random_state is not None:
            np.random.seed(self.random_state)

        #pca(주성분분석)
        if self.explained_variance_ratio is not None:
            self.pca = PCA(n_components=self.n_components)
            self.pca.fit(X_imputed)
            explained_variance = np.cumsum(self.pca.explained_variance_ratio_)
            self.n_components = np.searchsorted(explained_variance, self.explained_variance_ratio) + 1
            self.pca = PCA(n_components=self.n_components)

        X_train, X_valid, y_train, y_valid = train_test_split(X_imputed, y, test_size=0.2, random_state=self.random_state)

        X_train_scaled = self.scaler.fit_transform(X_train)
        X_train_reduced = self.pca.fit_transform(X_train_scaled)

        self.root.fit(X_train_reduced, y_train)

        y_valid_pred = self.predict(X_valid)
        validation_mse = mean_squared_error(y_valid, y_valid_pred)
        print("Validation MSE:", validation_mse)

        if self.early_stopping_rounds is not None:
            pass

        return self

    def predict(self, X):
        X_imputed = self.imputer.transform(X)
        X_scaled = self.scaler.transform(X_imputed)
        X_reduced = self.pca.transform(X_scaled)
        return np.array([self.root.predict(x) for x in X_reduced])

ODT_Params = {
    'max_depth': 5,
    'alpha': 0.01,
    'n_components': 20,
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'max_leaf_nodes': 30,
    'min_impurity_decrease': 0.01,
    'explained_variance_ratio': 0.95,
    'ccp_alpha': 0.01,
    'random_state': SEED,
    'early_stopping_rounds': 10,
    'verbose': 1,
}

In [71]:
#앙상블 모델 구성 -> 모델들의 예측을 결합하여 최종 예측 생성

Params = {
    'learning_rate': 0.046,
    'max_depth': 12,
    'num_leaves': 478,
    'min_data_in_leaf': 13,
    'feature_fraction': 0.893,
    'bagging_fraction': 0.784,
    'bagging_freq': 4,
    'lambda_l1': 10,  # Increased from 6.59
    'lambda_l2': 0.01,  # Increased from 2.68e-06
    'device': 'cpu',
}


XGB_Params = {
    'learning_rate': 0.05,
    'max_depth': 6,
    'n_estimators': 200,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 1,  # Increased from 0.1
    'reg_lambda': 5,  # Increased from 1
    'random_state': SEED,
    #'tree_method': 'gpu_hist' : tree method 매개변수를 gpu_hist -> hist로 변경, GPU 대신 CPU 기반 알고리즘 사용
    'tree_method': 'hist'
}


CatBoost_Params = {
    'learning_rate': 0.05,
    'depth': 6,
    'iterations': 200,
    'random_seed': SEED,
    'verbose': 0,
    'l2_leaf_reg': 10,  # Increase this value
    #'task_type': 'GPU' : task tpye GPU -> GPU에서 CPU로 변경
    'task_type': 'CPU'
}


In [72]:
# Create model instances
Light = LGBMRegressor(**Params, random_state=SEED, verbose=-1, n_estimators=300)
XGB_Model = XGBRegressor(**XGB_Params)
CatBoost_Model = CatBoostRegressor(**CatBoost_Params)
TabNet_Model = TabNetWrapper(**TabNet_Params)
ODT_Model = ObliqueDecisionTreeRegressor(**ODT_Params)

# Combine models using Voting Regressor
voting_model = VotingRegressor(estimators=[
    ('lightgbm', Light),
    ('xgboost', XGB_Model),
    ('catboost', CatBoost_Model),
    ('tabnet', TabNet_Model),
    #('odt', ODT_Model),
], weights=[4.0,4.0,5.0,4.0]) #가중치

In [73]:
Submission1 = TrainML(voting_model, test)

# Save submission
# Submission1.to_csv('submission.csv', index=False)

Training Folds: 100%|██████████| 5/5 [02:56<00:00, 35.34s/it]

Mean Train QWK --> 0.7194
Mean Validation QWK ---> 0.4672
----> || Optimized QWK SCORE :: [36m[1m 0.529[0m





In [74]:
Submission1

Unnamed: 0,id,sii
0,00008ff9,1
1,000fd460,0
2,00105258,1
3,00115b9f,0
4,0016bb22,0
5,001f3379,1
6,0038ba98,1
7,0068a485,0
8,0069fbed,1
9,0083e397,0


#### submission 2

In [77]:
#파일 경로 각자 설정해야 함
#train = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')
#test = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')
#sample = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/sample_submission.csv')

train = pd.read_csv('/content/drive/MyDrive/child-mind-institute-problematic-internet-use/train.csv')
test = pd.read_csv('/content/drive/MyDrive/child-mind-institute-problematic-internet-use/test.csv')
sample = pd.read_csv('/content/drive/MyDrive/child-mind-institute-problematic-internet-use/sample_submission.csv')

#파일 경로 각자 설정해야 함
#train_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet")
#test_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet")

train_ts = load_time_series('/content/drive/MyDrive/child-mind-institute-problematic-internet-use/series_train.parquet')
test_ts = load_time_series('/content/drive/MyDrive/child-mind-institute-problematic-internet-use/series_test.parquet')

def process_file(filename, dirname):
    df = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))
    df.drop('step', axis=1, inplace=True)
    return df.describe().values.reshape(-1), filename.split('=')[1]

def load_time_series(dirname) -> pd.DataFrame:
    ids = os.listdir(dirname)

    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))

    stats, indexes = zip(*results)

    df = pd.DataFrame(stats, columns=[f"stat_{i}" for i in range(len(stats[0]))])
    df['id'] = indexes
    return df

time_series_cols = train_ts.columns.tolist()
time_series_cols.remove("id")

train = pd.merge(train, train_ts, how="left", on='id')
test = pd.merge(test, test_ts, how="left", on='id')

train = train.drop('id', axis=1)
test = test.drop('id', axis=1)

featuresCols = ['Basic_Demos-Enroll_Season', 'Basic_Demos-Age', 'Basic_Demos-Sex',
                'CGAS-Season', 'CGAS-CGAS_Score', 'Physical-Season', 'Physical-BMI',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'Fitness_Endurance-Season', 'Fitness_Endurance-Max_Stage',
                'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
                'FGC-Season', 'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
                'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
                'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
                'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone', 'BIA-Season',
                'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
                'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
                'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
                'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
                'BIA-BIA_TBW', 'PAQ_A-Season', 'PAQ_A-PAQ_A_Total', 'PAQ_C-Season',
                'PAQ_C-PAQ_C_Total', 'SDS-Season', 'SDS-SDS_Total_Raw',
                'SDS-SDS_Total_T', 'PreInt_EduHx-Season',
                'PreInt_EduHx-computerinternet_hoursday', 'sii']

featuresCols += time_series_cols

train = train[featuresCols]
train = train.dropna(subset='sii')

cat_c = ['Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season',
          'Fitness_Endurance-Season', 'FGC-Season', 'BIA-Season',
          'PAQ_A-Season', 'PAQ_C-Season', 'SDS-Season', 'PreInt_EduHx-Season']

def update(df):
    global cat_c
    for c in cat_c:
        df[c] = df[c].fillna('Missing')
        df[c] = df[c].astype('category')
    return df

train = update(train)
test = update(test)

def create_mapping(column, dataset):
    unique_values = dataset[column].unique()
    return {value: idx for idx, value in enumerate(unique_values)}

for col in cat_c:
    mapping = create_mapping(col, train)
    mappingTe = create_mapping(col, test)

    train[col] = train[col].replace(mapping).astype(int)
    test[col] = test[col].replace(mappingTe).astype(int)

def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

def threshold_Rounder(oof_non_rounded, thresholds):
    return np.where(oof_non_rounded < thresholds[0], 0,
                    np.where(oof_non_rounded < thresholds[1], 1,
                             np.where(oof_non_rounded < thresholds[2], 2, 3)))

def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    rounded_p = threshold_Rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_p)

def TrainML(model_class, test_data):
    X = train.drop(['sii'], axis=1)
    y = train['sii']

    SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)

    train_S = []
    test_S = []

    oof_non_rounded = np.zeros(len(y), dtype=float)
    oof_rounded = np.zeros(len(y), dtype=int)
    test_preds = np.zeros((len(test_data), n_splits))

    for fold, (train_idx, test_idx) in enumerate(tqdm(SKF.split(X, y), desc="Training Folds", total=n_splits)):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]

        model = clone(model_class)
        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

        oof_non_rounded[test_idx] = y_val_pred
        y_val_pred_rounded = y_val_pred.round(0).astype(int)
        oof_rounded[test_idx] = y_val_pred_rounded

        train_kappa = quadratic_weighted_kappa(y_train, y_train_pred.round(0).astype(int))
        val_kappa = quadratic_weighted_kappa(y_val, y_val_pred_rounded)

        train_S.append(train_kappa)
        test_S.append(val_kappa)

        test_preds[:, fold] = model.predict(test_data)

        print(f"Fold {fold+1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}")
        clear_output(wait=True)

    print(f"Mean Train QWK --> {np.mean(train_S):.4f}")
    print(f"Mean Validation QWK ---> {np.mean(test_S):.4f}")

    KappaOPtimizer = minimize(evaluate_predictions,
                              x0=[0.5, 1.5, 2.5], args=(y, oof_non_rounded),
                              method='Nelder-Mead')
    assert KappaOPtimizer.success, "Optimization did not converge."

    oof_tuned = threshold_Rounder(oof_non_rounded, KappaOPtimizer.x)
    tKappa = quadratic_weighted_kappa(y, oof_tuned)

    print(f"----> || Optimized QWK SCORE :: {Fore.CYAN}{Style.BRIGHT} {tKappa:.3f}{Style.RESET_ALL}")

    tpm = test_preds.mean(axis=1)
    tpTuned = threshold_Rounder(tpm, KappaOPtimizer.x)

    submission = pd.DataFrame({
        'id': sample['id'],
        'sii': tpTuned
    })

    return submission

# Model parameters for LightGBM
Params = {
    'learning_rate': 0.046,
    'max_depth': 12,
    'num_leaves': 478,
    'min_data_in_leaf': 13,
    'feature_fraction': 0.893,
    'bagging_fraction': 0.784,
    'bagging_freq': 4,
    'lambda_l1': 10,  # Increased from 6.59
    'lambda_l2': 0.01  # Increased from 2.68e-06
}


# XGBoost parameters
XGB_Params = {
    'learning_rate': 0.05,
    'max_depth': 6,
    'n_estimators': 200,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 1,  # Increased from 0.1
    'reg_lambda': 5,  # Increased from 1
    'random_state': SEED
}


CatBoost_Params = {
    'learning_rate': 0.05,
    'depth': 6,
    'iterations': 200,
    'random_seed': SEED,
    'cat_features': cat_c,
    'verbose': 0,
    'l2_leaf_reg': 10  # Increase this value
}

# Create model instances
Light = LGBMRegressor(**Params, random_state=SEED, verbose=-1, n_estimators=300)
XGB_Model = XGBRegressor(**XGB_Params)
CatBoost_Model = CatBoostRegressor(**CatBoost_Params)
TabNet_Model = TabNetWrapper(**TabNet_Params)
ODT_Model = ObliqueDecisionTreeRegressor(**ODT_Params)

# Combine models using Voting Regressor
voting_model = VotingRegressor(estimators=[
    ('lightgbm', Light),
    ('xgboost', XGB_Model),
    ('catboost', CatBoost_Model),
    #('tabnet', TabNet_Model),
    #('odt', ODT_Model),
])

# Train the ensemble model
Submission2 = TrainML(voting_model, test)

Training Folds: 100%|██████████| 5/5 [01:58<00:00, 23.69s/it]

Mean Train QWK --> 0.7595
Mean Validation QWK ---> 0.3862





----> || Optimized QWK SCORE :: [36m[1m 0.450[0m


In [78]:
Submission2

Unnamed: 0,id,sii
0,00008ff9,1
1,000fd460,0
2,00105258,0
3,00115b9f,0
4,0016bb22,1
5,001f3379,1
6,0038ba98,0
7,0068a485,0
8,0069fbed,1
9,0083e397,0


#### submission 3

In [79]:
#파일 경로 각자 설정해야 함
#train = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')
#test = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')
#sample = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/sample_submission.csv')

train = pd.read_csv('/content/drive/MyDrive/child-mind-institute-problematic-internet-use/train.csv')
test = pd.read_csv('/content/drive/MyDrive/child-mind-institute-problematic-internet-use/test.csv')
sample = pd.read_csv('/content/drive/MyDrive/child-mind-institute-problematic-internet-use/sample_submission.csv')

#파일 경로 각자 설정해야 함
#train_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet")
#test_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet")

train_ts = load_time_series('/content/drive/MyDrive/child-mind-institute-problematic-internet-use/series_train.parquet')
test_ts = load_time_series('/content/drive/MyDrive/child-mind-institute-problematic-internet-use/series_test.parquet')

featuresCols = ['Basic_Demos-Enroll_Season', 'Basic_Demos-Age', 'Basic_Demos-Sex',
                'CGAS-Season', 'CGAS-CGAS_Score', 'Physical-Season', 'Physical-BMI',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'Fitness_Endurance-Season', 'Fitness_Endurance-Max_Stage',
                'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
                'FGC-Season', 'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
                'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
                'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
                'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone', 'BIA-Season',
                'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
                'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
                'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
                'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
                'BIA-BIA_TBW', 'PAQ_A-Season', 'PAQ_A-PAQ_A_Total', 'PAQ_C-Season',
                'PAQ_C-PAQ_C_Total', 'SDS-Season', 'SDS-SDS_Total_Raw',
                'SDS-SDS_Total_T', 'PreInt_EduHx-Season',
                'PreInt_EduHx-computerinternet_hoursday', 'sii']

cat_c = ['Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season',
          'Fitness_Endurance-Season', 'FGC-Season', 'BIA-Season',
          'PAQ_A-Season', 'PAQ_C-Season', 'SDS-Season', 'PreInt_EduHx-Season']

time_series_cols = train_ts.columns.tolist()
time_series_cols.remove("id")

train = pd.merge(train, train_ts, how="left", on='id')
test = pd.merge(test, test_ts, how="left", on='id')

train = train.drop('id', axis=1)
test = test.drop('id', axis=1)

featuresCols += time_series_cols

train = train[featuresCols]
train = train.dropna(subset='sii')

def update(df):
    global cat_c
    for c in cat_c:
        df[c] = df[c].fillna('Missing')
        df[c] = df[c].astype('category')
    return df

train = update(train)
test = update(test)

def create_mapping(column, dataset):
    unique_values = dataset[column].unique()
    return {value: idx for idx, value in enumerate(unique_values)}

for col in cat_c:
    mapping = create_mapping(col, train)
    mappingTe = create_mapping(col, test)

    train[col] = train[col].replace(mapping).astype(int)
    test[col] = test[col].replace(mappingTe).astype(int)

def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

def threshold_Rounder(oof_non_rounded, thresholds):
    return np.where(oof_non_rounded < thresholds[0], 0,
                    np.where(oof_non_rounded < thresholds[1], 1,
                             np.where(oof_non_rounded < thresholds[2], 2, 3)))

def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    rounded_p = threshold_Rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_p)

def TrainML(model_class, test_data):
    X = train.drop(['sii'], axis=1)
    y = train['sii']

    SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)

    train_S = []
    test_S = []

    oof_non_rounded = np.zeros(len(y), dtype=float)
    oof_rounded = np.zeros(len(y), dtype=int)
    test_preds = np.zeros((len(test_data), n_splits))

    for fold, (train_idx, test_idx) in enumerate(tqdm(SKF.split(X, y), desc="Training Folds", total=n_splits)):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]

        model = clone(model_class)
        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

        oof_non_rounded[test_idx] = y_val_pred
        y_val_pred_rounded = y_val_pred.round(0).astype(int)
        oof_rounded[test_idx] = y_val_pred_rounded

        train_kappa = quadratic_weighted_kappa(y_train, y_train_pred.round(0).astype(int))
        val_kappa = quadratic_weighted_kappa(y_val, y_val_pred_rounded)

        train_S.append(train_kappa)
        test_S.append(val_kappa)

        test_preds[:, fold] = model.predict(test_data)

        print(f"Fold {fold+1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}")
        clear_output(wait=True)

    print(f"Mean Train QWK --> {np.mean(train_S):.4f}")
    print(f"Mean Validation QWK ---> {np.mean(test_S):.4f}")

    KappaOPtimizer = minimize(evaluate_predictions,
                              x0=[0.5, 1.5, 2.5], args=(y, oof_non_rounded),
                              method='Nelder-Mead')
    assert KappaOPtimizer.success, "Optimization did not converge."

    oof_tuned = threshold_Rounder(oof_non_rounded, KappaOPtimizer.x)
    tKappa = quadratic_weighted_kappa(y, oof_tuned)

    print(f"----> || Optimized QWK SCORE :: {Fore.CYAN}{Style.BRIGHT} {tKappa:.3f}{Style.RESET_ALL}")

    tpm = test_preds.mean(axis=1)
    tp_rounded = threshold_Rounder(tpm, KappaOPtimizer.x)

    return tp_rounded

imputer = SimpleImputer(strategy='median')

ensemble = VotingRegressor(estimators=[
    ('lgb',    Pipeline(steps=[('imputer', imputer), ('regressor', LGBMRegressor(random_state=SEED))])),
    ('xgb',    Pipeline(steps=[('imputer', imputer), ('regressor', XGBRegressor(random_state=SEED))])),
    ('cat',    Pipeline(steps=[('imputer', imputer), ('regressor', CatBoostRegressor(random_state=SEED, silent=True))])),
    ('rf',     Pipeline(steps=[('imputer', imputer), ('regressor', RandomForestRegressor(random_state=SEED))])),
    ('gb',     Pipeline(steps=[('imputer', imputer), ('regressor', GradientBoostingRegressor(random_state=SEED))])),
    #('tabnet', Pipeline(steps=[('imputer', imputer), ('regressor', TabNetWrapper(**TabNet_Params))])),
    #('odt',    Pipeline(steps=[('imputer', imputer), ('regressor', ObliqueDecisionTreeRegressor(**ODT_Params))])),
])

Submission3 = TrainML(ensemble, test)

Training Folds: 100%|██████████| 5/5 [04:18<00:00, 51.77s/it]

Mean Train QWK --> 0.9187
Mean Validation QWK ---> 0.3735





----> || Optimized QWK SCORE :: [36m[1m 0.442[0m


In [80]:
Submission3 = pd.DataFrame({
    'id': sample['id'],
    'sii': Submission3
})

Submission3

Unnamed: 0,id,sii
0,00008ff9,1
1,000fd460,0
2,00105258,0
3,00115b9f,0
4,0016bb22,1
5,001f3379,1
6,0038ba98,0
7,0068a485,0
8,0069fbed,1
9,0083e397,0


#### combine submission

In [81]:
sub1 = Submission1
sub2 = Submission2
sub3 = Submission3

sub1 = sub1.sort_values(by='id').reset_index(drop=True)
sub2 = sub2.sort_values(by='id').reset_index(drop=True)
sub3 = sub3.sort_values(by='id').reset_index(drop=True)

combined = pd.DataFrame({
    'id': sub1['id'],
    'sii_1': sub1['sii'],
    'sii_2': sub2['sii'],
    'sii_3': sub3['sii']
})

In [82]:
combined

Unnamed: 0,id,sii_1,sii_2,sii_3
0,00008ff9,1,1,1
1,000fd460,0,0,0
2,00105258,1,0,0
3,00115b9f,0,0,0
4,0016bb22,0,1,1
5,001f3379,1,1,1
6,0038ba98,1,0,0
7,0068a485,0,0,0
8,0069fbed,1,1,1
9,0083e397,0,0,0


In [83]:
def majority_vote(row):
    return row.mode()[0] #최빈값

combined['final_sii'] = combined[['sii_1', 'sii_2', 'sii_3']].apply(majority_vote, axis=1)

final_submission = combined[['id', 'final_sii']].rename(columns={'final_sii': 'sii'})

final_submission.to_csv('submission.csv', index=False)

print("Majority voting completed and saved to 'Final_Submission.csv'")

Majority voting completed and saved to 'Final_Submission.csv'


In [84]:
final_submission

Unnamed: 0,id,sii
0,00008ff9,1
1,000fd460,0
2,00105258,0
3,00115b9f,0
4,0016bb22,1
5,001f3379,1
6,0038ba98,0
7,0068a485,0
8,0069fbed,1
9,0083e397,0
