In [7]:
# ======================================================================
# 🚀 최종 마스터 파이프라인: 데이터 정제부터 모델 튜닝까지 한 번에!
# ======================================================================
import joblib
import pandas as pd
import numpy as np
import time
import os
import subprocess
import warnings
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.utils.class_weight import compute_class_weight
from catboost import CatBoostClassifier
import xgboost as xgb
import lightgbm as lgb
try:
    import optuna
    OPTUNA_AVAILABLE = True
except ImportError:
    OPTUNA_AVAILABLE = False
    
warnings.filterwarnings('ignore')

# =======================================================
# 🧹 1단계: 데이터 정제 (Flow-Packet 동기화 중복 제거)
# =======================================================
print("="*80)
print("🧹 1단계: 데이터 정제 (Flow-Packet 동기화 중복 제거)")
print("="*80)

# --- 1-1: 원본 데이터 로딩 ---
print("📂 원본 데이터 로딩...")
try:
    flow_data_orig = joblib.load("task2_data/train_flow_data.pkl")
    # 🚨 중요: 이 단계는 메모리를 일시적으로 많이 사용할 수 있습니다. (대청소를 위한 일회성 투자)
    all_packets_orig = []
    packet_files = [f"task2_data/train_packet_data_{i}.pkl" for i in range(50000, 650000, 50000)]
    for file_path in packet_files:
        if os.path.exists(file_path):
            packets_chunk = joblib.load(file_path)
            all_packets_orig.extend(packets_chunk)
    print(f"✅ 원본 Flow/Packet 데이터 로딩 완료")
except FileNotFoundError:
    print("❌ 원본 파일을 찾을 수 없습니다! 데이터 경로를 확인해주세요.")
    exit()

# --- 1-2: Flow 데이터 기준 중복 인덱스 식별 ---
print("\n🔍 Flow 데이터 기준 중복 인덱스 식별...")
initial_rows = len(flow_data_orig)
# .index를 통해 원본 인덱스를 그대로 보존하는 것이 핵심
non_duplicate_indices = flow_data_orig.drop_duplicates().index
final_rows = len(non_duplicate_indices)
print(f"✅ 유효 인덱스 {final_rows:,}개 확보 (제거될 중복: {initial_rows - final_rows:,}개)")

# --- 1-3: 유효 인덱스를 사용하여 데이터 동기화 정제 ---
print("\n🔄 데이터 동기화 정제...")
# Flow 데이터와 Packet 데이터를 동일한 순서로 재정렬하고, 인덱스를 0부터 새로 부여
flow_data = flow_data_orig.loc[non_duplicate_indices].reset_index(drop=True)
all_packets = [all_packets_orig[i] for i in non_duplicate_indices]
print(f"✅ 동기화 완료! 최종 정제 데이터: {len(flow_data):,}개")
# --- 1-4: 정제된 데이터 저장 ---
joblib.dump(flow_data, "task2_data/train_flow_data_cleaned.pkl")
joblib.dump(all_packets, "task2_data/train_packet_data_cleaned.pkl")
print("✅ 정제된 데이터 저장 완료")

🧹 1단계: 데이터 정제 (Flow-Packet 동기화 중복 제거)
📂 원본 데이터 로딩...
✅ 원본 Flow/Packet 데이터 로딩 완료

🔍 Flow 데이터 기준 중복 인덱스 식별...
✅ 유효 인덱스 381,559개 확보 (제거될 중복: 218,441개)

🔄 데이터 동기화 정제...
✅ 동기화 완료! 최종 정제 데이터: 381,559개
✅ 정제된 데이터 저장 완료


In [2]:
# =======================================================
# 🎯 2단계: 대표 샘플 생성 (층화 추출)
# =======================================================
print("\n" + "="*80)
print("🎯 2단계: 대표 샘플 생성 (층화 추출)")
print("="*80)

# 👈 빠르고 간단한 테스트를 원하시면 이 숫자를 50000으로 줄이세요.
target_total_samples = 100000  
print(f"🎯 목표 총 샘플 수: {target_total_samples:,}개")

# 복잡한 for 루프 없이, 전체 데이터에서 바로 샘플링
stratify_key = flow_data['duration_class'].astype(str) + '_' + flow_data['volume_class'].astype(str)
stratified_sampler = StratifiedShuffleSplit(n_splits=1, train_size=target_total_samples, random_state=42)
indices = np.arange(len(flow_data))
# sampled_indices는 이제 0부터 시작하는 '위치 인덱스'입니다.
sampled_indices, _ = next(stratified_sampler.split(indices, stratify_key))
print(f"✅ 샘플링 완료! 총 {len(sampled_indices):,}개 대표 샘플 인덱스 확보.")


🎯 2단계: 대표 샘플 생성 (층화 추출)
🎯 목표 총 샘플 수: 100,000개
✅ 샘플링 완료! 총 100,000개 대표 샘플 인덱스 확보.


In [3]:
# =======================================================
# 🔧 3단계: 고급 특징 엔지니어링
# =======================================================
print("\n" + "="*80)
print("🔧 3단계: 고급 특징 엔지니어링")
print("="*80)

def extract_packet_features(packets):
    """ 패킷들로부터 고급 특징 추출 (최대 3개 패킷만 사용) """
    features = {}
    num_packets = min(3, len(packets))
    packets = packets.iloc[:num_packets]
    
    numeric_cols = packets.select_dtypes(include=[np.number]).columns
    
    for col in numeric_cols:
        features[f'first_{col}'] = packets.iloc[0][col] if col in packets.columns else 0
        features[f'second_{col}'] = packets.iloc[1][col] if col in packets.columns and len(packets) > 1 else 0
    
    if 'ip_len' in packets.columns:
        ip_lens = packets['ip_len'].values
        features['ip_len_mean_13'] = np.mean(ip_lens); features['ip_len_std_13'] = np.std(ip_lens) if len(ip_lens) > 1 else 0
        features['ip_len_max_13'] = np.max(ip_lens); features['ip_len_min_13'] = np.min(ip_lens)
        features['ip_len_range_13'] = np.max(ip_lens) - np.min(ip_lens); features['ip_len_median_13'] = np.median(ip_lens)
        if len(ip_lens) >= 3:
            diffs = np.diff(ip_lens)
            features['ip_len_trend'] = 1 if np.mean(diffs) > 0 else (-1 if np.mean(diffs) < 0 else 0)
            features['ip_len_volatility'] = np.std(diffs) if len(diffs) > 1 else 0
        else:
            features['ip_len_trend'] = 0; features['ip_len_volatility'] = 0
    
    if 'packet_capture_time' in packets.columns:
        try:
            times = pd.to_datetime(packets['packet_capture_time']); time_diffs = np.diff(times).astype('timedelta64[us]').astype(float)
            if len(time_diffs) > 0:
                features['inter_time_mean_13'] = np.mean(time_diffs); features['inter_time_std_13'] = np.std(time_diffs) if len(time_diffs) > 1 else 0
                features['inter_time_max_13'] = np.max(time_diffs); features['inter_time_min_13'] = np.min(time_diffs)
                features['timing_consistency'] = np.std(time_diffs) / (np.mean(time_diffs) + 1)
            else:
                for key in ['inter_time_mean_13', 'inter_time_std_13', 'inter_time_max_13', 'inter_time_min_13', 'timing_consistency']: features[key] = 0
        except:
            for key in ['inter_time_mean_13', 'inter_time_std_13', 'inter_time_max_13', 'inter_time_min_13', 'timing_consistency']: features[key] = 0
    
    if 'tcp_len' in packets.columns and 'ip_len' in packets.columns:
        tcp_lens = packets['tcp_len'].values; features['tcp_len_mean_13'] = np.mean(tcp_lens)
        features['tcp_len_std_13'] = np.std(tcp_lens) if len(tcp_lens) > 1 else 0; features['tcp_len_sum_13'] = np.sum(tcp_lens)
        total_ip = np.sum(packets['ip_len']); total_tcp = np.sum(tcp_lens); features['tcp_efficiency_13'] = total_tcp / max(total_ip, 1)
    
    if 'tcp_flags' in packets.columns:
        flags = packets['tcp_flags'].values
        features['has_syn'] = int(any(flag & 0x02 for flag in flags)); features['has_ack'] = int(any(flag & 0x10 for flag in flags))
        features['has_fin'] = int(any(flag & 0x01 for flag in flags)); features['has_rst'] = int(any(flag & 0x04 for flag in flags))
        features['has_psh'] = int(any(flag & 0x08 for flag in flags))
        if len(flags) >= 3:
            first_syn = (flags[0] & 0x02) != 0; second_syn_ack = (flags[1] & 0x12) == 0x12; third_ack = (flags[2] & 0x10) != 0
            features['is_handshake_complete'] = int(first_syn and second_syn_ack and third_ack)
            has_fin_ack = any((flag & 0x11) == 0x11 for flag in flags); features['is_graceful_close'] = int(has_fin_ack)
        else:
            features['is_handshake_complete'] = 0; features['is_graceful_close'] = 0
        features['flag_diversity'] = len(set(flags)); psh_count = sum(1 for flag in flags if flag & 0x08); features['push_frequency'] = psh_count / len(flags)
    
    if 'ip_len' in packets.columns and len(packets) >= 3:
        sizes = packets['ip_len'].values; increases = sum(1 for i in range(1, len(sizes)) if sizes[i] > sizes[i-1]); decreases = sum(1 for i in range(1, len(sizes)) if sizes[i] < sizes[i-1])
        features['size_increase_count'] = increases; features['size_decrease_count'] = decreases; features['size_stability'] = sum(1 for i in range(1, len(sizes)) if sizes[i] == sizes[i-1])
    
    if 'tcp_len' in packets.columns and 'ip_len' in packets.columns:
        features['tcp_to_ip_ratio_first'] = packets.iloc[0]['tcp_len'] / max(packets.iloc[0]['ip_len'], 1)
        if len(packets) > 1: features['tcp_to_ip_ratio_second'] = packets.iloc[1]['tcp_len'] / max(packets.iloc[1]['ip_len'], 1)
    
    features['inter_packet_time_us'] = features.get('inter_time_mean_13', 0); features['ip_len_diff'] = features.get('second_ip_len', 0) - features.get('first_ip_len', 0)
    return features

def extract_advanced_features_from_cleaned_data(indices, packet_data_list):
    print("🚀 고급 특징 추출 시작...")
    features_list = []
    for i, idx in enumerate(indices):
        try:
            packet_df = packet_data_list[idx]
            if isinstance(packet_df, pd.DataFrame) and not packet_df.empty:
                valid_packets = packet_df.dropna()
                if len(valid_packets) >= 1:
                    features = extract_packet_features(valid_packets)
                    features_list.append(features)
            if (i + 1) % 10000 == 0:
                print(f"     진행률: {i + 1:,} / {len(indices):,}개 완료")
        except Exception:
            continue
    return features_list

advanced_features_list = extract_advanced_features_from_cleaned_data(sampled_indices, all_packets)
advanced_features_df = pd.DataFrame(advanced_features_list).fillna(0)
# iloc으로 샘플링된 flow 데이터를 가져오고, 인덱스를 리셋하여 특징 데이터와 완벽히 맞춤
valid_flow_data = flow_data.iloc[sampled_indices].reset_index(drop=True)
print(f"\n✅ 특징 추출 완료!")
print(f"✓ 최종 특징 행렬 크기: {advanced_features_df.shape}")
print(f"✓ 최종 타겟 데이터 크기: {valid_flow_data.shape}")


🔧 3단계: 고급 특징 엔지니어링
🚀 고급 특징 추출 시작...
     진행률: 10,000 / 100,000개 완료
     진행률: 20,000 / 100,000개 완료
     진행률: 30,000 / 100,000개 완료
     진행률: 40,000 / 100,000개 완료
     진행률: 50,000 / 100,000개 완료
     진행률: 60,000 / 100,000개 완료
     진행률: 70,000 / 100,000개 완료
     진행률: 80,000 / 100,000개 완료
     진행률: 90,000 / 100,000개 완료
     진행률: 100,000 / 100,000개 완료

✅ 특징 추출 완료!
✓ 최종 특징 행렬 크기: (100000, 44)
✓ 최종 타겟 데이터 크기: (100000, 11)


In [8]:
# =======================================================
# 🚀 4단계: 모델 튜닝 및 평가
# =======================================================
print("\n" + "="*80)
print("🚀 4단계: 5개 모델 100회 하이퍼파라미터 튜닝")
print("="*80)

# --- 4-1: 환경 설정 ---
def check_gpu_advanced():
    gpu_status = {}
    
    # NVIDIA GPU 확인
    try: 
        result = subprocess.run(['nvidia-smi'], capture_output=True, text=True, timeout=5)
        gpu_status['nvidia'] = result.returncode == 0
        print(f"🔍 NVIDIA-SMI: {'✅ 감지됨' if gpu_status['nvidia'] else '❌ 미감지'}")
    except: 
        gpu_status['nvidia'] = False
        print(f"🔍 NVIDIA-SMI: ❌ 실행 실패")
    
    # CatBoost GPU 테스트 (더 안전한 방식)
    try: 
        import catboost
        print(f"🔍 CatBoost 버전: {catboost.__version__}")
        # 매우 작은 더미 데이터로 테스트
        test_cat = CatBoostClassifier(task_type='GPU', iterations=1, verbose=False, allow_writing_files=False)
        test_cat.fit([[1, 2], [3, 4]], [0, 1])
        gpu_status['catboost'] = True
        print(f"🔍 CatBoost GPU: ✅ 작동 확인")
    except Exception as e:
        gpu_status['catboost'] = False
        print(f"🔍 CatBoost GPU: ❌ 실패 ({str(e)[:50]}...)")
    
    # XGBoost GPU 테스트 (더 안전한 방식)
    try: 
        print(f"🔍 XGBoost 버전: {xgb.__version__}")
        test_xgb = xgb.XGBClassifier(tree_method='gpu_hist', n_estimators=1, verbosity=0)
        test_xgb.fit(np.array([[1, 2], [3, 4]]), np.array([0, 1]))
        gpu_status['xgboost'] = True
        print(f"🔍 XGBoost GPU: ✅ 작동 확인")
    except Exception as e:
        gpu_status['xgboost'] = False
        print(f"🔍 XGBoost GPU: ❌ 실패 ({str(e)[:50]}...)")
        # CPU 모드로 대체 테스트
        try:
            test_xgb_cpu = xgb.XGBClassifier(tree_method='hist', n_estimators=1, verbosity=0)
            test_xgb_cpu.fit(np.array([[1, 2], [3, 4]]), np.array([0, 1]))
            print(f"🔍 XGBoost CPU: ✅ CPU 모드는 정상")
        except:
            print(f"🔍 XGBoost CPU: ❌ CPU 모드도 실패")
    
    # LightGBM GPU 테스트 (더 안전한 방식)
    try: 
        print(f"🔍 LightGBM 버전: {lgb.__version__}")
        test_lgb = lgb.LGBMClassifier(device='gpu', n_estimators=1, verbose=-1, force_row_wise=True)
        test_lgb.fit([[1, 2], [3, 4]], [0, 1])
        gpu_status['lightgbm'] = True
        print(f"🔍 LightGBM GPU: ✅ 작동 확인")
    except Exception as e:
        gpu_status['lightgbm'] = False
        print(f"🔍 LightGBM GPU: ❌ 실패 ({str(e)[:50]}...)")
        # CPU 모드로 대체 테스트
        try:
            test_lgb_cpu = lgb.LGBMClassifier(device='cpu', n_estimators=1, verbose=-1)
            test_lgb_cpu.fit([[1, 2], [3, 4]], [0, 1])
            print(f"🔍 LightGBM CPU: ✅ CPU 모드는 정상")
        except:
            print(f"🔍 LightGBM CPU: ❌ CPU 모드도 실패")
    
    return gpu_status

gpu_status = check_gpu_advanced()
print(f"🖥️ GPU 상태: NVIDIA {'✅' if gpu_status['nvidia'] else '❌'} | CatBoost {'✅' if gpu_status['catboost'] else '❌'} | XGBoost {'✅' if gpu_status['xgboost'] else '❌'} | LightGBM {'✅' if gpu_status['lightgbm'] else '❌'}")
if not OPTUNA_AVAILABLE:
    print("❌ Optuna 없음 - 랜덤 서치로 대체합니다. (pip install optuna 권장)")

# --- 4-2: 데이터 분할 및 가중치 계산 ---
X = advanced_features_df
y_duration = valid_flow_data['duration_class']
y_volume = valid_flow_data['volume_class']
X_train, X_test, y_duration_train, y_duration_test = train_test_split(X, y_duration, test_size=0.2, random_state=42, stratify=y_duration)
_, _, y_volume_train, y_volume_test = train_test_split(X, y_volume, test_size=0.2, random_state=42, stratify=y_volume)
print(f"\n📊 데이터 준비: 학습 {X_train.shape[0]:,}개 | 검증 {X_test.shape[0]:,}개 | 특징 {X_train.shape[1]:,}개")

duration_classes = np.unique(y_duration_train); duration_weights = compute_class_weight('balanced', classes=duration_classes, y=y_duration_train)
duration_class_weights = {int(cls): weight for cls, weight in zip(duration_classes, duration_weights)}
volume_classes = np.unique(y_volume_train); volume_weights = compute_class_weight('balanced', classes=volume_classes, y=y_volume_train)
volume_class_weights = {int(cls): weight for cls, weight in zip(volume_classes, volume_weights)}
print(f"⚖️ 클래스 가중치 계산 완료")

# --- 4-3: Optuna 튜닝 클래스 정의 ---
results = {'model': [], 'task': [], 'accuracy': [], 'precision': [], 'recall': [], 'f1_weighted': [], 'f1_macro': []}
def evaluate_model(model, X_test, y_test, model_name, task_name):
    y_pred = model.predict(X_test)
    f1_w = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    results['model'].append(model_name); results['task'].append(task_name)
    results['accuracy'].append(accuracy_score(y_test, y_pred)); results['precision'].append(precision_score(y_test, y_pred, average='weighted', zero_division=0))
    results['recall'].append(recall_score(y_test, y_pred, average='weighted', zero_division=0)); results['f1_weighted'].append(f1_w)
    results['f1_macro'].append(f1_score(y_test, y_pred, average='macro', zero_division=0))
    print(f"  > 최종 검증 F1(weighted): {f1_w:.4f}")

class OptunaHyperparameterSearch:
    def __init__(self, model_class, param_space, n_trials=100, cv=3, random_state=42, use_gpu=False):
        self.model_class, self.param_space, self.n_trials, self.cv, self.random_state, self.use_gpu = model_class, param_space, n_trials, cv, random_state, use_gpu
        self.best_params_, self.best_estimator_ = None, None

    def objective(self, trial):
        params = {name: trial.suggest_categorical(name, values) for name, values in self.param_space.items()}
        model_params = {'random_state': self.random_state}
        
        # 모델별 GPU 설정 및 확인 메시지
        if self.model_class.__name__ == 'CatBoostClassifier': 
            model_params.update({
                'task_type': 'GPU' if self.use_gpu else 'CPU', 
                'verbose': False,
                'allow_writing_files': False
            })
            if trial.number == 0:  # 첫 번째 시도에서만 출력
                print(f"    🖥️ CatBoost {'GPU' if self.use_gpu else 'CPU'} 모드로 실행")
                
        elif self.model_class.__name__ == 'XGBClassifier': 
            model_params.update({
                'tree_method': 'gpu_hist' if self.use_gpu else 'hist', 
                'eval_metric': 'mlogloss', 
                'n_jobs': -1 if not self.use_gpu else 1,
                'verbosity': 1 if self.use_gpu and trial.number == 0 else 0  # GPU 첫 시도만 verbose
            })
            if trial.number == 0:
                print(f"    🖥️ XGBoost {'GPU(gpu_hist)' if self.use_gpu else 'CPU(hist)'} 모드로 실행")
                
        elif self.model_class.__name__ == 'LGBMClassifier': 
            model_params.update({
                'device': 'gpu' if self.use_gpu else 'cpu', 
                'verbose': -1, 
                'n_jobs': -1 if not self.use_gpu else 1,
                'force_row_wise': True if self.use_gpu else False
            })
            if trial.number == 0:
                print(f"    🖥️ LightGBM {'GPU' if self.use_gpu else 'CPU'} 모드로 실행")
                
        elif self.model_class.__name__ in ['RandomForestClassifier', 'ExtraTreesClassifier']: 
            model_params.update({'n_jobs': -1})
            if trial.number == 0:
                print(f"    🖥️ {self.model_class.__name__} CPU 멀티코어 모드로 실행")
        
        model = self.model_class(**{**model_params, **params})
        
        skf = StratifiedKFold(n_splits=self.cv, shuffle=True, random_state=self.random_state)
        scores = []
        for train_idx, val_idx in skf.split(self.X, self.y):
            X_train_fold, y_train_fold = self.X.iloc[train_idx], self.y.iloc[train_idx]
            X_val_fold, y_val_fold = self.X.iloc[val_idx], self.y.iloc[val_idx]
            
            fit_params = {}
            if self.class_weights and self.model_class.__name__ == 'XGBClassifier':
                fit_params['sample_weight'] = np.array([self.class_weights[cls] for cls in y_train_fold])
            
            model.fit(X_train_fold, y_train_fold, **fit_params)
            scores.append(f1_score(y_val_fold, model.predict(X_val_fold), average='weighted', zero_division=0))
        return np.mean(scores)
    
    def fit(self, X, y, class_weights=None):
        self.X, self.y, self.class_weights = X, y, class_weights
        
        if OPTUNA_AVAILABLE:
            study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=self.random_state))
            study.optimize(self.objective, n_trials=self.n_trials, show_progress_bar=True)
            self.best_params_ = study.best_params
        else: # Optuna 미설치 시 랜덤 파라미터 1개로만 테스트
            self.best_params_ = {name: values[0] for name, values in self.param_space.items()}

        # 최적 모델 생성 시에도 GPU 사용 확인
        model_params = {'random_state': self.random_state, **self.best_params_}
        if self.model_class.__name__ == 'CatBoostClassifier': 
            model_params.update({
                'task_type': 'GPU' if self.use_gpu else 'CPU', 
                'verbose': False, 
                'class_weights': self.class_weights,
                'allow_writing_files': False
            })
            print(f"    ✅ 최적 CatBoost 모델 {'GPU' if self.use_gpu else 'CPU'} 모드로 학습 중...")
            
        elif self.model_class.__name__ == 'LGBMClassifier' or self.model_class.__name__ in ['RandomForestClassifier', 'ExtraTreesClassifier']: 
            model_params.update({'class_weight': 'balanced'})
            if self.model_class.__name__ == 'LGBMClassifier':
                model_params.update({
                    'device': 'gpu' if self.use_gpu else 'cpu',
                    'verbose': -1,
                    'force_row_wise': True if self.use_gpu else False
                })
                print(f"    ✅ 최적 LightGBM 모델 {'GPU' if self.use_gpu else 'CPU'} 모드로 학습 중...")
            else:
                print(f"    ✅ 최적 {self.model_class.__name__} 모델 CPU 멀티코어 모드로 학습 중...")
        
        elif self.model_class.__name__ == 'XGBClassifier':
            model_params.update({
                'tree_method': 'gpu_hist' if self.use_gpu else 'hist',
                'eval_metric': 'mlogloss',
                'n_jobs': -1 if not self.use_gpu else 1
            })
            print(f"    ✅ 최적 XGBoost 모델 {'GPU(gpu_hist)' if self.use_gpu else 'CPU(hist)'} 모드로 학습 중...")
        
        self.best_estimator_ = self.model_class(**model_params)
        
        fit_params = {}
        if self.class_weights and self.model_class.__name__ == 'XGBClassifier':
            fit_params['sample_weight'] = np.array([self.class_weights[cls] for cls in y])

        self.best_estimator_.fit(X, y, **fit_params)
        return self

# --- 4-4: 모델별 튜닝 실행 ---
param_spaces = {
    'CatBoost': {'iterations': [200, 400, 600], 'depth': [6, 8, 10, 12], 'learning_rate': [0.03, 0.05, 0.1], 'l2_leaf_reg': [3, 5, 7], 'border_count': [128, 254]},
    'XGBoost': {'n_estimators': [200, 400, 600], 'max_depth': [6, 8, 10], 'learning_rate': [0.03, 0.05, 0.1], 'subsample': [0.8, 0.9], 'colsample_bytree': [0.8, 0.9]},
    'LightGBM': {'n_estimators': [200, 400, 600], 'max_depth': [8, 10, 12, -1], 'learning_rate': [0.03, 0.05, 0.1], 'num_leaves': [50, 100, 200]},
    'RandomForest': {'n_estimators': [200, 300], 'max_depth': [15, 20, 25, None], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2, 4]},
    'ExtraTrees': {'n_estimators': [200, 300], 'max_depth': [15, 20, 25, None], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2, 4]}
}
models_to_tune = [
    (CatBoostClassifier, 'CatBoost', gpu_status['catboost']),
    (xgb.XGBClassifier, 'XGBoost', gpu_status['xgboost']),
    (lgb.LGBMClassifier, 'LightGBM', gpu_status['lightgbm']),
    (RandomForestClassifier, 'RandomForest', False),
    (ExtraTreesClassifier, 'ExtraTrees', False)
]

for model_class, name, use_gpu in models_to_tune:
    print(f"\n--- {name} 튜닝 시작 ---")
    # Duration 분류
    print(f"🎯 {name} - Duration 분류 튜닝...")
    search_duration = OptunaHyperparameterSearch(model_class, param_spaces[name], n_trials=30, use_gpu=use_gpu).fit(X_train, y_duration_train, class_weights=duration_class_weights if name == 'CatBoost' or name == 'XGBoost' else None)
    evaluate_model(search_duration.best_estimator_, X_test, y_duration_test, name, 'Duration')
    
    # Volume 분류
    print(f"📦 {name} - Volume 분류 튜닝...")
    search_volume = OptunaHyperparameterSearch(model_class, param_spaces[name], n_trials=30, use_gpu=use_gpu).fit(X_train, y_volume_train, class_weights=volume_class_weights if name == 'CatBoost' or name == 'XGBoost' else None)
    evaluate_model(search_volume.best_estimator_, X_test, y_volume_test, name, 'Volume')

# =======================================================
# 🏆 5단계: 최종 성능 비교 결과
# =======================================================
print("\n" + "="*80)
print("🏆 최종 성능 비교 결과")
print("="*80)
final_results_df = pd.DataFrame(results)

print("\n🎯 Duration 분류 결과 (F1-weighted 기준 정렬):")
duration_results = final_results_df[final_results_df['task'] == 'Duration'].sort_values('f1_weighted', ascending=False)
print(duration_results[['model', 'accuracy', 'precision', 'recall', 'f1_weighted', 'f1_macro']].round(4).to_string(index=False))

print("\n📦 Volume 분류 결과 (F1-weighted 기준 정렬):")
volume_results = final_results_df[final_results_df['task'] == 'Volume'].sort_values('f1_weighted', ascending=False)
print(volume_results[['model', 'accuracy', 'precision', 'recall', 'f1_weighted', 'f1_macro']].round(4).to_string(index=False))

print("\n🎉 전체 파이프라인 실행 완료! 🎉")


🚀 4단계: 5개 모델 100회 하이퍼파라미터 튜닝
🔍 NVIDIA-SMI: ✅ 감지됨
🔍 CatBoost 버전: 1.2.8
🔍 CatBoost GPU: ✅ 작동 확인
🔍 XGBoost 버전: 3.0.2
🔍 XGBoost GPU: ✅ 작동 확인
🔍 LightGBM 버전: 4.6.0
🔍 LightGBM GPU: ✅ 작동 확인
🖥️ GPU 상태: NVIDIA ✅ | CatBoost ✅ | XGBoost ✅ | LightGBM ✅


[I 2025-08-08 00:57:03,797] A new study created in memory with name: no-name-41a42d4d-fc8d-4029-b987-f7ad9c4856c4



📊 데이터 준비: 학습 80,000개 | 검증 20,000개 | 특징 44개
⚖️ 클래스 가중치 계산 완료

--- CatBoost 튜닝 시작 ---
🎯 CatBoost - Duration 분류 튜닝...


  0%|          | 0/30 [00:00<?, ?it/s]

    🖥️ CatBoost GPU 모드로 실행
[I 2025-08-08 00:57:09,505] Trial 0 finished with value: 0.723792040542015 and parameters: {'iterations': 400, 'depth': 6, 'learning_rate': 0.03, 'l2_leaf_reg': 5, 'border_count': 128}. Best is trial 0 with value: 0.723792040542015.
[I 2025-08-08 00:58:03,978] Trial 1 finished with value: 0.7428328444289712 and parameters: {'iterations': 600, 'depth': 10, 'learning_rate': 0.1, 'l2_leaf_reg': 3, 'border_count': 128}. Best is trial 1 with value: 0.7428328444289712.
[I 2025-08-08 00:58:09,420] Trial 2 finished with value: 0.7323445701079838 and parameters: {'iterations': 200, 'depth': 8, 'learning_rate': 0.05, 'l2_leaf_reg': 5, 'border_count': 128}. Best is trial 1 with value: 0.7428328444289712.
[I 2025-08-08 00:58:37,811] Trial 3 finished with value: 0.7318817045089978 and parameters: {'iterations': 200, 'depth': 10, 'learning_rate': 0.03, 'l2_leaf_reg': 3, 'border_count': 254}. Best is trial 1 with value: 0.7428328444289712.
[I 2025-08-08 00:59:30,962] Trial 

[I 2025-08-08 01:12:57,981] A new study created in memory with name: no-name-a065a697-cc61-4e97-95a6-ca3f2dc5c73c


  > 최종 검증 F1(weighted): 0.7365
📦 CatBoost - Volume 분류 튜닝...


  0%|          | 0/30 [00:00<?, ?it/s]

    🖥️ CatBoost GPU 모드로 실행
[I 2025-08-08 01:13:03,536] Trial 0 finished with value: 0.27950702747130424 and parameters: {'iterations': 400, 'depth': 6, 'learning_rate': 0.03, 'l2_leaf_reg': 5, 'border_count': 128}. Best is trial 0 with value: 0.27950702747130424.
[I 2025-08-08 01:13:44,126] Trial 1 finished with value: 0.30851405916522756 and parameters: {'iterations': 600, 'depth': 10, 'learning_rate': 0.1, 'l2_leaf_reg': 3, 'border_count': 128}. Best is trial 1 with value: 0.30851405916522756.
[I 2025-08-08 01:13:49,650] Trial 2 finished with value: 0.28307999137592055 and parameters: {'iterations': 200, 'depth': 8, 'learning_rate': 0.05, 'l2_leaf_reg': 5, 'border_count': 128}. Best is trial 1 with value: 0.30851405916522756.
[I 2025-08-08 01:14:08,510] Trial 3 finished with value: 0.2834522906767665 and parameters: {'iterations': 200, 'depth': 10, 'learning_rate': 0.03, 'l2_leaf_reg': 3, 'border_count': 254}. Best is trial 1 with value: 0.30851405916522756.
[I 2025-08-08 01:15:05,10

[I 2025-08-08 01:54:27,991] A new study created in memory with name: no-name-8abc749e-7171-4c69-9a22-ffd1836d6305


  > 최종 검증 F1(weighted): 0.2892

--- XGBoost 튜닝 시작 ---
🎯 XGBoost - Duration 분류 튜닝...


  0%|          | 0/30 [00:00<?, ?it/s]

    🖥️ XGBoost GPU(gpu_hist) 모드로 실행
[I 2025-08-08 01:54:39,935] Trial 0 finished with value: 0.7423599178402891 and parameters: {'n_estimators': 400, 'max_depth': 6, 'learning_rate': 0.05, 'subsample': 0.8, 'colsample_bytree': 0.8}. Best is trial 0 with value: 0.7423599178402891.
[I 2025-08-08 01:54:49,572] Trial 1 finished with value: 0.7428363892173658 and parameters: {'n_estimators': 200, 'max_depth': 8, 'learning_rate': 0.05, 'subsample': 0.9, 'colsample_bytree': 0.9}. Best is trial 1 with value: 0.7428363892173658.
[I 2025-08-08 01:55:22,204] Trial 2 finished with value: 0.7388657451858998 and parameters: {'n_estimators': 600, 'max_depth': 8, 'learning_rate': 0.1, 'subsample': 0.8, 'colsample_bytree': 0.9}. Best is trial 1 with value: 0.7428363892173658.
[I 2025-08-08 01:55:51,913] Trial 3 finished with value: 0.7465722160656864 and parameters: {'n_estimators': 600, 'max_depth': 8, 'learning_rate': 0.03, 'subsample': 0.8, 'colsample_bytree': 0.8}. Best is trial 3 with value: 0.746

[I 2025-08-08 02:10:56,248] A new study created in memory with name: no-name-2f2a7d5d-02ae-4b73-b959-5f17fc51d7e5


  > 최종 검증 F1(weighted): 0.7476
📦 XGBoost - Volume 분류 튜닝...


  0%|          | 0/30 [00:00<?, ?it/s]

    🖥️ XGBoost GPU(gpu_hist) 모드로 실행
[I 2025-08-08 02:11:09,398] Trial 0 finished with value: 0.2817110886727891 and parameters: {'n_estimators': 400, 'max_depth': 6, 'learning_rate': 0.05, 'subsample': 0.8, 'colsample_bytree': 0.8}. Best is trial 0 with value: 0.2817110886727891.
[I 2025-08-08 02:11:21,767] Trial 1 finished with value: 0.288779443450038 and parameters: {'n_estimators': 200, 'max_depth': 8, 'learning_rate': 0.05, 'subsample': 0.9, 'colsample_bytree': 0.9}. Best is trial 1 with value: 0.288779443450038.
[I 2025-08-08 02:11:59,225] Trial 2 finished with value: 0.3097595480466848 and parameters: {'n_estimators': 600, 'max_depth': 8, 'learning_rate': 0.1, 'subsample': 0.8, 'colsample_bytree': 0.9}. Best is trial 2 with value: 0.3097595480466848.
[I 2025-08-08 02:12:34,950] Trial 3 finished with value: 0.29848980151380383 and parameters: {'n_estimators': 600, 'max_depth': 8, 'learning_rate': 0.03, 'subsample': 0.8, 'colsample_bytree': 0.8}. Best is trial 2 with value: 0.3097

[I 2025-08-08 02:35:53,732] A new study created in memory with name: no-name-541806ce-4afa-4bc1-9459-c236d4d55507


  > 최종 검증 F1(weighted): 0.3108

--- LightGBM 튜닝 시작 ---
🎯 LightGBM - Duration 분류 튜닝...


  0%|          | 0/30 [00:00<?, ?it/s]

    🖥️ LightGBM GPU 모드로 실행
[I 2025-08-08 02:37:05,177] Trial 0 finished with value: 0.7483528522640758 and parameters: {'n_estimators': 400, 'max_depth': 8, 'learning_rate': 0.03, 'num_leaves': 100}. Best is trial 0 with value: 0.7483528522640758.
[I 2025-08-08 02:38:10,626] Trial 1 finished with value: 0.7497274558786571 and parameters: {'n_estimators': 200, 'max_depth': 10, 'learning_rate': 0.03, 'num_leaves': 200}. Best is trial 1 with value: 0.7497274558786571.
[I 2025-08-08 02:41:18,493] Trial 2 finished with value: 0.7423721485850493 and parameters: {'n_estimators': 600, 'max_depth': 10, 'learning_rate': 0.05, 'num_leaves': 200}. Best is trial 1 with value: 0.7497274558786571.
[I 2025-08-08 02:43:18,542] Trial 3 finished with value: 0.7371370694626286 and parameters: {'n_estimators': 600, 'max_depth': 10, 'learning_rate': 0.1, 'num_leaves': 100}. Best is trial 1 with value: 0.7497274558786571.
[I 2025-08-08 02:43:40,333] Trial 4 finished with value: 0.7474054858757061 and paramet

[I 2025-08-08 03:20:08,824] A new study created in memory with name: no-name-800d215e-67d5-48cb-9317-be537b402947


  > 최종 검증 F1(weighted): 0.7500
📦 LightGBM - Volume 분류 튜닝...


  0%|          | 0/30 [00:00<?, ?it/s]

    🖥️ LightGBM GPU 모드로 실행
[I 2025-08-08 03:21:11,961] Trial 0 finished with value: 0.29952613379519205 and parameters: {'n_estimators': 400, 'max_depth': 8, 'learning_rate': 0.03, 'num_leaves': 100}. Best is trial 0 with value: 0.29952613379519205.
[I 2025-08-08 03:22:04,188] Trial 1 finished with value: 0.2960047858418456 and parameters: {'n_estimators': 200, 'max_depth': 10, 'learning_rate': 0.03, 'num_leaves': 200}. Best is trial 0 with value: 0.29952613379519205.
[I 2025-08-08 03:24:44,092] Trial 2 finished with value: 0.3110139738558872 and parameters: {'n_estimators': 600, 'max_depth': 10, 'learning_rate': 0.05, 'num_leaves': 200}. Best is trial 2 with value: 0.3110139738558872.
[I 2025-08-08 03:26:43,492] Trial 3 finished with value: 0.31179303336611713 and parameters: {'n_estimators': 600, 'max_depth': 10, 'learning_rate': 0.1, 'num_leaves': 100}. Best is trial 3 with value: 0.31179303336611713.
[I 2025-08-08 03:27:03,854] Trial 4 finished with value: 0.29175511400553183 and p

[I 2025-08-08 04:23:14,977] A new study created in memory with name: no-name-c105d745-6f22-42d5-9699-5dbf7cb85628


  > 최종 검증 F1(weighted): 0.3083

--- RandomForest 튜닝 시작 ---
🎯 RandomForest - Duration 분류 튜닝...


  0%|          | 0/30 [00:00<?, ?it/s]

    🖥️ RandomForestClassifier CPU 멀티코어 모드로 실행
[I 2025-08-08 04:23:23,012] Trial 0 finished with value: 0.7442127425783842 and parameters: {'n_estimators': 300, 'max_depth': 15, 'min_samples_split': 5, 'min_samples_leaf': 2}. Best is trial 0 with value: 0.7442127425783842.
[I 2025-08-08 04:23:29,895] Trial 1 finished with value: 0.7475066349609256 and parameters: {'n_estimators': 200, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 2}. Best is trial 1 with value: 0.7475066349609256.
[I 2025-08-08 04:23:39,443] Trial 2 finished with value: 0.7500187193449156 and parameters: {'n_estimators': 300, 'max_depth': 20, 'min_samples_split': 2, 'min_samples_leaf': 1}. Best is trial 2 with value: 0.7500187193449156.
[I 2025-08-08 04:23:47,344] Trial 3 finished with value: 0.7431485453366805 and parameters: {'n_estimators': 300, 'max_depth': 15, 'min_samples_split': 2, 'min_samples_leaf': 4}. Best is trial 2 with value: 0.7500187193449156.
[I 2025-08-08 04:23:57,682] Trial 4 finished

[I 2025-08-08 04:28:21,759] A new study created in memory with name: no-name-74a7c484-9a57-4ca8-9f66-b95c70afdc9c


  > 최종 검증 F1(weighted): 0.7475
📦 RandomForest - Volume 분류 튜닝...


  0%|          | 0/30 [00:00<?, ?it/s]

    🖥️ RandomForestClassifier CPU 멀티코어 모드로 실행
[I 2025-08-08 04:28:31,535] Trial 0 finished with value: 0.28289182044771677 and parameters: {'n_estimators': 300, 'max_depth': 15, 'min_samples_split': 5, 'min_samples_leaf': 2}. Best is trial 0 with value: 0.28289182044771677.
[I 2025-08-08 04:28:43,870] Trial 1 finished with value: 0.31119328477861535 and parameters: {'n_estimators': 200, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 2}. Best is trial 1 with value: 0.31119328477861535.
[I 2025-08-08 04:28:56,407] Trial 2 finished with value: 0.29371532998985916 and parameters: {'n_estimators': 300, 'max_depth': 20, 'min_samples_split': 2, 'min_samples_leaf': 1}. Best is trial 1 with value: 0.31119328477861535.
[I 2025-08-08 04:29:06,268] Trial 3 finished with value: 0.2823500800192861 and parameters: {'n_estimators': 300, 'max_depth': 15, 'min_samples_split': 2, 'min_samples_leaf': 4}. Best is trial 1 with value: 0.31119328477861535.
[I 2025-08-08 04:29:21,018] Trial 4 f

[I 2025-08-08 04:35:03,625] A new study created in memory with name: no-name-79e2efe2-0661-4182-9b73-a2dc900efe8e


  > 최종 검증 F1(weighted): 0.3137

--- ExtraTrees 튜닝 시작 ---
🎯 ExtraTrees - Duration 분류 튜닝...


  0%|          | 0/30 [00:00<?, ?it/s]

    🖥️ ExtraTreesClassifier CPU 멀티코어 모드로 실행
[I 2025-08-08 04:35:06,814] Trial 0 finished with value: 0.7142492325902485 and parameters: {'n_estimators': 300, 'max_depth': 15, 'min_samples_split': 5, 'min_samples_leaf': 2}. Best is trial 0 with value: 0.7142492325902485.
[I 2025-08-08 04:35:09,608] Trial 1 finished with value: 0.747986609992512 and parameters: {'n_estimators': 200, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 2}. Best is trial 1 with value: 0.747986609992512.
[I 2025-08-08 04:35:13,318] Trial 2 finished with value: 0.7380436013004689 and parameters: {'n_estimators': 300, 'max_depth': 20, 'min_samples_split': 2, 'min_samples_leaf': 1}. Best is trial 1 with value: 0.747986609992512.
[I 2025-08-08 04:35:16,197] Trial 3 finished with value: 0.7110120958349491 and parameters: {'n_estimators': 300, 'max_depth': 15, 'min_samples_split': 2, 'min_samples_leaf': 4}. Best is trial 1 with value: 0.747986609992512.
[I 2025-08-08 04:35:20,403] Trial 4 finished with 

[I 2025-08-08 04:36:37,680] A new study created in memory with name: no-name-94c43307-2971-451b-85dd-c4446bdbea4d


  > 최종 검증 F1(weighted): 0.7459
📦 ExtraTrees - Volume 분류 튜닝...


  0%|          | 0/30 [00:00<?, ?it/s]

    🖥️ ExtraTreesClassifier CPU 멀티코어 모드로 실행
[I 2025-08-08 04:36:40,738] Trial 0 finished with value: 0.2760962942151801 and parameters: {'n_estimators': 300, 'max_depth': 15, 'min_samples_split': 5, 'min_samples_leaf': 2}. Best is trial 0 with value: 0.2760962942151801.
[I 2025-08-08 04:36:44,425] Trial 1 finished with value: 0.3068880949206834 and parameters: {'n_estimators': 200, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 2}. Best is trial 1 with value: 0.3068880949206834.
[I 2025-08-08 04:36:48,162] Trial 2 finished with value: 0.2882815173323199 and parameters: {'n_estimators': 300, 'max_depth': 20, 'min_samples_split': 2, 'min_samples_leaf': 1}. Best is trial 1 with value: 0.3068880949206834.
[I 2025-08-08 04:36:51,108] Trial 3 finished with value: 0.275740437679236 and parameters: {'n_estimators': 300, 'max_depth': 15, 'min_samples_split': 2, 'min_samples_leaf': 4}. Best is trial 1 with value: 0.3068880949206834.
[I 2025-08-08 04:36:55,902] Trial 4 finished wi

[I 2025-08-08 02:05:16,485] Trial 19 finished with value: 0.747744586489528 and parameters: {'n_estimators': 400, 'max_depth': 10, 'learning_rate': 0.03, 'subsample': 0.8, 'colsample_bytree': 0.8}. Best is trial 19 with value: 0.747744586489528. 이게 xg,[I 2025-08-08 02:50:12,172] Trial 10 finished with value: 0.7501720911448091 and parameters: {'n_estimators': 200, 'max_depth': 12, 'learning_rate': 0.03, 'num_leaves': 200}. Best is trial 10 with value: 0.7501720911448091.이게 light,[I 2025-08-08 04:24:22,548] Trial 7 finished with value: 0.750454297201291 and parameters: {'n_estimators': 300, 'max_depth': 20, 'min_samples_split': 5, 'min_samples_leaf': 2}. Best is trial 7 with value: 0.750454297201291.이게 랜덤포레스트

In [10]:
# LightGBM 튜닝된 값으로 모델 학습 및 저장
import joblib

# gpu_status가 정의되어 있지 않으면 CPU로 강제 지정
try:
    _gpu_status = gpu_status
except NameError:
    gpu_status = {'lightgbm': False}

# 예시: 튜닝된 best_params를 직접 입력 (실제 튜닝 결과에 맞게 수정)
lgb_best_params = {
    'n_estimators': 200,
    'max_depth': 12,
    'learning_rate': 0.03,
    'num_leaves': 200,
    'device': 'gpu' if gpu_status.get('lightgbm', False) else 'cpu',
    'verbose': -1,
    'force_row_wise': True if gpu_status.get('lightgbm', False) else False,
    'random_state': 42
}

# Duration 분류 모델 학습
lgb_duration = lgb.LGBMClassifier(**lgb_best_params)
lgb_duration.fit(X_train, y_duration_train)
joblib.dump(lgb_duration, 'model/lgb_duration_model.pkl')
print('✅ LightGBM Duration 모델 저장 완료: model/lgb_duration_model.pkl')

# Volume 분류 모델 학습
lgb_volume = lgb.LGBMClassifier(**lgb_best_params)
lgb_volume.fit(X_train, y_volume_train)
joblib.dump(lgb_volume, 'model/lgb_volume_model.pkl')
print('✅ LightGBM Volume 모델 저장 완료: model/lgb_volume_model.pkl')

NameError: name 'X_train' is not defined