In [None]:
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.base import clone
from joblib import dump, load
from tqdm import tqdm
import os

# 한글 폰트 설정
font_path = 'C:/Windows/Fonts/malgun.ttf'  # Windows에서의 경우
# font_path = '/usr/share/fonts/truetype/nanum/NanumGothic.ttf'  # Ubuntu에서의 경우
font_name = fm.FontProperties(fname=font_path).get_name()
plt.rc('font', family=font_name)


In [None]:
CONFIG = {
    'input_file': '0703_total.xlsx',
    'output_folder': 'plk_folder_05_1',
    'keywords': ["auto", "construct", "capital_market", "chemicals", "equipment",
                 "transport", "semi", "bank", "steel",
                 "telecom", "staples", "discretionary", "kospi"],
    'window_sizes': [20, 60],
    'n_estimators_range': range(5, 21, 5),  # 하이퍼파라미터 탐색 범위 축소
    'max_depth_range': range(5, 21, 5)  # 하이퍼파라미터 탐색 범위 축소
}

def load_data(file_path, sheet_name):
    """엑셀 파일에서 데이터를 로드하고 전처리합니다."""
    try:
        df = pd.read_excel(file_path, sheet_name=sheet_name, header=0, index_col=0)
        df.index = pd.to_datetime(df.index).strftime('%Y-%m-%d')
        return df.dropna()
    except Exception as e:
        print(f"데이터 로드 중 오류 발생: {e}")
        return None

def prepare_data(df, keyword):
    """데이터를 학습용으로 준비합니다."""
    y = df[keyword]
    X = df.drop(columns=[keyword])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    return X_train_scaled, X_test_scaled, y_train, y_test


In [None]:
def train_and_evaluate_model(X_train, X_test, y_train, y_test, n_estimators, max_depth):
    """모델을 훈련하고 평가합니다."""
    rfc = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, 
                                 max_leaf_nodes=300, n_jobs=-1, random_state=0)
    rfc.fit(X_train, y_train)
    
    cv_scores = cross_val_score(rfc, X_train, y_train, cv=5, scoring='accuracy')  # 교차 검증 폴드 수 줄이기
    y_pred = rfc.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_true=y_test, y_pred=y_pred, average='macro')
    precision = precision_score(y_true=y_test, y_pred=y_pred, average='macro')
    
    return rfc, cv_scores.mean(), accuracy, recall, precision

def find_best_params(X_train, y_train, n_estimators_range, max_depth_range):
    """최적의 하이퍼파라미터를 찾습니다."""
    best_score = 0
    best_params = {}
    
    for n_estimators in tqdm(n_estimators_range, desc="n_estimators 진행 중"):
        for max_depth in max_depth_range:
            rfc = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, 
                                         max_leaf_nodes=300, n_jobs=-1, random_state=0)
            score = cross_val_score(rfc, X_train, y_train, cv=5, scoring='accuracy').mean()  # 교차 검증 폴드 수 줄이기
            
            if score > best_score:
                best_score = score
                best_params = {'n_estimators': n_estimators, 'max_depth': max_depth}
    
    return best_params

def plot_feature_importance(model, feature_names, keyword):
    """특성 중요도를 시각화합니다."""
    importances = model.feature_importances_
    indices = np.argsort(importances)[::-1]
    
    plt.figure(figsize=(12, 8))
    plt.title(f'Top 5 Feature Importance - {keyword}')
    plt.bar(range(5), importances[indices][:5])
    plt.xticks(range(5), [feature_names[i] for i in indices[:5]], rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

def early_stopping_model(X_train, y_train, X_test, y_test, patience=5):
    """조기 종료를 사용한 모델 훈련."""
    best_score = 0
    best_model = RandomForestClassifier(random_state=0)  # 초기화
    no_improvement = 0
    
    model = RandomForestClassifier(random_state=0)
    for i in range(1, 101):
        model.set_params(n_estimators=i)
        model.fit(X_train, y_train)
        score = accuracy_score(y_test, model.predict(X_test))
        
        if score > best_score:
            best_score = score
            best_model = clone(model)
            no_improvement = 0
        else:
            no_improvement += 1
        
        if no_improvement >= patience:
            print(f"조기 종료: {i} 에포크에서 종료")
            break
    
    if no_improvement >= patience:
        best_model.fit(X_train, y_train)
    
    return best_model



In [None]:
def main():
    os.makedirs(CONFIG['output_folder'], exist_ok=True)
    
    for keyword in tqdm(CONFIG['keywords'], desc="키워드 처리 진행 중"):
        print(f"\n처리 중: {keyword}")
        
        df = load_data(CONFIG['input_file'], keyword)
        if df is None:
            continue
        
        # 데이터 샘플링 (데이터가 많을 경우 일부만 사용)
        df_sampled = df.sample(frac=0.1, random_state=0) if len(df) > 10000 else df
        
        X_train, X_test, y_train, y_test = prepare_data(df_sampled, keyword)
        
        best_params = find_best_params(X_train, y_train, CONFIG['n_estimators_range'], CONFIG['max_depth_range'])
        print(f"최적 파라미터: {best_params}")
        
        # 조기 종료 모델 학습 및 저장
        early_stopping_rfc = early_stopping_model(X_train, y_train, X_test, y_test)
        
        y_pred = early_stopping_rfc.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        recall = recall_score(y_true=y_test, y_pred=y_pred, average='macro')
        precision = precision_score(y_true=y_test, y_pred=y_pred, average='macro')
        
        print(f"조기 종료 모델 - {keyword}: 정확도: {accuracy:.4f}, 재현율: {recall:.4f}, 정밀도: {precision:.4f}")
        
        # 모델 저장
        dump(early_stopping_rfc, f"{CONFIG['output_folder']}/{keyword}_early_stopping_model.pkl")
        
        # 중요도 플롯
        plot_feature_importance(early_stopping_rfc, df.columns, keyword)

if __name__ == "__main__":
    main()
